Adding test for issue 1133 and improving documentation (#1134)

* Adding test. * Saving. * With exceptions. * Added extensive tests. * Better documentation. * Tweaking CI * Cleaning. * Do not assume make. * Let us make the build verbose * Reorg * I do not understand how circle ci works. * Breaking it up. * Better syntax.
2020-08-20 14:03:14 -04:00 · 2020-08-20 14:03:14 -04:00 · 3316df9195
parent 5d355f1a8b
commit 3316df9195
6 changed files with 193 additions and 27 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,5 +1,8 @@
 version: 2.1

+
+# We constantly run out of memory so please do not use parallelism (-j, -j4). 
+
 # Reusable image / compiler definitions
 executors:
  gcc8:
@ -8,8 +11,8 @@ executors:
        environment:
          CXX: g++-8
          CC: gcc-8
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS: 
+          CTEST_FLAGS: --output-on-failure

  gcc9:
    docker:
@ -17,8 +20,8 @@ executors:
        environment:
          CXX: g++-9
          CC: gcc-9
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS:
+          CTEST_FLAGS: --output-on-failure

  gcc10:
    docker:
@ -26,8 +29,8 @@ executors:
        environment:
          CXX: g++-10
          CC: gcc-10
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS:
+          CTEST_FLAGS: --output-on-failure

  clang10:
    docker:
@ -35,8 +38,8 @@ executors:
        environment:
          CXX: clang++-10
          CC: clang-10
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS:
+          CTEST_FLAGS: --output-on-failure

  clang9:
    docker:
@ -44,8 +47,8 @@ executors:
        environment:
          CXX: clang++-9
          CC: clang-9
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS:
+          CTEST_FLAGS: --output-on-failure 

  clang6:
    docker:
@ -53,8 +56,8 @@ executors:
        environment:
          CXX: clang++-6.0
          CC: clang-6.0
-          BUILD_FLAGS: -j
-          CTEST_FLAGS: -j4 --output-on-failure
+          BUILD_FLAGS:
+          CTEST_FLAGS: --output-on-failure

 # Reusable test commands (and initializer for clang 6)
 commands:
@ -68,13 +71,15 @@ commands:
      - checkout
      - run: mkdir -p build

-  cmake_build:
+  cmake_build_cache:
    steps:
      - cmake_prep
-      - run: |
-          cd build &&
-          cmake $CMAKE_FLAGS -DCMAKE_INSTALL_PREFIX:PATH=destination .. &&
-          make $BUILD_FLAGS all
+      - run: cmake $CMAKE_FLAGS -DCMAKE_INSTALL_PREFIX:PATH=destination -B build . 
+
+  cmake_build:
+    steps:
+      - cmake_build_cache
+      - run: cmake --build build

  cmake_test:
    steps:
@ -138,12 +143,12 @@ jobs:
  sanitize-gcc10:
    description: Build and run tests on GCC 10 and AVX 2 with a cmake sanitize build
    executor: gcc10
-    environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, BUILD_FLAGS: "", CTEST_FLAGS: -j4 --output-on-failure -E checkperf }
+    environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, BUILD_FLAGS: "", CTEST_FLAGS: --output-on-failure -E checkperf }
    steps: [ cmake_test ]
  sanitize-clang10:
    description: Build and run tests on clang 10 and AVX 2 with a cmake sanitize build
    executor: clang10
-    environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: -j4 --output-on-failure -E checkperf }
+    environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -E checkperf }
    steps: [ cmake_test ]

  # dynamic
--- a/doc/basics.md
+++ b/doc/basics.md
@ -592,14 +592,37 @@ Here is a simple example, given "x.json" with this content:

 ```c++
 dom::parser parser;
-dom::document_stream docs = parser.load_many(filename);
+dom::document_stream docs = parser.load_many("x.json");
 for (dom::element doc : docs) {
  cout << doc["foo"] << endl;
 }
 // Prints 1 2 3
 ```

-In-memory ndjson strings can be parsed as well, with `parser.parse_many(string)`.
+In-memory ndjson strings can be parsed as well, with `parser.parse_many(string)`: 
+
+
+```c++
+dom::parser parser;
+  auto json = R"({ "foo": 1 }
+{ "foo": 2 }
+{ "foo": 3 })"_padded;
+dom::document_stream docs = parser.parse_many(json);
+for (dom::element doc : docs) {
+  cout << doc["foo"] << endl;
+}
+// Prints 1 2 3
+```
+
+
+Unlike `parser.parse`, both `parser.load_many(filename)` and `parser.parse_many(string)` may parse
+"on demand" (lazily). That is, no parsing may have been done before you enter the loop 
+`for (dom::element doc : docs) {` and you should expect the parser to only ever fully parse one JSON
+document at a time.
+
+1. When calling `parser.load_many(filename)`, the file's content is loaded up in a memory buffer owned by the `parser`'s instance. Thus the file can be safely deleted after calling `parser.load_many(filename)` as the parser instance owns all of the data.
+2. When calling  `parser.parse_many(string)`, no copy is made of the provided string input. The provided memory buffer may be accessed each time a JSON document is parsed.  Calling `parser.parse_many(string)` on a  temporary string buffer (e.g., `docs = parser.parse_many("[1,2,3]"_padded)`) is unsafe because the  `document_stream` instance needs access to the buffer to return the JSON documents. In constrast, calling `doc = parser.parse("[1,2,3]"_padded)` is safe because `parser.parse` eagerly parses the input.
+

 Both `load_many` and `parse_many` take an optional parameter `size_t batch_size` which defines the window processing size. It is set by default to a large value (`1000000` corresponding to 1 MB). None of your JSON documents should exceed this window size, or else you will get  the error `simdjson::CAPACITY`. You cannot set this window size larger than 4 GB: you will get  the error `simdjson::CAPACITY`. The smaller the window size is, the less memory the function will use. Setting the window size too small (e.g., less than 100 kB) may also impact performance negatively. Leaving it to 1 MB is expected to be a good choice, unless you have some larger documents.

--- a/doc/basics_doxygen.md
+++ b/doc/basics_doxygen.md
@ -564,7 +564,7 @@ than 4GB), though each individual document must be no larger than 4 GB.

 Here is a simple example, given "x.json" with this content:

-```json
+```
 { "foo": 1 }
 { "foo": 2 }
 { "foo": 3 }
@ -572,14 +572,37 @@ Here is a simple example, given "x.json" with this content:

 ```
 dom::parser parser;
-dom::document_stream docs = parser.load_many(filename);
+dom::document_stream docs = parser.load_many("x.json");
 for (dom::element doc : docs) {
  cout << doc["foo"] << endl;
 }
 // Prints 1 2 3
 ```

-In-memory ndjson strings can be parsed as well, with `parser.parse_many(string)`.
+
+In-memory ndjson strings can be parsed as well, with `parser.parse_many(string)`: 
+
+
+```
+dom::parser parser;
+  auto json = R"({ "foo": 1 }
+{ "foo": 2 }
+{ "foo": 3 })"_padded;
+dom::document_stream docs = parser.parse_many(json);
+for (dom::element doc : docs) {
+  cout << doc["foo"] << endl;
+}
+// Prints 1 2 3
+```
+
+
+Unlike `parser.parse`, both `parser.load_many(filename)` and `parser.parse_many(string)` may parse
+"on demand" (lazily). That is, no parsing may have been done before you enter the loop 
+`for (dom::element doc : docs) {` and you should expect the parser to only ever fully parse one JSON
+document at a time.
+
+1. When calling `parser.load_many(filename)`, the file's content is loaded up in a memory buffer owned by the `parser`'s instance. Thus the file can be safely deleted after calling `parser.load_many(filename)` as the parser instance owns all of the data.
+2. When calling  `parser.parse_many(string)`, no copy is made of the provided string input. The provided memory buffer may be accessed each time a JSON document is parsed.  Calling `parser.parse_many(string)` on a  temporary string buffer (e.g., `docs = parser.parse_many("[1,2,3]"_padded)`) is unsafe because the  `document_stream` instance needs access to the buffer to return the JSON documents. In constrast, calling `doc = parser.parse("[1,2,3]"_padded)` is safe because `parser.parse` eagerly parses the input.

 Both `load_many` and `parse_many` take an optional parameter `size_t batch_size` which defines the window processing size. It is set by default to a large value (`1000000` corresponding to 1 MB). None of your JSON documents should exceed this window size, or else you will get  the error `simdjson::CAPACITY`. You cannot set this window size larger than 4 GB: you will get  the error `simdjson::CAPACITY`. The smaller the window size is, the less memory the function will use. Setting the window size too small (e.g., less than 100 kB) may also impact performance negatively. Leaving it to 1 MB is expected to be a good choice, unless you have some larger documents.

--- a/include/simdjson/dom/parser.h
+++ b/include/simdjson/dom/parser.h
@ -70,7 +70,10 @@ public:
   *
   *   dom::parser parser;
   *   const element doc = parser.load("jsonexamples/twitter.json");
-   *
+   * 
+   * The function is eager: the file's content is loaded in memory inside the parser instance
+   * and immediately parsed. The file can be deleted after the  `parser.load` call.
+   * 
   * ### IMPORTANT: Document Lifetime
   *
   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
@ -96,6 +99,9 @@ public:
   *
   *   dom::parser parser;
   *   element doc = parser.parse(buf, len);
+   * 
+   * The function eagerly parses the input: the input can be modified and discarded after
+   * the `parser.parse(buf, len)` call has completed.
   *
   * ### IMPORTANT: Document Lifetime
   *
@ -149,6 +155,13 @@ public:
   *     cout << std::string(doc["title"]) << endl;
   *   }
   *
+   * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)`
+   * function has returned. The memory is held by the `parser` instance.
+   * 
+   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
+   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
+   * returned.
+   * 
   * ### Format
   *
   * The file must contain a series of one or more JSON documents, concatenated into a single
@ -156,7 +169,7 @@ public:
   * then starts parsing the next document at that point. (It does this with more parallelism and
   * lookahead than you might think, though.)
   *
-   * documents that consist of an object or array may omit the whitespace between them, concatenating
+   * Documents that consist of an object or array may omit the whitespace between them, concatenating
   * with no separator. documents that consist of a single primitive (i.e. documents that are not
   * arrays or objects) MUST be separated with whitespace.
   * 
@ -213,6 +226,30 @@ public:
   *     cout << std::string(doc["title"]) << endl;
   *   }
   *
+   * No copy of the input buffer is made.
+   *
+   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
+   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
+   * returned.
+   * 
+   * The caller is responsabile to ensure that the input string data remains unchanged and is
+   * not deleted during the loop. In particular, the following is unsafe:
+   * 
+   *   auto docs = parser.parse_many("[\"temporary data\"]"_padded);
+   *   // here the string "[\"temporary data\"]" may no longer exist in memory
+   *   // the parser instance may not have even accessed the input yet
+   *   for (element doc : docs) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   * 
+   * The following is safe: 
+   * 
+   *   auto json = "[\"temporary data\"]"_padded;
+   *   auto docs = parser.parse_many(json);
+   *   for (element doc : docs) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   *     
   * ### Format
   *
   * The buffer must contain a series of one or more JSON documents, concatenated into a single
--- a/tests/document_stream_tests.cpp
+++ b/tests/document_stream_tests.cpp
@ -69,6 +69,69 @@ namespace document_stream_tests {
    }
    return true;
  }
+  bool single_document() {
+    std::cout << "Running " << __func__ << std::endl;
+    simdjson::dom::parser parser;
+    auto json = R"({"hello": "world"})"_padded;
+    simdjson::dom::document_stream stream;
+    ASSERT_SUCCESS(parser.parse_many(json).get(stream));
+    size_t count = 0;
+    for (auto doc : stream) {
+        if(doc.error()) {
+          std::cerr << "Unexpected error: " << doc.error() << std::endl;
+          return false;
+        }
+        std::string expected = R"({"hello":"world"})";
+        simdjson::dom::element this_document;
+        ASSERT_SUCCESS(doc.get(this_document));
+
+        std::string answer = simdjson::minify(this_document);
+        if(answer != expected) {
+          std::cout << this_document << std::endl;
+          return false;
+        }
+        count += 1;
+    }
+    return count == 1;
+  }
+#if SIMDJSON_EXCEPTIONS
+  bool single_document_exceptions() {
+    std::cout << "Running " << __func__ << std::endl;
+    simdjson::dom::parser parser;
+    auto json = R"({"hello": "world"})"_padded;
+    size_t count = 0;
+    for (simdjson::dom::element doc : parser.parse_many(json)) {
+        std::string expected = R"({"hello":"world"})";
+        std::string answer = simdjson::minify(doc);
+        if(answer != expected) {
+          std::cout << "got     : "  << answer << std::endl;
+          std::cout << "expected: "  << expected << std::endl;
+          return false;
+        }
+        count += 1;
+    }
+    return count == 1;
+  }
+
+  bool issue1133() {
+    std::cout << "Running " << __func__ << std::endl;
+    simdjson::dom::parser parser;
+    auto json = "{\"hello\": \"world\"}"_padded;
+    simdjson::dom::document_stream docs = parser.parse_many(json);
+    size_t count = 0;
+    for (simdjson::dom::element doc : docs) {
+        std::string expected = R"({"hello":"world"})";
+        std::string answer = simdjson::minify(doc);
+        if(answer != expected) {
+          std::cout << "got     : "  << answer << std::endl;
+          std::cout << "expected: "  << expected << std::endl;
+          return false;
+        }
+        count += 1;
+    }
+    return count == 1;
+  }
+#endif

  bool small_window() {
    std::cout << "Running " << __func__ << std::endl;
@ -247,7 +310,12 @@ namespace document_stream_tests {
  }

  bool run() {
-    return test_current_index() &&
+    return test_current_index()  && 
+           single_document() &&
+#if SIMDJSON_EXCEPTIONS
+           single_document_exceptions() &&
+           issue1133() && 
+#endif
 #ifdef SIMDJSON_THREADS_ENABLED
           threaded_disabled() &&
 #endif
--- a/tests/readme_examples.cpp
+++ b/tests/readme_examples.cpp
@ -179,6 +179,16 @@ void basics_ndjson() {
  // Prints 1 2 3
 }

+void basics_ndjson_parse_many() {
+  dom::parser parser;
+  auto json = R"({ "foo": 1 }
+{ "foo": 2 }
+{ "foo": 3 })"_padded;
+  dom::document_stream docs = parser.parse_many(json);
+  for (dom::element doc : docs) {
+    cout << doc["foo"] << endl;
+  }
+}
 void implementation_selection_1() {
  cout << "simdjson v" << STRINGIFY(SIMDJSON_VERSION) << endl;
  cout << "Detected the best implementation for your machine: " << simdjson::active_implementation->name();