Reenable the on-demand tests and allows us to convert a raw string into a C++ string. (#1232)

* Reenable the on-demand tests and allows us to convert a raw string into a C++ string.

* Fixing a 1-byte buffer overrun.

* More documentation.

* Adding more tests.

* Enabling the new tests

* Committing a nicer example.

* Not yet happy but this should fix our failures.

* Duh.

* Ok. Making it easier to get string_view instances from field instances.

* It is a struct.

* Trying to satisfy VS.

* Adopting John's name.
This commit is contained in:
Daniel Lemire 2020-10-19 20:22:24 -04:00 committed by GitHub
parent 3e8e797bc2
commit 0d6919dd99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 203 additions and 13 deletions

View File

@ -209,15 +209,25 @@ jobs:
# make (test and checkperf)
arch-haswell-gcc10:
description: Build, run tests and check performance on GCC 7 with -march=haswell
description: Build, run tests and check performance on GCC 10 with -march=haswell
executor: gcc10
environment: { CXXFLAGS: -march=haswell }
steps: [ cmake_test ]
arch-nehalem-gcc10:
description: Build, run tests and check performance on GCC 7 with -march=nehalem
description: Build, run tests and check performance on GCC 10 with -march=nehalem
executor: gcc10
environment: { CXXFLAGS: -march=nehalem }
steps: [ cmake_test ]
sanitize-haswell-gcc10:
description: Build and run tests on GCC 10 and AVX 2 with a cmake sanitize build
executor: gcc10
environment: { CXXFLAGS: -march=haswell, CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, BUILD_FLAGS: "", CTEST_FLAGS: --output-on-failure -E checkperf }
steps: [ cmake_test ]
sanitize-haswell-clang10:
description: Build and run tests on clang 10 and AVX 2 with a cmake sanitize build
executor: clang10
environment: { CXXFLAGS: -march=haswell, CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -E checkperf }
steps: [ cmake_test ]
workflows:
version: 2.1
@ -248,6 +258,11 @@ workflows:
- arch-haswell-gcc10
- arch-nehalem-gcc10
# sanitized single-implementation tests
- sanitize-haswell-gcc10
- sanitize-haswell-clang10
# testing "just the library"
- justlib-gcc10

View File

@ -447,6 +447,44 @@ When the user requests strings, we unescape them to a single string buffer much
so that users enjoy the same string performance as the core simdjson. We do not write the length to the
string buffer, however; that is stored in the `string_view` instance we return to the user.
```C++
ondemand::parser parser;
auto doc = parser.iterate(json);
std::set<std::string_view> default_users;
ondemand::array tweets = doc["statuses"].get_array();
for (auto tweet_value : tweets) {
auto tweet = tweet_value.get_object();
ondemand::object user = tweet["user"].get_object();
std::string_view screen_name = user["screen_name"].get_string();
bool default_profile = user["default_profile"].get_bool();
if (default_profile) { default_users.insert(screen_name); }
}
```
By using `string_view` instances, we avoid the high cost of allocating many small strings (as would be the
case with `std::string`) but be mindful that the life cycle of these `string_view` instances is tied to the
parser instance. If the parser instance is destroyed or reused for a new JSON document, these strings are no longer valid.
We iterate through object instances using `field` instances which represent key-value pairs. The value
is accessible by the `value()` method whereas the key is accessible by the `key()` method.
The keys are treated differently than values are made available as as special type `raw_json_string`
which is a lightweight type that is meant to be used on a temporary basis, amost solely for
direct raw ASCII comparisons (`field.key() == "mykey"`). If you occasionally need to access and store the
unescaped key values, you may use the `unescaped_key()` method. Once you have called `unescaped_key()` method,
neither the `key()` nor the `unescaped_key()` methods should be called: the current field instance
has no longer a key (that is by design). Like other strings, the resulting `std::string_view` generated
from the `unescaped_key()` method has a lifecycle tied to the `parser` instance: once the parser
is destroyed or reused with another document, the `std::string_view` instance becomes invalid.
```C++
auto doc = parser.iterate(json);
for(auto field : doc.get_object()) {
std::string_view keyv = field.unescaped_key();
}
```
### Iteration Safety

View File

@ -21,7 +21,15 @@ simdjson_really_inline simdjson_result<field> field::start(json_iterator_ref &&i
return field(key, value::start(std::forward<json_iterator_ref>(iter)));
}
simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> field::unescaped_key() noexcept {
SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us.
simdjson_result<std::string_view> answer = first.unescape(second.get_iterator());
first.consume();
return answer;
}
simdjson_really_inline raw_json_string field::key() const noexcept {
SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us.
return first;
}
@ -58,6 +66,10 @@ simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::raw_js
if (error()) { return error(); }
return first.key();
}
simdjson_really_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::field>::unescaped_key() noexcept {
if (error()) { return error(); }
return first.unescaped_key();
}
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::field>::value() noexcept {
if (error()) { return error(); }
return std::move(first.value());

View File

@ -26,7 +26,17 @@ public:
simdjson_really_inline field &operator=(const field &other) noexcept = delete;
/**
* Get the key.
* Get the key as a string_view (for higher speed, consider raw_key).
* We deliberately use a more cumbersome name (unescaped_key) to force users
* to think twice about using it.
*
* This consumes the key: once you have called unescaped_key(), you cannot
* call it again nor can you call key().
*/
simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> unescaped_key() noexcept;
/**
* Get the key as a raw_json_string: this is fast and allows straight comparisons.
* We want this to be the default for most users.
*/
simdjson_really_inline raw_json_string key() const noexcept;
/**
@ -62,6 +72,7 @@ public:
simdjson_really_inline simdjson_result(simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::field> &&a) noexcept = default;
simdjson_really_inline ~simdjson_result() noexcept = default; ///< @private
simdjson_really_inline simdjson_result<std::string_view> unescaped_key() noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string> key() noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value> value() noexcept;
};

View File

@ -8,9 +8,8 @@ simdjson_warn_unused simdjson_really_inline error_code parser::allocate(size_t n
// string_capacity copied from document::allocate
_capacity = 0;
_max_depth = 0;
// The most string buffer we could possibly need is capacity-2 (a string the whole document long).
// Allocate up to capacity so we don't have to check for capacity == 0 or 1.
string_buf.reset(new (std::nothrow) uint8_t[new_capacity]);
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + SIMDJSON_PADDING, 64);
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
SIMDJSON_TRY( dom_parser.set_capacity(new_capacity) );
SIMDJSON_TRY( dom_parser.set_max_depth(DEFAULT_MAX_DEPTH) );
_capacity = new_capacity;

View File

@ -8,10 +8,20 @@ class object;
class parser;
/**
* A string escaped per JSON rules, terminated with quote (")
* A string escaped per JSON rules, terminated with quote ("). They are used to represent
* unescaped keys inside JSON documents.
*
* (In other words, a pointer to the beginning of a string, just after the start quote, inside a
* JSON file.)
*
* This class is deliberately simplistic and has little functionality. You can
* compare two raw_json_string instances, or compare a raw_json_string with a string_view, but
* that is pretty much all you can do.
*
* They originate typically from field instance which in turn represent key-value pairs from
* object instances. From a field instance, you get the raw_json_string instance by calling key().
* You can, if you want a more usable string_view instance, call the unescaped_key() method
* on the field instance.
*/
class raw_json_string {
public:
@ -35,8 +45,24 @@ public:
simdjson_really_inline raw_json_string(const uint8_t * _buf) noexcept;
/**
* Get the raw pointer to the beginning of the string in the JSON (just after the ").
*
* It is possible for this function to return a null pointer if the instance
* has outlived its existence.
*/
simdjson_really_inline const char * raw() const noexcept;
private:
/**
* This will set the inner pointer to zero, effectively making
* this instance unusable.
*/
simdjson_really_inline void consume() noexcept { buf = nullptr; }
/**
* Checks whether the inner pointer is non-null and thus usable.
*/
simdjson_really_inline simdjson_warn_unused bool alive() const noexcept { return buf != nullptr; }
/**
* Unescape this JSON string, replacing \\ with \, \n with newline, etc.
*
@ -62,9 +88,10 @@ public:
*/
simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> unescape(json_iterator &iter) const noexcept;
private:
const uint8_t * buf{};
friend class object;
friend class field;
friend struct simdjson_result<raw_json_string>;
};
simdjson_unused simdjson_really_inline bool operator==(const raw_json_string &a, std::string_view b) noexcept;

View File

@ -73,6 +73,10 @@ simdjson_really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
return offset > 0;
}
/**
* Unescape a string from src to dst, stopping at a final unescaped quote. E.g., if src points at 'joe"', then
* dst needs to have four free bytes.
*/
simdjson_warn_unused simdjson_really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
while (1) {
// Copy the next n bytes, and find the backslash and quote in them.

View File

@ -15,6 +15,7 @@
#include "simdjson.h"
#include "test_ondemand.h"
// const size_t AMAZON_CELLPHONES_NDJSON_DOC_COUNT = 793;
#define SIMDJSON_SHOW_DEFINE(x) printf("%s=%s\n", #x, STRINGIFY(x))
@ -41,6 +42,42 @@ void compilation_test_1() {
}
}
}
// Do not run this, it is only meant to compile
void compilation_test_2() {
const padded_string bogus = ""_padded;
ondemand::parser parser;
auto doc = parser.iterate(bogus);
std::set<std::string_view> default_users;
ondemand::array tweets = doc["statuses"].get_array();
for (auto tweet_value : tweets) {
auto tweet = tweet_value.get_object();
ondemand::object user = tweet["user"].get_object();
std::string_view screen_name = user["screen_name"].get_string();
bool default_profile = user["default_profile"].get_bool();
if (default_profile) { default_users.insert(screen_name); }
}
}
// Do not run this, it is only meant to compile
void compilation_test_3() {
const padded_string bogus = ""_padded;
ondemand::parser parser;
auto doc = parser.iterate(bogus);
ondemand::array tweets;
if(! doc["statuses"].get(tweets)) { return; }
for (auto tweet_value : tweets) {
auto tweet = tweet_value.get_object();
for (auto field : tweet) {
std::string_view key = field.unescaped_key().value();
std::cout << "key = " << key << std::endl;
std::string_view val = std::string_view(field.value());
std::cout << "value (assuming it is a string) = " << val << std::endl;
}
}
}
#endif
#define ONDEMAND_SUBTEST(NAME, JSON, TEST) \
@ -53,6 +90,32 @@ void compilation_test_1() {
} \
}
namespace key_string_tests {
#if SIMDJSON_EXCEPTIONS
bool parser_key_value() {
TEST_START();
ondemand::parser parser;
const padded_string json = R"({ "1": "1", "2": "2", "3": "3", "abc": "abc", "\u0075": "\u0075" })"_padded;
auto doc = parser.iterate(json);
for(auto field : doc.get_object()) {
std::string_view keyv = field.unescaped_key();
std::string_view valuev = field.value();
if(keyv != valuev) { return false; }
}
return true;
}
#endif
bool run() {
return
#if SIMDJSON_EXCEPTIONS
parser_key_value() &&
#endif
true;
}
}
namespace number_tests {
// ulp distance
@ -866,10 +929,30 @@ namespace twitter_tests {
auto media = entities["media"];
if (media.error() == SUCCESS) {
for (ondemand::object image : media) {
/**
* Fun fact: id and id_str can differ:
* 505866668485386240 and 505866668485386241.
* Presumably, it is because doubles are used
* at some point in the process and the number
* 505866668485386241 cannot be represented as a double.
* (not our fault)
*/
uint64_t id_val = image["id"].get_uint64();
std::cout << "id = " <<id_val << std::endl;
auto id_string = std::string_view(image["id_str"].value());
std::cout << "id_string = " << id_string << std::endl;
auto sizes = image["sizes"].get_object();
for (auto size : sizes) {
/**
* We want to know the key that describes the size.
*/
std::string_view raw_size_key_v = size.unescaped_key().value();
std::cout << "Type of image size = " << raw_size_key_v << std::endl;
ondemand::object size_value = size.value();
image_sizes.insert(make_pair(size_value["w"], size_value["h"]));
int64_t width = size_value["w"];
int64_t height = size_value["h"];
std::cout << width << " x " << height << std::endl;
image_sizes.insert(make_pair(width, height));
}
}
}
@ -1346,12 +1429,13 @@ int main(int argc, char *argv[]) {
std::cout << "Running basic tests." << std::endl;
if (
// parse_api_tests::run() &&
// dom_api_tests::run() &&
// twitter_tests::run() &&
// number_tests::run() &&
parse_api_tests::run() &&
dom_api_tests::run() &&
twitter_tests::run() &&
number_tests::run() &&
error_tests::run() &&
ordering_tests::run() &&
key_string_tests::run() &&
true
) {
std::cout << "Basic tests are ok." << std::endl;