fixing issue 1480 (#1485)

This commit is contained in:
Daniel Lemire 2021-03-08 19:31:42 -05:00 committed by GitHub
parent 0948573e63
commit 8e8fbc4cff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 343 additions and 33 deletions

View File

@ -632,7 +632,12 @@ We iterate through object instances using `field` instances which represent key-
is accessible by the `value()` method whereas the key is accessible by the `key()` method.
The keys are treated differently than values are made available as as special type `raw_json_string`
which is a lightweight type that is meant to be used on a temporary basis, amost solely for
direct raw ASCII comparisons (`field.key() == "mykey"`). If you occasionally need to access and store the
direct raw ASCII comparisons: `key().raw()` provides direct access to the unescaped string.
You can compare `key()` with unescaped C strings (e.g., `key()=="test"`). It is expected
that the provided string is a valid JSON string. Importantly,
the C string must not contain an unescaped quote character (`"`). For speed, the comparison is done byte-by-byte
without handling the escaped caracters.
If you occasionally need to access and store the
unescaped key values, you may use the `unescaped_key()` method. Once you have called `unescaped_key()` method,
neither the `key()` nor the `unescaped_key()` methods should be called: the current field instance
has no longer a key (that is by design). Like other strings, the resulting `std::string_view` generated

View File

@ -30,8 +30,8 @@ public:
*/
simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> unescaped_key() noexcept;
/**
* Get the key as a raw_json_string: this is fast and allows straight comparisons.
* We want this to be the default for most users.
* Get the key as a raw_json_string. Can be used for direct comparison with
* an unescaped C string: e.g., key() == "test".
*/
simdjson_really_inline raw_json_string key() const noexcept;
/**

View File

@ -32,6 +32,8 @@ public:
* double y = obj.find_field("y");
* double x = obj.find_field("x");
* ```
* If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful
* that only one field is returned.
*
* **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
* e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
@ -59,6 +61,9 @@ public:
* Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
* field wasn't there when they aren't).
*
* If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful
* that only one field is returned.
*
* @param key The key to look up.
* @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
*/

View File

@ -1,4 +1,5 @@
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace ondemand {
@ -13,25 +14,145 @@ simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> ra
return result;
}
simdjson_really_inline bool raw_json_string::is_free_from_unescaped_quote(std::string_view target) noexcept {
size_t pos{0};
// if the content has no escape character, just scan through it quickly!
for(;pos < target.size() && target[pos] != '\\';pos++) {}
// slow path may begin.
bool escaping{false};
for(;pos < target.size();pos++) {
if((target[pos] == '"') && !escaping) {
return false;
} else if(target[pos] == '\\') {
escaping = !escaping;
} else {
escaping = false;
}
}
return true;
}
simdjson_really_inline bool raw_json_string::is_free_from_unescaped_quote(const char* target) noexcept {
size_t pos{0};
// if the content has no escape character, just scan through it quickly!
for(;target[pos] && target[pos] != '\\';pos++) {}
// slow path may begin.
bool escaping{false};
for(;target[pos];pos++) {
if((target[pos] == '"') && !escaping) {
return false;
} else if(target[pos] == '\\') {
escaping = !escaping;
} else {
escaping = false;
}
}
return true;
}
simdjson_really_inline bool raw_json_string::unsafe_is_equal(size_t length, std::string_view target) const noexcept {
// If we are going to call memcmp, then we must know something about the length of the raw_json_string.
return (length >= target.size()) && (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size());
}
simdjson_really_inline bool raw_json_string::unsafe_is_equal(std::string_view target) const noexcept {
// Assumptions: does not contain unescaped quote characters, and
// the raw content is quote terminated within a valid JSON string.
if(target.size() <= SIMDJSON_PADDING) {
return (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size());
}
const char * r{raw()};
size_t pos{0};
for(;pos < target.size();pos++) {
if(r[pos] != target[pos]) { return false; }
}
if(r[pos] != '"') { return false; }
return true;
}
simdjson_really_inline bool raw_json_string::is_equal(std::string_view target) const noexcept {
const char * r{raw()};
size_t pos{0};
bool escaping{false};
for(;pos < target.size();pos++) {
if(r[pos] != target[pos]) { return false; }
// if target is a compile-time constant and it is free from
// quotes, then the next part could get optimized away through
// inlining.
if((target[pos] == '"') && !escaping) {
// We have reached the end of the raw_json_string but
// the target is not done.
return false;
} else if(target[pos] == '\\') {
escaping = !escaping;
} else {
escaping = false;
}
}
if(r[pos] != '"') { return false; }
return true;
}
simdjson_really_inline bool raw_json_string::unsafe_is_equal(const char * target) const noexcept {
// Assumptions: 'target' does not contain unescaped quote characters, is null terminated and
// the raw content is quote terminated within a valid JSON string.
const char * r{raw()};
size_t pos{0};
for(;target[pos];pos++) {
if(r[pos] != target[pos]) { return false; }
}
if(r[pos] != '"') { return false; }
return true;
}
simdjson_really_inline bool raw_json_string::is_equal(const char* target) const noexcept {
// Assumptions: does not contain unescaped quote characters, and
// the raw content is quote terminated within a valid JSON string.
const char * r{raw()};
size_t pos{0};
bool escaping{false};
for(;target[pos];pos++) {
if(r[pos] != target[pos]) { return false; }
// if target is a compile-time constant and it is free from
// quotes, then the next part could get optimized away through
// inlining.
if((target[pos] == '"') && !escaping) {
// We have reached the end of the raw_json_string but
// the target is not done.
return false;
} else if(target[pos] == '\\') {
escaping = !escaping;
} else {
escaping = false;
}
}
if(r[pos] != '"') { return false; }
return true;
}
simdjson_unused simdjson_really_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept {
return a.unsafe_is_equal(c);
}
simdjson_unused simdjson_really_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept {
return a == c;
}
simdjson_unused simdjson_really_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept {
return !(a == c);
}
simdjson_unused simdjson_really_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept {
return !(a == c);
}
simdjson_really_inline simdjson_warn_unused simdjson_result<std::string_view> raw_json_string::unescape(json_iterator &iter) const noexcept {
return unescape(iter.string_buf_loc());
}
simdjson_unused simdjson_really_inline bool operator==(const raw_json_string &a, std::string_view b) noexcept {
return !memcmp(a.raw(), b.data(), b.size());
}
simdjson_unused simdjson_really_inline bool operator==(std::string_view a, const raw_json_string &b) noexcept {
return b == a;
}
simdjson_unused simdjson_really_inline bool operator!=(const raw_json_string &a, std::string_view b) noexcept {
return !(a == b);
}
simdjson_unused simdjson_really_inline bool operator!=(std::string_view a, const raw_json_string &b) noexcept {
return !(a == b);
}
simdjson_unused simdjson_really_inline std::ostream &operator<<(std::ostream &out, const raw_json_string &str) noexcept {
bool in_escape = false;

View File

@ -16,7 +16,7 @@ class json_iterator;
* JSON file.)
*
* This class is deliberately simplistic and has little functionality. You can
* compare two raw_json_string instances, or compare a raw_json_string with a string_view, but
* compare a raw_json_string instance with an unescaped C string, but
* that is pretty much all you can do.
*
* They originate typically from field instance which in turn represent key-value pairs from
@ -49,7 +49,76 @@ public:
*/
simdjson_really_inline const char * raw() const noexcept;
/**
* This compares the current instance to the std::string_view target: returns true if
* they are byte-by-byte equal (no escaping is done) on target.size() characters,
* and if the raw_json_string instance has a quote character at byte index target.size().
* We never read more than length + 1 bytes in the raw_json_string instance.
* If length is smaller than target.size(), this will return false.
*
* The std::string_view instance may contain any characters. However, the caller
* is responsible for setting length so that length bytes may be read in the
* raw_json_string.
*
* Performance: the comparison may be done using memcmp which may be efficient
* for long strings.
*/
simdjson_really_inline bool unsafe_is_equal(size_t length, std::string_view target) const noexcept;
/**
* This compares the current instance to the std::string_view target: returns true if
* they are byte-by-byte equal (no escaping is done).
* The std::string_view instance should not contain unescaped quote characters:
* the caller is responsible for this check. See is_free_from_unescaped_quote.
*
* Performance: the comparison is done byte-by-byte which might be inefficient for
* long strings.
*
* If target is a compile-time constant, and your compiler likes you,
* you should be able to do the following without performance penatly...
*
* static_assert(raw_json_string::is_free_from_unescaped_quote(target), "");
* s.unsafe_is_equal(target);
*/
simdjson_really_inline bool unsafe_is_equal(std::string_view target) const noexcept;
/**
* This compares the current instance to the C string target: returns true if
* they are byte-by-byte equal (no escaping is done).
* The provided C string should not contain an unescaped quote character:
* the caller is responsible for this check. See is_free_from_unescaped_quote.
*
* If target is a compile-time constant, and your compiler likes you,
* you should be able to do the following without performance penatly...
*
* static_assert(raw_json_string::is_free_from_unescaped_quote(target), "");
* s.unsafe_is_equal(target);
*/
simdjson_really_inline bool unsafe_is_equal(const char* target) const noexcept;
/**
* This compares the current instance to the std::string_view target: returns true if
* they are byte-by-byte equal (no escaping is done).
*/
simdjson_really_inline bool is_equal(std::string_view target) const noexcept;
/**
* This compares the current instance to the C string target: returns true if
* they are byte-by-byte equal (no escaping is done).
*/
simdjson_really_inline bool is_equal(const char* target) const noexcept;
/**
* Returns true if target is free from unescaped quote. If target is known at
* compile-time, we might expect the computation to happen at compile time with
* many compilers (not all!).
*/
static simdjson_really_inline bool is_free_from_unescaped_quote(std::string_view target) noexcept;
static simdjson_really_inline bool is_free_from_unescaped_quote(const char* target) noexcept;
private:
/**
* This will set the inner pointer to zero, effectively making
* this instance unusable.
@ -92,13 +161,18 @@ private:
friend struct simdjson_result<raw_json_string>;
};
simdjson_unused simdjson_really_inline bool operator==(const raw_json_string &a, std::string_view b) noexcept;
simdjson_unused simdjson_really_inline bool operator==(std::string_view a, const raw_json_string &b) noexcept;
simdjson_unused simdjson_really_inline bool operator!=(const raw_json_string &a, std::string_view b) noexcept;
simdjson_unused simdjson_really_inline bool operator!=(std::string_view a, const raw_json_string &b) noexcept;
simdjson_unused simdjson_really_inline std::ostream &operator<<(std::ostream &, const raw_json_string &) noexcept;
/**
* Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user is responsible
* for providing a string with no unescaped quote. Note that unescaped quotes cannot be present in valid JSON strings.
*/
simdjson_unused simdjson_really_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept;
simdjson_unused simdjson_really_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept;
simdjson_unused simdjson_really_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept;
simdjson_unused simdjson_really_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept;
} // namespace ondemand
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson

View File

@ -222,7 +222,9 @@ public:
* double y = obj.find_field("y");
* double x = obj.find_field("x");
* ```
*
* If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful
* that only one field is returned.
* **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
* e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
*
@ -246,6 +248,9 @@ public:
* default behavior failed to look up a field just because it was in the wrong order--and many
* APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order.
*
* If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful
* that only one field is returned.
*
* Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
* field wasn't there when they aren't).
*

View File

@ -56,7 +56,6 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator
simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator::find_field_raw(const std::string_view key) noexcept {
error_code error;
bool has_value;
//
// Initially, the object can be in one of a few different places:
//
@ -113,11 +112,19 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator
while (has_value) {
// Get the key and colon, stopping at the value.
raw_json_string actual_key;
// size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
if ((error = field_key().get(actual_key) )) { abandon(); return error; };
if ((error = field_value() )) { abandon(); return error; }
if ((error = field_value() )) { abandon(); return error; }
// If it matches, stop and return
if (actual_key == key) {
// We could do it this way if we wanted to allow arbitrary
// key content (including escaped quotes).
//if (actual_key.unsafe_is_equal(max_key_length, key)) {
// Instead we do the following which may trigger buffer overruns if the
// user provides an adversarial key (containing a well placed unescaped quote
// character and being longer than the number of bytes remaining in the JSON
// input).
if (actual_key.unsafe_is_equal(key)) {
logger::log_event(*this, "match", key, -2);
return true;
}
@ -135,7 +142,6 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator
simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator::find_field_unordered_raw(const std::string_view key) noexcept {
error_code error;
bool has_value;
//
// Initially, the object can be in one of a few different places:
//
@ -215,11 +221,20 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator
// Get the key and colon, stopping at the value.
raw_json_string actual_key;
// size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
if ((error = field_key().get(actual_key) )) { abandon(); return error; };
if ((error = field_value() )) { abandon(); return error; }
// If it matches, stop and return
if (actual_key == key) {
// We could do it this way if we wanted to allow arbitrary
// key content (including escaped quotes).
// if (actual_key.unsafe_is_equal(max_key_length, key)) {
// Instead we do the following which may trigger buffer overruns if the
// user provides an adversarial key (containing a well placed unescaped quote
// character and being longer than the number of bytes remaining in the JSON
// input).
if (actual_key.unsafe_is_equal(key)) {
logger::log_event(*this, "match", key, -2);
return true;
}
@ -243,11 +258,20 @@ simdjson_warn_unused simdjson_really_inline simdjson_result<bool> value_iterator
// Get the key and colon, stopping at the value.
raw_json_string actual_key;
// size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
error = field_key().get(actual_key); SIMDJSON_ASSUME(!error);
error = field_value(); SIMDJSON_ASSUME(!error);
// If it matches, stop and return
if (actual_key == key) {
// We could do it this way if we wanted to allow arbitrary
// key content (including escaped quotes).
// if (actual_key.unsafe_is_equal(max_key_length, key)) {
// Instead we do the following which may trigger buffer overruns if the
// user provides an adversarial key (containing a well placed unescaped quote
// character and being longer than the number of bytes remaining in the JSON
// input).
if (actual_key.unsafe_is_equal(key)) {
logger::log_event(*this, "match", key, -2);
return true;
}

View File

@ -25,7 +25,6 @@ void compilation_test_1() {
}
}
// Do not run this, it is only meant to compile
void compilation_test_2() {
const padded_string bogus = ""_padded;

View File

@ -539,6 +539,82 @@ namespace object_tests {
TEST_SUCCEED();
}
bool issue_1480() {
TEST_START();
auto json = R"({ "name" : "something", "version": "0.13.2", "version_major": 0})"_padded;
SUBTEST("ondemand::issue_1480::object", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::object object;
ASSERT_SUCCESS( doc_result.get(object) );
ASSERT_EQUAL( object.find_field_unordered("version_major").get_uint64().value_unsafe(), 0 );
ASSERT_EQUAL( object.find_field_unordered("version").get_string().value_unsafe(), "0.13.2");
return true;
}));
SUBTEST("ondemand::issue_1480::big-key", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::object object;
ASSERT_SUCCESS( doc_result.get(object) );
std::vector<char> large_buf(512,' ');
std::string_view lb{large_buf.data(), large_buf.size()};
ASSERT_ERROR( object.find_field(lb), NO_SUCH_FIELD );
return true;
}));
#if SIMDJSON_EXCEPTIONS
SUBTEST("ondemand::issue_1480::object-iteration", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::object object;
ASSERT_SUCCESS( doc_result.get(object) );
size_t counter_version{0};
size_t counter_version_major{0};
for(auto field : object) {
std::string_view keyv = field.unescaped_key();
if(keyv == "version") { counter_version++; }
if(keyv == "version_major") { counter_version_major++; }
}
ASSERT_EQUAL( counter_version, 1 );
ASSERT_EQUAL( counter_version_major, 1 );
return true;
}));
SUBTEST("ondemand::issue_1480::object-iteration-string-view", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::object object;
ASSERT_SUCCESS( doc_result.get(object) );
size_t counter_version{0};
size_t counter_version_major{0};
const char * bufv = "version";
const char * bufvm = "version_major";
std::string_view v{bufv};
std::string_view vm{bufvm};
for(auto field : object) {
std::string_view keyv = field.unescaped_key();
if(keyv == v) { counter_version++; }
if(keyv == vm) { counter_version_major++; }
}
ASSERT_EQUAL( counter_version, 1 );
ASSERT_EQUAL( counter_version_major, 1 );
return true;
}));
SUBTEST("ondemand::issue_1480::original", test_ondemand_doc(json, [&](auto doc_result) {
size_t counter{0};
for (auto field : doc_result.get_object()) {
auto key = field.key().value();
if (key == "version") { counter++; }
}
ASSERT_EQUAL( counter, 1 );
return true;
}));
SUBTEST("ondemand::issue_1480::object-operator", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::object object;
ASSERT_SUCCESS( doc_result.get(object) );
ASSERT_EQUAL( std::string_view(object["version"]), "0.13.2" );
ASSERT_EQUAL( uint64_t(object["version_major"]), 0 );
return true;
}));
#endif
TEST_SUCCEED();
}
bool object_find_field_unordered() {
TEST_START();
auto json = R"({ "a": 1, "b": 2, "c/d": 3})"_padded;
@ -833,6 +909,7 @@ namespace object_tests {
value_nested_object_index() &&
iterate_object_partial_children() &&
object_index_partial_children() &&
issue_1480() &&
#if SIMDJSON_EXCEPTIONS
iterate_object_exception() &&
object_index_exception() &&