Documenting and fixing the case where a string is immediately followed by a scalar (#1106)

* Documenting and fixing.

* More cleaning.

* Being a bit cleaner.
This commit is contained in:
Daniel Lemire 2020-08-14 16:19:57 -04:00 committed by GitHub
parent bee4d7a12b
commit 17f6d5208f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 74 additions and 18 deletions

1
jsonchecker/fail81.json Normal file
View File

@ -0,0 +1 @@
""n

View File

@ -4,10 +4,27 @@ namespace stage1 {
/**
* A block of scanned json, with information on operators and scalars.
*
* We seek to identify pseudo-structural characters. Anything that is inside
* a string must be omitted (hence & ~_string.string_tail()).
* Otherwise, pseudo-structural characters come in two forms.
* 1. We have the structural characters ([,],{,},:, comma). The
* term 'structural character' is from the JSON RFC.
* 2. We have the 'scalar pseudo-structural characters'.
* Scalars are quotes, and any character except structural characters and white space.
*
* To identify the scalar pseudo-structural characters, we must look at what comes
* before them: it must be a space, a quote or a structural characters.
* Starting with simdjson v0.3, we identify them by
* negation: we identify everything that is followed by a non-quote scalar,
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
*/
struct json_block {
public:
/** The start of structurals */
/**
* The start of structurals.
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
**/
really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
/** All JSON whitespace (i.e. not in a string) */
really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
@ -21,27 +38,49 @@ public:
// string and escape characters
json_string_block _string;
// whitespace, operators, scalars
// whitespace, structural characters ('operators'), scalars
json_character_block _characters;
// whether the previous character was a scalar
uint64_t _follows_potential_scalar;
uint64_t _follows_potential_nonquote_scalar;
private:
// Potential structurals (i.e. disregarding strings)
/** operators plus scalar starts like 123, true and "abc" */
/**
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
* They may reside inside a string.
**/
really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
/** the start of non-operator runs, like 123, true and "abc" */
really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
/** whether the given character is immediately after a non-operator like 123, true or " */
really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
/**
* The start of non-operator runs, like 123, true and "abc".
* It main reside inside a string.
**/
really_inline uint64_t potential_scalar_start() {
// The term "scalar" refers to anything except structural characters and white space
// (so letters, numbers, quotes).
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
// then we know that it is irrelevant structurally.
return _characters.scalar() & ~follows_potential_scalar();
}
/**
* Whether the given character is immediately after a non-operator like 123, true.
* The characters following a quote are not included.
*/
really_inline uint64_t follows_potential_scalar() {
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
// white space.
// It is understood that within quoted region, anything at all could be marked (irrelevant).
return _follows_potential_nonquote_scalar;
}
};
/**
* Scans JSON for important bits: operators, strings, and scalars.
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
*
* The scanner starts by calculating two distinct things:
* - string characters (taking \" into account)
* - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
* - structural characters or 'operators' ([]{},:, comma)
* and scalars (runs of non-operators like 123, true and "abc")
*
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
* in particular, the operator/scalar bit will find plenty of things that are actually part of
@ -56,7 +95,7 @@ public:
private:
// Whether the last character of the previous iteration is part of a scalar token
// (anything except whitespace or an operator).
// (anything except whitespace or a structural character/'operator').
uint64_t prev_scalar = 0ULL;
json_string_scanner string_scanner{};
};
@ -77,12 +116,24 @@ really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
json_string_block strings = string_scanner.next(in);
// identifies the white-space and the structurat characters
json_character_block characters = json_character_block::classify(in);
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
// The term "scalar" refers to anything except structural characters and white space
// (so letters, numbers, quotes).
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
//
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
// may need to add an extra check when parsing strings.
//
// Performance: there are many ways to skin this cat.
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
return {
strings,
characters,
follows_scalar
follows_nonquote_scalar
};
}

View File

@ -10,9 +10,9 @@ struct json_string_block {
// Real (non-backslashed) quotes
really_inline uint64_t quote() const { return _quote; }
// Start quotes of strings
really_inline uint64_t string_end() const { return _quote & _in_string; }
really_inline uint64_t string_start() const { return _quote & _in_string; }
// End quotes of strings
really_inline uint64_t string_start() const { return _quote & ~_in_string; }
really_inline uint64_t string_end() const { return _quote & ~_in_string; }
// Only characters inside the string (not including the quotes)
really_inline uint64_t string_content() const { return _in_string & ~_quote; }
// Return a mask of whether the given characters are inside a string (only works on non-quotes)

View File

@ -13,15 +13,19 @@ using namespace simd;
struct json_character_block {
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
// ASCII white-space ('\r','\n','\t',' ')
really_inline uint64_t whitespace() const { return _whitespace; }
// non-quote structural characters (comma, colon, braces, brackets)
really_inline uint64_t op() const { return _op; }
// neither a structural character nor a white-space, so letters, numbers and quotes
really_inline uint64_t scalar() { return ~(op() | whitespace()); }
uint64_t _whitespace;
uint64_t _op;
uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
};
// This identifies structural characters (comma, colon, braces, brackets),
// and ASCII white-space ('\r','\n','\t',' ').
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.