diff --git a/jsonchecker/fail81.json b/jsonchecker/fail81.json new file mode 100644 index 00000000..ee461671 --- /dev/null +++ b/jsonchecker/fail81.json @@ -0,0 +1 @@ +""n diff --git a/src/generic/stage1/json_scanner.h b/src/generic/stage1/json_scanner.h index 0d6e68f0..170369b9 100644 --- a/src/generic/stage1/json_scanner.h +++ b/src/generic/stage1/json_scanner.h @@ -4,10 +4,27 @@ namespace stage1 { /** * A block of scanned json, with information on operators and scalars. + * + * We seek to identify pseudo-structural characters. Anything that is inside + * a string must be omitted (hence & ~_string.string_tail()). + * Otherwise, pseudo-structural characters come in two forms. + * 1. We have the structural characters ([,],{,},:, comma). The + * term 'structural character' is from the JSON RFC. + * 2. We have the 'scalar pseudo-structural characters'. + * Scalars are quotes, and any character except structural characters and white space. + * + * To identify the scalar pseudo-structural characters, we must look at what comes + * before them: it must be a space, a quote or a structural characters. + * Starting with simdjson v0.3, we identify them by + * negation: we identify everything that is followed by a non-quote scalar, + * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'. */ struct json_block { public: - /** The start of structurals */ + /** + * The start of structurals. + * In simdjson prior to v0.3, these were called the pseudo-structural characters. + **/ really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); } /** All JSON whitespace (i.e. not in a string) */ really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); } @@ -21,27 +38,49 @@ public: // string and escape characters json_string_block _string; - // whitespace, operators, scalars + // whitespace, structural characters ('operators'), scalars json_character_block _characters; // whether the previous character was a scalar - uint64_t _follows_potential_scalar; + uint64_t _follows_potential_nonquote_scalar; private: // Potential structurals (i.e. disregarding strings) - /** operators plus scalar starts like 123, true and "abc" */ + /** + * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc". + * They may reside inside a string. + **/ really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); } - /** the start of non-operator runs, like 123, true and "abc" */ - really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); } - /** whether the given character is immediately after a non-operator like 123, true or " */ - really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; } + /** + * The start of non-operator runs, like 123, true and "abc". + * It main reside inside a string. + **/ + really_inline uint64_t potential_scalar_start() { + // The term "scalar" refers to anything except structural characters and white space + // (so letters, numbers, quotes). + // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space + // then we know that it is irrelevant structurally. + return _characters.scalar() & ~follows_potential_scalar(); + } + /** + * Whether the given character is immediately after a non-operator like 123, true. + * The characters following a quote are not included. + */ + really_inline uint64_t follows_potential_scalar() { + // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character + // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a + // white space. + // It is understood that within quoted region, anything at all could be marked (irrelevant). + return _follows_potential_nonquote_scalar; + } }; /** - * Scans JSON for important bits: operators, strings, and scalars. + * Scans JSON for important bits: structural characters or 'operators', strings, and scalars. * * The scanner starts by calculating two distinct things: * - string characters (taking \" into account) - * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc") + * - structural characters or 'operators' ([]{},:, comma) + * and scalars (runs of non-operators like 123, true and "abc") * * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel: * in particular, the operator/scalar bit will find plenty of things that are actually part of @@ -56,7 +95,7 @@ public: private: // Whether the last character of the previous iteration is part of a scalar token - // (anything except whitespace or an operator). + // (anything except whitespace or a structural character/'operator'). uint64_t prev_scalar = 0ULL; json_string_scanner string_scanner{}; }; @@ -77,12 +116,24 @@ really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { really_inline json_block json_scanner::next(const simd::simd8x64& in) { json_string_block strings = string_scanner.next(in); + // identifies the white-space and the structurat characters json_character_block characters = json_character_block::classify(in); - uint64_t follows_scalar = follows(characters.scalar(), prev_scalar); + // The term "scalar" refers to anything except structural characters and white space + // (so letters, numbers, quotes). + // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers). + // + // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon) + // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential + // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we + // may need to add an extra check when parsing strings. + // + // Performance: there are many ways to skin this cat. + const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote(); + uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar); return { strings, characters, - follows_scalar + follows_nonquote_scalar }; } diff --git a/src/generic/stage1/json_string_scanner.h b/src/generic/stage1/json_string_scanner.h index dd9ac530..63ab9a5f 100644 --- a/src/generic/stage1/json_string_scanner.h +++ b/src/generic/stage1/json_string_scanner.h @@ -10,9 +10,9 @@ struct json_string_block { // Real (non-backslashed) quotes really_inline uint64_t quote() const { return _quote; } // Start quotes of strings - really_inline uint64_t string_end() const { return _quote & _in_string; } + really_inline uint64_t string_start() const { return _quote & _in_string; } // End quotes of strings - really_inline uint64_t string_start() const { return _quote & ~_in_string; } + really_inline uint64_t string_end() const { return _quote & ~_in_string; } // Only characters inside the string (not including the quotes) really_inline uint64_t string_content() const { return _in_string & ~_quote; } // Return a mask of whether the given characters are inside a string (only works on non-quotes) diff --git a/src/haswell/dom_parser_implementation.cpp b/src/haswell/dom_parser_implementation.cpp index 2b5af3bb..14fb5fb8 100644 --- a/src/haswell/dom_parser_implementation.cpp +++ b/src/haswell/dom_parser_implementation.cpp @@ -13,15 +13,19 @@ using namespace simd; struct json_character_block { static really_inline json_character_block classify(const simd::simd8x64& in); - + // ASCII white-space ('\r','\n','\t',' ') really_inline uint64_t whitespace() const { return _whitespace; } + // non-quote structural characters (comma, colon, braces, brackets) really_inline uint64_t op() const { return _op; } + // neither a structural character nor a white-space, so letters, numbers and quotes really_inline uint64_t scalar() { return ~(op() | whitespace()); } - uint64_t _whitespace; - uint64_t _op; + uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ') + uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes) }; +// This identifies structural characters (comma, colon, braces, brackets), +// and ASCII white-space ('\r','\n','\t',' '). really_inline json_character_block json_character_block::classify(const simd::simd8x64& in) { // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why // we can't use the generic lookup_16.