Documenting and fixing the case where a string is immediately followed by a scalar (#1106)
* Documenting and fixing. * More cleaning. * Being a bit cleaner.
This commit is contained in:
parent
bee4d7a12b
commit
17f6d5208f
|
@ -0,0 +1 @@
|
|||
""n
|
|
@ -4,10 +4,27 @@ namespace stage1 {
|
|||
|
||||
/**
|
||||
* A block of scanned json, with information on operators and scalars.
|
||||
*
|
||||
* We seek to identify pseudo-structural characters. Anything that is inside
|
||||
* a string must be omitted (hence & ~_string.string_tail()).
|
||||
* Otherwise, pseudo-structural characters come in two forms.
|
||||
* 1. We have the structural characters ([,],{,},:, comma). The
|
||||
* term 'structural character' is from the JSON RFC.
|
||||
* 2. We have the 'scalar pseudo-structural characters'.
|
||||
* Scalars are quotes, and any character except structural characters and white space.
|
||||
*
|
||||
* To identify the scalar pseudo-structural characters, we must look at what comes
|
||||
* before them: it must be a space, a quote or a structural characters.
|
||||
* Starting with simdjson v0.3, we identify them by
|
||||
* negation: we identify everything that is followed by a non-quote scalar,
|
||||
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
||||
*/
|
||||
struct json_block {
|
||||
public:
|
||||
/** The start of structurals */
|
||||
/**
|
||||
* The start of structurals.
|
||||
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
||||
**/
|
||||
really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
|
||||
/** All JSON whitespace (i.e. not in a string) */
|
||||
really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
|
||||
|
@ -21,27 +38,49 @@ public:
|
|||
|
||||
// string and escape characters
|
||||
json_string_block _string;
|
||||
// whitespace, operators, scalars
|
||||
// whitespace, structural characters ('operators'), scalars
|
||||
json_character_block _characters;
|
||||
// whether the previous character was a scalar
|
||||
uint64_t _follows_potential_scalar;
|
||||
uint64_t _follows_potential_nonquote_scalar;
|
||||
private:
|
||||
// Potential structurals (i.e. disregarding strings)
|
||||
|
||||
/** operators plus scalar starts like 123, true and "abc" */
|
||||
/**
|
||||
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
||||
* They may reside inside a string.
|
||||
**/
|
||||
really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
|
||||
/** the start of non-operator runs, like 123, true and "abc" */
|
||||
really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
|
||||
/** whether the given character is immediately after a non-operator like 123, true or " */
|
||||
really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
|
||||
/**
|
||||
* The start of non-operator runs, like 123, true and "abc".
|
||||
* It main reside inside a string.
|
||||
**/
|
||||
really_inline uint64_t potential_scalar_start() {
|
||||
// The term "scalar" refers to anything except structural characters and white space
|
||||
// (so letters, numbers, quotes).
|
||||
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
||||
// then we know that it is irrelevant structurally.
|
||||
return _characters.scalar() & ~follows_potential_scalar();
|
||||
}
|
||||
/**
|
||||
* Whether the given character is immediately after a non-operator like 123, true.
|
||||
* The characters following a quote are not included.
|
||||
*/
|
||||
really_inline uint64_t follows_potential_scalar() {
|
||||
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
||||
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
||||
// white space.
|
||||
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
||||
return _follows_potential_nonquote_scalar;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Scans JSON for important bits: operators, strings, and scalars.
|
||||
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
||||
*
|
||||
* The scanner starts by calculating two distinct things:
|
||||
* - string characters (taking \" into account)
|
||||
* - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
|
||||
* - structural characters or 'operators' ([]{},:, comma)
|
||||
* and scalars (runs of non-operators like 123, true and "abc")
|
||||
*
|
||||
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
||||
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
||||
|
@ -56,7 +95,7 @@ public:
|
|||
|
||||
private:
|
||||
// Whether the last character of the previous iteration is part of a scalar token
|
||||
// (anything except whitespace or an operator).
|
||||
// (anything except whitespace or a structural character/'operator').
|
||||
uint64_t prev_scalar = 0ULL;
|
||||
json_string_scanner string_scanner{};
|
||||
};
|
||||
|
@ -77,12 +116,24 @@ really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|||
|
||||
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
||||
json_string_block strings = string_scanner.next(in);
|
||||
// identifies the white-space and the structurat characters
|
||||
json_character_block characters = json_character_block::classify(in);
|
||||
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
|
||||
// The term "scalar" refers to anything except structural characters and white space
|
||||
// (so letters, numbers, quotes).
|
||||
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
||||
//
|
||||
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
||||
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
||||
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
||||
// may need to add an extra check when parsing strings.
|
||||
//
|
||||
// Performance: there are many ways to skin this cat.
|
||||
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
||||
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
||||
return {
|
||||
strings,
|
||||
characters,
|
||||
follows_scalar
|
||||
follows_nonquote_scalar
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -10,9 +10,9 @@ struct json_string_block {
|
|||
// Real (non-backslashed) quotes
|
||||
really_inline uint64_t quote() const { return _quote; }
|
||||
// Start quotes of strings
|
||||
really_inline uint64_t string_end() const { return _quote & _in_string; }
|
||||
really_inline uint64_t string_start() const { return _quote & _in_string; }
|
||||
// End quotes of strings
|
||||
really_inline uint64_t string_start() const { return _quote & ~_in_string; }
|
||||
really_inline uint64_t string_end() const { return _quote & ~_in_string; }
|
||||
// Only characters inside the string (not including the quotes)
|
||||
really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
||||
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
||||
|
|
|
@ -13,15 +13,19 @@ using namespace simd;
|
|||
|
||||
struct json_character_block {
|
||||
static really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
||||
|
||||
// ASCII white-space ('\r','\n','\t',' ')
|
||||
really_inline uint64_t whitespace() const { return _whitespace; }
|
||||
// non-quote structural characters (comma, colon, braces, brackets)
|
||||
really_inline uint64_t op() const { return _op; }
|
||||
// neither a structural character nor a white-space, so letters, numbers and quotes
|
||||
really_inline uint64_t scalar() { return ~(op() | whitespace()); }
|
||||
|
||||
uint64_t _whitespace;
|
||||
uint64_t _op;
|
||||
uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
|
||||
uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
|
||||
};
|
||||
|
||||
// This identifies structural characters (comma, colon, braces, brackets),
|
||||
// and ASCII white-space ('\r','\n','\t',' ').
|
||||
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
||||
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
|
||||
// we can't use the generic lookup_16.
|
||||
|
|
Loading…
Reference in New Issue