Store the string lengths on the string tape (#101)

* Store string length in the string-tape item.
* Files are now limited to 4GB.
* Moving detection of unescaped chars to stage 1 to reduce the burden due to string parsing.

Fixes https://github.com/lemire/simdjson/issues/114

Fixes https://github.com/lemire/simdjson/issues/87
This commit is contained in:
Daniel Lemire 2019-03-13 19:32:57 -04:00 committed by GitHub
parent 609e96b5d1
commit df8f792183
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 294 additions and 163 deletions

View File

@ -138,13 +138,13 @@ jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
ujdecode.o: $(UJSON4C_INCLUDE)
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES)
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
$(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES)
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
$(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES)
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
allparsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) $(LIBS)

View File

@ -263,7 +263,7 @@ The parser builds a useful immutable (read-only) DOM (document-object model) whi
To simplify the engineering, we make some assumptions.
- We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16). We do not believe that this is a genuine limitation in the sense that we do not think that there is any serious application that needs to process JSON data without an ASCII or UTF-8 encoding.
- We store strings as NULL terminated C strings. Thus we implicitly assume that you do not include a NULL character within your string, which is allowed technically speaking if you escape it (\u0000).
- All strings in the JSON document may have up to 4294967295 bytes in UTF-8 (4GB). To enforce this constraint, we refuse to parse a document that contains more than 4294967295 bytes (4GB). This should accomodate most JSON documents.
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited).
- In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.)
- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same).

View File

@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
if(!justdata) BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
repeat, volume, !justdata);
// (static alloc)
BEST_TIME("simdjson ", json_parse(p, pj), true, , repeat,
BEST_TIME("simdjson ", json_parse(p, pj), simdjson::SUCCESS, , repeat,
volume, !justdata);

View File

@ -5,6 +5,9 @@
#include <cassert>
// we support documents up to 4GB
#define SIMDJSON_MAXSIZE_BYTES 0xFFFFFFFF
// the input buf should be readable up to buf + SIMDJSON_PADDING
#define SIMDJSON_PADDING sizeof(__m256i)

View File

@ -5,8 +5,9 @@
#include <iomanip>
#include <iostream>
// ends with zero char
static inline void print_with_escapes(const unsigned char *src) {
while (*src != 0u) {
while (*src) {
switch (*src) {
case '\b':
putchar('\\');
@ -47,8 +48,10 @@ static inline void print_with_escapes(const unsigned char *src) {
}
}
static inline void print_with_escapes(const unsigned char *src, std::ostream &os) {
while (*src != 0u) {
// ends with zero char
static inline void print_with_escapes(const unsigned char *src,
std::ostream &os) {
while (*src) {
switch (*src) {
case '\b':
os << '\\';
@ -81,7 +84,100 @@ static inline void print_with_escapes(const unsigned char *src, std::ostream &os
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0') << static_cast<int>(*src);
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
}
}
src++;
}
}
// print len chars
static inline void print_with_escapes(const unsigned char *src, size_t len) {
const unsigned char *finalsrc = src + len;
while (src < finalsrc) {
switch (*src) {
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
break;
case '\r':
putchar('\\');
putchar('r');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%04x", *src);
} else {
putchar(*src);
}
}
src++;
}
}
// print len chars
static inline void print_with_escapes(const unsigned char *src,
std::ostream &os, size_t len) {
const unsigned char *finalsrc = src + len;
while (src < finalsrc) {
switch (*src) {
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';
break;
case '\r':
os << '\\';
os << 'r';
break;
case '\"':
os << '\\';
os << '"';
break;
case '\t':
os << '\\';
os << 't';
break;
case '\\':
os << '\\';
os << '\\';
break;
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
@ -95,4 +191,10 @@ static inline void print_with_escapes(const char *src, std::ostream &os) {
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os);
}
static inline void print_with_escapes(const char *src, std::ostream &os,
size_t len) {
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os, len);
}
#
#endif

View File

@ -20,7 +20,7 @@ WARN_UNUSED
int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);
// Parse a document found in buf, need to preallocate ParsedJson.
// Return false in case of a failure. You can also check validity
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
//
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
@ -33,7 +33,7 @@ inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool realloc
}
// Parse a document found in buf, need to preallocate ParsedJson.
// Return false in case of a failure. You can also check validity
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
//
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing

View File

@ -125,8 +125,12 @@ public:
// get the string value at this node (NULL ended); valid only if we're at "
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
// return value is valid UTF-8
// It may contain NULL chars within the string: get_string_length determines the true
// string length.
const char * get_string() const;
uint32_t get_string_length() const;
// get the double value at this node; valid only if
// we're at "d"
double get_double() const;
@ -149,6 +153,9 @@ public:
// if successful, we are left pointing at the value,
// if not, we are still pointing at the object ({)
// (in case of repeated keys, this only finds the first one)
// We seek the key using C's strcmp so if your JSON strings contain
// NULL chars, this would trigger a false positive: if you expect that
// to be the case, take extra precautions.
bool move_to_key(const char * key);
// throughout return true if we can do the navigation, false

View File

@ -79,67 +79,51 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
pj.write_tape(0, '"');// don't bother with the string parsing at all
return true; // always succeeds
#else
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
uint8_t *dst = pj.current_string_buf_loc;
#ifdef JSON_TEST_STRINGS // for unit testing
uint8_t *const start_of_string = dst;
#endif
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
const uint8_t *const start_of_string = dst;
while (1) {
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
auto bs_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
auto quote_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'))));
#define CHECKUNESCAPED
// All Unicode characters may be placed within the
// quotation marks, except for the characters that MUST be escaped:
// quotation mark, reverse solidus, and the control characters (U+0000
//through U+001F).
// https://tools.ietf.org/html/rfc8259
#ifdef CHECKUNESCAPED
__m256i unitsep = _mm256_set1_epi8(0x1F);
__m256i unescaped_vec = _mm256_cmpeq_epi8(_mm256_max_epu8(unitsep,v),unitsep);// could do it with saturated subtraction
#endif // CHECKUNESCAPED
uint32_t quote_dist = trailingzeroes(quote_bits);
uint32_t bs_dist = trailingzeroes(bs_bits);
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
if (quote_dist < bs_dist) {
auto bs_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
auto quote_bits =
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
if(((bs_bits - 1) & quote_bits) != 0 ) {
// we encountered quotes first. Move dst to point to quotes and exit
dst[quote_dist] = 0; // null terminate and get out
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
// find out where the quote is...
uint32_t quote_dist = trailingzeroes(quote_bits);
// NULL termination is still handy if you expect all your strings to be NULL terminated?
// It comes at a small cost
dst[quote_dist] = 0;
uint32_t str_length = (dst - start_of_string) + quote_dist;
memcpy(pj.current_string_buf_loc,&str_length, sizeof(uint32_t));
///////////////////////
// Above, check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
////////////////////////
// we advance the point, accounting for the fact that we have a NULl termination
pj.current_string_buf_loc = dst + quote_dist + 1;
pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
#ifdef CHECKUNESCAPED
// check that there is no unescaped char before the quote
auto unescaped_bits = static_cast<uint32_t>(_mm256_movemask_epi8(unescaped_vec));
bool is_ok = ((quote_bits - 1) & (~ quote_bits) & unescaped_bits) == 0;
#ifdef JSON_TEST_STRINGS // for unit testing
if(is_ok) foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
else foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
return is_ok;
#else //CHECKUNESCAPED
#ifdef JSON_TEST_STRINGS // for unit testing
foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
#endif // JSON_TEST_STRINGS
return true;
#endif //CHECKUNESCAPED
} if (quote_dist > bs_dist) {
uint8_t escape_char = src[bs_dist + 1];
#ifdef CHECKUNESCAPED
// we are going to need the unescaped_bits to check for unescaped chars
auto unescaped_bits = static_cast<uint32_t>(_mm256_movemask_epi8(unescaped_vec));
if(((bs_bits - 1) & (~ bs_bits) & unescaped_bits) != 0) {
#ifdef JSON_TEST_STRINGS // for unit testing
foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
return false;
}
#endif //CHECKUNESCAPED
if(((quote_bits - 1) & bs_bits ) != 0 ) {
// find out where the backspace is
uint32_t bs_dist = trailingzeroes(bs_bits);
uint8_t escape_char = src[bs_dist + 1];
// we encountered backslash first. Handle backslash
if (escape_char == 'u') {
// move src/dst up to the start; they will be further adjusted
@ -173,15 +157,6 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
// neither.
src += 32;
dst += 32;
#ifdef CHECKUNESCAPED
// check for unescaped chars
if(_mm256_testz_si256(unescaped_vec,unescaped_vec) != 1) {
#ifdef JSON_TEST_STRINGS // for unit testing
foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
return false;
}
#endif // CHECKUNESCAPED
}
}
// can't be reached

View File

@ -38,8 +38,12 @@ bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
std::cerr << "capacities must be non-zero " << std::endl;
return false;
}
if ((len <= bytecapacity) && (depthcapacity < maxdepth))
if(len > SIMDJSON_MAXSIZE_BYTES) {
return false;
}
if ((len <= bytecapacity) && (depthcapacity < maxdepth)) {
return true;
}
deallocate();
isvalid = false;
bytecapacity = 0; // will only set it to len after allocations are a success
@ -47,7 +51,9 @@ bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
structural_indexes = new (std::nothrow) uint32_t[max_structures];
size_t localtapecapacity = ROUNDUP_N(len, 64);
size_t localstringcapacity = ROUNDUP_N(len + 32, 64);
// a document with only zero-length strings... could have len/3 string
// and we would need len/3 * 5 bytes on the string buffer
size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64);
string_buf = new (std::nothrow) uint8_t[localstringcapacity];
tape = new (std::nothrow) uint64_t[localtapecapacity];
containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
@ -103,6 +109,7 @@ bool ParsedJson::printjson(std::ostream &os) {
if(!isvalid) {
return false;
}
uint32_t string_length;
size_t tapeidx = 0;
uint64_t tape_val = tape[tapeidx];
uint8_t type = (tape_val >> 56);
@ -146,7 +153,8 @@ bool ParsedJson::printjson(std::ostream &os) {
switch (type) {
case '"': // we have a string
os << '"';
print_with_escapes((const unsigned char *)(string_buf + payload));
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
os << '"';
break;
case 'l': // we have a long int
@ -215,8 +223,10 @@ bool ParsedJson::printjson(std::ostream &os) {
WARN_UNUSED
bool ParsedJson::dump_raw_tape(std::ostream &os) {
if(!isvalid) { return false;
if(!isvalid) {
return false;
}
uint32_t string_length;
size_t tapeidx = 0;
uint64_t tape_val = tape[tapeidx];
uint8_t type = (tape_val >> 56);
@ -239,7 +249,8 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) {
switch (type) {
case '"': // we have a string
os << "string \"";
print_with_escapes((const unsigned char *)(string_buf + payload));
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
os << '"';
os << '\n';
break;

View File

@ -1,5 +1,6 @@
#include "simdjson/parsedjson.h"
#include "simdjson/common_defs.h"
#include <iterator>
ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
if(pj.isValid()) {
@ -106,13 +107,15 @@ uint8_t ParsedJson::iterator::get_type() const {
int64_t ParsedJson::iterator::get_integer() const {
if(location + 1 >= tape_length) { return 0;// default value in case of error
if(location + 1 >= tape_length) {
return 0;// default value in case of error
}
return static_cast<int64_t>(pj.tape[location + 1]);
}
double ParsedJson::iterator::get_double() const {
if(location + 1 >= tape_length) { return NAN;// default value in case of error
if(location + 1 >= tape_length) {
return NAN;// default value in case of error
}
double answer;
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
@ -120,10 +123,16 @@ double ParsedJson::iterator::get_double() const {
}
const char * ParsedJson::iterator::get_string() const {
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)) ;
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK) + sizeof(uint32_t)) ;
}
uint32_t ParsedJson::iterator::get_string_length() const {
uint32_t answer;
memcpy(&answer, reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)), sizeof(uint32_t));
return answer;
}
bool ParsedJson::iterator::is_object_or_array() const {
return is_object_or_array(get_type());
}
@ -156,9 +165,10 @@ bool ParsedJson::iterator::move_to_key(const char * key) {
if(down()) {
do {
assert(is_string());
bool rightkey = (strcmp(get_string(),key)==0);
bool rightkey = (strcmp(get_string(),key)==0);// null chars would fool this
next();
if(rightkey) { return true;
if(rightkey) {
return true;
}
} while(next());
assert(up());// not found
@ -260,15 +270,17 @@ void ParsedJson::iterator::to_start_scope() {
}
bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
if(!isOk()) { return false;
if(!isOk()) {
return false;
}
switch (current_type) {
case '"': // we have a string
os << '"';
if(escape_strings) {
print_with_escapes(get_string(), os);
print_with_escapes(get_string(), os, get_string_length());
} else {
os << get_string();
// was: os << get_string();, but given that we can include null chars, we have to do something crazier:
std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator<char>(os));
}
os << '"';
break;

View File

@ -45,6 +45,16 @@ really_inline uint64_t cmp_mask_against_input(__m256i input_lo,
return res_0 | (res_1 << 32);
}
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
really_inline uint64_t unsigned_lteq_against_input(__m256i input_lo,
__m256i input_hi, __m256i maxval) {
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_lo),maxval);
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_hi),maxval);
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
return res_0 | (res_1 << 32);
}
// return a bitvector indicating where we have characters that end an odd-length
// sequence of backslashes (and thus change the behavior of the next character
// to follow). A even-length sequence of backslashes, and, for that matter, the
@ -103,13 +113,21 @@ find_odd_backslash_sequences(__m256i input_lo, __m256i input_hi,
// backslash sequences (of any length) will be detected elsewhere.
really_inline uint64_t find_quote_mask_and_bits(
__m256i input_lo, __m256i input_hi, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits) {
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
quote_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
// remove from the valid quoted region the unescapted characters.
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
// All Unicode characters may be placed within the
// quotation marks, except for the characters that MUST be escaped:
// quotation mark, reverse solidus, and the control characters (U+0000
//through U+001F).
// https://tools.ietf.org/html/rfc8259
uint64_t unescaped = unsigned_lteq_against_input(input_lo, input_hi, _mm256_set1_epi8(0x1F));
error_mask |= quote_mask & unescaped;
// right shift of a signed value expected to be well-defined and standard
// compliant as of C++20,
// John Regher from Utah U. says this is fine code
@ -212,11 +230,9 @@ really_inline uint64_t finalize_structurals(
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
// mask off anything inside quotes
structurals &= ~quote_mask;
// add the real quote bits back into our bitmask as well, so we can
// quickly traverse the strings we've spent all this trouble gathering
structurals |= quote_bits;
// Now, establish "pseudo-structural characters". These are non-whitespace
// characters that are (a) outside quotes and (b) have a predecessor that's
// either whitespace or a structural character. This means that subsequent
@ -228,6 +244,7 @@ really_inline uint64_t finalize_structurals(
// a qualified predecessor is something that can happen 1 position before an
// psuedo-structural character
uint64_t pseudo_pred = structurals | whitespace;
uint64_t shifted_pseudo_pred =
(pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
@ -285,6 +302,7 @@ WARN_UNUSED
size_t lenminus64 = len < 64 ? 0 : len - 64;
size_t idx = 0;
uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)
for (; idx < lenminus64; idx += 64) {
#ifndef _MSC_VER
@ -307,7 +325,7 @@ WARN_UNUSED
// themselves
uint64_t quote_bits;
uint64_t quote_mask = find_quote_mask_and_bits(
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits);
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
// take the previous iterations structural bits, not our current iteration,
// and flatten
@ -348,7 +366,7 @@ WARN_UNUSED
// themselves
uint64_t quote_bits;
uint64_t quote_mask = find_quote_mask_and_bits(
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits);
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
// take the previous iterations structural bits, not our current iteration,
// and flatten
@ -383,7 +401,9 @@ WARN_UNUSED
}
// make it safe to dereference one beyond this array
base_ptr[pj.n_structural_indexes] = 0;
if (error_mask) {
return false;
}
#ifdef SIMDJSON_UTF8VALIDATE
return _mm256_testz_si256(has_error, has_error) != 0;
#else

View File

@ -103,6 +103,7 @@ Hint: we can read the first tape element to determine the length of the tape.
## Strings
We prefix the string data itself by a 32-bit header to be interpreted as a 32-bit integer. It indicates the length of the string. The actual string data starts at an offset of 4 bytes.
We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element `('"'<< 56) + x` where the payload `x` is the location on the string tape of the null-terminated string.