New version (mostly setting the singleheader version in sync).
This commit is contained in:
parent
d5a185b13e
commit
bf9b1b1457
|
@ -11,8 +11,8 @@ project(simdjson)
|
||||||
set(SIMDJSON_LIB_NAME simdjson)
|
set(SIMDJSON_LIB_NAME simdjson)
|
||||||
set(PROJECT_VERSION_MAJOR 0)
|
set(PROJECT_VERSION_MAJOR 0)
|
||||||
set(PROJECT_VERSION_MINOR 1)
|
set(PROJECT_VERSION_MINOR 1)
|
||||||
set(PROJECT_VERSION_PATCH 0)
|
set(PROJECT_VERSION_PATCH 1)
|
||||||
set(SIMDJSON_LIB_VERSION "0.1.0" CACHE STRING "simdjson library version")
|
set(SIMDJSON_LIB_VERSION "0.1.1" CACHE STRING "simdjson library version")
|
||||||
set(SIMDJSON_LIB_SOVERSION "0" CACHE STRING "simdjson library soversion")
|
set(SIMDJSON_LIB_SOVERSION "0" CACHE STRING "simdjson library soversion")
|
||||||
|
|
||||||
if(NOT MSVC)
|
if(NOT MSVC)
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
|
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
|
||||||
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
#define SIMDJSON_VERSION 0.1.0
|
#define SIMDJSON_VERSION 0.1.1
|
||||||
enum {
|
enum {
|
||||||
SIMDJSON_VERSION_MAJOR = 0,
|
SIMDJSON_VERSION_MAJOR = 0,
|
||||||
SIMDJSON_VERSION_MINOR = 1,
|
SIMDJSON_VERSION_MINOR = 1,
|
||||||
SIMDJSON_VERSION_REVISION = 0
|
SIMDJSON_VERSION_REVISION = 1
|
||||||
};
|
};
|
||||||
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* auto-generated on Wed 13 Mar 2019 20:02:04 EDT. Do not edit! */
|
/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "simdjson.h"
|
#include "simdjson.h"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* auto-generated on Wed 13 Mar 2019 20:02:04 EDT. Do not edit! */
|
/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
|
||||||
#include "simdjson.h"
|
#include "simdjson.h"
|
||||||
|
|
||||||
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
|
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
|
||||||
|
@ -391,6 +391,16 @@ really_inline uint64_t cmp_mask_against_input(__m256i input_lo,
|
||||||
return res_0 | (res_1 << 32);
|
return res_0 | (res_1 << 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
|
||||||
|
really_inline uint64_t unsigned_lteq_against_input(__m256i input_lo,
|
||||||
|
__m256i input_hi, __m256i maxval) {
|
||||||
|
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_lo),maxval);
|
||||||
|
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||||
|
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,input_hi),maxval);
|
||||||
|
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||||
|
return res_0 | (res_1 << 32);
|
||||||
|
}
|
||||||
|
|
||||||
// return a bitvector indicating where we have characters that end an odd-length
|
// return a bitvector indicating where we have characters that end an odd-length
|
||||||
// sequence of backslashes (and thus change the behavior of the next character
|
// sequence of backslashes (and thus change the behavior of the next character
|
||||||
// to follow). A even-length sequence of backslashes, and, for that matter, the
|
// to follow). A even-length sequence of backslashes, and, for that matter, the
|
||||||
|
@ -449,13 +459,21 @@ find_odd_backslash_sequences(__m256i input_lo, __m256i input_hi,
|
||||||
// backslash sequences (of any length) will be detected elsewhere.
|
// backslash sequences (of any length) will be detected elsewhere.
|
||||||
really_inline uint64_t find_quote_mask_and_bits(
|
really_inline uint64_t find_quote_mask_and_bits(
|
||||||
__m256i input_lo, __m256i input_hi, uint64_t odd_ends,
|
__m256i input_lo, __m256i input_hi, uint64_t odd_ends,
|
||||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits) {
|
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
||||||
quote_bits =
|
quote_bits =
|
||||||
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
|
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
|
||||||
quote_bits = quote_bits & ~odd_ends;
|
quote_bits = quote_bits & ~odd_ends;
|
||||||
|
// remove from the valid quoted region the unescapted characters.
|
||||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||||
quote_mask ^= prev_iter_inside_quote;
|
quote_mask ^= prev_iter_inside_quote;
|
||||||
|
// All Unicode characters may be placed within the
|
||||||
|
// quotation marks, except for the characters that MUST be escaped:
|
||||||
|
// quotation mark, reverse solidus, and the control characters (U+0000
|
||||||
|
//through U+001F).
|
||||||
|
// https://tools.ietf.org/html/rfc8259
|
||||||
|
uint64_t unescaped = unsigned_lteq_against_input(input_lo, input_hi, _mm256_set1_epi8(0x1F));
|
||||||
|
error_mask |= quote_mask & unescaped;
|
||||||
// right shift of a signed value expected to be well-defined and standard
|
// right shift of a signed value expected to be well-defined and standard
|
||||||
// compliant as of C++20,
|
// compliant as of C++20,
|
||||||
// John Regher from Utah U. says this is fine code
|
// John Regher from Utah U. says this is fine code
|
||||||
|
@ -558,11 +576,9 @@ really_inline uint64_t finalize_structurals(
|
||||||
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
|
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
|
||||||
// mask off anything inside quotes
|
// mask off anything inside quotes
|
||||||
structurals &= ~quote_mask;
|
structurals &= ~quote_mask;
|
||||||
|
|
||||||
// add the real quote bits back into our bitmask as well, so we can
|
// add the real quote bits back into our bitmask as well, so we can
|
||||||
// quickly traverse the strings we've spent all this trouble gathering
|
// quickly traverse the strings we've spent all this trouble gathering
|
||||||
structurals |= quote_bits;
|
structurals |= quote_bits;
|
||||||
|
|
||||||
// Now, establish "pseudo-structural characters". These are non-whitespace
|
// Now, establish "pseudo-structural characters". These are non-whitespace
|
||||||
// characters that are (a) outside quotes and (b) have a predecessor that's
|
// characters that are (a) outside quotes and (b) have a predecessor that's
|
||||||
// either whitespace or a structural character. This means that subsequent
|
// either whitespace or a structural character. This means that subsequent
|
||||||
|
@ -574,6 +590,7 @@ really_inline uint64_t finalize_structurals(
|
||||||
// a qualified predecessor is something that can happen 1 position before an
|
// a qualified predecessor is something that can happen 1 position before an
|
||||||
// psuedo-structural character
|
// psuedo-structural character
|
||||||
uint64_t pseudo_pred = structurals | whitespace;
|
uint64_t pseudo_pred = structurals | whitespace;
|
||||||
|
|
||||||
uint64_t shifted_pseudo_pred =
|
uint64_t shifted_pseudo_pred =
|
||||||
(pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
(pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
|
||||||
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
|
||||||
|
@ -631,6 +648,7 @@ WARN_UNUSED
|
||||||
|
|
||||||
size_t lenminus64 = len < 64 ? 0 : len - 64;
|
size_t lenminus64 = len < 64 ? 0 : len - 64;
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
|
uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)
|
||||||
|
|
||||||
for (; idx < lenminus64; idx += 64) {
|
for (; idx < lenminus64; idx += 64) {
|
||||||
#ifndef _MSC_VER
|
#ifndef _MSC_VER
|
||||||
|
@ -653,7 +671,7 @@ WARN_UNUSED
|
||||||
// themselves
|
// themselves
|
||||||
uint64_t quote_bits;
|
uint64_t quote_bits;
|
||||||
uint64_t quote_mask = find_quote_mask_and_bits(
|
uint64_t quote_mask = find_quote_mask_and_bits(
|
||||||
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits);
|
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
|
||||||
|
|
||||||
// take the previous iterations structural bits, not our current iteration,
|
// take the previous iterations structural bits, not our current iteration,
|
||||||
// and flatten
|
// and flatten
|
||||||
|
@ -694,7 +712,7 @@ WARN_UNUSED
|
||||||
// themselves
|
// themselves
|
||||||
uint64_t quote_bits;
|
uint64_t quote_bits;
|
||||||
uint64_t quote_mask = find_quote_mask_and_bits(
|
uint64_t quote_mask = find_quote_mask_and_bits(
|
||||||
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits);
|
input_lo, input_hi, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
|
||||||
|
|
||||||
// take the previous iterations structural bits, not our current iteration,
|
// take the previous iterations structural bits, not our current iteration,
|
||||||
// and flatten
|
// and flatten
|
||||||
|
@ -729,7 +747,9 @@ WARN_UNUSED
|
||||||
}
|
}
|
||||||
// make it safe to dereference one beyond this array
|
// make it safe to dereference one beyond this array
|
||||||
base_ptr[pj.n_structural_indexes] = 0;
|
base_ptr[pj.n_structural_indexes] = 0;
|
||||||
|
if (error_mask) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
#ifdef SIMDJSON_UTF8VALIDATE
|
#ifdef SIMDJSON_UTF8VALIDATE
|
||||||
return _mm256_testz_si256(has_error, has_error) != 0;
|
return _mm256_testz_si256(has_error, has_error) != 0;
|
||||||
#else
|
#else
|
||||||
|
@ -1297,8 +1317,12 @@ bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
|
||||||
std::cerr << "capacities must be non-zero " << std::endl;
|
std::cerr << "capacities must be non-zero " << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if ((len <= bytecapacity) && (depthcapacity < maxdepth))
|
if(len > SIMDJSON_MAXSIZE_BYTES) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ((len <= bytecapacity) && (depthcapacity < maxdepth)) {
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
deallocate();
|
deallocate();
|
||||||
isvalid = false;
|
isvalid = false;
|
||||||
bytecapacity = 0; // will only set it to len after allocations are a success
|
bytecapacity = 0; // will only set it to len after allocations are a success
|
||||||
|
@ -1306,7 +1330,9 @@ bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
|
||||||
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
|
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
|
||||||
structural_indexes = new (std::nothrow) uint32_t[max_structures];
|
structural_indexes = new (std::nothrow) uint32_t[max_structures];
|
||||||
size_t localtapecapacity = ROUNDUP_N(len, 64);
|
size_t localtapecapacity = ROUNDUP_N(len, 64);
|
||||||
size_t localstringcapacity = ROUNDUP_N(len + 32, 64);
|
// a document with only zero-length strings... could have len/3 string
|
||||||
|
// and we would need len/3 * 5 bytes on the string buffer
|
||||||
|
size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64);
|
||||||
string_buf = new (std::nothrow) uint8_t[localstringcapacity];
|
string_buf = new (std::nothrow) uint8_t[localstringcapacity];
|
||||||
tape = new (std::nothrow) uint64_t[localtapecapacity];
|
tape = new (std::nothrow) uint64_t[localtapecapacity];
|
||||||
containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
|
containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
|
||||||
|
@ -1362,6 +1388,7 @@ bool ParsedJson::printjson(std::ostream &os) {
|
||||||
if(!isvalid) {
|
if(!isvalid) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
uint32_t string_length;
|
||||||
size_t tapeidx = 0;
|
size_t tapeidx = 0;
|
||||||
uint64_t tape_val = tape[tapeidx];
|
uint64_t tape_val = tape[tapeidx];
|
||||||
uint8_t type = (tape_val >> 56);
|
uint8_t type = (tape_val >> 56);
|
||||||
|
@ -1405,7 +1432,8 @@ bool ParsedJson::printjson(std::ostream &os) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case '"': // we have a string
|
case '"': // we have a string
|
||||||
os << '"';
|
os << '"';
|
||||||
print_with_escapes((const unsigned char *)(string_buf + payload));
|
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
|
||||||
|
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
|
||||||
os << '"';
|
os << '"';
|
||||||
break;
|
break;
|
||||||
case 'l': // we have a long int
|
case 'l': // we have a long int
|
||||||
|
@ -1474,8 +1502,10 @@ bool ParsedJson::printjson(std::ostream &os) {
|
||||||
|
|
||||||
WARN_UNUSED
|
WARN_UNUSED
|
||||||
bool ParsedJson::dump_raw_tape(std::ostream &os) {
|
bool ParsedJson::dump_raw_tape(std::ostream &os) {
|
||||||
if(!isvalid) { return false;
|
if(!isvalid) {
|
||||||
}
|
return false;
|
||||||
|
}
|
||||||
|
uint32_t string_length;
|
||||||
size_t tapeidx = 0;
|
size_t tapeidx = 0;
|
||||||
uint64_t tape_val = tape[tapeidx];
|
uint64_t tape_val = tape[tapeidx];
|
||||||
uint8_t type = (tape_val >> 56);
|
uint8_t type = (tape_val >> 56);
|
||||||
|
@ -1498,7 +1528,8 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case '"': // we have a string
|
case '"': // we have a string
|
||||||
os << "string \"";
|
os << "string \"";
|
||||||
print_with_escapes((const unsigned char *)(string_buf + payload));
|
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
|
||||||
|
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
|
||||||
os << '"';
|
os << '"';
|
||||||
os << '\n';
|
os << '\n';
|
||||||
break;
|
break;
|
||||||
|
@ -1553,6 +1584,7 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) {
|
||||||
}
|
}
|
||||||
/* end file src/parsedjson.cpp */
|
/* end file src/parsedjson.cpp */
|
||||||
/* begin file src/parsedjsoniterator.cpp */
|
/* begin file src/parsedjsoniterator.cpp */
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
|
ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
|
||||||
if(pj.isValid()) {
|
if(pj.isValid()) {
|
||||||
|
@ -1659,24 +1691,32 @@ uint8_t ParsedJson::iterator::get_type() const {
|
||||||
|
|
||||||
|
|
||||||
int64_t ParsedJson::iterator::get_integer() const {
|
int64_t ParsedJson::iterator::get_integer() const {
|
||||||
if(location + 1 >= tape_length) { return 0;// default value in case of error
|
if(location + 1 >= tape_length) {
|
||||||
}
|
return 0;// default value in case of error
|
||||||
|
}
|
||||||
return static_cast<int64_t>(pj.tape[location + 1]);
|
return static_cast<int64_t>(pj.tape[location + 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
double ParsedJson::iterator::get_double() const {
|
double ParsedJson::iterator::get_double() const {
|
||||||
if(location + 1 >= tape_length) { return NAN;// default value in case of error
|
if(location + 1 >= tape_length) {
|
||||||
}
|
return NAN;// default value in case of error
|
||||||
|
}
|
||||||
double answer;
|
double answer;
|
||||||
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
|
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
|
||||||
return answer;
|
return answer;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ParsedJson::iterator::get_string() const {
|
const char * ParsedJson::iterator::get_string() const {
|
||||||
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)) ;
|
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK) + sizeof(uint32_t)) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
uint32_t ParsedJson::iterator::get_string_length() const {
|
||||||
|
uint32_t answer;
|
||||||
|
memcpy(&answer, reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)), sizeof(uint32_t));
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
bool ParsedJson::iterator::is_object_or_array() const {
|
bool ParsedJson::iterator::is_object_or_array() const {
|
||||||
return is_object_or_array(get_type());
|
return is_object_or_array(get_type());
|
||||||
}
|
}
|
||||||
|
@ -1707,14 +1747,15 @@ bool ParsedJson::iterator::is_object_or_array(uint8_t type) {
|
||||||
|
|
||||||
bool ParsedJson::iterator::move_to_key(const char * key) {
|
bool ParsedJson::iterator::move_to_key(const char * key) {
|
||||||
if(down()) {
|
if(down()) {
|
||||||
do {
|
do {
|
||||||
assert(is_string());
|
assert(is_string());
|
||||||
bool rightkey = (strcmp(get_string(),key)==0);
|
bool rightkey = (strcmp(get_string(),key)==0);// null chars would fool this
|
||||||
next();
|
next();
|
||||||
if(rightkey) { return true;
|
if(rightkey) {
|
||||||
}
|
return true;
|
||||||
} while(next());
|
}
|
||||||
assert(up());// not found
|
} while(next());
|
||||||
|
assert(up());// not found
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1813,15 +1854,17 @@ void ParsedJson::iterator::to_start_scope() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
|
bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
|
||||||
if(!isOk()) { return false;
|
if(!isOk()) {
|
||||||
}
|
return false;
|
||||||
|
}
|
||||||
switch (current_type) {
|
switch (current_type) {
|
||||||
case '"': // we have a string
|
case '"': // we have a string
|
||||||
os << '"';
|
os << '"';
|
||||||
if(escape_strings) {
|
if(escape_strings) {
|
||||||
print_with_escapes(get_string(), os);
|
print_with_escapes(get_string(), os, get_string_length());
|
||||||
} else {
|
} else {
|
||||||
os << get_string();
|
// was: os << get_string();, but given that we can include null chars, we have to do something crazier:
|
||||||
|
std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator<char>(os));
|
||||||
}
|
}
|
||||||
os << '"';
|
os << '"';
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
/* auto-generated on Wed 13 Mar 2019 20:02:04 EDT. Do not edit! */
|
/* auto-generated on Wed 13 Mar 2019 21:02:37 EDT. Do not edit! */
|
||||||
/* begin file include/simdjson/simdjson_version.h */
|
/* begin file include/simdjson/simdjson_version.h */
|
||||||
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
|
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
|
||||||
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
#define SIMDJSON_VERSION 0.1.0
|
#define SIMDJSON_VERSION 0.1.1
|
||||||
enum {
|
enum {
|
||||||
SIMDJSON_VERSION_MAJOR = 0,
|
SIMDJSON_VERSION_MAJOR = 0,
|
||||||
SIMDJSON_VERSION_MINOR = 1,
|
SIMDJSON_VERSION_MINOR = 1,
|
||||||
SIMDJSON_VERSION_REVISION = 0
|
SIMDJSON_VERSION_REVISION = 1
|
||||||
};
|
};
|
||||||
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||||
/* end file include/simdjson/simdjson_version.h */
|
/* end file include/simdjson/simdjson_version.h */
|
||||||
|
@ -165,6 +165,9 @@ static inline void aligned_free(void *memblock) {
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
|
// we support documents up to 4GB
|
||||||
|
#define SIMDJSON_MAXSIZE_BYTES 0xFFFFFFFF
|
||||||
|
|
||||||
// the input buf should be readable up to buf + SIMDJSON_PADDING
|
// the input buf should be readable up to buf + SIMDJSON_PADDING
|
||||||
#define SIMDJSON_PADDING sizeof(__m256i)
|
#define SIMDJSON_PADDING sizeof(__m256i)
|
||||||
|
|
||||||
|
@ -349,87 +352,183 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
// ends with zero char
|
||||||
static inline void print_with_escapes(const unsigned char *src) {
|
static inline void print_with_escapes(const unsigned char *src) {
|
||||||
while (*src != 0u) {
|
while (*src) {
|
||||||
switch (*src) {
|
switch (*src) {
|
||||||
case '\b':
|
case '\b':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('b');
|
putchar('b');
|
||||||
break;
|
break;
|
||||||
case '\f':
|
case '\f':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('f');
|
putchar('f');
|
||||||
break;
|
break;
|
||||||
case '\n':
|
case '\n':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('n');
|
putchar('n');
|
||||||
break;
|
break;
|
||||||
case '\r':
|
case '\r':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('r');
|
putchar('r');
|
||||||
break;
|
break;
|
||||||
case '\"':
|
case '\"':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('"');
|
putchar('"');
|
||||||
break;
|
break;
|
||||||
case '\t':
|
case '\t':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('t');
|
putchar('t');
|
||||||
break;
|
break;
|
||||||
case '\\':
|
case '\\':
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
putchar('\\');
|
putchar('\\');
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (*src <= 0x1F) {
|
if (*src <= 0x1F) {
|
||||||
printf("\\u%04x", *src);
|
printf("\\u%04x", *src);
|
||||||
} else {
|
} else {
|
||||||
putchar(*src);
|
putchar(*src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void print_with_escapes(const unsigned char *src, std::ostream &os) {
|
// ends with zero char
|
||||||
while (*src != 0u) {
|
static inline void print_with_escapes(const unsigned char *src,
|
||||||
|
std::ostream &os) {
|
||||||
|
while (*src) {
|
||||||
switch (*src) {
|
switch (*src) {
|
||||||
case '\b':
|
case '\b':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << 'b';
|
os << 'b';
|
||||||
break;
|
break;
|
||||||
case '\f':
|
case '\f':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << 'f';
|
os << 'f';
|
||||||
break;
|
break;
|
||||||
case '\n':
|
case '\n':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << 'n';
|
os << 'n';
|
||||||
break;
|
break;
|
||||||
case '\r':
|
case '\r':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << 'r';
|
os << 'r';
|
||||||
break;
|
break;
|
||||||
case '\"':
|
case '\"':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << '"';
|
os << '"';
|
||||||
break;
|
break;
|
||||||
case '\t':
|
case '\t':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << 't';
|
os << 't';
|
||||||
break;
|
break;
|
||||||
case '\\':
|
case '\\':
|
||||||
os << '\\';
|
os << '\\';
|
||||||
os << '\\';
|
os << '\\';
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (*src <= 0x1F) {
|
if (*src <= 0x1F) {
|
||||||
std::ios::fmtflags f(os.flags());
|
std::ios::fmtflags f(os.flags());
|
||||||
os << std::hex << std::setw(4) << std::setfill('0') << static_cast<int>(*src);
|
os << std::hex << std::setw(4) << std::setfill('0')
|
||||||
os.flags(f);
|
<< static_cast<int>(*src);
|
||||||
} else {
|
os.flags(f);
|
||||||
os << *src;
|
} else {
|
||||||
|
os << *src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
src++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// print len chars
|
||||||
|
static inline void print_with_escapes(const unsigned char *src, size_t len) {
|
||||||
|
const unsigned char *finalsrc = src + len;
|
||||||
|
while (src < finalsrc) {
|
||||||
|
switch (*src) {
|
||||||
|
case '\b':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('b');
|
||||||
|
break;
|
||||||
|
case '\f':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('f');
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('n');
|
||||||
|
break;
|
||||||
|
case '\r':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('r');
|
||||||
|
break;
|
||||||
|
case '\"':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('"');
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('t');
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
putchar('\\');
|
||||||
|
putchar('\\');
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (*src <= 0x1F) {
|
||||||
|
printf("\\u%04x", *src);
|
||||||
|
} else {
|
||||||
|
putchar(*src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
src++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// print len chars
|
||||||
|
static inline void print_with_escapes(const unsigned char *src,
|
||||||
|
std::ostream &os, size_t len) {
|
||||||
|
const unsigned char *finalsrc = src + len;
|
||||||
|
while (src < finalsrc) {
|
||||||
|
switch (*src) {
|
||||||
|
case '\b':
|
||||||
|
os << '\\';
|
||||||
|
os << 'b';
|
||||||
|
break;
|
||||||
|
case '\f':
|
||||||
|
os << '\\';
|
||||||
|
os << 'f';
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
os << '\\';
|
||||||
|
os << 'n';
|
||||||
|
break;
|
||||||
|
case '\r':
|
||||||
|
os << '\\';
|
||||||
|
os << 'r';
|
||||||
|
break;
|
||||||
|
case '\"':
|
||||||
|
os << '\\';
|
||||||
|
os << '"';
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
os << '\\';
|
||||||
|
os << 't';
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
os << '\\';
|
||||||
|
os << '\\';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (*src <= 0x1F) {
|
||||||
|
std::ios::fmtflags f(os.flags());
|
||||||
|
os << std::hex << std::setw(4) << std::setfill('0')
|
||||||
|
<< static_cast<int>(*src);
|
||||||
|
os.flags(f);
|
||||||
|
} else {
|
||||||
|
os << *src;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
|
@ -439,6 +538,12 @@ static inline void print_with_escapes(const char *src, std::ostream &os) {
|
||||||
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os);
|
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void print_with_escapes(const char *src, std::ostream &os,
|
||||||
|
size_t len) {
|
||||||
|
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
#endif
|
#endif
|
||||||
/* end file include/simdjson/jsonformatutils.h */
|
/* end file include/simdjson/jsonformatutils.h */
|
||||||
/* begin file include/simdjson/jsonioutil.h */
|
/* begin file include/simdjson/jsonioutil.h */
|
||||||
|
@ -35907,8 +36012,12 @@ public:
|
||||||
// get the string value at this node (NULL ended); valid only if we're at "
|
// get the string value at this node (NULL ended); valid only if we're at "
|
||||||
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
|
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
|
||||||
// return value is valid UTF-8
|
// return value is valid UTF-8
|
||||||
|
// It may contain NULL chars within the string: get_string_length determines the true
|
||||||
|
// string length.
|
||||||
const char * get_string() const;
|
const char * get_string() const;
|
||||||
|
|
||||||
|
uint32_t get_string_length() const;
|
||||||
|
|
||||||
// get the double value at this node; valid only if
|
// get the double value at this node; valid only if
|
||||||
// we're at "d"
|
// we're at "d"
|
||||||
double get_double() const;
|
double get_double() const;
|
||||||
|
@ -35931,6 +36040,9 @@ public:
|
||||||
// if successful, we are left pointing at the value,
|
// if successful, we are left pointing at the value,
|
||||||
// if not, we are still pointing at the object ({)
|
// if not, we are still pointing at the object ({)
|
||||||
// (in case of repeated keys, this only finds the first one)
|
// (in case of repeated keys, this only finds the first one)
|
||||||
|
// We seek the key using C's strcmp so if your JSON strings contain
|
||||||
|
// NULL chars, this would trigger a false positive: if you expect that
|
||||||
|
// to be the case, take extra precautions.
|
||||||
bool move_to_key(const char * key);
|
bool move_to_key(const char * key);
|
||||||
|
|
||||||
// throughout return true if we can do the navigation, false
|
// throughout return true if we can do the navigation, false
|
||||||
|
@ -36129,67 +36241,51 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
|
||||||
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
pj.write_tape(0, '"');// don't bother with the string parsing at all
|
||||||
return true; // always succeeds
|
return true; // always succeeds
|
||||||
#else
|
#else
|
||||||
|
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
||||||
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
const uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||||
uint8_t *dst = pj.current_string_buf_loc;
|
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
|
||||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
const uint8_t *const start_of_string = dst;
|
||||||
uint8_t *const start_of_string = dst;
|
|
||||||
#endif
|
|
||||||
while (1) {
|
while (1) {
|
||||||
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
||||||
auto bs_bits =
|
|
||||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
|
|
||||||
auto quote_bits =
|
|
||||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'))));
|
|
||||||
#define CHECKUNESCAPED
|
|
||||||
// All Unicode characters may be placed within the
|
|
||||||
// quotation marks, except for the characters that MUST be escaped:
|
|
||||||
// quotation mark, reverse solidus, and the control characters (U+0000
|
|
||||||
//through U+001F).
|
|
||||||
// https://tools.ietf.org/html/rfc8259
|
|
||||||
#ifdef CHECKUNESCAPED
|
|
||||||
__m256i unitsep = _mm256_set1_epi8(0x1F);
|
|
||||||
__m256i unescaped_vec = _mm256_cmpeq_epi8(_mm256_max_epu8(unitsep,v),unitsep);// could do it with saturated subtraction
|
|
||||||
#endif // CHECKUNESCAPED
|
|
||||||
|
|
||||||
uint32_t quote_dist = trailingzeroes(quote_bits);
|
|
||||||
uint32_t bs_dist = trailingzeroes(bs_bits);
|
|
||||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||||
// later
|
// later
|
||||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
||||||
if (quote_dist < bs_dist) {
|
auto bs_bits =
|
||||||
|
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\'))));
|
||||||
|
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
||||||
|
auto quote_bits =
|
||||||
|
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask));
|
||||||
|
if(((bs_bits - 1) & quote_bits) != 0 ) {
|
||||||
// we encountered quotes first. Move dst to point to quotes and exit
|
// we encountered quotes first. Move dst to point to quotes and exit
|
||||||
dst[quote_dist] = 0; // null terminate and get out
|
|
||||||
|
|
||||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
|
// find out where the quote is...
|
||||||
|
uint32_t quote_dist = trailingzeroes(quote_bits);
|
||||||
|
|
||||||
|
// NULL termination is still handy if you expect all your strings to be NULL terminated?
|
||||||
|
// It comes at a small cost
|
||||||
|
dst[quote_dist] = 0;
|
||||||
|
|
||||||
|
uint32_t str_length = (dst - start_of_string) + quote_dist;
|
||||||
|
memcpy(pj.current_string_buf_loc,&str_length, sizeof(uint32_t));
|
||||||
|
///////////////////////
|
||||||
|
// Above, check for overflow in case someone has a crazy string (>=4GB?)
|
||||||
|
// But only add the overflow check when the document itself exceeds 4GB
|
||||||
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
||||||
|
////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// we advance the point, accounting for the fact that we have a NULl termination
|
||||||
|
pj.current_string_buf_loc = dst + quote_dist + 1;
|
||||||
|
|
||||||
pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
|
|
||||||
#ifdef CHECKUNESCAPED
|
|
||||||
// check that there is no unescaped char before the quote
|
|
||||||
auto unescaped_bits = static_cast<uint32_t>(_mm256_movemask_epi8(unescaped_vec));
|
|
||||||
bool is_ok = ((quote_bits - 1) & (~ quote_bits) & unescaped_bits) == 0;
|
|
||||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
#ifdef JSON_TEST_STRINGS // for unit testing
|
||||||
if(is_ok) foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
|
foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
|
||||||
else foundBadString(buf + offset);
|
|
||||||
#endif // JSON_TEST_STRINGS
|
|
||||||
return is_ok;
|
|
||||||
#else //CHECKUNESCAPED
|
|
||||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
|
||||||
foundString(buf + offset,start_of_string,pj.current_string_buf_loc - 1);
|
|
||||||
#endif // JSON_TEST_STRINGS
|
#endif // JSON_TEST_STRINGS
|
||||||
return true;
|
return true;
|
||||||
#endif //CHECKUNESCAPED
|
}
|
||||||
} if (quote_dist > bs_dist) {
|
if(((quote_bits - 1) & bs_bits ) != 0 ) {
|
||||||
|
// find out where the backspace is
|
||||||
|
uint32_t bs_dist = trailingzeroes(bs_bits);
|
||||||
uint8_t escape_char = src[bs_dist + 1];
|
uint8_t escape_char = src[bs_dist + 1];
|
||||||
#ifdef CHECKUNESCAPED
|
|
||||||
// we are going to need the unescaped_bits to check for unescaped chars
|
|
||||||
auto unescaped_bits = static_cast<uint32_t>(_mm256_movemask_epi8(unescaped_vec));
|
|
||||||
if(((bs_bits - 1) & (~ bs_bits) & unescaped_bits) != 0) {
|
|
||||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
|
||||||
foundBadString(buf + offset);
|
|
||||||
#endif // JSON_TEST_STRINGS
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif //CHECKUNESCAPED
|
|
||||||
// we encountered backslash first. Handle backslash
|
// we encountered backslash first. Handle backslash
|
||||||
if (escape_char == 'u') {
|
if (escape_char == 'u') {
|
||||||
// move src/dst up to the start; they will be further adjusted
|
// move src/dst up to the start; they will be further adjusted
|
||||||
|
@ -36223,15 +36319,6 @@ really_inline bool parse_string(const uint8_t *buf, UNUSED size_t len,
|
||||||
// neither.
|
// neither.
|
||||||
src += 32;
|
src += 32;
|
||||||
dst += 32;
|
dst += 32;
|
||||||
#ifdef CHECKUNESCAPED
|
|
||||||
// check for unescaped chars
|
|
||||||
if(_mm256_testz_si256(unescaped_vec,unescaped_vec) != 1) {
|
|
||||||
#ifdef JSON_TEST_STRINGS // for unit testing
|
|
||||||
foundBadString(buf + offset);
|
|
||||||
#endif // JSON_TEST_STRINGS
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif // CHECKUNESCAPED
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// can't be reached
|
// can't be reached
|
||||||
|
@ -36789,7 +36876,7 @@ WARN_UNUSED
|
||||||
int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);
|
int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);
|
||||||
|
|
||||||
// Parse a document found in buf, need to preallocate ParsedJson.
|
// Parse a document found in buf, need to preallocate ParsedJson.
|
||||||
// Return false in case of a failure. You can also check validity
|
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
|
||||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||||
//
|
//
|
||||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||||
|
@ -36802,7 +36889,7 @@ inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool realloc
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse a document found in buf, need to preallocate ParsedJson.
|
// Parse a document found in buf, need to preallocate ParsedJson.
|
||||||
// Return false in case of a failure. You can also check validity
|
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
|
||||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||||
//
|
//
|
||||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||||
|
|
Loading…
Reference in New Issue