Add fallback parser for pre-SSE4.2 machines
This commit is contained in:
parent
8e2c06cb0e
commit
af203aaf86
3
Makefile
3
Makefile
|
@ -61,9 +61,10 @@ endif # ifeq ($(MEMSANITIZE),1)
|
|||
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
|
||||
SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h
|
||||
SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h
|
||||
SRCHEADERS_FALLBACK= src/fallback/implementation.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h
|
||||
SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
|
||||
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
|
||||
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE)
|
||||
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)
|
||||
|
||||
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
|
||||
|
||||
|
|
|
@ -128,7 +128,7 @@ struct option_struct {
|
|||
break;
|
||||
default:
|
||||
// reaching here means an argument was given to getopt() which did not have a case label
|
||||
exit_error("Unexpected argument - missing case for option "+
|
||||
exit_usage("Unexpected argument - missing case for option "+
|
||||
std::string(1,static_cast<char>(c))+
|
||||
" (programming error)");
|
||||
}
|
||||
|
|
|
@ -48,6 +48,7 @@
|
|||
#endif
|
||||
|
||||
// under GCC and CLANG, we use these two macros
|
||||
#define TARGET_FALLBACK TARGET_REGION("")
|
||||
#define TARGET_HASWELL TARGET_REGION("avx2,bmi,pclmul,lzcnt")
|
||||
#define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul")
|
||||
#define TARGET_ARM64
|
||||
|
|
|
@ -42,6 +42,9 @@ set(SIMDJSON_SRC_HEADERS
|
|||
arm64/stage1_find_marks.h
|
||||
arm64/stage2_build_tape.h
|
||||
arm64/stringparsing.h
|
||||
fallback/implementation.h
|
||||
fallback/stage1_find_marks.h
|
||||
fallback/stage2_build_tape.h
|
||||
generic/atomparsing.h
|
||||
generic/numberparsing.h
|
||||
generic/stage1_find_marks.h
|
||||
|
|
|
@ -15,16 +15,24 @@ namespace simdjson::arm64 {
|
|||
using namespace simd;
|
||||
|
||||
// Holds backslashes and quotes locations.
|
||||
struct parse_string_helper {
|
||||
struct backslash_and_quote {
|
||||
public:
|
||||
static constexpr uint32_t BYTES_PROCESSED = 32;
|
||||
really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
||||
|
||||
really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
||||
really_inline bool has_backslash() { return bs_bits != 0; }
|
||||
really_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
||||
really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
||||
|
||||
uint32_t bs_bits;
|
||||
uint32_t quote_bits;
|
||||
static const uint32_t BYTES_PROCESSED = 32;
|
||||
};
|
||||
}; // struct backslash_and_quote
|
||||
|
||||
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
|
||||
really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
|
||||
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
|
||||
simd8<uint8_t> v0(src);
|
||||
simd8<uint8_t> v1(src + sizeof(v0));
|
||||
v0.store(dst);
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
|
||||
#define SIMDJSON_FALLBACK_BITMANIPULATION_H
|
||||
|
||||
#include "simdjson.h"
|
||||
#include <limits>
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
|
||||
#ifndef _MSC_VER
|
||||
// We sometimes call trailing_zero on inputs that are zero,
|
||||
// but the algorithms do not end up using the returned value.
|
||||
// Sadly, sanitizers are not smart enough to figure it out.
|
||||
__attribute__((no_sanitize("undefined"))) // this is deliberate
|
||||
#endif // _MSC_VER
|
||||
/* result might be undefined when input_num is zero */
|
||||
really_inline int trailing_zeroes(uint64_t input_num) {
|
||||
|
||||
#ifdef _MSC_VER
|
||||
unsigned long ret;
|
||||
// Search the mask data from least significant bit (LSB)
|
||||
// to the most significant bit (MSB) for a set bit (1).
|
||||
_BitScanForward64(&ret, input_num);
|
||||
return (int)ret;
|
||||
#else
|
||||
return __builtin_ctzll(input_num);
|
||||
#endif // _MSC_VER
|
||||
|
||||
} // namespace simdjson::arm64
|
||||
|
||||
/* result might be undefined when input_num is zero */
|
||||
really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
||||
return input_num & (input_num-1);
|
||||
}
|
||||
|
||||
/* result might be undefined when input_num is zero */
|
||||
really_inline int leading_zeroes(uint64_t input_num) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned long leading_zero = 0;
|
||||
// Search the mask data from most significant bit (MSB)
|
||||
// to least significant bit (LSB) for a set bit (1).
|
||||
if (_BitScanReverse64(&leading_zero, input_num))
|
||||
return (int)(63 - leading_zero);
|
||||
else
|
||||
return 64;
|
||||
#else
|
||||
return __builtin_clzll(input_num);
|
||||
#endif// _MSC_VER
|
||||
}
|
||||
|
||||
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
*result = value1 + value2;
|
||||
return *result < value1;
|
||||
}
|
||||
|
||||
really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
*result = value1 * value2;
|
||||
// TODO there must be a faster way
|
||||
return value2 > 0 && value1 > std::numeric_limits<uint64_t>::max() / value2;
|
||||
}
|
||||
|
||||
} // namespace simdjson::fallback
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
@ -0,0 +1,26 @@
|
|||
#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
||||
#define SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
||||
|
||||
#include "simdjson.h"
|
||||
#include "isadetection.h"
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
|
||||
class implementation final : public simdjson::implementation {
|
||||
public:
|
||||
really_inline implementation() : simdjson::implementation(
|
||||
"fallback",
|
||||
"Generic fallback implementation",
|
||||
0
|
||||
) {}
|
||||
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
|
||||
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
|
||||
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
|
||||
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace simdjson::fallback
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
@ -0,0 +1,34 @@
|
|||
#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_H
|
||||
#define SIMDJSON_FALLBACK_NUMBERPARSING_H
|
||||
|
||||
#include "simdjson.h"
|
||||
#include "jsoncharutils.h"
|
||||
#include "fallback/bitmanipulation.h"
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
void found_invalid_number(const uint8_t *buf);
|
||||
void found_integer(int64_t result, const uint8_t *buf);
|
||||
void found_unsigned_integer(uint64_t result, const uint8_t *buf);
|
||||
void found_float(double result, const uint8_t *buf);
|
||||
#endif
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
||||
uint32_t result = 0;
|
||||
for (int i=0;i<8;i++) {
|
||||
result = result*10 + (chars[i] - '0');
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#define SWAR_NUMBER_PARSING
|
||||
|
||||
#include "generic/numberparsing.h"
|
||||
|
||||
} // namespace simdjson::fallback
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_NUMBERPARSING_H
|
|
@ -0,0 +1,160 @@
|
|||
#ifndef SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
|
||||
#define SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
|
||||
|
||||
#include "simdjson.h"
|
||||
#include "fallback/implementation.h"
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback::stage1 {
|
||||
|
||||
class structural_scanner {
|
||||
public:
|
||||
|
||||
really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, document::parser &_doc_parser, bool _streaming)
|
||||
: buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {}
|
||||
|
||||
really_inline void add_structural() {
|
||||
*next_structural_index = idx;
|
||||
next_structural_index++;
|
||||
}
|
||||
|
||||
really_inline bool is_continuation(uint8_t c) {
|
||||
return (c & 0b11000000) == 0b10000000;
|
||||
}
|
||||
|
||||
really_inline void validate_utf8_character() {
|
||||
// Continuation
|
||||
if (unlikely((buf[idx] & 0b01000000) == 0)) {
|
||||
// extra continuation
|
||||
error = UTF8_ERROR;
|
||||
idx++;
|
||||
return;
|
||||
}
|
||||
|
||||
// 2-byte
|
||||
if ((buf[idx] & 0b00100000) == 0) {
|
||||
// missing continuation
|
||||
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
|
||||
// overlong: 1100000_ 10______
|
||||
if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
|
||||
idx += 2;
|
||||
return;
|
||||
}
|
||||
|
||||
// 3-byte
|
||||
if ((buf[idx] & 0b00010000) == 0) {
|
||||
// missing continuation
|
||||
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
|
||||
// overlong: 11100000 100_____ ________
|
||||
if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
|
||||
// surrogates: U+D800-U+DFFF 11101101 101_____
|
||||
if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
|
||||
idx += 3;
|
||||
return;
|
||||
}
|
||||
|
||||
// 4-byte
|
||||
// missing continuation
|
||||
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
|
||||
// overlong: 11110000 1000____ ________ ________
|
||||
if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
|
||||
// too large: > U+10FFFF:
|
||||
// 11110100 (1001|101_)____
|
||||
// 1111(1___|011_|0101) 10______
|
||||
// also includes 5, 6, 7 and 8 byte characters:
|
||||
// 11111___
|
||||
if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
|
||||
if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
really_inline void validate_string() {
|
||||
idx++; // skip first quote
|
||||
while (idx < len && buf[idx] != '"') {
|
||||
if (buf[idx] == '\\') {
|
||||
idx += 2;
|
||||
} else if (unlikely(buf[idx] & 0b10000000)) {
|
||||
validate_utf8_character();
|
||||
} else {
|
||||
if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
|
||||
}
|
||||
|
||||
really_inline bool is_whitespace_or_operator(uint8_t c) {
|
||||
switch (c) {
|
||||
case '{': case '}': case '[': case ']': case ',': case ':':
|
||||
case ' ': case '\r': case '\n': case '\t':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse the entire input in STEP_SIZE-byte chunks.
|
||||
//
|
||||
really_inline error_code scan() {
|
||||
for (;idx<len;idx++) {
|
||||
switch (buf[idx]) {
|
||||
// String
|
||||
case '"':
|
||||
add_structural();
|
||||
validate_string();
|
||||
break;
|
||||
// Operator
|
||||
case '{': case '}': case '[': case ']': case ',': case ':':
|
||||
add_structural();
|
||||
break;
|
||||
// Whitespace
|
||||
case ' ': case '\r': case '\n': case '\t':
|
||||
break;
|
||||
// Primitive or invalid character (invalid characters will be checked in stage 2)
|
||||
default:
|
||||
// Anything else, add the structural and go until we find the next one
|
||||
add_structural();
|
||||
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
|
||||
idx++;
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) {
|
||||
return EMPTY;
|
||||
}
|
||||
*next_structural_index = len;
|
||||
next_structural_index++;
|
||||
doc_parser.n_structural_indexes = next_structural_index - doc_parser.structural_indexes.get();
|
||||
return error;
|
||||
}
|
||||
|
||||
private:
|
||||
const uint8_t *buf;
|
||||
uint32_t *next_structural_index;
|
||||
document::parser &doc_parser;
|
||||
uint32_t idx;
|
||||
uint32_t len;
|
||||
error_code error;
|
||||
bool streaming;
|
||||
}; // structural_scanner
|
||||
|
||||
} // simdjson::fallback::stage1
|
||||
UNTARGET_REGION
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
|
||||
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
|
||||
if (unlikely(len > parser.capacity())) {
|
||||
return CAPACITY;
|
||||
}
|
||||
stage1::structural_scanner scanner(buf, len, parser, streaming);
|
||||
return scanner.scan();
|
||||
}
|
||||
|
||||
} // namespace simdjson::fallback
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
|
|
@ -0,0 +1,20 @@
|
|||
#ifndef SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
|
||||
#define SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
|
||||
|
||||
#include "simdjson.h"
|
||||
|
||||
#include "fallback/implementation.h"
|
||||
#include "fallback/stringparsing.h"
|
||||
#include "fallback/numberparsing.h"
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
|
||||
#include "generic/atomparsing.h"
|
||||
#include "generic/stage2_build_tape.h"
|
||||
#include "generic/stage2_streaming_build_tape.h"
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H
|
|
@ -0,0 +1,35 @@
|
|||
#ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
|
||||
#define SIMDJSON_FALLBACK_STRINGPARSING_H
|
||||
|
||||
#include "simdjson.h"
|
||||
#include "jsoncharutils.h"
|
||||
|
||||
TARGET_FALLBACK
|
||||
namespace simdjson::fallback {
|
||||
|
||||
// Holds backslashes and quotes locations.
|
||||
struct backslash_and_quote {
|
||||
public:
|
||||
static constexpr uint32_t BYTES_PROCESSED = 1;
|
||||
really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
||||
|
||||
really_inline bool has_quote_first() { return c == '"'; }
|
||||
really_inline bool has_backslash() { return c == '\\'; }
|
||||
really_inline int quote_index() { return c == '"' ? 0 : 1; }
|
||||
really_inline int backslash_index() { return c == '\\' ? 0 : 1; }
|
||||
|
||||
uint8_t c;
|
||||
}; // struct backslash_and_quote
|
||||
|
||||
really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like later
|
||||
dst[0] = src[0];
|
||||
return { src[0] };
|
||||
}
|
||||
|
||||
#include "generic/stringparsing.h"
|
||||
|
||||
} // namespace simdjson::fallback
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // SIMDJSON_FALLBACK_STRINGPARSING_H
|
|
@ -71,23 +71,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|||
return offset > 0;
|
||||
}
|
||||
|
||||
WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src,
|
||||
uint8_t *dst) {
|
||||
WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
|
||||
src++;
|
||||
while (1) {
|
||||
parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
|
||||
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
|
||||
/* we encountered quotes first. Move dst to point to quotes and exit
|
||||
*/
|
||||
|
||||
/* find out where the quote is... */
|
||||
auto quote_dist = trailing_zeroes(helper.quote_bits);
|
||||
|
||||
return dst + quote_dist;
|
||||
// Copy the next n bytes, and find the backslash and quote in them.
|
||||
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
||||
// If the next thing is the end quote, copy and return
|
||||
if (bs_quote.has_quote_first()) {
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
return dst + bs_quote.quote_index();
|
||||
}
|
||||
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
|
||||
if (bs_quote.has_backslash()) {
|
||||
/* find out where the backspace is */
|
||||
auto bs_dist = trailing_zeroes(helper.bs_bits);
|
||||
auto bs_dist = bs_quote.backslash_index();
|
||||
uint8_t escape_char = src[bs_dist + 1];
|
||||
/* we encountered backslash first. Handle backslash */
|
||||
if (escape_char == 'u') {
|
||||
|
@ -114,8 +110,8 @@ WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src,
|
|||
} else {
|
||||
/* they are the same. Since they can't co-occur, it means we
|
||||
* encountered neither. */
|
||||
src += parse_string_helper::BYTES_PROCESSED;
|
||||
dst += parse_string_helper::BYTES_PROCESSED;
|
||||
src += backslash_and_quote::BYTES_PROCESSED;
|
||||
dst += backslash_and_quote::BYTES_PROCESSED;
|
||||
}
|
||||
}
|
||||
/* can't be reached */
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
// are straight up concatenated into the final value. The first byte of a multibyte character is a
|
||||
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
|
||||
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size looks like:
|
||||
//
|
||||
// - ASCII (7 bits): 0_______
|
||||
// - 2 byte character (11 bits): 110_____ 10______
|
||||
|
|
|
@ -16,16 +16,24 @@ namespace simdjson::haswell {
|
|||
using namespace simd;
|
||||
|
||||
// Holds backslashes and quotes locations.
|
||||
struct parse_string_helper {
|
||||
struct backslash_and_quote {
|
||||
public:
|
||||
static constexpr uint32_t BYTES_PROCESSED = 32;
|
||||
really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
||||
|
||||
really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
||||
really_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
|
||||
really_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
||||
really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
||||
|
||||
uint32_t bs_bits;
|
||||
uint32_t quote_bits;
|
||||
static const uint32_t BYTES_PROCESSED = 32;
|
||||
};
|
||||
}; // struct backslash_and_quote
|
||||
|
||||
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
|
||||
really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 15 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
|
||||
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
|
||||
simd8<uint8_t> v(src);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like later
|
||||
v.store(dst);
|
||||
|
|
|
@ -8,11 +8,13 @@
|
|||
|
||||
#include "haswell/implementation.h"
|
||||
#include "westmere/implementation.h"
|
||||
#include "fallback/implementation.h"
|
||||
|
||||
namespace simdjson::internal {
|
||||
const fallback::implementation fallback_singleton{};
|
||||
const haswell::implementation haswell_singleton{};
|
||||
const westmere::implementation westmere_singleton{};
|
||||
constexpr const std::initializer_list<const implementation *> available_implementation_pointers { &haswell_singleton, &westmere_singleton };
|
||||
constexpr const std::initializer_list<const implementation *> available_implementation_pointers { &haswell_singleton, &westmere_singleton, &fallback_singleton };
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -20,10 +22,12 @@ constexpr const std::initializer_list<const implementation *> available_implemen
|
|||
#ifdef IS_ARM64
|
||||
|
||||
#include "arm64/implementation.h"
|
||||
#include "fallback/implementation.h"
|
||||
|
||||
namespace simdjson::internal {
|
||||
const fallback::implementation fallback_singleton{};
|
||||
const arm64::implementation arm64_singleton{};
|
||||
constexpr const std::initializer_list<const implementation *> available_implementation_pointers { &arm64_singleton };
|
||||
constexpr const std::initializer_list<const implementation *> available_implementation_pointers { &arm64_singleton, &fallback_singleton };
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "arm64/stage1_find_marks.h"
|
||||
#include "fallback/stage1_find_marks.h"
|
||||
#include "haswell/stage1_find_marks.h"
|
||||
#include "westmere/stage1_find_marks.h"
|
||||
|
|
|
@ -13,5 +13,6 @@ void found_bad_string(const uint8_t *buf);
|
|||
#endif
|
||||
|
||||
#include "arm64/stage2_build_tape.h"
|
||||
#include "fallback/stage2_build_tape.h"
|
||||
#include "haswell/stage2_build_tape.h"
|
||||
#include "westmere/stage2_build_tape.h"
|
||||
|
|
|
@ -16,16 +16,24 @@ namespace simdjson::westmere {
|
|||
using namespace simd;
|
||||
|
||||
// Holds backslashes and quotes locations.
|
||||
struct parse_string_helper {
|
||||
struct backslash_and_quote {
|
||||
public:
|
||||
static constexpr uint32_t BYTES_PROCESSED = 32;
|
||||
really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
||||
|
||||
really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
||||
really_inline bool has_backslash() { return bs_bits != 0; }
|
||||
really_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
||||
really_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
||||
|
||||
uint32_t bs_bits;
|
||||
uint32_t quote_bits;
|
||||
static const uint32_t BYTES_PROCESSED = 32;
|
||||
};
|
||||
}; // struct backslash_and_quote
|
||||
|
||||
really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) {
|
||||
really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1));
|
||||
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1));
|
||||
simd8<uint8_t> v0(src);
|
||||
simd8<uint8_t> v1(src + 16);
|
||||
v0.store(dst);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include <string_view>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "simdjson.h"
|
||||
|
||||
|
@ -32,6 +33,7 @@ inline uint64_t f64_ulp_dist(double a, double b) {
|
|||
|
||||
|
||||
bool number_test_small_integers() {
|
||||
std::cout << __func__ << std::endl;
|
||||
char buf[1024];
|
||||
simdjson::document::parser parser;
|
||||
for (int m = 10; m < 20; m++) {
|
||||
|
@ -66,6 +68,7 @@ bool number_test_small_integers() {
|
|||
|
||||
|
||||
bool number_test_powers_of_two() {
|
||||
std::cout << __func__ << std::endl;
|
||||
char buf[1024];
|
||||
simdjson::document::parser parser;
|
||||
int maxulp = 0;
|
||||
|
@ -202,6 +205,7 @@ static const double testing_power_of_ten[] = {
|
|||
|
||||
|
||||
bool number_test_powers_of_ten() {
|
||||
std::cout << __func__ << std::endl;
|
||||
char buf[1024];
|
||||
simdjson::document::parser parser;
|
||||
for (int i = -1000000; i <= 308; ++i) {// large negative values should be zero.
|
||||
|
@ -267,6 +271,7 @@ bool number_test_powers_of_ten() {
|
|||
|
||||
// adversarial example that once triggred overruns, see https://github.com/lemire/simdjson/issues/345
|
||||
bool bad_example() {
|
||||
std::cout << __func__ << std::endl;
|
||||
std::string badjson = "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6";
|
||||
simdjson::document::parser parser = simdjson::build_parsed_json(badjson);
|
||||
if(parser.is_valid()) {
|
||||
|
@ -277,6 +282,7 @@ bool bad_example() {
|
|||
}
|
||||
// returns true if successful
|
||||
bool stable_test() {
|
||||
std::cout << __func__ << std::endl;
|
||||
std::string json = "{"
|
||||
"\"Image\":{"
|
||||
"\"Width\":800,"
|
||||
|
@ -1438,10 +1444,10 @@ bool error_messages_in_correct_order() {
|
|||
|
||||
bool lots_of_brackets() {
|
||||
std::string input;
|
||||
for(size_t i = 0; i < 1000; i++) {
|
||||
for(size_t i = 0; i < 16; i++) {
|
||||
input += "[";
|
||||
}
|
||||
for(size_t i = 0; i < 1000; i++) {
|
||||
for(size_t i = 0; i < 16; i++) {
|
||||
input += "]";
|
||||
}
|
||||
auto [doc, error] = simdjson::document::parse(input);
|
||||
|
@ -1451,7 +1457,26 @@ bool lots_of_brackets() {
|
|||
return true;
|
||||
}
|
||||
|
||||
int main() {
|
||||
int main(int argc, char *argv[]) {
|
||||
std::cout << std::unitbuf;
|
||||
char c;
|
||||
while ((c = getopt(argc, argv, "a:")) != -1) {
|
||||
switch (c) {
|
||||
case 'a': {
|
||||
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
|
||||
if (!impl) {
|
||||
fprintf(stderr, "Unsupported architecture value -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::active_implementation = impl;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "Unexpected argument %c\n", c);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
// this is put here deliberately to check that the documentation is correct (README),
|
||||
// should this fail to compile, you should update the documentation:
|
||||
if (simdjson::active_implementation->name() == "unsupported") {
|
||||
|
|
Loading…
Reference in New Issue