From af203aaf86e32bb08d4f295e13e1a532de5ebf86 Mon Sep 17 00:00:00 2001 From: John Keiser Date: Sun, 15 Mar 2020 12:50:09 -0700 Subject: [PATCH] Add fallback parser for pre-SSE4.2 machines --- Makefile | 3 +- benchmark/parse.cpp | 2 +- include/simdjson/portability.h | 1 + src/CMakeLists.txt | 3 + src/arm64/stringparsing.h | 18 ++- src/fallback/bitmanipulation.h | 65 +++++++++++ src/fallback/implementation.h | 26 +++++ src/fallback/numberparsing.h | 34 ++++++ src/fallback/stage1_find_marks.h | 160 +++++++++++++++++++++++++++ src/fallback/stage2_build_tape.h | 20 ++++ src/fallback/stringparsing.h | 35 ++++++ src/generic/stringparsing.h | 26 ++--- src/generic/utf8_lookup2_algorithm.h | 2 +- src/haswell/stringparsing.h | 18 ++- src/implementation.cpp | 8 +- src/stage1_find_marks.cpp | 1 + src/stage2_build_tape.cpp | 1 + src/westmere/stringparsing.h | 18 ++- tests/basictests.cpp | 31 +++++- 19 files changed, 434 insertions(+), 38 deletions(-) create mode 100644 src/fallback/bitmanipulation.h create mode 100644 src/fallback/implementation.h create mode 100644 src/fallback/numberparsing.h create mode 100644 src/fallback/stage1_find_marks.h create mode 100644 src/fallback/stage2_build_tape.h create mode 100644 src/fallback/stringparsing.h diff --git a/Makefile b/Makefile index 442b7c26..020f1ce2 100644 --- a/Makefile +++ b/Makefile @@ -61,9 +61,10 @@ endif # ifeq ($(MEMSANITIZE),1) SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h +SRCHEADERS_FALLBACK= src/fallback/implementation.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h -SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) +SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK) INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index 8982e770..b50a95d5 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -128,7 +128,7 @@ struct option_struct { break; default: // reaching here means an argument was given to getopt() which did not have a case label - exit_error("Unexpected argument - missing case for option "+ + exit_usage("Unexpected argument - missing case for option "+ std::string(1,static_cast(c))+ " (programming error)"); } diff --git a/include/simdjson/portability.h b/include/simdjson/portability.h index eea4f431..c4dac8be 100644 --- a/include/simdjson/portability.h +++ b/include/simdjson/portability.h @@ -48,6 +48,7 @@ #endif // under GCC and CLANG, we use these two macros +#define TARGET_FALLBACK TARGET_REGION("") #define TARGET_HASWELL TARGET_REGION("avx2,bmi,pclmul,lzcnt") #define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul") #define TARGET_ARM64 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a587face..a0abc046 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,6 +42,9 @@ set(SIMDJSON_SRC_HEADERS arm64/stage1_find_marks.h arm64/stage2_build_tape.h arm64/stringparsing.h + fallback/implementation.h + fallback/stage1_find_marks.h + fallback/stage2_build_tape.h generic/atomparsing.h generic/numberparsing.h generic/stage1_find_marks.h diff --git a/src/arm64/stringparsing.h b/src/arm64/stringparsing.h index 6ed01e42..3cd78dee 100644 --- a/src/arm64/stringparsing.h +++ b/src/arm64/stringparsing.h @@ -15,16 +15,24 @@ namespace simdjson::arm64 { using namespace simd; // Holds backslashes and quotes locations. -struct parse_string_helper { +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + really_inline bool has_backslash() { return bs_bits != 0; } + really_inline int quote_index() { return trailing_zeroes(quote_bits); } + really_inline int backslash_index() { return trailing_zeroes(bs_bits); } + uint32_t bs_bits; uint32_t quote_bits; - static const uint32_t BYTES_PROCESSED = 32; -}; +}; // struct backslash_and_quote -really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { +really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1)); simd8 v0(src); simd8 v1(src + sizeof(v0)); v0.store(dst); diff --git a/src/fallback/bitmanipulation.h b/src/fallback/bitmanipulation.h new file mode 100644 index 00000000..85aeead4 --- /dev/null +++ b/src/fallback/bitmanipulation.h @@ -0,0 +1,65 @@ +#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H +#define SIMDJSON_FALLBACK_BITMANIPULATION_H + +#include "simdjson.h" +#include + +TARGET_FALLBACK +namespace simdjson::fallback { + +#ifndef _MSC_VER +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +__attribute__((no_sanitize("undefined"))) // this is deliberate +#endif // _MSC_VER +/* result might be undefined when input_num is zero */ +really_inline int trailing_zeroes(uint64_t input_num) { + +#ifdef _MSC_VER + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else + return __builtin_ctzll(input_num); +#endif // _MSC_VER + +} // namespace simdjson::arm64 + +/* result might be undefined when input_num is zero */ +really_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +really_inline int leading_zeroes(uint64_t input_num) { +#ifdef _MSC_VER + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// _MSC_VER +} + +really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { + *result = value1 + value2; + return *result < value1; +} + +really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { + *result = value1 * value2; + // TODO there must be a faster way + return value2 > 0 && value1 > std::numeric_limits::max() / value2; +} + +} // namespace simdjson::fallback +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H diff --git a/src/fallback/implementation.h b/src/fallback/implementation.h new file mode 100644 index 00000000..74de3fa3 --- /dev/null +++ b/src/fallback/implementation.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H +#define SIMDJSON_FALLBACK_IMPLEMENTATION_H + +#include "simdjson.h" +#include "isadetection.h" + +TARGET_FALLBACK +namespace simdjson::fallback { + +class implementation final : public simdjson::implementation { +public: + really_inline implementation() : simdjson::implementation( + "fallback", + "Generic fallback implementation", + 0 + ) {} + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final; +}; + +} // namespace simdjson::fallback +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H \ No newline at end of file diff --git a/src/fallback/numberparsing.h b/src/fallback/numberparsing.h new file mode 100644 index 00000000..3a5be25f --- /dev/null +++ b/src/fallback/numberparsing.h @@ -0,0 +1,34 @@ +#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_H +#define SIMDJSON_FALLBACK_NUMBERPARSING_H + +#include "simdjson.h" +#include "jsoncharutils.h" +#include "fallback/bitmanipulation.h" +#include +#include + +#ifdef JSON_TEST_NUMBERS // for unit testing +void found_invalid_number(const uint8_t *buf); +void found_integer(int64_t result, const uint8_t *buf); +void found_unsigned_integer(uint64_t result, const uint8_t *buf); +void found_float(double result, const uint8_t *buf); +#endif + +TARGET_FALLBACK +namespace simdjson::fallback { +static inline uint32_t parse_eight_digits_unrolled(const char *chars) { + uint32_t result = 0; + for (int i=0;i<8;i++) { + result = result*10 + (chars[i] - '0'); + } + return result; +} + +#define SWAR_NUMBER_PARSING + +#include "generic/numberparsing.h" + +} // namespace simdjson::fallback +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_NUMBERPARSING_H diff --git a/src/fallback/stage1_find_marks.h b/src/fallback/stage1_find_marks.h new file mode 100644 index 00000000..5674c5b0 --- /dev/null +++ b/src/fallback/stage1_find_marks.h @@ -0,0 +1,160 @@ +#ifndef SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H +#define SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H + +#include "simdjson.h" +#include "fallback/implementation.h" + +TARGET_FALLBACK +namespace simdjson::fallback::stage1 { + +class structural_scanner { +public: + +really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, document::parser &_doc_parser, bool _streaming) + : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {} + +really_inline void add_structural() { + *next_structural_index = idx; + next_structural_index++; +} + +really_inline bool is_continuation(uint8_t c) { + return (c & 0b11000000) == 0b10000000; +} + +really_inline void validate_utf8_character() { + // Continuation + if (unlikely((buf[idx] & 0b01000000) == 0)) { + // extra continuation + error = UTF8_ERROR; + idx++; + return; + } + + // 2-byte + if ((buf[idx] & 0b00100000) == 0) { + // missing continuation + if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; } + // overlong: 1100000_ 10______ + if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; } + idx += 2; + return; + } + + // 3-byte + if ((buf[idx] & 0b00010000) == 0) { + // missing continuation + if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; } + // overlong: 11100000 100_____ ________ + if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; } + // surrogates: U+D800-U+DFFF 11101101 101_____ + if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; } + idx += 3; + return; + } + + // 4-byte + // missing continuation + if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; } + // overlong: 11110000 1000____ ________ ________ + if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; } + // too large: > U+10FFFF: + // 11110100 (1001|101_)____ + // 1111(1___|011_|0101) 10______ + // also includes 5, 6, 7 and 8 byte characters: + // 11111___ + if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; } + if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; } + idx += 4; +} + +really_inline void validate_string() { + idx++; // skip first quote + while (idx < len && buf[idx] != '"') { + if (buf[idx] == '\\') { + idx += 2; + } else if (unlikely(buf[idx] & 0b10000000)) { + validate_utf8_character(); + } else { + if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; } + idx++; + } + } + if (idx >= len && !streaming) { error = UNCLOSED_STRING; } +} + +really_inline bool is_whitespace_or_operator(uint8_t c) { + switch (c) { + case '{': case '}': case '[': case ']': case ',': case ':': + case ' ': case '\r': case '\n': case '\t': + return true; + default: + return false; + } +} + +// +// Parse the entire input in STEP_SIZE-byte chunks. +// +really_inline error_code scan() { + for (;idx parser.capacity())) { + return CAPACITY; + } + stage1::structural_scanner scanner(buf, len, parser, streaming); + return scanner.scan(); +} + +} // namespace simdjson::fallback +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H diff --git a/src/fallback/stage2_build_tape.h b/src/fallback/stage2_build_tape.h new file mode 100644 index 00000000..48886d16 --- /dev/null +++ b/src/fallback/stage2_build_tape.h @@ -0,0 +1,20 @@ +#ifndef SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H +#define SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H + +#include "simdjson.h" + +#include "fallback/implementation.h" +#include "fallback/stringparsing.h" +#include "fallback/numberparsing.h" + +TARGET_FALLBACK +namespace simdjson::fallback { + +#include "generic/atomparsing.h" +#include "generic/stage2_build_tape.h" +#include "generic/stage2_streaming_build_tape.h" + +} // namespace simdjson +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_STAGE2_BUILD_TAPE_H diff --git a/src/fallback/stringparsing.h b/src/fallback/stringparsing.h new file mode 100644 index 00000000..1b14ffcd --- /dev/null +++ b/src/fallback/stringparsing.h @@ -0,0 +1,35 @@ +#ifndef SIMDJSON_FALLBACK_STRINGPARSING_H +#define SIMDJSON_FALLBACK_STRINGPARSING_H + +#include "simdjson.h" +#include "jsoncharutils.h" + +TARGET_FALLBACK +namespace simdjson::fallback { + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 1; + really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + really_inline bool has_quote_first() { return c == '"'; } + really_inline bool has_backslash() { return c == '\\'; } + really_inline int quote_index() { return c == '"' ? 0 : 1; } + really_inline int backslash_index() { return c == '\\' ? 0 : 1; } + + uint8_t c; +}; // struct backslash_and_quote + +really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // store to dest unconditionally - we can overwrite the bits we don't like later + dst[0] = src[0]; + return { src[0] }; +} + +#include "generic/stringparsing.h" + +} // namespace simdjson::fallback +UNTARGET_REGION + +#endif // SIMDJSON_FALLBACK_STRINGPARSING_H diff --git a/src/generic/stringparsing.h b/src/generic/stringparsing.h index 0b03af00..1d7d7493 100644 --- a/src/generic/stringparsing.h +++ b/src/generic/stringparsing.h @@ -71,23 +71,19 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, return offset > 0; } -WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, - uint8_t *dst) { +WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) { src++; while (1) { - parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); - if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { - /* we encountered quotes first. Move dst to point to quotes and exit - */ - - /* find out where the quote is... */ - auto quote_dist = trailing_zeroes(helper.quote_bits); - - return dst + quote_dist; + // Copy the next n bytes, and find the backslash and quote in them. + auto bs_quote = backslash_and_quote::copy_and_find(src, dst); + // If the next thing is the end quote, copy and return + if (bs_quote.has_quote_first()) { + // we encountered quotes first. Move dst to point to quotes and exit + return dst + bs_quote.quote_index(); } - if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { + if (bs_quote.has_backslash()) { /* find out where the backspace is */ - auto bs_dist = trailing_zeroes(helper.bs_bits); + auto bs_dist = bs_quote.backslash_index(); uint8_t escape_char = src[bs_dist + 1]; /* we encountered backslash first. Handle backslash */ if (escape_char == 'u') { @@ -114,8 +110,8 @@ WARN_UNUSED really_inline uint8_t *parse_string(const uint8_t *src, } else { /* they are the same. Since they can't co-occur, it means we * encountered neither. */ - src += parse_string_helper::BYTES_PROCESSED; - dst += parse_string_helper::BYTES_PROCESSED; + src += backslash_and_quote::BYTES_PROCESSED; + dst += backslash_and_quote::BYTES_PROCESSED; } } /* can't be reached */ diff --git a/src/generic/utf8_lookup2_algorithm.h b/src/generic/utf8_lookup2_algorithm.h index 7647adcc..ec64e384 100644 --- a/src/generic/utf8_lookup2_algorithm.h +++ b/src/generic/utf8_lookup2_algorithm.h @@ -6,7 +6,7 @@ // are straight up concatenated into the final value. The first byte of a multibyte character is a // "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte // lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just -// start with 0, because that's what ASCII looks like. Here's what each size +// start with 0, because that's what ASCII looks like. Here's what each size looks like: // // - ASCII (7 bits): 0_______ // - 2 byte character (11 bits): 110_____ 10______ diff --git a/src/haswell/stringparsing.h b/src/haswell/stringparsing.h index c702115a..78cfae62 100644 --- a/src/haswell/stringparsing.h +++ b/src/haswell/stringparsing.h @@ -16,16 +16,24 @@ namespace simdjson::haswell { using namespace simd; // Holds backslashes and quotes locations. -struct parse_string_helper { +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + really_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; } + really_inline int quote_index() { return trailing_zeroes(quote_bits); } + really_inline int backslash_index() { return trailing_zeroes(bs_bits); } + uint32_t bs_bits; uint32_t quote_bits; - static const uint32_t BYTES_PROCESSED = 32; -}; +}; // struct backslash_and_quote -really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { +really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { // this can read up to 15 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1)); simd8 v(src); // store to dest unconditionally - we can overwrite the bits we don't like later v.store(dst); diff --git a/src/implementation.cpp b/src/implementation.cpp index 97936bfc..b1c446bc 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -8,11 +8,13 @@ #include "haswell/implementation.h" #include "westmere/implementation.h" +#include "fallback/implementation.h" namespace simdjson::internal { +const fallback::implementation fallback_singleton{}; const haswell::implementation haswell_singleton{}; const westmere::implementation westmere_singleton{}; -constexpr const std::initializer_list available_implementation_pointers { &haswell_singleton, &westmere_singleton }; +constexpr const std::initializer_list available_implementation_pointers { &haswell_singleton, &westmere_singleton, &fallback_singleton }; } #endif @@ -20,10 +22,12 @@ constexpr const std::initializer_list available_implemen #ifdef IS_ARM64 #include "arm64/implementation.h" +#include "fallback/implementation.h" namespace simdjson::internal { +const fallback::implementation fallback_singleton{}; const arm64::implementation arm64_singleton{}; -constexpr const std::initializer_list available_implementation_pointers { &arm64_singleton }; +constexpr const std::initializer_list available_implementation_pointers { &arm64_singleton, &fallback_singleton }; } #endif diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp index ed2537b0..5ea28617 100644 --- a/src/stage1_find_marks.cpp +++ b/src/stage1_find_marks.cpp @@ -1,3 +1,4 @@ #include "arm64/stage1_find_marks.h" +#include "fallback/stage1_find_marks.h" #include "haswell/stage1_find_marks.h" #include "westmere/stage1_find_marks.h" diff --git a/src/stage2_build_tape.cpp b/src/stage2_build_tape.cpp index 26a97b78..c0d1919d 100644 --- a/src/stage2_build_tape.cpp +++ b/src/stage2_build_tape.cpp @@ -13,5 +13,6 @@ void found_bad_string(const uint8_t *buf); #endif #include "arm64/stage2_build_tape.h" +#include "fallback/stage2_build_tape.h" #include "haswell/stage2_build_tape.h" #include "westmere/stage2_build_tape.h" diff --git a/src/westmere/stringparsing.h b/src/westmere/stringparsing.h index fdfbae4e..44d382f3 100644 --- a/src/westmere/stringparsing.h +++ b/src/westmere/stringparsing.h @@ -16,16 +16,24 @@ namespace simdjson::westmere { using namespace simd; // Holds backslashes and quotes locations. -struct parse_string_helper { +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + really_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + really_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + really_inline bool has_backslash() { return bs_bits != 0; } + really_inline int quote_index() { return trailing_zeroes(quote_bits); } + really_inline int backslash_index() { return trailing_zeroes(bs_bits); } + uint32_t bs_bits; uint32_t quote_bits; - static const uint32_t BYTES_PROCESSED = 32; -}; +}; // struct backslash_and_quote -really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { +really_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1)); simd8 v0(src); simd8 v1(src + 16); v0.store(dst); diff --git a/tests/basictests.cpp b/tests/basictests.cpp index e82579f8..6d5599f6 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "simdjson.h" @@ -32,6 +33,7 @@ inline uint64_t f64_ulp_dist(double a, double b) { bool number_test_small_integers() { + std::cout << __func__ << std::endl; char buf[1024]; simdjson::document::parser parser; for (int m = 10; m < 20; m++) { @@ -66,6 +68,7 @@ bool number_test_small_integers() { bool number_test_powers_of_two() { + std::cout << __func__ << std::endl; char buf[1024]; simdjson::document::parser parser; int maxulp = 0; @@ -202,6 +205,7 @@ static const double testing_power_of_ten[] = { bool number_test_powers_of_ten() { + std::cout << __func__ << std::endl; char buf[1024]; simdjson::document::parser parser; for (int i = -1000000; i <= 308; ++i) {// large negative values should be zero. @@ -267,6 +271,7 @@ bool number_test_powers_of_ten() { // adversarial example that once triggred overruns, see https://github.com/lemire/simdjson/issues/345 bool bad_example() { + std::cout << __func__ << std::endl; std::string badjson = "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"; simdjson::document::parser parser = simdjson::build_parsed_json(badjson); if(parser.is_valid()) { @@ -277,6 +282,7 @@ bool bad_example() { } // returns true if successful bool stable_test() { + std::cout << __func__ << std::endl; std::string json = "{" "\"Image\":{" "\"Width\":800," @@ -1438,10 +1444,10 @@ bool error_messages_in_correct_order() { bool lots_of_brackets() { std::string input; - for(size_t i = 0; i < 1000; i++) { + for(size_t i = 0; i < 16; i++) { input += "["; } - for(size_t i = 0; i < 1000; i++) { + for(size_t i = 0; i < 16; i++) { input += "]"; } auto [doc, error] = simdjson::document::parse(input); @@ -1451,7 +1457,26 @@ bool lots_of_brackets() { return true; } -int main() { +int main(int argc, char *argv[]) { + std::cout << std::unitbuf; + char c; + while ((c = getopt(argc, argv, "a:")) != -1) { + switch (c) { + case 'a': { + const simdjson::implementation *impl = simdjson::available_implementations[optarg]; + if (!impl) { + fprintf(stderr, "Unsupported architecture value -a %s\n", optarg); + return EXIT_FAILURE; + } + simdjson::active_implementation = impl; + break; + } + default: + fprintf(stderr, "Unexpected argument %c\n", c); + return EXIT_FAILURE; + } + } + // this is put here deliberately to check that the documentation is correct (README), // should this fail to compile, you should update the documentation: if (simdjson::active_implementation->name() == "unsupported") {