From 7369339c88e57f5ae85ac022fd5370502d9edaf9 Mon Sep 17 00:00:00 2001 From: ioioioio <iodadi@gmail.com> Date: Tue, 9 Jul 2019 15:14:34 -0400 Subject: [PATCH] Neon utf8validation (#207) * utf8 validation on neon works --- Dockerfile | 10 ++ include/simdjson/numberparsing.h | 32 ++-- include/simdjson/simdutf8check_neon.h | 222 ++++++++++++++++++++++++++ include/simdjson/stage1_find_marks.h | 134 +++++++++++----- include/simdjson/stage2_build_tape.h | 2 - include/simdjson/stringparsing.h | 4 - 6 files changed, 344 insertions(+), 60 deletions(-) create mode 100644 Dockerfile create mode 100644 include/simdjson/simdutf8check_neon.h diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..96a9445a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +# docker build -t simdjson . +# docker run --privileged -t simdjson +FROM gcc:8.3 +COPY . /usr/src/ +WORKDIR /usr/src/ +RUN make clean +RUN make +RUN make test +RUN make parsingcompetition +CMD ["bash", "scripts/parser.sh"] diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h index 82787ea4..280988e6 100644 --- a/include/simdjson/numberparsing.h +++ b/include/simdjson/numberparsing.h @@ -114,7 +114,8 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { return structural_or_whitespace_or_exponent_or_decimal_negated[c]; } -#if defined (__AVX2__) || defined (__SSE4_2__) +#ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING +// #if defined (__AVX2__) || defined (__SSE4_2__) #define SWAR_NUMBER_PARSING #endif @@ -138,22 +139,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) { 0x3333333333333333); } -// clang-format off -/*** -Should parse_eight_digits_unrolled be out of the question, one could -use a standard approach like the following: - -static inline uint32_t newparse_eight_digits_unrolled(const char *chars) { - uint64_t val; - memcpy(&val, chars, sizeof(uint64_t)); - val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; - val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; - return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32; -} - -credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ -*/ -// clang-format on +#if defined (__AVX2__) || defined (__SSE4_2__) static inline uint32_t parse_eight_digits_unrolled(const char *chars) { // this actually computes *16* values so we are being wasteful. @@ -171,7 +157,19 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { return _mm_cvtsi128_si32( t4); // only captures the sum of the first 8 digits, drop the rest } +#else +// we don't have SSE, so let us use a scalar function +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +static inline uint32_t parse_eight_digits_unrolled(const char *chars) { + uint64_t val; + memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32; +} + +#endif #endif // diff --git a/include/simdjson/simdutf8check_neon.h b/include/simdjson/simdutf8check_neon.h new file mode 100644 index 00000000..9b8015c1 --- /dev/null +++ b/include/simdjson/simdutf8check_neon.h @@ -0,0 +1,222 @@ +// From https://github.com/cyb70289/utf8/blob/master/lemire-neon.c +// Adapted from https://github.com/lemire/fastvalidate-utf-8 + +#ifndef SIMDJSON_SIMDUTF8CHECK_NEON_H +#define SIMDJSON_SIMDUTF8CHECK_NEON_H + +#ifdef __aarch64__ + +#include <stdio.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <inttypes.h> +#include <arm_neon.h> + +/* + * legal utf-8 byte sequence + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Code Points 1st 2s 3s 4s + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + */ + +#if 0 +static void print128(const char *s, const int8x16_t *v128) +{ + int8_t v8[16]; + vst1q_s8(v8, *v128); + + if (s) + printf("%s:\t", s); + for (int i = 0; i < 16; ++i) + printf("%02x ", (unsigned char)v8[i]); + printf("\n"); +} +#endif + +// all byte values must be no larger than 0xF4 +static inline void checkSmallerThan0xF4(int8x16_t current_bytes, + int8x16_t *has_error) { + // unsigned, saturates to 0 below max + *has_error = vorrq_s8(*has_error, + vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4)))); +} + +static const int8_t _nibbles[] = { + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) +}; + +static inline int8x16_t continuationLengths(int8x16_t high_nibbles) { + return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles)); +} + +static inline int8x16_t carryContinuations(int8x16_t initial_lengths, + int8x16_t previous_carries) { + + int8x16_t right1 = + vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)), + vdupq_n_u8(1))); + int8x16_t sum = vaddq_s8(initial_lengths, right1); + + int8x16_t right2 = vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)), + vdupq_n_u8(2))); + return vaddq_s8(sum, right2); +} + +static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carries, + int8x16_t *has_error) { + + // overlap || underlap + // carry > length && length > 0 || !(carry > length) && !(length > 0) + // (carries > length) == (lengths > 0) + uint8x16_t overunder = + vceqq_u8(vcgtq_s8(carries, initial_lengths), + vcgtq_s8(initial_lengths, vdupq_n_s8(0))); + + *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder)); +} + +// when 0xED is found, next byte must be no larger than 0x9F +// when 0xF4 is found, next byte must be no larger than 0x8F +// next byte must be continuation, ie sign bit is set, so signed < is ok +static inline void checkFirstContinuationMax(int8x16_t current_bytes, + int8x16_t off1_current_bytes, + int8x16_t *has_error) { + uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED)); + uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4)); + + uint8x16_t badfollowED = + vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x9F)), maskED); + uint8x16_t badfollowF4 = + vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4); + + *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4))); +} + +static const int8_t _initial_mins[] = { + -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, // 10xx => false + (int8_t) 0xC2, -128, // 110x + (int8_t) 0xE1, // 1110 + (int8_t) 0xF1, +}; + +static const int8_t _second_mins[] = { + -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, // 10xx => false + 127, 127, // 110x => true + (int8_t) 0xA0, // 1110 + (int8_t) 0x90, +}; + +// map off1_hibits => error condition +// hibits off1 cur +// C => < C2 && true +// E => < E1 && < A0 +// F => < F1 && < 90 +// else false && false +static inline void checkOverlong(int8x16_t current_bytes, + int8x16_t off1_current_bytes, int8x16_t hibits, + int8x16_t previous_hibits, int8x16_t *has_error) { + int8x16_t off1_hibits = vextq_s8(previous_hibits, hibits, 16 - 1); + int8x16_t initial_mins = vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits)); + + uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes); + + int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits)); + uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes); + *has_error = + vorrq_s8(*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under))); +} + +struct processed_utf_bytes { + int8x16_t rawbytes; + int8x16_t high_nibbles; + int8x16_t carried_continuations; +}; + +static inline void count_nibbles(int8x16_t bytes, + struct processed_utf_bytes *answer) { + answer->rawbytes = bytes; + answer->high_nibbles = + vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)); +} + +// check whether the current bytes are valid UTF-8 +// at the end of the function, previous gets updated +static inline struct processed_utf_bytes +checkUTF8Bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous, + int8x16_t *has_error) { + struct processed_utf_bytes pb; + count_nibbles(current_bytes, &pb); + + checkSmallerThan0xF4(current_bytes, has_error); + + int8x16_t initial_lengths = continuationLengths(pb.high_nibbles); + + pb.carried_continuations = + carryContinuations(initial_lengths, previous->carried_continuations); + + checkContinuations(initial_lengths, pb.carried_continuations, has_error); + + int8x16_t off1_current_bytes = + vextq_s8(previous->rawbytes, pb.rawbytes, 16 - 1); + checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + + checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); + return pb; +} + +#if 0 +static const int8_t _verror[] = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1}; + +/* Return 0 on success, -1 on error */ +int utf8_lemire(const unsigned char *src, int len) { + int i = 0; + int8x16_t has_error = vdupq_n_s8(0); + struct processed_utf_bytes previous = {.rawbytes = vdupq_n_s8(0), + .high_nibbles = vdupq_n_s8(0), + .carried_continuations = + vdupq_n_s8(0)}; + if (len >= 16) { + for (; i <= len - 16; i += 16) { + int8x16_t current_bytes = vld1q_s8((int8_t*)(src + i)); + previous = checkUTF8Bytes(current_bytes, &previous, &has_error); + } + } + + // last part + if (i < len) { + char buffer[16]; + memset(buffer, 0, 16); + memcpy(buffer, src + i, len - i); + int8x16_t current_bytes = vld1q_s8((int8_t *)buffer); + previous = checkUTF8Bytes(current_bytes, &previous, &has_error); + } else { + has_error = + vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(previous.carried_continuations, + vld1q_s8(_verror))), + has_error); + } + + return vmaxvq_u8(vreinterpretq_u8_s8(has_error)) == 0 ? 0 : -1; +} +#endif + +#endif +#endif \ No newline at end of file diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 525a39a9..b6e44c5a 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -6,30 +6,32 @@ #include "simdjson/parsedjson.h" #include "simdjson/portability.h" -#if defined (__AVX2__) || defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#if defined (__AVX2__) +#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#elif defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +#include <arm_neon.h> +#else +#warning It appears that neither ARM NEON nor AVX2 nor SSE are detected. +#endif // (__AVX2__) #ifndef SIMDJSON_SKIPUTF8VALIDATION #define SIMDJSON_UTF8VALIDATE #endif -#else -// currently we don't UTF8 validate for ARM -// also we assume that if you're not __AVX2__ -// you're ARM, which is a bit dumb. TODO: Fix... -#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) -#include <arm_neon.h> -#else -#warning It appears that neither ARM NEON nor AVX2 are detected. -#endif // __ARM_NEON -#endif // (__AVX2__) || (__SSE4_2__) // It seems that many parsers do UTF-8 validation. // RapidJSON does not do it by default, but a flag // allows it. #ifdef SIMDJSON_UTF8VALIDATE +#if defined (__AVX2__) #include "simdjson/simdutf8check.h" -#endif +#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#include "simdjson/simdutf8check.h" +#elif defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +#include "simdjson/simdutf8check_neon.h" +#endif // (__AVX2__) +#endif // SIMDJSON_UTF8VALIDATE -#define TRANSPOSE +//#define TRANSPOSE namespace simdjson { template<instruction_set> @@ -221,6 +223,32 @@ struct utf8_checking_state<instruction_set::sse4_2> }; #endif +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +template<> +struct utf8_checking_state<instruction_set::neon> +{ + int8x16_t has_error {}; + processed_utf_bytes previous {}; +}; +#endif + +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +// Checks that all bytes are ascii +really_inline +bool check_ascii_neon(simd_input<instruction_set::neon> in) { + // checking if the most significant bit is always equal to 0. + uint8x16_t highbit = vdupq_n_u8(0x80); + uint8x16_t t0 = vorrq_u8(in.i0, in.i1); + uint8x16_t t1 = vorrq_u8(in.i2, in.i3); + uint8x16_t t3 = vorrq_u8(t0, t1); + uint8x16_t t4 = vandq_u8(t3, highbit); + uint64x2_t v64 = vreinterpretq_u64_u8(t4); + uint32x2_t v32 = vqmovn_u64(v64); + uint64x1_t result = vreinterpret_u64_u32(v32); + return vget_lane_u64(result, 0) == 0; +} +#endif + template<instruction_set T> void check_utf8(simd_input<T> in, utf8_checking_state<T>& state); @@ -278,6 +306,28 @@ void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, } #endif // __SSE4_2 +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +template<> really_inline +void check_utf8<instruction_set::neon>(simd_input<instruction_set::neon> in, + utf8_checking_state<instruction_set::neon>& state) { + if (check_ascii_neon(in)) { + // All bytes are ascii. Therefore the byte that was just before must be ascii too. + // We only check the byte that was just before simd_input. Nines are arbitrary values. + int8_t _verror[] = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1}; + state.has_error = + vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(state.previous.carried_continuations, + vld1q_s8(_verror))), + state.has_error); + } else { + // it is not ascii so we have to do heavy work + state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i0), &(state.previous), &(state.has_error)); + state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i1), &(state.previous), &(state.has_error)); + state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i2), &(state.previous), &(state.has_error)); + state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i3), &(state.previous), &(state.has_error)); + } +} +#endif // __ARM_NEON + // Checks if the utf8 validation has found any error. template<instruction_set T> errorValues check_utf8_errors(utf8_checking_state<T>& state); @@ -295,6 +345,16 @@ errorValues check_utf8_errors<instruction_set::sse4_2>(utf8_checking_state<instr return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; } #endif + +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) +template<> really_inline +errorValues check_utf8_errors<instruction_set::neon>(utf8_checking_state<instruction_set::neon>& state) { + uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error); + uint32x2_t v32 = vqmovn_u64(v64); + uint64x1_t result = vreinterpret_u64_u32(v32); + return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +} +#endif #endif // SIMDJSON_UTF8VALIDATE template<instruction_set T> @@ -375,10 +435,10 @@ uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_ template<> really_inline uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vceqq_u8(in.i.val[0], mask); - uint8x16_t cmp_res_1 = vceqq_u8(in.i.val[1], mask); - uint8x16_t cmp_res_2 = vceqq_u8(in.i.val[2], mask); - uint8x16_t cmp_res_3 = vceqq_u8(in.i.val[3], mask); + uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask); + uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask); + uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask); + uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask); return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); } #endif @@ -419,10 +479,10 @@ uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruc template<> really_inline uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vcleq_u8(in.i.val[0], mask); - uint8x16_t cmp_res_1 = vcleq_u8(in.i.val[1], mask); - uint8x16_t cmp_res_2 = vcleq_u8(in.i.val[2], mask); - uint8x16_t cmp_res_3 = vcleq_u8(in.i.val[3], mask); + uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask); + uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask); + uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask); + uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask); return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); } #endif @@ -693,26 +753,26 @@ void find_whitespace_and_structurals<instruction_set::neon>( const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); - uint8x16_t nib_0_lo = vandq_u8(in.i.val[0], low_nib_and_mask); - uint8x16_t nib_0_hi = vshrq_n_u8(in.i.val[0], 4); + uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask); + uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4); uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo); uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi); uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi); - uint8x16_t nib_1_lo = vandq_u8(in.i.val[1], low_nib_and_mask); - uint8x16_t nib_1_hi = vshrq_n_u8(in.i.val[1], 4); + uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask); + uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4); uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo); uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi); uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi); - uint8x16_t nib_2_lo = vandq_u8(in.i.val[2], low_nib_and_mask); - uint8x16_t nib_2_hi = vshrq_n_u8(in.i.val[2], 4); + uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask); + uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4); uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo); uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi); uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi); - uint8x16_t nib_3_lo = vandq_u8(in.i.val[3], low_nib_and_mask); - uint8x16_t nib_3_hi = vshrq_n_u8(in.i.val[3], 4); + uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask); + uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4); uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo); uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi); uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi); @@ -768,29 +828,29 @@ void find_whitespace_and_structurals<instruction_set::neon>( const uint8x16_t low_3bits_and_mask = vmovq_n_u8(0x7); const uint8x16_t high_1bit_tst_mask = vmovq_n_u8(0x80); - int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i.val[0], low_3bits_and_mask)); - uint8x16_t high_5bits_0 = vshrq_n_u8(in.i.val[0], 3); + int8x16_t low_3bits_0 = vreinterpretq_s8_u8(vandq_u8(in.i0, low_3bits_and_mask)); + uint8x16_t high_5bits_0 = vshrq_n_u8(in.i0, 3); uint8x16_t shuffle_structural_0 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_0), low_3bits_0); uint8x16_t shuffle_ws_0 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_0), low_3bits_0); uint8x16_t tmp_0 = vtstq_u8(shuffle_structural_0, high_1bit_tst_mask); uint8x16_t tmp_ws_0 = vtstq_u8(shuffle_ws_0, high_1bit_tst_mask); - int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i.val[1], low_3bits_and_mask)); - uint8x16_t high_5bits_1 = vshrq_n_u8(in.i.val[1], 3); + int8x16_t low_3bits_1 = vreinterpretq_s8_u8(vandq_u8(in.i1, low_3bits_and_mask)); + uint8x16_t high_5bits_1 = vshrq_n_u8(in.i1, 3); uint8x16_t shuffle_structural_1 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_1), low_3bits_1); uint8x16_t shuffle_ws_1 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_1), low_3bits_1); uint8x16_t tmp_1 = vtstq_u8(shuffle_structural_1, high_1bit_tst_mask); uint8x16_t tmp_ws_1 = vtstq_u8(shuffle_ws_1, high_1bit_tst_mask); - int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i.val[2], low_3bits_and_mask)); - uint8x16_t high_5bits_2 = vshrq_n_u8(in.i.val[2], 3); + int8x16_t low_3bits_2 = vreinterpretq_s8_u8(vandq_u8(in.i2, low_3bits_and_mask)); + uint8x16_t high_5bits_2 = vshrq_n_u8(in.i2, 3); uint8x16_t shuffle_structural_2 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_2), low_3bits_2); uint8x16_t shuffle_ws_2 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_2), low_3bits_2); uint8x16_t tmp_2 = vtstq_u8(shuffle_structural_2, high_1bit_tst_mask); uint8x16_t tmp_ws_2 = vtstq_u8(shuffle_ws_2, high_1bit_tst_mask); - int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i.val[3], low_3bits_and_mask)); - uint8x16_t high_5bits_3 = vshrq_n_u8(in.i.val[3], 3); + int8x16_t low_3bits_3 = vreinterpretq_s8_u8(vandq_u8(in.i3, low_3bits_and_mask)); + uint8x16_t high_5bits_3 = vshrq_n_u8(in.i3, 3); uint8x16_t shuffle_structural_3 = vshlq_u8(vqtbl1q_u8(structural_bitvec, high_5bits_3), low_3bits_3); uint8x16_t shuffle_ws_3 = vshlq_u8(vqtbl1q_u8(whitespace_bitvec, high_5bits_3), low_3bits_3); uint8x16_t tmp_3 = vtstq_u8(shuffle_structural_3, high_1bit_tst_mask); diff --git a/include/simdjson/stage2_build_tape.h b/include/simdjson/stage2_build_tape.h index 911589d3..6f7b1b6d 100644 --- a/include/simdjson/stage2_build_tape.h +++ b/include/simdjson/stage2_build_tape.h @@ -12,8 +12,6 @@ #include "simdjson/stringparsing.h" #include "simdjson/simdjson.h" -#define PATH_SEP '/' - namespace simdjson { void init_state_machine(); diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h index c9be1788..b0332ca6 100644 --- a/include/simdjson/stringparsing.h +++ b/include/simdjson/stringparsing.h @@ -77,10 +77,6 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d return offset > 0; } -#ifdef __ARM_NEON -#include <arm_neon.h> -#endif - // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits;