From 861a6a17e4ab470148c30de5c9c032dc5832e0e2 Mon Sep 17 00:00:00 2001 From: ioioioio Date: Wed, 3 Jul 2019 17:15:21 -0400 Subject: [PATCH 1/9] SSE implementation integrated --- include/simdjson/numberparsing.h | 2 +- include/simdjson/portability.h | 2 +- include/simdjson/simdutf8check.h | 315 ++++++++++++++++++++++++--- include/simdjson/stage1_find_marks.h | 270 ++++++++++++++++++++--- include/simdjson/stringparsing.h | 26 ++- src/jsonparser.cpp | 12 +- 6 files changed, 565 insertions(+), 62 deletions(-) diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h index aba966e0..82787ea4 100644 --- a/include/simdjson/numberparsing.h +++ b/include/simdjson/numberparsing.h @@ -114,7 +114,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { return structural_or_whitespace_or_exponent_or_decimal_negated[c]; } -#ifdef __AVX2__ +#if defined (__AVX2__) || defined (__SSE4_2__) #define SWAR_NUMBER_PARSING #endif diff --git a/include/simdjson/portability.h b/include/simdjson/portability.h index 2069cf72..ec5a409e 100644 --- a/include/simdjson/portability.h +++ b/include/simdjson/portability.h @@ -40,7 +40,7 @@ static inline int hamming(uint64_t input_num) { #include #include -#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__) +#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__) || defined(__SSE4_2__) #include #endif namespace simdjson { diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h index fe198991..ecbc235b 100644 --- a/include/simdjson/simdutf8check.h +++ b/include/simdjson/simdutf8check.h @@ -1,12 +1,12 @@ - -#ifndef SIMDJSON_SIMDUTF8CHECK_H -#define SIMDJSON_SIMDUTF8CHECK_H - - +#ifndef SIMDUTF8CHECK_H +#define SIMDUTF8CHECK_H +#include #include #include #include -#include "simdjson/portability.h" +#include + +namespace simdjson { /* * legal utf-8 byte sequence * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 @@ -25,9 +25,171 @@ */ // all byte values must be no larger than 0xF4 +static inline void checkSmallerThan0xF4(__m128i current_bytes, + __m128i *has_error) { + // unsigned, saturates to 0 below max + *has_error = _mm_or_si128(*has_error, + _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4))); +} + +static inline __m128i continuationLengths(__m128i high_nibbles) { + return _mm_shuffle_epi8( + _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4), // 1111, next should be 0 (not checked here) + high_nibbles); +} + +static inline __m128i carryContinuations(__m128i initial_lengths, + __m128i previous_carries) { + + __m128i right1 = + _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1), + _mm_set1_epi8(1)); + __m128i sum = _mm_add_epi8(initial_lengths, right1); + + __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2), + _mm_set1_epi8(2)); + return _mm_add_epi8(sum, right2); +} + +static inline void checkContinuations(__m128i initial_lengths, __m128i carries, + __m128i *has_error) { + + // overlap || underlap + // carry > length && length > 0 || !(carry > length) && !(length > 0) + // (carries > length) == (lengths > 0) + __m128i overunder = + _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths), + _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128())); + + *has_error = _mm_or_si128(*has_error, overunder); +} + +// when 0xED is found, next byte must be no larger than 0x9F +// when 0xF4 is found, next byte must be no larger than 0x8F +// next byte must be continuation, ie sign bit is set, so signed < is ok +static inline void checkFirstContinuationMax(__m128i current_bytes, + __m128i off1_current_bytes, + __m128i *has_error) { + __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED)); + __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4)); + + __m128i badfollowED = + _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED); + __m128i badfollowF4 = + _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4); + + *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4)); +} + +// map off1_hibits => error condition +// hibits off1 cur +// C => < C2 && true +// E => < E1 && < A0 +// F => < F1 && < 90 +// else false && false +static inline void checkOverlong(__m128i current_bytes, + __m128i off1_current_bytes, __m128i hibits, + __m128i previous_hibits, __m128i *has_error) { + __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1); + __m128i initial_mins = _mm_shuffle_epi8( + _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, // 10xx => false + 0xC2, -128, // 110x + 0xE1, // 1110 + 0xF1), + off1_hibits); + + __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes); + + __m128i second_mins = _mm_shuffle_epi8( + _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90), + off1_hibits); + __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes); + *has_error = + _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under)); +} + +struct processed_utf_bytes { + __m128i rawbytes; + __m128i high_nibbles; + __m128i carried_continuations; +}; + +static inline void count_nibbles(__m128i bytes, + struct processed_utf_bytes *answer) { + answer->rawbytes = bytes; + answer->high_nibbles = + _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)); +} + +// check whether the current bytes are valid UTF-8 +// at the end of the function, previous gets updated +static struct processed_utf_bytes +checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous, + __m128i *has_error) { + struct processed_utf_bytes pb; + count_nibbles(current_bytes, &pb); + + checkSmallerThan0xF4(current_bytes, has_error); + + __m128i initial_lengths = continuationLengths(pb.high_nibbles); + + pb.carried_continuations = + carryContinuations(initial_lengths, previous->carried_continuations); + + checkContinuations(initial_lengths, pb.carried_continuations, has_error); + + __m128i off1_current_bytes = + _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1); + checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + + checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); + return pb; +} + +static bool validate_utf8_fast(const char *src, size_t len) { + size_t i = 0; + __m128i has_error = _mm_setzero_si128(); + struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(), + .high_nibbles = _mm_setzero_si128(), + .carried_continuations = + _mm_setzero_si128()}; + if (len >= 16) { + for (; i <= len - 16; i += 16) { + __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i)); + previous = checkUTF8Bytes(current_bytes, &previous, &has_error); + } + } + + // last part + if (i < len) { + char buffer[16]; + memset(buffer, 0, 16); + memcpy(buffer, src + i, len - i); + __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer)); + previous = checkUTF8Bytes(current_bytes, &previous, &has_error); + } else { + has_error = + _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations, + _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 1)), + has_error); + } + + return _mm_testz_si128(has_error, has_error); +} -namespace simdjson { #ifdef __AVX2__ + /*****************************/ static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) { return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15); @@ -119,31 +281,29 @@ static inline void avxcheckOverlong(__m256i current_bytes, __m256i *has_error) { __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits); __m256i initial_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 0xC2, -128, // 110x + 0xE1, // 1110 + 0xF1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false 0xC2, -128, // 110x 0xE1, // 1110 - 0xF1, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 0xC2, -128, // 110x - 0xE1, // 1110 - 0xF1), // 1111 + 0xF1), off1_hibits); __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); __m256i second_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false 127, 127, // 110x => true 0xA0, // 1110 - 0x90, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0, // 1110 - 0x90), // 1111 + 0x90), off1_hibits); __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); *has_error = _mm256_or_si256(*has_error, @@ -165,11 +325,11 @@ static inline void avx_count_nibbles(__m256i bytes, // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated -static inline struct avx_processed_utf_bytes +static struct avx_processed_utf_bytes avxcheckUTF8Bytes(__m256i current_bytes, struct avx_processed_utf_bytes *previous, __m256i *has_error) { - struct avx_processed_utf_bytes pb{}; + struct avx_processed_utf_bytes pb; avx_count_nibbles(current_bytes, &pb); avxcheckSmallerThan0xF4(current_bytes, has_error); @@ -190,8 +350,111 @@ avxcheckUTF8Bytes(__m256i current_bytes, return pb; } -#else // __AVX2__ -#warning "We require AVX2 support!" +// check whether the current bytes are valid UTF-8 +// at the end of the function, previous gets updated +static struct avx_processed_utf_bytes +avxcheckUTF8Bytes_asciipath(__m256i current_bytes, + struct avx_processed_utf_bytes *previous, + __m256i *has_error) { + if (_mm256_testz_si256(current_bytes, + _mm256_set1_epi8(0x80))) { // fast ascii path + *has_error = _mm256_or_si256( + _mm256_cmpgt_epi8(previous->carried_continuations, + _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 1)), + *has_error); + return *previous; + } + + struct avx_processed_utf_bytes pb; + avx_count_nibbles(current_bytes, &pb); + + avxcheckSmallerThan0xF4(current_bytes, has_error); + + __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles); + + pb.carried_continuations = + avxcarryContinuations(initial_lengths, previous->carried_continuations); + + avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error); + + __m256i off1_current_bytes = + push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes); + avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + + avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); + return pb; +} + +static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) { + size_t i = 0; + __m256i has_error = _mm256_setzero_si256(); + struct avx_processed_utf_bytes previous = { + .rawbytes = _mm256_setzero_si256(), + .high_nibbles = _mm256_setzero_si256(), + .carried_continuations = _mm256_setzero_si256()}; + if (len >= 32) { + for (; i <= len - 32; i += 32) { + __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i)); + previous = + avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error); + } + } + + // last part + if (i < len) { + char buffer[32]; + memset(buffer, 0, 32); + memcpy(buffer, src + i, len - i); + __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer)); + previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); + } else { + has_error = _mm256_or_si256( + _mm256_cmpgt_epi8(previous.carried_continuations, + _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 1)), + has_error); + } + + return _mm256_testz_si256(has_error, has_error); +} + +static bool validate_utf8_fast_avx(const char *src, size_t len) { + size_t i = 0; + __m256i has_error = _mm256_setzero_si256(); + struct avx_processed_utf_bytes previous = { + .rawbytes = _mm256_setzero_si256(), + .high_nibbles = _mm256_setzero_si256(), + .carried_continuations = _mm256_setzero_si256()}; + if (len >= 32) { + for (; i <= len - 32; i += 32) { + __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i)); + previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); + } + } + + // last part + if (i < len) { + char buffer[32]; + memset(buffer, 0, 32); + memcpy(buffer, src + i, len - i); + __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer)); + previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); + } else { + has_error = _mm256_or_si256( + _mm256_cmpgt_epi8(previous.carried_continuations, + _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 1)), + has_error); + } + + return _mm256_testz_si256(has_error, has_error); +} + #endif // __AVX2__ } -#endif +#endif \ No newline at end of file diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 6c025080..121c9454 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -6,7 +6,7 @@ #include "simdjson/parsedjson.h" #include "simdjson/portability.h" -#ifdef __AVX2__ +#if defined (__AVX2__) || (__SSE4_2__) #ifndef SIMDJSON_SKIPUTF8VALIDATION #define SIMDJSON_UTF8VALIDATE @@ -21,7 +21,7 @@ #else #warning It appears that neither ARM NEON nor AVX2 are detected. #endif // __ARM_NEON -#endif // __AVX2__ +#endif // (__AVX2__) || (__SSE4_2__) // It seems that many parsers do UTF-8 validation. // RapidJSON does not do it by default, but a flag @@ -35,6 +35,7 @@ namespace simdjson { template struct simd_input; + #ifdef __AVX2__ template<> struct simd_input @@ -44,6 +45,17 @@ struct simd_input }; #endif +#ifdef __SSE4_2__ +template<> +struct simd_input +{ + __m128i v0; + __m128i v1; + __m128i v2; + __m128i v3; +}; +#endif + #ifdef __ARM_NEON template<> struct simd_input { @@ -122,8 +134,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits); // where clmul is not supported, so check for both, to be sure. #ifdef SIMDJSON_AVOID_CLMUL template really_inline -uint64_t compute_quote_mask(uint64_t quote_bits) -{ +uint64_t compute_quote_mask(uint64_t quote_bits) { uint64_t quote_mask = quote_bits ^ (quote_bits << 1); quote_mask = quote_mask ^ (quote_mask << 2); quote_mask = quote_mask ^ (quote_mask << 4); @@ -145,6 +156,15 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { } #endif +#ifdef __SSE4_2__ +template<> really_inline +uint64_t compute_quote_mask(uint64_t quote_bits) { + uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( + _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); + return quote_mask; +} +#endif + #ifdef __ARM_NEON template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { @@ -158,30 +178,115 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { return quote_mask; } #endif -#endif +#endif // SIMDJSON_AVOID_CLMUL #ifdef SIMDJSON_UTF8VALIDATE -templatereally_inline -void check_utf8(simd_input in, - __m256i &has_error, - struct avx_processed_utf_bytes &previous) { +// some hack to bypass the impossibily to overload the check_utf8() specialized template +template +struct check_utf8_helper; + +#ifdef __AVX2__ +template<> +struct check_utf8_helper +{ + __m256i has_error = _mm256_setzero_si256(); + avx_processed_utf_bytes previous { + _mm256_setzero_si256(), + _mm256_setzero_si256(), + _mm256_setzero_si256() + }; +}; +#endif + +#ifdef __SSE4_2__ +template<> +struct check_utf8_helper +{ + __m128i has_error = _mm_setzero_si128(); + processed_utf_bytes previous { + _mm_setzero_si128(), + _mm_setzero_si128(), + _mm_setzero_si128() + }; +}; +#endif + +template +void check_utf8(simd_input in, check_utf8_helper& helper); + +#ifdef __AVX2__ +template<> really_inline +void check_utf8(simd_input in, + check_utf8_helper& helper) { __m256i highbit = _mm256_set1_epi8(0x80); if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) { // it is ascii, we just check continuation - has_error = _mm256_or_si256( + helper.has_error = _mm256_or_si256( _mm256_cmpgt_epi8( - previous.carried_continuations, + helper.previous.carried_continuations, _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - has_error); + helper.has_error); } else { // it is not ascii so we have to do heavy work - previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error); - previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error); + helper.previous = avxcheckUTF8Bytes(in.lo, &(helper.previous), &(helper.has_error)); + helper.previous = avxcheckUTF8Bytes(in.hi, &(helper.previous), &(helper.has_error)); } } +#endif //__AVX2__ + +#ifdef __SSE4_2__ +template<> really_inline +void check_utf8(simd_input in, + check_utf8_helper& helper) { + __m128i highbit = _mm_set1_epi8(0x80); + if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) { + // it is ascii, we just check continuation + helper.has_error = _mm_or_si128( + _mm_cmpgt_epi8( + helper.previous.carried_continuations, + _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), + helper.has_error); + } else { + // it is not ascii so we have to do heavy work + helper.previous = checkUTF8Bytes(in.v0, &(helper.previous), &(helper.has_error)); + helper.previous = checkUTF8Bytes(in.v1, &(helper.previous), &(helper.has_error)); + } + + if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) { + // it is ascii, we just check continuation + helper.has_error = _mm_or_si128( + _mm_cmpgt_epi8( + helper.previous.carried_continuations, + _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), + helper.has_error); + } else { + // it is not ascii so we have to do heavy work + helper.previous = checkUTF8Bytes(in.v2, &(helper.previous), &(helper.has_error)); + helper.previous = checkUTF8Bytes(in.v3, &(helper.previous), &(helper.has_error)); + } +} +#endif // __SSE4_2 + +// Checks if the utf8 validation has found any error. +template +errorValues check_utf8_errors(check_utf8_helper& helper); + +#ifdef __AVX2__ +template<> really_inline +errorValues check_utf8_errors(check_utf8_helper& helper) { + return _mm256_testz_si256(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +} #endif +#ifdef __SSE4_2__ +template<> really_inline +errorValues check_utf8_errors(check_utf8_helper& helper) { + return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +} +#endif +#endif // SIMDJSON_UTF8VALIDATE + template simd_input fill_input(const uint8_t * ptr); @@ -195,6 +300,18 @@ simd_input fill_input(const uint8_ } #endif +#ifdef __SSE4_2__ +template<> really_inline +simd_input fill_input(const uint8_t * ptr) { + struct simd_input in; + in.v0 = _mm_loadu_si128(reinterpret_cast(ptr + 0)); + in.v1 = _mm_loadu_si128(reinterpret_cast(ptr + 16)); + in.v2 = _mm_loadu_si128(reinterpret_cast(ptr + 32)); + in.v3 = _mm_loadu_si128(reinterpret_cast(ptr + 48)); + return in; +} +#endif + #ifdef __ARM_NEON template<> really_inline simd_input fill_input(const uint8_t * ptr) { @@ -219,7 +336,6 @@ uint64_t cmp_mask_against_input(simd_input in, uint8_t m); #ifdef __AVX2__ template<> really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { - const __m256i mask = _mm256_set1_epi8(m); __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask); uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); @@ -229,6 +345,23 @@ uint64_t cmp_mask_against_input(simd_input really_inline +uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { + const __m128i mask = _mm_set1_epi8(m); + __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask); + uint64_t res_0 = _mm_movemask_epi8(cmp_res_0); + __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask); + uint64_t res_1 = _mm_movemask_epi8(cmp_res_1); + __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask); + uint64_t res_2 = _mm_movemask_epi8(cmp_res_2); + __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask); + uint64_t res_3 = _mm_movemask_epi8(cmp_res_3); + return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48); + return res_0 | (res_1 << 32); +} +#endif + #ifdef __ARM_NEON template<> really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { @@ -257,6 +390,22 @@ uint64_t unsigned_lteq_against_input(simd_input really_inline +uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { + const __m128i maxval = _mm_set1_epi8(m); + __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval); + uint64_t res_0 = _mm_movemask_epi8(cmp_res_0); + __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval); + uint64_t res_1 = _mm_movemask_epi8(cmp_res_1); + __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval); + uint64_t res_2 = _mm_movemask_epi8(cmp_res_2); + __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval); + uint64_t res_3 = _mm_movemask_epi8(cmp_res_3); + return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48); +} +#endif + #ifdef __ARM_NEON template<> really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { @@ -447,7 +596,78 @@ void find_whitespace_and_structurals(simd_input really_inline +void find_whitespace_and_structurals(simd_input in, + uint64_t &whitespace, + uint64_t &structurals) { + const __m128i low_nibble_mask = _mm_setr_epi8( + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); + const __m128i high_nibble_mask = _mm_setr_epi8( + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); + + __m128i structural_shufti_mask = _mm_set1_epi8(0x7); + __m128i whitespace_shufti_mask = _mm_set1_epi8(0x18); + + __m128i v_0 = _mm_and_si128( + _mm_shuffle_epi8(low_nibble_mask, in.v0), + _mm_shuffle_epi8(high_nibble_mask, + _mm_and_si128(_mm_srli_epi32(in.v0, 4), + _mm_set1_epi8(0x7f)))); + + __m128i v_1 = _mm_and_si128( + _mm_shuffle_epi8(low_nibble_mask, in.v1), + _mm_shuffle_epi8(high_nibble_mask, + _mm_and_si128(_mm_srli_epi32(in.v1, 4), + _mm_set1_epi8(0x7f)))); + + __m128i v_2 = _mm_and_si128( + _mm_shuffle_epi8(low_nibble_mask, in.v2), + _mm_shuffle_epi8(high_nibble_mask, + _mm_and_si128(_mm_srli_epi32(in.v2, 4), + _mm_set1_epi8(0x7f)))); + + __m128i v_3 = _mm_and_si128( + _mm_shuffle_epi8(low_nibble_mask, in.v3), + _mm_shuffle_epi8(high_nibble_mask, + _mm_and_si128(_mm_srli_epi32(in.v3, 4), + _mm_set1_epi8(0x7f)))); + + __m128i tmp_v0 = _mm_cmpeq_epi8( + _mm_and_si128(v_0, structural_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_v1 = _mm_cmpeq_epi8( + _mm_and_si128(v_1, structural_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_v2 = _mm_cmpeq_epi8( + _mm_and_si128(v_2, structural_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_v3 = _mm_cmpeq_epi8( + _mm_and_si128(v_3, structural_shufti_mask), _mm_set1_epi8(0)); + + uint64_t structural_res_0 = _mm_movemask_epi8(tmp_v0); + uint64_t structural_res_1 = _mm_movemask_epi8(tmp_v1); + uint64_t structural_res_2 = _mm_movemask_epi8(tmp_v2); + uint64_t structural_res_3 = _mm_movemask_epi8(tmp_v3); + + structurals = ~(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48)); + + __m128i tmp_ws_v0 = _mm_cmpeq_epi8( + _mm_and_si128(v_0, whitespace_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_ws_v1 = _mm_cmpeq_epi8( + _mm_and_si128(v_1, whitespace_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_ws_v2 = _mm_cmpeq_epi8( + _mm_and_si128(v_2, whitespace_shufti_mask), _mm_set1_epi8(0)); + __m128i tmp_ws_v3 = _mm_cmpeq_epi8( + _mm_and_si128(v_3, whitespace_shufti_mask), _mm_set1_epi8(0)); + + uint64_t ws_res_0 = _mm_movemask_epi8(tmp_ws_v0); + uint64_t ws_res_1 = _mm_movemask_epi8(tmp_ws_v1); + uint64_t ws_res_2 = _mm_movemask_epi8(tmp_ws_v2); + uint64_t ws_res_3 = _mm_movemask_epi8(tmp_ws_v3); + + whitespace = ~(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48)); +} +#endif // __SSE4_2__ #ifdef __ARM_NEON template<> really_inline @@ -569,9 +789,9 @@ void find_whitespace_and_structurals( structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); -#endif +#endif // FUNKY_BAD_TABLE } -#endif +#endif // __ARM_NEON #ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking @@ -657,7 +877,7 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, } base = next_base; } -#endif +#endif // SIMDJSON_NAIVE_FLATTEN // return a updated structural bit vector with quoted contents cleared out and // pseudo-structural characters added to the mask @@ -711,11 +931,7 @@ WARN_UNUSED uint32_t *base_ptr = pj.structural_indexes; uint32_t base = 0; #ifdef SIMDJSON_UTF8VALIDATE - __m256i has_error = _mm256_setzero_si256(); - struct avx_processed_utf_bytes previous {}; - previous.rawbytes = _mm256_setzero_si256(); - previous.high_nibbles = _mm256_setzero_si256(); - previous.carried_continuations = _mm256_setzero_si256(); + check_utf8_helper helper; #endif // we have padded the input out to 64 byte multiple with the remainder being @@ -751,7 +967,7 @@ WARN_UNUSED #endif simd_input in = fill_input(buf+idx); #ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, has_error, previous); + check_utf8(in, helper); #endif // detect odd sequences of backslashes uint64_t odd_ends = find_odd_backslash_sequences( @@ -786,7 +1002,7 @@ WARN_UNUSED memcpy(tmpbuf, buf + idx, len - idx); simd_input in = fill_input(tmpbuf); #ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, has_error, previous); + check_utf8(in, helper); #endif // detect odd sequences of backslashes @@ -843,7 +1059,7 @@ WARN_UNUSED return simdjson::UNESCAPED_CHARS; } #ifdef SIMDJSON_UTF8VALIDATE - return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + return check_utf8_errors(helper); #else return simdjson::SUCCESS; #endif diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h index 148678f8..87687846 100644 --- a/include/simdjson/stringparsing.h +++ b/include/simdjson/stringparsing.h @@ -109,6 +109,23 @@ parse_string_helper find_bs_bits_and_quote_bits (const ui } #endif +#ifdef __SSE4_2__ +template<> really_inline +parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + __m128i v = _mm_loadu_si128(reinterpret_cast(src)); + // store to dest unconditionally - we can overwrite the bits we don't like + // later + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v); + auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"')); + return { + static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits + static_cast(_mm_movemask_epi8(quote_mask)) // quote_bits + }; +} +#endif + #ifdef __ARM_NEON template<> really_inline parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { @@ -221,8 +238,13 @@ bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, } else { // they are the same. Since they can't co-occur, it means we encountered // neither. - src += 32; - dst += 32; + if constexpr(T == instruction_set::sse4_2) { + src += 16; + dst += 16; + } else { + src += 32; + dst += 32; + } } } // can't be reached diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp index 538ca813..ed4e3912 100644 --- a/src/jsonparser.cpp +++ b/src/jsonparser.cpp @@ -15,7 +15,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea json_parse_functype* avx_implementation = &json_parse_implementation; #endif #ifdef __SSE4_2__ - // json_parse_functype* sse4_2_implementation = &json_parse_implementation; // not implemented yet + json_parse_functype* sse4_2_implementation = &json_parse_implementation; #endif #ifdef __ARM_NEON json_parse_functype* neon_implementation = &json_parse_implementation; @@ -39,11 +39,13 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea case instruction_set::avx2 : json_parse_ptr = avx_implementation; break; -#elif defined (__SSE4_2__) - /*case instruction_set::sse4_2 : +#endif +#ifdef __SSE4_2__ + case instruction_set::sse4_2 : json_parse_ptr = sse4_2_implementation; - break;*/ -#elif defined (__ARM_NEON) + break; +#endif +#ifdef __ARM_NEON case instruction_set::neon : json_parse_ptr = neon_implementation; break; From f7ea2629e4599569e688aa4dd300445fe7ffb92b Mon Sep 17 00:00:00 2001 From: ioioioio Date: Thu, 4 Jul 2019 10:13:40 -0400 Subject: [PATCH 2/9] Fixing warnings and Microsoft intinsics. --- include/simdjson/simdutf8check.h | 187 +++++-------------------------- 1 file changed, 28 insertions(+), 159 deletions(-) diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h index ecbc235b..40fc921a 100644 --- a/include/simdjson/simdutf8check.h +++ b/include/simdjson/simdutf8check.h @@ -1,12 +1,12 @@ -#ifndef SIMDUTF8CHECK_H -#define SIMDUTF8CHECK_H -#include + +#ifndef SIMDJSON_SIMDUTF8CHECK_H +#define SIMDJSON_SIMDUTF8CHECK_H + + #include #include #include -#include - -namespace simdjson { +#include "simdjson/portability.h" /* * legal utf-8 byte sequence * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 @@ -25,6 +25,9 @@ namespace simdjson { */ // all byte values must be no larger than 0xF4 + +namespace simdjson { +// all byte values must be no larger than 0xF4 static inline void checkSmallerThan0xF4(__m128i current_bytes, __m128i *has_error) { // unsigned, saturates to 0 below max @@ -156,40 +159,7 @@ checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous, return pb; } -static bool validate_utf8_fast(const char *src, size_t len) { - size_t i = 0; - __m128i has_error = _mm_setzero_si128(); - struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(), - .high_nibbles = _mm_setzero_si128(), - .carried_continuations = - _mm_setzero_si128()}; - if (len >= 16) { - for (; i <= len - 16; i += 16) { - __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i)); - previous = checkUTF8Bytes(current_bytes, &previous, &has_error); - } - } - - // last part - if (i < len) { - char buffer[16]; - memset(buffer, 0, 16); - memcpy(buffer, src + i, len - i); - __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer)); - previous = checkUTF8Bytes(current_bytes, &previous, &has_error); - } else { - has_error = - _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations, - _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 1)), - has_error); - } - - return _mm_testz_si128(has_error, has_error); -} - #ifdef __AVX2__ - /*****************************/ static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) { return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15); @@ -281,29 +251,31 @@ static inline void avxcheckOverlong(__m256i current_bytes, __m256i *has_error) { __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits); __m256i initial_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 0xC2, -128, // 110x - 0xE1, // 1110 - 0xF1, -128, -128, -128, -128, -128, -128, -128, -128, + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false 0xC2, -128, // 110x 0xE1, // 1110 - 0xF1), + 0xF1, // 1111 + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, // 10xx => false + 0xC2, -128, // 110x + 0xE1, // 1110 + 0xF1), // 1111 off1_hibits); __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); __m256i second_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0, // 1110 - 0x90, -128, -128, -128, -128, -128, -128, -128, -128, + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, // 10xx => false 127, 127, // 110x => true 0xA0, // 1110 - 0x90), + 0x90, // 1111 + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90), // 1111 off1_hibits); __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); *has_error = _mm256_or_si256(*has_error, @@ -325,11 +297,11 @@ static inline void avx_count_nibbles(__m256i bytes, // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated -static struct avx_processed_utf_bytes +static inline struct avx_processed_utf_bytes avxcheckUTF8Bytes(__m256i current_bytes, struct avx_processed_utf_bytes *previous, __m256i *has_error) { - struct avx_processed_utf_bytes pb; + struct avx_processed_utf_bytes pb{}; avx_count_nibbles(current_bytes, &pb); avxcheckSmallerThan0xF4(current_bytes, has_error); @@ -350,111 +322,8 @@ avxcheckUTF8Bytes(__m256i current_bytes, return pb; } -// check whether the current bytes are valid UTF-8 -// at the end of the function, previous gets updated -static struct avx_processed_utf_bytes -avxcheckUTF8Bytes_asciipath(__m256i current_bytes, - struct avx_processed_utf_bytes *previous, - __m256i *has_error) { - if (_mm256_testz_si256(current_bytes, - _mm256_set1_epi8(0x80))) { // fast ascii path - *has_error = _mm256_or_si256( - _mm256_cmpgt_epi8(previous->carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 1)), - *has_error); - return *previous; - } - - struct avx_processed_utf_bytes pb; - avx_count_nibbles(current_bytes, &pb); - - avxcheckSmallerThan0xF4(current_bytes, has_error); - - __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles); - - pb.carried_continuations = - avxcarryContinuations(initial_lengths, previous->carried_continuations); - - avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error); - - __m256i off1_current_bytes = - push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes); - avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error); - - avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, - previous->high_nibbles, has_error); - return pb; -} - -static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) { - size_t i = 0; - __m256i has_error = _mm256_setzero_si256(); - struct avx_processed_utf_bytes previous = { - .rawbytes = _mm256_setzero_si256(), - .high_nibbles = _mm256_setzero_si256(), - .carried_continuations = _mm256_setzero_si256()}; - if (len >= 32) { - for (; i <= len - 32; i += 32) { - __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i)); - previous = - avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error); - } - } - - // last part - if (i < len) { - char buffer[32]; - memset(buffer, 0, 32); - memcpy(buffer, src + i, len - i); - __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer)); - previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); - } else { - has_error = _mm256_or_si256( - _mm256_cmpgt_epi8(previous.carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 1)), - has_error); - } - - return _mm256_testz_si256(has_error, has_error); -} - -static bool validate_utf8_fast_avx(const char *src, size_t len) { - size_t i = 0; - __m256i has_error = _mm256_setzero_si256(); - struct avx_processed_utf_bytes previous = { - .rawbytes = _mm256_setzero_si256(), - .high_nibbles = _mm256_setzero_si256(), - .carried_continuations = _mm256_setzero_si256()}; - if (len >= 32) { - for (; i <= len - 32; i += 32) { - __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i)); - previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); - } - } - - // last part - if (i < len) { - char buffer[32]; - memset(buffer, 0, 32); - memcpy(buffer, src + i, len - i); - __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer)); - previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error); - } else { - has_error = _mm256_or_si256( - _mm256_cmpgt_epi8(previous.carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 1)), - has_error); - } - - return _mm256_testz_si256(has_error, has_error); -} - +#else // __AVX2__ +#warning "We require AVX2 support!" #endif // __AVX2__ } -#endif \ No newline at end of file +#endif From 2b2d93b05f1838addca9a925007d9ab1e64adb42 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 4 Jul 2019 17:19:05 -0400 Subject: [PATCH 3/9] Various minor tweaks. --- .appveyor.yml | 7 ++++- .travis.yml | 3 ++ CMakeLists.txt | 2 ++ Makefile | 17 +++++++---- README.md | 11 +++---- include/simdjson/jsonminifier.h | 2 ++ include/simdjson/simdutf8check.h | 2 -- include/simdjson/stage1_find_marks.h | 43 +++++++++++++++++----------- tests/CMakeLists.txt | 9 +++--- tools/cmake/FindOptions.cmake | 18 ++++++++---- 10 files changed, 74 insertions(+), 40 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 5372da62..30549a97 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -9,9 +9,14 @@ clone_folder: c:\projects\simdjson platform: - x64 +environment: + matrix: + - AVXFLAG: "OFF" + - AVXFLAG: "ON" + build_script: - mkdir build - cd build - - ps: cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. + - ps: cmake -DSIMDJSON_DISABLE_AVX="$env:AVXFLAG" -DCMAKE_GENERATOR_PLATFORM=x64 .. - cmake --build . - ctest --verbose diff --git a/.travis.yml b/.travis.yml index b625f367..bafe5d5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,3 +19,6 @@ script: - make test - make clean - make SANITIZEGOLD=1 test + - make clean + - ARCHFLAGS="-march=nehalem" make + - ARCHFLAGS="-march=nehalem" make test diff --git a/CMakeLists.txt b/CMakeLists.txt index eba12585..88e9d1d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,8 @@ if(ltoresult) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() +option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_MACOSX_RPATH OFF) diff --git a/Makefile b/Makefile index 603ac88d..a99cf7b0 100644 --- a/Makefile +++ b/Makefile @@ -9,16 +9,21 @@ COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include EXTRADEPSINCLUDE = -Idependencies/jsoncppdist -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src # users can provide their own additional flags with make EXTRAFLAGS=something architecture:=$(shell arch) -CXXFLAGS = -std=c++17 -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(EXTRAFLAGS) -CFLAGS = -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS) + +#### +# If you want to specify your own target architecture, +# then define ARCHFLAGS. Otherwise, we set good default. +### ifeq ($(architecture),aarch64) -CXXFLAGS += -march=armv8-a+crc+crypto -CFLAGS += -march=armv8-a+crc+crypto +ARCHFLAGS ?= -march=armv8-a+crc+crypto else -CXXFLAGS += -march=native -CFLAGS += -march=native +ARCHFLAGS ?= -march=native endif +CXXFLAGS = $(ARCHFLAGS) -std=c++17 -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(EXTRAFLAGS) +CFLAGS = $(ARCHFLAGS) -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS) + + # This is a convenience flag ifdef SANITIZEGOLD SANITIZE = 1 diff --git a/README.md b/README.md index 3efe5f0e..f49b9d9e 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ On a Skylake processor, the parsing speeds (in GB/s) of various processors on th ## Requirements - We support platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later. -- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017). +- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017) or at least SSE 4.2 (i.e., Intel processors going back to Nehalem released in 2008 or AMD processors starting with the Jaguar used in the PS4 and XBox One). - A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU GCC 7 or better or LLVM's clang 6 or better. - Some benchmark scripts assume bash and other common utilities, but they are optional. @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { } ``` -We require hardware support for AVX2 instructions. You have to make sure that you instruct your +On Intel and AMD processors, we get best performance by using the hardware support for AVX2 instructions. You have to make sure that you instruct your compiler to use these instructions as needed. Under compilers such as GNU GCC or LLVM clang, the flag `-march=native` used on a recent Intel processor (Haswell or better) is sufficient. For portability of the binary files you can also specify directly the Haswell processor (`-march=haswell`). You may @@ -260,14 +260,15 @@ make test ## Usage (CMake on Windows using Visual Studio) -We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later). +We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 (2008 Nehalem or later). - Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/). - Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake. - Create a subdirectory within simdjson, such as `VisualStudio`. - Using a shell, go to this newly created directory. -- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) -- This last command created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`. +- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) This will build the code with AVX2 instructions. If your target processor does not support AVX2, you need to replace `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` by `cmake -DSIMDJSON_DISABLE_AVX=on -DCMAKE_GENERATOR_PLATFORM=x64 ..` . That is, you need to set the flag to forcefully disable AVX support since we compile with AVX2 instructions *by default*. +- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`. + ## Usage (Using `vcpkg` on Windows, Linux and MacOS) diff --git a/include/simdjson/jsonminifier.h b/include/simdjson/jsonminifier.h index a588338c..c5cf0bb4 100644 --- a/include/simdjson/jsonminifier.h +++ b/include/simdjson/jsonminifier.h @@ -5,9 +5,11 @@ #include namespace simdjson { + // Take input from buf and remove useless whitespace, write it to out; buf and // out can be the same pointer. Result is null terminated, // return the string length (minus the null termination). +// The accelerated version of this function only runs on AVX2 hardware. size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out); diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h index 40fc921a..79e67567 100644 --- a/include/simdjson/simdutf8check.h +++ b/include/simdjson/simdutf8check.h @@ -322,8 +322,6 @@ avxcheckUTF8Bytes(__m256i current_bytes, return pb; } -#else // __AVX2__ -#warning "We require AVX2 support!" #endif // __AVX2__ } #endif diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 121c9454..4b0df107 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -126,6 +126,19 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16 template uint64_t compute_quote_mask(uint64_t quote_bits); +namespace { + // for when clmul is unavailable + [[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) { + uint64_t quote_mask = quote_bits ^ (quote_bits << 1); + quote_mask = quote_mask ^ (quote_mask << 2); + quote_mask = quote_mask ^ (quote_mask << 4); + quote_mask = quote_mask ^ (quote_mask << 8); + quote_mask = quote_mask ^ (quote_mask << 16); + quote_mask = quote_mask ^ (quote_mask << 32); + return quote_mask; + } +} + // In practice, if you have NEON or __PCLMUL__, you would // always want to use them, but it might be useful, for research // purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL @@ -135,13 +148,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits); #ifdef SIMDJSON_AVOID_CLMUL template really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { - uint64_t quote_mask = quote_bits ^ (quote_bits << 1); - quote_mask = quote_mask ^ (quote_mask << 2); - quote_mask = quote_mask ^ (quote_mask << 4); - quote_mask = quote_mask ^ (quote_mask << 8); - quote_mask = quote_mask ^ (quote_mask << 16); - quote_mask = quote_mask ^ (quote_mask << 32); - return quote_mask; + return portable_compute_quote_mask(quote_bits); } #else template @@ -150,6 +157,8 @@ uint64_t compute_quote_mask(uint64_t quote_bits); #ifdef __AVX2__ template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { + // There should be no such thing with a processing supporting avx2 + // but not clmul. uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); return quote_mask; @@ -159,23 +168,25 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { #ifdef __SSE4_2__ template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( + // CLMUL is supported on some SSE42 hardware such as Sandy Bridge, + // but not on others. +#ifdef __PCLMUL__ + return _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); - return quote_mask; +#else + return portable_compute_quote_mask(quote_bits); +#endif } #endif #ifdef __ARM_NEON template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { -#ifdef __PCLMUL__ // Might cause problems on runtime dispatch - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), - _mm_set1_epi8(0xFF), 0)); +#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension + return vmull_p64( -1ULL, quote_bits); #else - uint64_t quote_mask = vmull_p64( -1ULL, quote_bits); -#endif - return quote_mask; + return portable_compute_quote_mask(quote_bits); +#endif } #endif #endif // SIMDJSON_AVOID_CLMUL diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index be75702b..229df38b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,7 +7,8 @@ endif() add_cpp_test(basictests) add_cpp_test(jsoncheck) -add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp) -target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json") -target_link_libraries(singleheader ${SIMDJSON_LIB_NAME}) -add_test(singleheader singleheader) \ No newline at end of file +## This causes problems +# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp) +# target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json") +# target_link_libraries(singleheader ${SIMDJSON_LIB_NAME}) +# add_test(singleheader singleheader) \ No newline at end of file diff --git a/tools/cmake/FindOptions.cmake b/tools/cmake/FindOptions.cmake index 3c4596cb..89f2c611 100644 --- a/tools/cmake/FindOptions.cmake +++ b/tools/cmake/FindOptions.cmake @@ -13,15 +13,21 @@ if(SIMDJSON_SANITIZE) endif() - -# some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it -if(NOT MSVC) +if(SIMDJSON_DISABLE_AVX) + if(NOT MSVC) + set (OPT_FLAGS "${OPT_FLAGS} -mno-avx -mno-bmi -mno-pclmul -msse4.2") + else() + set (OPT_FLAGS "${OPT_FLAGS}") + endif() +else() + # some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it + if(NOT MSVC) set (OPT_FLAGS "${OPT_FLAGS} -mavx2 -mbmi -mbmi2 -mpclmul") -else() - set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2 /std:c++latest") + else() + set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2") + endif() endif() - if(NOT MSVC) set(CXXSTD_FLAGS "-std=c++17 -fPIC") endif() From 19cdc09928fa321c3d6430890d1b5269832fa8f1 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 4 Jul 2019 17:36:26 -0400 Subject: [PATCH 4/9] Improving support for VS --- include/simdjson/stage1_find_marks.h | 37 ++++++++++++++-------------- include/simdjson/stringparsing.h | 2 +- src/jsonparser.cpp | 2 +- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 4b0df107..65525557 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -6,17 +6,16 @@ #include "simdjson/parsedjson.h" #include "simdjson/portability.h" -#if defined (__AVX2__) || (__SSE4_2__) +#if defined (__AVX2__) || defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) #ifndef SIMDJSON_SKIPUTF8VALIDATION #define SIMDJSON_UTF8VALIDATE - #endif #else // currently we don't UTF8 validate for ARM // also we assume that if you're not __AVX2__ // you're ARM, which is a bit dumb. TODO: Fix... -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) #include #else #warning It appears that neither ARM NEON nor AVX2 are detected. @@ -45,7 +44,7 @@ struct simd_input }; #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> struct simd_input { @@ -56,7 +55,7 @@ struct simd_input }; #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) template<> struct simd_input { #ifndef TRANSPOSE @@ -70,7 +69,7 @@ template<> struct simd_input }; #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) really_inline uint16_t neonmovemask(uint8x16_t input) { const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, @@ -165,7 +164,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { } #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { // CLMUL is supported on some SSE42 hardware such as Sandy Bridge, @@ -179,7 +178,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { } #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) template<> really_inline uint64_t compute_quote_mask(uint64_t quote_bits) { #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension @@ -209,7 +208,7 @@ struct check_utf8_helper }; #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> struct check_utf8_helper { @@ -246,7 +245,7 @@ void check_utf8(simd_input in, } #endif //__AVX2__ -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline void check_utf8(simd_input in, check_utf8_helper& helper) { @@ -290,7 +289,7 @@ errorValues check_utf8_errors(check_utf8_helper really_inline errorValues check_utf8_errors(check_utf8_helper& helper) { return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; @@ -311,7 +310,7 @@ simd_input fill_input(const uint8_ } #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline simd_input fill_input(const uint8_t * ptr) { struct simd_input in; @@ -323,7 +322,7 @@ simd_input fill_input(const ui } #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) template<> really_inline simd_input fill_input(const uint8_t * ptr) { struct simd_input in; @@ -356,7 +355,7 @@ uint64_t cmp_mask_against_input(simd_input really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { const __m128i mask = _mm_set1_epi8(m); @@ -373,7 +372,7 @@ uint64_t cmp_mask_against_input(simd_input really_inline uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); @@ -401,7 +400,7 @@ uint64_t unsigned_lteq_against_input(simd_input really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { const __m128i maxval = _mm_set1_epi8(m); @@ -417,7 +416,7 @@ uint64_t unsigned_lteq_against_input(simd_input really_inline uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); @@ -609,7 +608,7 @@ void find_whitespace_and_structurals(simd_input really_inline void find_whitespace_and_structurals(simd_input in, uint64_t &whitespace, @@ -680,7 +679,7 @@ void find_whitespace_and_structurals(simd_input really_inline void find_whitespace_and_structurals( simd_input in, diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h index 87687846..c9be1788 100644 --- a/include/simdjson/stringparsing.h +++ b/include/simdjson/stringparsing.h @@ -109,7 +109,7 @@ parse_string_helper find_bs_bits_and_quote_bits (const ui } #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp index ed4e3912..7f82471a 100644 --- a/src/jsonparser.cpp +++ b/src/jsonparser.cpp @@ -40,7 +40,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea json_parse_ptr = avx_implementation; break; #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) case instruction_set::sse4_2 : json_parse_ptr = sse4_2_implementation; break; From fba27ef4b99ee3e93a1e8ade22f94601db4f2a92 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 4 Jul 2019 17:45:45 -0400 Subject: [PATCH 5/9] I missed a few. Building up VS support. --- include/simdjson/simdjson.h | 2 +- src/jsonparser.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/simdjson/simdjson.h b/include/simdjson/simdjson.h index 9a16692d..4190aeaf 100644 --- a/include/simdjson/simdjson.h +++ b/include/simdjson/simdjson.h @@ -12,7 +12,7 @@ enum class instruction_set { // the 'native' enum class value should point at a good default on the current machine #ifdef __AVX2__ native = avx2 -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) native = neon #else // Let us assume that we have an old x64 processor, but one that has SSE (i.e., something diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp index 7f82471a..be17b069 100644 --- a/src/jsonparser.cpp +++ b/src/jsonparser.cpp @@ -14,10 +14,10 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea #ifdef __AVX2__ json_parse_functype* avx_implementation = &json_parse_implementation; #endif -#ifdef __SSE4_2__ +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) json_parse_functype* sse4_2_implementation = &json_parse_implementation; #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) json_parse_functype* neon_implementation = &json_parse_implementation; #endif @@ -25,9 +25,9 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea // Should be done at runtime. Does not make any sense on preprocessor. #ifdef __AVX2__ instruction_set best_implementation = instruction_set::avx2; -#elif defined (__SSE4_2__) +#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) instruction_set best_implementation = instruction_set::sse4_2; -#elif defined (__ARM_NEON) +#elif defined (__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) instruction_set best_implementation = instruction_set::neon; #else instruction_set best_implementation = instruction_set::none; @@ -45,7 +45,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea json_parse_ptr = sse4_2_implementation; break; #endif -#ifdef __ARM_NEON +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64)) case instruction_set::neon : json_parse_ptr = neon_implementation; break; From 0c2f58e40c25c9cd9c5632994183efbdb22ee88c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 4 Jul 2019 17:58:45 -0400 Subject: [PATCH 6/9] Extending the no-avx tests on circleci. --- .circleci/config.yml | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index e73a9654..86889c0b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,6 +38,44 @@ jobs: cd build make test + "gccnoavx": + docker: + - image: ubuntu:18.04 + environment: + CXX: g++-7 + steps: + - checkout + + - run: apt-get update -qq + - run: > + apt-get install -y + build-essential + cmake + g++-7 + git + + - run: + name: Building (gcc) + command: ARCHFLAGS="-march=nehalem" make + + - run: + name: Running tests (gcc) + command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate + + - run: + name: Building (gcc, cmake) + command: | + mkdir build + cd build + cmake -DSIMDJSON_DISABLE_AVX=on .. + make + + - run: + name: Running tests (gcc, cmake) + command: | + cd build + make test + "clang": docker: - image: ubuntu:18.04 @@ -76,9 +114,49 @@ jobs: cd build make test + "clangnoavx": + docker: + - image: ubuntu:18.04 + environment: + CXX: clang++-6.0 + steps: + - checkout + + - run: apt-get update -qq + - run: > + apt-get install -y + build-essential + cmake + clang-6.0 + git + + - run: + name: Building (clang) + command: ARCHFLAGS="-march=nehalem" make + + - run: + name: Running tests (clang) + command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate + + - run: + name: Building (clang, cmake) + command: | + mkdir build + cd build + cmake -DSIMDJSON_DISABLE_AVX=on .. + make + + - run: + name: Running tests (clang, cmake) + command: | + cd build + make test + workflows: version: 2 build_and_test: jobs: - "clang" - "gcc" + - "clangnoavx" + - "gccnoavx" \ No newline at end of file From d7b9a29dc6f0ff73c345f19aa71245debd2d2749 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 4 Jul 2019 19:10:05 -0400 Subject: [PATCH 7/9] Adding comments. --- CMakeLists.txt | 1 + Makefile | 1 + 2 files changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 88e9d1d1..d2da0f6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ if(ltoresult) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() +# usage: cmake -DSIMDJSON_DISABLE_AVX=on .. option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF) set(CMAKE_CXX_STANDARD 17) diff --git a/Makefile b/Makefile index a99cf7b0..e9edf55a 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ architecture:=$(shell arch) #### # If you want to specify your own target architecture, # then define ARCHFLAGS. Otherwise, we set good default. +# E.g., type ' ARCHFLAGS="-march=nehalem" make parse ' ### ifeq ($(architecture),aarch64) ARCHFLAGS ?= -march=armv8-a+crc+crypto From b0d9c074e1179d6ff6ef915c8f02259d20fe6cef Mon Sep 17 00:00:00 2001 From: ioioioio Date: Fri, 5 Jul 2019 11:09:28 -0400 Subject: [PATCH 8/9] check_utf8_helper has a more meaningful name --- include/simdjson/stage1_find_marks.h | 75 ++++++++++++++-------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 65525557..e707d624 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -191,56 +191,56 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { #endif // SIMDJSON_AVOID_CLMUL #ifdef SIMDJSON_UTF8VALIDATE -// some hack to bypass the impossibily to overload the check_utf8() specialized template +// Holds the state required to perform check_utf8(). template -struct check_utf8_helper; +struct utf8_checking_state; #ifdef __AVX2__ template<> -struct check_utf8_helper +struct utf8_checking_state { __m256i has_error = _mm256_setzero_si256(); avx_processed_utf_bytes previous { - _mm256_setzero_si256(), - _mm256_setzero_si256(), - _mm256_setzero_si256() + _mm256_setzero_si256(), // rawbytes + _mm256_setzero_si256(), // high_nibbles + _mm256_setzero_si256() // carried_continuations }; }; #endif #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> -struct check_utf8_helper +struct utf8_checking_state { __m128i has_error = _mm_setzero_si128(); processed_utf_bytes previous { - _mm_setzero_si128(), - _mm_setzero_si128(), - _mm_setzero_si128() + _mm_setzero_si128(), // rawbytes + _mm_setzero_si128(), // high_nibbles + _mm_setzero_si128() // carried_continuations }; }; #endif template -void check_utf8(simd_input in, check_utf8_helper& helper); +void check_utf8(simd_input in, utf8_checking_state& state); #ifdef __AVX2__ template<> really_inline void check_utf8(simd_input in, - check_utf8_helper& helper) { + utf8_checking_state& state) { __m256i highbit = _mm256_set1_epi8(0x80); if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) { // it is ascii, we just check continuation - helper.has_error = _mm256_or_si256( + state.has_error = _mm256_or_si256( _mm256_cmpgt_epi8( - helper.previous.carried_continuations, + state.previous.carried_continuations, _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - helper.has_error); + state.has_error); } else { // it is not ascii so we have to do heavy work - helper.previous = avxcheckUTF8Bytes(in.lo, &(helper.previous), &(helper.has_error)); - helper.previous = avxcheckUTF8Bytes(in.hi, &(helper.previous), &(helper.has_error)); + state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error)); + state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error)); } } #endif //__AVX2__ @@ -248,51 +248,51 @@ void check_utf8(simd_input in, #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline void check_utf8(simd_input in, - check_utf8_helper& helper) { + utf8_checking_state& state) { __m128i highbit = _mm_set1_epi8(0x80); if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) { // it is ascii, we just check continuation - helper.has_error = _mm_or_si128( + state.has_error = _mm_or_si128( _mm_cmpgt_epi8( - helper.previous.carried_continuations, + state.previous.carried_continuations, _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - helper.has_error); + state.has_error); } else { // it is not ascii so we have to do heavy work - helper.previous = checkUTF8Bytes(in.v0, &(helper.previous), &(helper.has_error)); - helper.previous = checkUTF8Bytes(in.v1, &(helper.previous), &(helper.has_error)); + state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error)); + state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error)); } if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) { // it is ascii, we just check continuation - helper.has_error = _mm_or_si128( + state.has_error = _mm_or_si128( _mm_cmpgt_epi8( - helper.previous.carried_continuations, + state.previous.carried_continuations, _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - helper.has_error); + state.has_error); } else { // it is not ascii so we have to do heavy work - helper.previous = checkUTF8Bytes(in.v2, &(helper.previous), &(helper.has_error)); - helper.previous = checkUTF8Bytes(in.v3, &(helper.previous), &(helper.has_error)); + state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error)); + state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error)); } } #endif // __SSE4_2 // Checks if the utf8 validation has found any error. template -errorValues check_utf8_errors(check_utf8_helper& helper); +errorValues check_utf8_errors(utf8_checking_state& state); #ifdef __AVX2__ template<> really_inline -errorValues check_utf8_errors(check_utf8_helper& helper) { - return _mm256_testz_si256(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +errorValues check_utf8_errors(utf8_checking_state& state) { + return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; } #endif #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64)) template<> really_inline -errorValues check_utf8_errors(check_utf8_helper& helper) { - return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +errorValues check_utf8_errors(utf8_checking_state& state) { + return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; } #endif #endif // SIMDJSON_UTF8VALIDATE @@ -368,7 +368,6 @@ uint64_t cmp_mask_against_input(simd_input helper; + utf8_checking_state state; #endif // we have padded the input out to 64 byte multiple with the remainder being @@ -977,7 +976,7 @@ WARN_UNUSED #endif simd_input in = fill_input(buf+idx); #ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, helper); + check_utf8(in, state); #endif // detect odd sequences of backslashes uint64_t odd_ends = find_odd_backslash_sequences( @@ -1012,7 +1011,7 @@ WARN_UNUSED memcpy(tmpbuf, buf + idx, len - idx); simd_input in = fill_input(tmpbuf); #ifdef SIMDJSON_UTF8VALIDATE - check_utf8(in, helper); + check_utf8(in, state); #endif // detect odd sequences of backslashes @@ -1069,7 +1068,7 @@ WARN_UNUSED return simdjson::UNESCAPED_CHARS; } #ifdef SIMDJSON_UTF8VALIDATE - return check_utf8_errors(helper); + return check_utf8_errors(state); #else return simdjson::SUCCESS; #endif From a1f692408d9a3846e513ee63d57e6c18533c092a Mon Sep 17 00:00:00 2001 From: ioioioio Date: Fri, 5 Jul 2019 11:38:32 -0400 Subject: [PATCH 9/9] Adding Sunny Gleason to the contributors list --- CONTRIBUTORS | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 35fb3b30..9b11cbec 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -20,5 +20,6 @@ Tom Dyson Ihor Dotsenko Alexey Milovidov Chang Liu +Sunny Gleason # if you have contributed to the project and your name does not # appear in this list, please let us know!