diff --git a/.drone.yml b/.drone.yml index 9e8e3c02..8c6ad18c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -22,7 +22,7 @@ steps: image: gcc:8 environment: CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson - commands: [ make checkperf ] + commands: [ cat /proc/cpuinfo, make checkperf ] --- kind: pipeline name: x64-build @@ -72,7 +72,7 @@ steps: image: gcc:8 environment: CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson - commands: [ make checkperf ] + commands: [ cat /proc/cpuinfo, make checkperf ] --- kind: pipeline name: arm64-build diff --git a/Makefile b/Makefile index 03fd64df..45ce1aee 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,7 @@ COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompeti SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing # Load headers and sources -LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/simd_input.h src/arm64/simdutf8check.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/simd_input.h src/haswell/simdutf8check.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/simd_input.h src/westmere/simdutf8check.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h +LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/bitmask.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/bitmask.h src/haswell/simd.h src/generic/simdutf8check.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/bitmask.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/parsedjson.h include/simdjson/parsedjsoniterator.h include/simdjson/portability.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h HEADERS=$(PUBHEADERS) $(LIBHEADERS) @@ -120,12 +120,12 @@ run_issue150_sh: allparserscheckfile run_testjson2json_sh: minify json2json ./scripts/testjson2json.sh -test: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck run_testjson2json_sh run_issue150_sh +test: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck run_testjson2json_sh run_issue150_sh @echo "It looks like the code is good!" -quiettest: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck run_testjson2json_sh run_issue150_sh +quiettest: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck run_testjson2json_sh run_issue150_sh -quicktests: run_basictests run_numberparsingcheck run_integer_tests run_stringparsingcheck run_jsoncheck run_pointercheck +quicktests: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck run_pointercheck slowtests: run_testjson2json_sh run_issue150_sh diff --git a/amalgamation.sh b/amalgamation.sh index da9c8632..b8dbce8e 100755 --- a/amalgamation.sh +++ b/amalgamation.sh @@ -20,12 +20,12 @@ $SCRIPTPATH/src/simdjson.cpp $SCRIPTPATH/src/jsonioutil.cpp $SCRIPTPATH/src/jsonminifier.cpp $SCRIPTPATH/src/jsonparser.cpp -$SCRIPTPATH/src/arm64/simd_input.h -$SCRIPTPATH/src/haswell/simd_input.h -$SCRIPTPATH/src/westmere/simd_input.h -$SCRIPTPATH/src/arm64/simdutf8check.h -$SCRIPTPATH/src/haswell/simdutf8check.h -$SCRIPTPATH/src/westmere/simdutf8check.h +$SCRIPTPATH/src/arm64/bitmask.h +$SCRIPTPATH/src/haswell/bitmask.h +$SCRIPTPATH/src/westmere/bitmask.h +$SCRIPTPATH/src/arm64/simd.h +$SCRIPTPATH/src/haswell/simd.h +$SCRIPTPATH/src/westmere/simd.h $SCRIPTPATH/src/arm64/stage1_find_marks.h $SCRIPTPATH/src/haswell/stage1_find_marks.h $SCRIPTPATH/src/westmere/stage1_find_marks.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a16066e5..96d22a28 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -37,21 +37,22 @@ set(SIMDJSON_SRC_HEADERS jsoncharutils.h numberparsing.h simdprune_tables.h - arm64/simd_input.h - arm64/simdutf8check.h + arm64/bitmask.h + arm64/simd.h arm64/stage1_find_marks.h arm64/stage2_build_tape.h arm64/stringparsing.h generic/stage1_find_marks.h generic/stage2_build_tape.h generic/stringparsing.h - haswell/simd_input.h - haswell/simdutf8check.h + generic/simdutf8check.h + haswell/bitmask.h + haswell/simd.h haswell/stage1_find_marks.h haswell/stage2_build_tape.h haswell/stringparsing.h - westmere/simd_input.h - westmere/simdutf8check.h + westmere/bitmask.h + westmere/simd.h westmere/stage1_find_marks.h westmere/stage2_build_tape.h westmere/stringparsing.h diff --git a/src/arm64/bitmask.h b/src/arm64/bitmask.h new file mode 100644 index 00000000..4fd1e2fa --- /dev/null +++ b/src/arm64/bitmask.h @@ -0,0 +1,38 @@ +#ifndef SIMDJSON_ARM64_BITMASK_H +#define SIMDJSON_ARM64_BITMASK_H + +#include "simdjson/portability.h" + +#ifdef IS_ARM64 + +#include "haswell/bitmask.h" +#include "simdjson/common_defs.h" + +namespace simdjson::arm64 { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +really_inline uint64_t prefix_xor(uint64_t bitmask) { + +#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension + return vmull_p64(-1ULL, bitmask); +#else + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +#endif + +} + +} // namespace simdjson::arm64 +UNTARGET_REGION + +#endif // IS_ARM64 +#endif diff --git a/src/arm64/simd.h b/src/arm64/simd.h new file mode 100644 index 00000000..e14da8e1 --- /dev/null +++ b/src/arm64/simd.h @@ -0,0 +1,316 @@ +#ifndef SIMDJSON_ARM64_SIMD_H +#define SIMDJSON_ARM64_SIMD_H + +#include "simdjson/portability.h" + +#ifdef IS_ARM64 + +#include "simdjson/common_defs.h" +#include "simdjson/simdjson.h" + +namespace simdjson::arm64::simd { + + template + struct simd8; + + // + // Base class of simd8 and simd8, both of which use uint8x16_t internally. + // + template> + struct base_u8 { + uint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + really_inline base_u8(const uint8x16_t _value) : value(_value) {} + really_inline operator const uint8x16_t&() const { return this->value; } + really_inline operator uint8x16_t&() { return this->value; } + + // Bit operations + really_inline simd8 operator|(const simd8 other) const { return vorrq_u8(*this, other); } + really_inline simd8 operator&(const simd8 other) const { return vandq_u8(*this, other); } + really_inline simd8 operator^(const simd8 other) const { return veorq_u8(*this, other); } + really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_u8(*this, other); } + really_inline simd8 operator~() const { return *this ^ 0xFFu; } + really_inline simd8& operator|=(const simd8 other) { auto this_cast = (simd8*)this; *this_cast = *this_cast | other; return *this_cast; } + really_inline simd8& operator&=(const simd8 other) { auto this_cast = (simd8*)this; *this_cast = *this_cast & other; return *this_cast; } + really_inline simd8& operator^=(const simd8 other) { auto this_cast = (simd8*)this; *this_cast = *this_cast ^ other; return *this_cast; } + + really_inline Mask operator==(const simd8 other) const { return vceqq_u8(*this, other); } + + template + really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_u8(prev_chunk, *this, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base_u8 { + typedef uint32_t bitmask_t; + + static really_inline simd8 splat(bool _value) { return vmovq_n_u8(-(!!_value)); } + + really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // False constructor + really_inline simd8() : simd8(vdupq_n_u8(0)) {} + // Splat constructor + really_inline simd8(bool _value) : simd8(splat(_value)) {} + + really_inline simd8::bitmask_t to_bitmask() const { + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; + auto minput = *this & bit_mask; + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); + } + really_inline bool any() const { return vmaxvq_u8(*this) != 0; } + }; + + // Unsigned bytes + template<> + struct simd8: base_u8 { + static really_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); } + static really_inline uint8x16_t zero() { return vdupq_n_u8(0); } + static really_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); } + + really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // Zero constructor + really_inline simd8() : simd8(zero()) {} + // Array constructor + really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization + really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(uint8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} + + // Store to array + really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); } + + // Saturated math + really_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } + really_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } + + // Addition/subtraction are the same for signed and unsigned + really_inline simd8 operator+(const simd8 other) const { return vaddq_u8(*this, other); } + really_inline simd8 operator-(const simd8 other) const { return vsubq_u8(*this, other); } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-specific operations + really_inline simd8 max(const simd8 other) const { return vmaxq_u8(*this, other); } + really_inline simd8 min(const simd8 other) const { return vminq_u8(*this, other); } + really_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } + + // Bit-specific operations + really_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } + really_inline bool any_bits_set_anywhere() const { return vmaxvq_u8(*this) != 0; } + really_inline bool any_bits_set_anywhere(simd8 bits) const { return (*this & bits).any_bits_set_anywhere(); } + template + really_inline simd8 shr() const { return vshrq_n_u8(*this, N); } + template + really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } + + // Perform a lookup assuming no value is larger than 16 + template + really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + simd8 lookup_table( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + return lookup_table.apply_lookup_16_to(*this); + } + + // Perform a lookup of the lower 4 bits + template + really_inline simd8 lookup_lower_4_bits( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return (*this & 0xF).lookup_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + } + + really_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_u8(*this, original); + } + }; + + // Signed bytes + template<> + struct simd8 { + int8x16_t value; + + static really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } + static really_inline simd8 zero() { return vdupq_n_s8(0); } + static really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + + // Conversion from/to SIMD register + really_inline simd8(const int8x16_t _value) : value{_value} {} + really_inline operator const int8x16_t&() const { return this->value; } + really_inline operator int8x16_t&() { return this->value; } + + // Zero constructor + really_inline simd8() : simd8(zero()) {} + // Splat constructor + really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(int8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} + + // Store to array + really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); } + + // Explicit conversion to/from unsigned + really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} + really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(*this); } + + // Math + really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(*this, other); } + really_inline simd8 operator-(const simd8 other) const { return vsubq_s8(*this, other); } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-sensitive comparisons + really_inline simd8 max(const simd8 other) const { return vmaxq_s8(*this, other); } + really_inline simd8 min(const simd8 other) const { return vminq_s8(*this, other); } + really_inline simd8 operator>(const simd8 other) const { return vcgtq_s8(*this, other); } + really_inline simd8 operator==(const simd8 other) const { return vceqq_s8(*this, other); } + + template + really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_s8(prev_chunk, *this, 16 - N); + } + + // Perform a lookup of the lower 4 bits + template + really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return simd8(*this).lookup_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + } + + really_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_s8(*this, original); + } + }; + + template + struct simd8x64 { + const simd8 chunks[4]; + + really_inline simd8x64() : chunks{simd8(), simd8(), simd8(), simd8()} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + really_inline void store(T ptr[64]) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+16); + this->chunks[0].store(ptr+32); + this->chunks[0].store(ptr+48); + } + + template + really_inline void each(F const& each_chunk) const + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); + each_chunk(this->chunks[2]); + each_chunk(this->chunks[3]); + } + + template + really_inline simd8x64 map(F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]), + map_chunk(this->chunks[2]), + map_chunk(this->chunks[3]) + ); + } + + template + really_inline simd8x64 map(const simd8x64 b, F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]), + map_chunk(this->chunks[2], b.chunks[2]), + map_chunk(this->chunks[3], b.chunks[3]) + ); + } + + template + really_inline simd8 reduce(F const& reduce_pair) const { + return reduce_pair( + reduce_pair(this->chunks[0], this->chunks[1]), + reduce_pair(this->chunks[2], this->chunks[3]) + ); + } + + really_inline uint64_t to_bitmask() const { + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); + uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + really_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a | mask; } ); + } + + really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); + } + + really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); + } + + }; // struct simd8x64 + +} // namespace simdjson::arm64::simd + +#endif // IS_ARM64 +#endif // SIMDJSON_ARM64_SIMD_H diff --git a/src/arm64/simd_input.h b/src/arm64/simd_input.h deleted file mode 100644 index 458f6a52..00000000 --- a/src/arm64/simd_input.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef SIMDJSON_ARM64_SIMD_INPUT_H -#define SIMDJSON_ARM64_SIMD_INPUT_H - -#include "simdjson/common_defs.h" -#include "simdjson/portability.h" -#include "simdjson/simdjson.h" - -#ifdef IS_ARM64 - -namespace simdjson::arm64 { - -really_inline uint16_t neon_movemask(uint8x16_t input) { - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t minput = vandq_u8(input, bit_mask); - uint8x16_t tmp = vpaddq_u8(minput, minput); - tmp = vpaddq_u8(tmp, tmp); - tmp = vpaddq_u8(tmp, tmp); - return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); -} - -really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1, - uint8x16_t p2, uint8x16_t p3) { - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t t0 = vandq_u8(p0, bit_mask); - uint8x16_t t1 = vandq_u8(p1, bit_mask); - uint8x16_t t2 = vandq_u8(p2, bit_mask); - uint8x16_t t3 = vandq_u8(p3, bit_mask); - uint8x16_t sum0 = vpaddq_u8(t0, t1); - uint8x16_t sum1 = vpaddq_u8(t2, t3); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); -} - -struct simd_input { - const uint8x16_t chunks[4]; - - really_inline simd_input() - : chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {} - - really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3) - : chunks{chunk0, chunk1, chunk2, chunk3 } {} - - really_inline simd_input(const uint8_t *ptr) - : chunks{ - vld1q_u8(ptr + 0*16), - vld1q_u8(ptr + 1*16), - vld1q_u8(ptr + 2*16), - vld1q_u8(ptr + 3*16) - } {} - - template - really_inline void each(F const& each_chunk) const { - each_chunk(this->chunks[0]); - each_chunk(this->chunks[1]); - each_chunk(this->chunks[2]); - each_chunk(this->chunks[3]); - } - - template - really_inline simd_input map(F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0]), - map_chunk(this->chunks[1]), - map_chunk(this->chunks[2]), - map_chunk(this->chunks[3]) - ); - } - - template - really_inline simd_input map(simd_input b, F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0], b.chunks[0]), - map_chunk(this->chunks[1], b.chunks[1]), - map_chunk(this->chunks[2], b.chunks[2]), - map_chunk(this->chunks[3], b.chunks[3]) - ); - } - - template - really_inline uint8x16_t reduce(F const& reduce_pair) const { - uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]); - uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]); - return reduce_pair(r01, r23); - } - - really_inline uint64_t to_bitmask() const { - return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]); - } - - really_inline simd_input bit_or(const uint8_t m) const { - const uint8x16_t mask = vmovq_n_u8(m); - return this->map( [&](auto a) { - return vorrq_u8(a, mask); - }); - } - - really_inline uint64_t eq(const uint8_t m) const { - const uint8x16_t mask = vmovq_n_u8(m); - return this->map( [&](auto a) { - return vceqq_u8(a, mask); - }).to_bitmask(); - } - - really_inline uint64_t lteq(const uint8_t m) const { - const uint8x16_t mask = vmovq_n_u8(m); - return this->map( [&](auto a) { - return vcleq_u8(a, mask); - }).to_bitmask(); - } - -}; // struct simd_input - -} // namespace simdjson::arm64 - -#endif // IS_ARM64 -#endif // SIMDJSON_ARM64_SIMD_INPUT_H diff --git a/src/arm64/simdutf8check.h b/src/arm64/simdutf8check.h deleted file mode 100644 index 4cdfd79f..00000000 --- a/src/arm64/simdutf8check.h +++ /dev/null @@ -1,212 +0,0 @@ -// From https://github.com/cyb70289/utf8/blob/master/lemire-neon.c -// Adapted from https://github.com/lemire/fastvalidate-utf-8 - -#ifndef SIMDJSON_ARM64_SIMDUTF8CHECK_H -#define SIMDJSON_ARM64_SIMDUTF8CHECK_H - -// TODO this is different from IS_ARM64 in portability.h, which we use in other places ... -#if defined(_ARM_NEON) || defined(__aarch64__) || \ - (defined(_MSC_VER) && defined(_M_ARM64)) - -#include "simdjson/simdjson.h" -#include "arm64/simd_input.h" -#include -#include -#include -#include -#include -#include - -/* - * legal utf-8 byte sequence - * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 - * - * Code Points 1st 2s 3s 4s - * U+0000..U+007F 00..7F - * U+0080..U+07FF C2..DF 80..BF - * U+0800..U+0FFF E0 A0..BF 80..BF - * U+1000..U+CFFF E1..EC 80..BF 80..BF - * U+D000..U+D7FF ED 80..9F 80..BF - * U+E000..U+FFFF EE..EF 80..BF 80..BF - * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - * - */ -namespace simdjson::arm64 { - -static const int8_t _nibbles[] = { - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) -}; - -static const int8_t _initial_mins[] = { - -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, -128, -128, // 10xx => false - (int8_t)0xC2, -128, // 110x - (int8_t)0xE1, // 1110 - (int8_t)0xF1, -}; - -static const int8_t _second_mins[] = { - -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - (int8_t)0xA0, // 1110 - (int8_t)0x90, -}; - -struct processed_utf_bytes { - int8x16_t raw_bytes; - int8x16_t high_nibbles; - int8x16_t carried_continuations; -}; - -struct utf8_checker { - int8x16_t has_error{vdupq_n_s8(0)}; - processed_utf_bytes previous{vdupq_n_s8(0), vdupq_n_s8(0), vdupq_n_s8(0)}; - - really_inline void add_errors(int8x16_t errors) { - this->has_error = vorrq_s8(this->has_error, errors); - } - - // all byte values must be no larger than 0xF4 - really_inline void check_smaller_than_0xF4(int8x16_t current_bytes) { - // unsigned, saturates to 0 below max - this->add_errors( vreinterpretq_s8_u8(vqsubq_u8( - vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))) ); - } - - really_inline int8x16_t continuation_lengths(int8x16_t high_nibbles) { - return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles)); - } - - really_inline int8x16_t carry_continuations(int8x16_t initial_lengths) { - int8x16_t right1 = vreinterpretq_s8_u8(vqsubq_u8( - vreinterpretq_u8_s8(vextq_s8(this->previous.carried_continuations, initial_lengths, 16 - 1)), - vdupq_n_u8(1))); - int8x16_t sum = vaddq_s8(initial_lengths, right1); - - int8x16_t right2 = vreinterpretq_s8_u8( - vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(this->previous.carried_continuations, sum, 16 - 2)), - vdupq_n_u8(2))); - return vaddq_s8(sum, right2); - } - - really_inline void check_continuations(int8x16_t initial_lengths, int8x16_t carries) { - - // overlap || underlap - // carry > length && length > 0 || !(carry > length) && !(length > 0) - // (carries > length) == (lengths > 0) - uint8x16_t overunder = vceqq_u8(vcgtq_s8(carries, initial_lengths), - vcgtq_s8(initial_lengths, vdupq_n_s8(0))); - - this->add_errors( vreinterpretq_s8_u8(overunder) ); - } - - // when 0xED is found, next byte must be no larger than 0x9F - // when 0xF4 is found, next byte must be no larger than 0x8F - // next byte must be continuation, ie sign bit is set, so signed < is ok - really_inline void check_first_continuation_max(int8x16_t current_bytes, int8x16_t off1_current_bytes) { - uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED)); - uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4)); - - uint8x16_t badfollowED = vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x9F)), maskED); - uint8x16_t badfollowF4 = vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4); - - this->add_errors( vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)) ); - } - - // map off1_hibits => error condition - // hibits off1 cur - // C => < C2 && true - // E => < E1 && < A0 - // F => < F1 && < 90 - // else false && false - really_inline void check_overlong(int8x16_t current_bytes, - int8x16_t off1_current_bytes, - int8x16_t high_nibbles) { - int8x16_t off1_high_nibbles = vextq_s8(this->previous.high_nibbles, high_nibbles, 16 - 1); - int8x16_t initial_mins = - vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_high_nibbles)); - - uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes); - - int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_high_nibbles)); - uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes); - this->add_errors( vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)) ); - } - - really_inline int8x16_t count_nibbles(int8x16_t bytes) { - return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)); - } - - // check whether the current bytes are valid UTF-8 - // at the end of the function, previous gets updated - really_inline void check_utf8_bytes(int8x16_t current_bytes) { - struct processed_utf_bytes pb; - pb.raw_bytes = current_bytes; - pb.high_nibbles = this->count_nibbles(current_bytes); - - this->check_smaller_than_0xF4(current_bytes); - - int8x16_t initial_lengths = this->continuation_lengths(pb.high_nibbles); - - pb.carried_continuations = this->carry_continuations(initial_lengths); - - this->check_continuations(initial_lengths, pb.carried_continuations); - - int8x16_t off1_current_bytes = vextq_s8(this->previous.raw_bytes, pb.raw_bytes, 16 - 1); - this->check_first_continuation_max(current_bytes, off1_current_bytes); - - this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles); - this->previous = pb; - } - - // Checks that all bytes are ascii - really_inline bool check_ascii_neon(simd_input in) { - // checking if the most significant bit is always equal to 0. - uint8x16_t high_bit = vdupq_n_u8(0x80); - uint8x16_t any_bits_on = in.reduce([&](auto a, auto b) { - return vorrq_u8(a, b); - }); - uint8x16_t high_bit_on = vandq_u8(any_bits_on, high_bit); - uint64x2_t v64 = vreinterpretq_u64_u8(high_bit_on); - uint32x2_t v32 = vqmovn_u64(v64); - uint64x1_t result = vreinterpret_u64_u32(v32); - return vget_lane_u64(result, 0) == 0; - } - - really_inline void check_next_input(simd_input in) { - if (check_ascii_neon(in)) { - // All bytes are ascii. Therefore the byte that was just before must be - // ascii too. We only check the byte that was just before simd_input. Nines - // are arbitrary values. - const int8x16_t verror = - (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1}; - this->add_errors(vreinterpretq_s8_u8( - vcgtq_s8(this->previous.carried_continuations, verror))); - } else { - // it is not ascii so we have to do heavy work - in.each([&](auto _in) { - this->check_utf8_bytes(vreinterpretq_s8_u8(_in)); - }); - } - } - - really_inline ErrorValues errors() { - uint64x2_t v64 = vreinterpretq_u64_s8(this->has_error); - uint32x2_t v32 = vqmovn_u64(v64); - uint64x1_t result = vreinterpret_u64_u32(v32); - return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR - : simdjson::SUCCESS; - } - -}; // struct utf8_checker - -} // namespace simdjson::arm64 -#endif -#endif diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index 672dd908..751477b7 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -5,49 +5,31 @@ #ifdef IS_ARM64 -#include "arm64/simd_input.h" -#include "arm64/simdutf8check.h" +#include "arm64/bitmask.h" +#include "arm64/simd.h" #include "simdjson/stage1_find_marks.h" namespace simdjson::arm64 { -really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { - -#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension - return vmull_p64(-1ULL, quote_bits); -#else - return portable_compute_quote_mask(quote_bits); -#endif -} +using namespace simd; really_inline void find_whitespace_and_operators( - const simd_input in, - uint64_t &whitespace, uint64_t &op) { - const uint8x16_t low_nibble_mask = - (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; - const uint8x16_t high_nibble_mask = - (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; - const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); + const simd::simd8x64 in, + uint64_t &whitespace, uint64_t &op) { - auto v = in.map([&](auto chunk) { - uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask); - uint8x16_t nib_hi = vshrq_n_u8(chunk, 4); - uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo); - uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi); - return vandq_u8(shuf_lo, shuf_hi); + auto v = in.map([&](simd8 chunk) { + auto nib_lo = chunk & 0xf; + auto nib_hi = chunk.shr<4>(); + auto shuf_lo = nib_lo.lookup_16(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); + auto shuf_hi = nib_hi.lookup_16(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); + return shuf_lo & shuf_hi; }); - const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7); - op = v.map([&](auto _v) { - return vtstq_u8(_v, operator_shufti_mask); - }).to_bitmask(); - - const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); - whitespace = v.map([&](auto _v) { - return vtstq_u8(_v, whitespace_shufti_mask); - }).to_bitmask(); + op = v.map([&](simd8 _v) { return _v.any_bits_set(0x7); }).to_bitmask(); + whitespace = v.map([&](simd8 _v) { return _v.any_bits_set(0x18); }).to_bitmask(); } +#include "generic/simdutf8check.h" #include "generic/stage1_find_marks.h" } // namespace simdjson::arm64 diff --git a/src/arm64/stringparsing.h b/src/arm64/stringparsing.h index b83ca7bd..cddb4c56 100644 --- a/src/arm64/stringparsing.h +++ b/src/arm64/stringparsing.h @@ -1,59 +1,42 @@ #ifndef SIMDJSON_ARM64_STRINGPARSING_H #define SIMDJSON_ARM64_STRINGPARSING_H +#include "simdjson/portability.h" + #ifdef IS_ARM64 +#include "arm64/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - namespace simdjson::arm64 { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(uint8x16_t)*2; } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING); - uint8x16_t v0 = vld1q_u8(src); - uint8x16_t v1 = vld1q_u8(src + 16); - vst1q_u8(dst, v0); - vst1q_u8(dst + 16, v1); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v0(src); + simd8 v1(src + sizeof(v0)); + v0.store(dst); + v1.store(dst + sizeof(v0)); - uint8x16_t bs_mask = vmovq_n_u8('\\'); - uint8x16_t qt_mask = vmovq_n_u8('"'); - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask); - uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask); - uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask); - uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask); - - cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask); - cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask); - cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask); - cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask); - - uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1); - uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); + // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we + // smash them together into a 64-byte mask and get the bitmask from there. + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); return { - vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits - vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits + static_cast(bs_and_quote), // bs_bits + static_cast(bs_and_quote >> 32) // quote_bits }; - } #include "generic/stringparsing.h" diff --git a/src/generic/simdutf8check.h b/src/generic/simdutf8check.h new file mode 100644 index 00000000..ad129ffd --- /dev/null +++ b/src/generic/simdutf8check.h @@ -0,0 +1,176 @@ +/* + * legal utf-8 byte sequence + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Code Points 1st 2s 3s 4s + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + */ + +// all byte values must be no larger than 0xF4 + +using namespace simd; + +struct processed_utf_bytes { + simd8 raw_bytes; + simd8 high_nibbles; + simd8 carried_continuations; +}; + +struct utf8_checker { + simd8 has_error; + processed_utf_bytes previous; + + // all byte values must be no larger than 0xF4 + really_inline void check_smaller_than_0xF4(simd8 current_bytes) { + // unsigned, saturates to 0 below max + this->has_error |= current_bytes.saturating_sub(0xF4u); + } + + really_inline simd8 continuation_lengths(simd8 high_nibbles) { + return high_nibbles.lookup_16( + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4); // 1111, next should be 0 (not checked here) + } + + really_inline simd8 carry_continuations(simd8 initial_lengths) { + simd8 prev_carried_continuations = initial_lengths.prev(this->previous.carried_continuations); + simd8 right1 = simd8(simd8(prev_carried_continuations).saturating_sub(1)); + simd8 sum = initial_lengths + right1; + + simd8 prev2_carried_continuations = sum.prev<2>(this->previous.carried_continuations); + simd8 right2 = simd8(simd8(prev2_carried_continuations).saturating_sub(2)); + return sum + right2; + } + + really_inline void check_continuations(simd8 initial_lengths, simd8 carries) { + // overlap || underlap + // carry > length && length > 0 || !(carry > length) && !(length > 0) + // (carries > length) == (lengths > 0) + // (carries > current) == (current > 0) + this->has_error |= simd8( + (carries > initial_lengths) == (initial_lengths > simd8::zero())); + } + + really_inline void check_carried_continuations() { + static const int8_t last_1[32] = { + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 1 + }; + this->has_error |= simd8(this->previous.carried_continuations > simd8(last_1 + 32 - sizeof(simd8))); + } + + // when 0xED is found, next byte must be no larger than 0x9F + // when 0xF4 is found, next byte must be no larger than 0x8F + // next byte must be continuation, ie sign bit is set, so signed < is ok + really_inline void check_first_continuation_max(simd8 current_bytes, + simd8 off1_current_bytes) { + simd8 prev_ED = off1_current_bytes == 0xEDu; + simd8 prev_F4 = off1_current_bytes == 0xF4u; + // Check if ED is followed by A0 or greater + simd8 ED_too_large = (simd8(current_bytes) > simd8::splat(0x9Fu)) & prev_ED; + // Check if F4 is followed by 90 or greater + simd8 F4_too_large = (simd8(current_bytes) > simd8::splat(0x8Fu)) & prev_F4; + // These will also error if ED or F4 is followed by ASCII, but that's an error anyway + this->has_error |= simd8(ED_too_large | F4_too_large); + } + + // map off1_hibits => error condition + // hibits off1 cur + // C => < C2 && true + // E => < E1 && < A0 + // F => < F1 && < 90 + // else false && false + really_inline void check_overlong(simd8 current_bytes, + simd8 off1_current_bytes, + simd8 high_nibbles) { + simd8 off1_high_nibbles = high_nibbles.prev(this->previous.high_nibbles); + + // Two-byte characters must start with at least C2 + // Three-byte characters must start with at least E1 + // Four-byte characters must start with at least F1 + simd8 initial_mins = off1_high_nibbles.lookup_16( + -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx -> false + -128, -128, -128, -128, // 10xx -> false + 0xC2, -128, // 1100 -> C2 + 0xE1, // 1110 + 0xF1 // 1111 + ); + simd8 initial_under = initial_mins > simd8(off1_current_bytes); + + // Two-byte characters starting with at least C2 are always OK + // Three-byte characters starting with at least E1 must be followed by at least A0 + // Four-byte characters starting with at least F1 must be followed by at least 90 + simd8 second_mins = off1_high_nibbles.lookup_16( + -128, -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx => false + -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90 // 1111 + ); + simd8 second_under = second_mins > simd8(current_bytes); + this->has_error |= simd8(initial_under & second_under); + } + + really_inline void count_nibbles(simd8 bytes, struct processed_utf_bytes *answer) { + answer->raw_bytes = bytes; + answer->high_nibbles = simd8(bytes.shr<4>()); + } + + // check whether the current bytes are valid UTF-8 + // at the end of the function, previous gets updated + really_inline void check_utf8_bytes(simd8 current_bytes) { + struct processed_utf_bytes pb {}; + this->count_nibbles(current_bytes, &pb); + + this->check_smaller_than_0xF4(current_bytes); + + simd8 initial_lengths = this->continuation_lengths(pb.high_nibbles); + + pb.carried_continuations = this->carry_continuations(initial_lengths); + + this->check_continuations(initial_lengths, pb.carried_continuations); + + simd8 off1_current_bytes = pb.raw_bytes.prev(this->previous.raw_bytes); + this->check_first_continuation_max(current_bytes, off1_current_bytes); + + this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles); + this->previous = pb; + } + + really_inline void check_next_input(simd8 in) { + if (likely(!in.any_bits_set_anywhere(0x80u))) { + this->check_carried_continuations(); + } else { + this->check_utf8_bytes(in); + } + } + + really_inline void check_next_input(simd8x64 in) { + simd8 bits = in.reduce([&](auto a, auto b) { return a | b; }); + if (likely(!bits.any_bits_set_anywhere(0x80u))) { + // it is ascii, we just check carried continuations. + this->check_carried_continuations(); + } else { + // it is not ascii so we have to do heavy work + in.each([&](auto _in) { this->check_utf8_bytes(_in); }); + } + } + + really_inline ErrorValues errors() { + return this->has_error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + } +}; // struct utf8_checker diff --git a/src/generic/stage1_find_marks.h b/src/generic/stage1_find_marks.h index c8e9a4f4..e7bf3369 100644 --- a/src/generic/stage1_find_marks.h +++ b/src/generic/stage1_find_marks.h @@ -72,7 +72,6 @@ public: uint64_t unescaped_chars_error = 0; bit_indexer structural_indexes; - json_structural_scanner(uint32_t *_structural_indexes) : structural_indexes{_structural_indexes} {} // return a bitvector indicating where we have characters that end an odd-length @@ -159,12 +158,12 @@ public: // // Backslash sequences outside of quotes will be detected in stage 2. // - really_inline uint64_t find_strings(const simd_input in) { + really_inline uint64_t find_strings(const simd::simd8x64 in) { const uint64_t backslash = in.eq('\\'); const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped); const uint64_t quote = in.eq('"') & ~escaped; - // compute_quote_mask returns start quote plus string contents. - const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string; + // prefix_xor flips on bits inside the string (and flips off the end quote). + const uint64_t in_string = prefix_xor(quote) ^ prev_in_string; /* right shift of a signed value expected to be well-defined and standard * compliant as of C++20, * John Regher from Utah U. says this is fine code */ @@ -198,7 +197,7 @@ public: // contents of a string the same as content outside. Errors and structurals inside the string or on // the trailing quote will need to be removed later when the correct string information is known. // - really_inline uint64_t find_potential_structurals(const simd_input in) { + really_inline uint64_t find_potential_structurals(const simd::simd8x64 in) { // These use SIMD so let's kick them off before running the regular 64-bit stuff ... uint64_t whitespace, op; find_whitespace_and_operators(in, whitespace, op); @@ -236,8 +235,8 @@ public: // // Load up all 128 bytes into SIMD registers // - simd_input in_1(buf); - simd_input in_2(buf+64); + simd::simd8x64 in_1(buf); + simd::simd8x64 in_2(buf+64); // // Find the strings and potential structurals (operators / primitives). diff --git a/src/generic/stringparsing.h b/src/generic/stringparsing.h index 230d30b5..b71584d8 100644 --- a/src/generic/stringparsing.h +++ b/src/generic/stringparsing.h @@ -84,7 +84,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, */ /* find out where the quote is... */ - uint32_t quote_dist = trailing_zeroes(helper.quote_bits); + auto quote_dist = trailing_zeroes(helper.quote_bits); /* NULL termination is still handy if you expect all your strings to * be NULL terminated? */ @@ -92,7 +92,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, dst[quote_dist] = 0; uint32_t str_length = (dst - start_of_string) + quote_dist; - memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); + memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length)); /***************************** * Above, check for overflow in case someone has a crazy string * (>=4GB?) _ @@ -109,7 +109,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, } if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { /* find out where the backspace is */ - uint32_t bs_dist = trailing_zeroes(helper.bs_bits); + auto bs_dist = trailing_zeroes(helper.bs_bits); uint8_t escape_char = src[bs_dist + 1]; /* we encountered backslash first. Handle backslash */ if (escape_char == 'u') { @@ -136,8 +136,8 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, } else { /* they are the same. Since they can't co-occur, it means we * encountered neither. */ - src += helper.bytes_processed(); - dst += helper.bytes_processed(); + src += parse_string_helper::BYTES_PROCESSED; + dst += parse_string_helper::BYTES_PROCESSED; } } /* can't be reached */ diff --git a/src/haswell/bitmask.h b/src/haswell/bitmask.h new file mode 100644 index 00000000..6d474601 --- /dev/null +++ b/src/haswell/bitmask.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_HASWELL_BITMASK_H +#define SIMDJSON_HASWELL_BITMASK_H + +#include "simdjson/portability.h" + +#ifdef IS_X86_64 + +#include "simdjson/common_defs.h" + +TARGET_HASWELL +namespace simdjson::haswell { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +really_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processing supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // namespace simdjson::haswell +UNTARGET_REGION + +#endif // IS_X86_64 +#endif diff --git a/src/haswell/simd.h b/src/haswell/simd.h new file mode 100644 index 00000000..de342b04 --- /dev/null +++ b/src/haswell/simd.h @@ -0,0 +1,267 @@ +#ifndef SIMDJSON_HASWELL_SIMD_H +#define SIMDJSON_HASWELL_SIMD_H + +#include "simdjson/portability.h" + +#ifdef IS_X86_64 + +#include "simdjson/common_defs.h" + +TARGET_HASWELL +namespace simdjson::haswell::simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m256i value; + + // Zero constructor + really_inline base() : value{__m256i()} {} + + // Conversion from SIMD register + really_inline base(const __m256i _value) : value(_value) {} + + // Conversion to SIMD register + really_inline operator const __m256i&() const { return this->value; } + really_inline operator __m256i&() { return this->value; } + + // Bit operations + really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } + really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } + really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } + really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(*this, other); } + really_inline Child operator~() const { return *this ^ 0xFFu; } + really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; } + really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; } + really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + really_inline base8() : base>() {} + really_inline base8(const __m256i _value) : base>(_value) {} + + really_inline Mask operator==(const simd8 other) const { return _mm256_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base::value); + + template + really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + typedef int bitmask_t; + static really_inline simd8 splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); } + + really_inline simd8() : base8() {} + really_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + really_inline simd8(bool _value) : base8(splat(_value)) {} + + really_inline bitmask_t to_bitmask() const { return _mm256_movemask_epi8(*this); } + really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + }; + + template + struct base8_numeric: base8 { + static really_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } + static really_inline simd8 zero() { return _mm256_setzero_si256(); } + static really_inline simd8 load(const T values[32]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + + really_inline base8_numeric() : base8() {} + really_inline base8_numeric(const __m256i _value) : base8(_value) {} + + // Store to array + really_inline void store(T dst[32]) { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } + really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Perform a lookup of the lower 4 bits + template + really_inline simd8 lookup_lower_4_bits( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + simd8 lookup_table( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15, + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + return _mm256_shuffle_epi8(lookup_table, *this); + } + + // Perform a lookup assuming the value is between 0 and 16 + template + really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_lower_4_bits( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + really_inline simd8() : base8_numeric() {} + really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + + // Order-sensitive comparisons + really_inline simd8 max(const simd8 other) const { return _mm256_max_epi8(*this, other); } + really_inline simd8 min(const simd8 other) const { return _mm256_min_epi8(*this, other); } + really_inline simd8 operator>(const simd8 other) const { return _mm256_cmpgt_epi8(*this, other); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + really_inline simd8() : base8_numeric() {} + really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + + // Saturated math + really_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } + really_inline simd8 saturating_sub(const simd8 other) const { return _mm256_subs_epu8(*this, other); } + + // Order-specific operations + really_inline simd8 max(const simd8 other) const { return _mm256_max_epu8(*this, other); } + really_inline simd8 min(const simd8 other) const { return _mm256_min_epu8(*this, other); } + really_inline simd8 operator<=(const simd8 other) const { return other.max(*this) == other; } + + // Bit-specific operations + really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } + really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } + really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm256_testz_si256(*this, bits); } + really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); } + template + really_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + really_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + }; + + template + struct simd8x64 { + const simd8 chunks[2]; + + really_inline simd8x64() : chunks{simd8(), simd8()} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+32)} {} + + really_inline void store(T *ptr) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+sizeof(simd8)); + } + + template + really_inline void each(F const& each_chunk) const + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); + } + + template + really_inline simd8x64 map(F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]) + ); + } + + template + really_inline simd8x64 map(const simd8x64 b, F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]) + ); + } + + template + really_inline simd8 reduce(F const& reduce_pair) const { + return reduce_pair(this->chunks[0], this->chunks[1]); + } + + really_inline uint64_t to_bitmask() const { + uint64_t r_lo = static_cast(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + really_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a | mask; } ); + } + + really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); + } + + really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); + } + + }; // struct simd8x64 + +} // namespace simdjson::haswell::simd +UNTARGET_REGION + +#endif // IS_X86_64 +#endif // SIMDJSON_HASWELL_SIMD_H diff --git a/src/haswell/simd_input.h b/src/haswell/simd_input.h deleted file mode 100644 index 4b8cefc4..00000000 --- a/src/haswell/simd_input.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef SIMDJSON_HASWELL_SIMD_INPUT_H -#define SIMDJSON_HASWELL_SIMD_INPUT_H - -#include "simdjson/common_defs.h" -#include "simdjson/portability.h" -#include "simdjson/simdjson.h" - -#ifdef IS_X86_64 - -TARGET_HASWELL -namespace simdjson::haswell { - -struct simd_input { - const __m256i chunks[2]; - - really_inline simd_input() : chunks{__m256i(), __m256i()} {} - - really_inline simd_input(const __m256i chunk0, const __m256i chunk1) - : chunks{chunk0, chunk1} {} - - really_inline simd_input(const uint8_t *ptr) - : chunks{ - _mm256_loadu_si256(reinterpret_cast(ptr + 0*32)), - _mm256_loadu_si256(reinterpret_cast(ptr + 1*32)) - } {} - - template - really_inline void each(F const& each_chunk) const - { - each_chunk(this->chunks[0]); - each_chunk(this->chunks[1]); - } - - template - really_inline simd_input map(F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0]), - map_chunk(this->chunks[1]) - ); - } - - template - really_inline simd_input map(const simd_input b, F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0], b.chunks[0]), - map_chunk(this->chunks[1], b.chunks[1]) - ); - } - - template - really_inline __m256i reduce(F const& reduce_pair) const { - return reduce_pair(this->chunks[0], this->chunks[1]); - } - - really_inline uint64_t to_bitmask() const { - uint64_t r_lo = static_cast(_mm256_movemask_epi8(this->chunks[0])); - uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]); - return r_lo | (r_hi << 32); - } - - really_inline simd_input bit_or(const uint8_t m) const { - const __m256i mask = _mm256_set1_epi8(m); - return this->map( [&](auto a) { - return _mm256_or_si256(a, mask); - }); - } - - really_inline uint64_t eq(const uint8_t m) const { - const __m256i mask = _mm256_set1_epi8(m); - return this->map( [&](auto a) { - return _mm256_cmpeq_epi8(a, mask); - }).to_bitmask(); - } - - really_inline uint64_t lteq(const uint8_t m) const { - const __m256i maxval = _mm256_set1_epi8(m); - return this->map( [&](auto a) { - return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval); - }).to_bitmask(); - } - -}; // struct simd_input - -} // namespace simdjson::haswell -UNTARGET_REGION - -#endif // IS_X86_64 -#endif // SIMDJSON_HASWELL_SIMD_INPUT_H diff --git a/src/haswell/simdutf8check.h b/src/haswell/simdutf8check.h deleted file mode 100644 index 65896e98..00000000 --- a/src/haswell/simdutf8check.h +++ /dev/null @@ -1,233 +0,0 @@ -#ifndef SIMDJSON_HASWELL_SIMDUTF8CHECK_H -#define SIMDJSON_HASWELL_SIMDUTF8CHECK_H - -#include "simdjson/portability.h" -#include "simdjson/simdjson.h" -#include "haswell/simd_input.h" -#include -#include -#include - -#ifdef IS_X86_64 -/* - * legal utf-8 byte sequence - * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 - * - * Code Points 1st 2s 3s 4s - * U+0000..U+007F 00..7F - * U+0080..U+07FF C2..DF 80..BF - * U+0800..U+0FFF E0 A0..BF 80..BF - * U+1000..U+CFFF E1..EC 80..BF 80..BF - * U+D000..U+D7FF ED 80..9F 80..BF - * U+E000..U+FFFF EE..EF 80..BF 80..BF - * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - * - */ - -// all byte values must be no larger than 0xF4 - -TARGET_HASWELL -namespace simdjson::haswell { - -static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) { - return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15); -} - -static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) { - return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14); -} - -struct processed_utf_bytes { - __m256i raw_bytes; - __m256i high_nibbles; - __m256i carried_continuations; -}; - -struct utf8_checker { - __m256i has_error; - processed_utf_bytes previous; - - utf8_checker() : - has_error{_mm256_setzero_si256()}, - previous{_mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()} {} - - really_inline void add_errors(__m256i errors) { - this->has_error = _mm256_or_si256(this->has_error, errors); - } - - // all byte values must be no larger than 0xF4 - really_inline void check_smaller_than_0xF4(__m256i current_bytes) { - // unsigned, saturates to 0 below max - this->add_errors( _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4u)) ); - } - - really_inline __m256i continuation_lengths(__m256i high_nibbles) { - return _mm256_shuffle_epi8( - _mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4), // 1111, next should be 0 (not checked here) - - high_nibbles); - } - - really_inline __m256i carry_continuations(__m256i initial_lengths) { - __m256i right1 = _mm256_subs_epu8( - push_last_byte_of_a_to_b(this->previous.carried_continuations, initial_lengths), - _mm256_set1_epi8(1)); - __m256i sum = _mm256_add_epi8(initial_lengths, right1); - - __m256i right2 = _mm256_subs_epu8( - push_last_2bytes_of_a_to_b(this->previous.carried_continuations, sum), _mm256_set1_epi8(2)); - return _mm256_add_epi8(sum, right2); - } - - really_inline void check_continuations(__m256i initial_lengths, __m256i carries) { - // overlap || underlap - // carry > length && length > 0 || !(carry > length) && !(length > 0) - // (carries > length) == (lengths > 0) - // (carries > current) == (current > 0) - __m256i overunder = _mm256_cmpeq_epi8( - _mm256_cmpgt_epi8(carries, initial_lengths), - _mm256_cmpgt_epi8(initial_lengths, _mm256_setzero_si256())); - - this->add_errors( overunder ); - } - - really_inline void check_carried_continuations() { - this->add_errors( - _mm256_cmpgt_epi8(this->previous.carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 1)) - ); - } - - // when 0xED is found, next byte must be no larger than 0x9F - // when 0xF4 is found, next byte must be no larger than 0x8F - // next byte must be continuation, ie sign bit is set, so signed < is ok - really_inline void check_first_continuation_max(__m256i current_bytes, - __m256i off1_current_bytes) { - __m256i maskED = - _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xEDu)); - __m256i maskF4 = - _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xF4u)); - - __m256i badfollowED = _mm256_and_si256( - _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x9Fu)), maskED); - __m256i badfollowF4 = _mm256_and_si256( - _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x8Fu)), maskF4); - - this->add_errors( _mm256_or_si256(badfollowED, badfollowF4) ); - } - - // map off1_hibits => error condition - // hibits off1 cur - // C => < C2 && true - // E => < E1 && < A0 - // F => < F1 && < 90 - // else false && false - really_inline void check_overlong(__m256i current_bytes, - __m256i off1_current_bytes, - __m256i high_nibbles) { - __m256i off1_high_nibbles = push_last_byte_of_a_to_b(this->previous.high_nibbles, high_nibbles); - __m256i initial_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 0xC2u, -128, // 110x - 0xE1u, // 1110 - 0xF1u, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 0xC2u, -128, // 110x - 0xE1u, // 1110 - 0xF1u), // 1111 - off1_high_nibbles); - - __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); - - __m256i second_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0u, // 1110 - 0x90u, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0u, // 1110 - 0x90u), // 1111 - off1_high_nibbles); - __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); - this->add_errors( _mm256_and_si256(initial_under, second_under) ); - } - - really_inline void count_nibbles(__m256i bytes, struct processed_utf_bytes *answer) { - answer->raw_bytes = bytes; - answer->high_nibbles = _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F)); - } - - // check whether the current bytes are valid UTF-8 - // at the end of the function, previous gets updated - really_inline void check_utf8_bytes(__m256i current_bytes) { - struct processed_utf_bytes pb {}; - this->count_nibbles(current_bytes, &pb); - - this->check_smaller_than_0xF4(current_bytes); - - __m256i initial_lengths = this->continuation_lengths(pb.high_nibbles); - - pb.carried_continuations = this->carry_continuations(initial_lengths); - - this->check_continuations(initial_lengths, pb.carried_continuations); - - __m256i off1_current_bytes = - push_last_byte_of_a_to_b(this->previous.raw_bytes, pb.raw_bytes); - this->check_first_continuation_max(current_bytes, off1_current_bytes); - - this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles); - this->previous = pb; - } - - really_inline void check_next_input(__m256i in) { - __m256i high_bit = _mm256_set1_epi8(0x80u); - if (likely(_mm256_testz_si256(in, high_bit) == 1)) { - this->check_carried_continuations(); - } else { - this->check_utf8_bytes(in); - } - } - - really_inline void check_next_input(simd_input in) { - __m256i high_bit = _mm256_set1_epi8(0x80u); - __m256i any_bits_on = in.reduce([&](auto a, auto b) { - return _mm256_or_si256(a, b); - }); - if (likely(_mm256_testz_si256(any_bits_on, high_bit) == 1)) { - // it is ascii, we just check carried continuations. - this->check_carried_continuations(); - } else { - // it is not ascii so we have to do heavy work - in.each([&](auto _in) { check_utf8_bytes(_in); }); - } - } - - really_inline ErrorValues errors() { - return _mm256_testz_si256(this->has_error, this->has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; - } -}; // struct utf8_checker - -}; // namespace simdjson::haswell -UNTARGET_REGION // haswell - -#endif // IS_X86_64 - -#endif // SIMDJSON_HASWELL_SIMDUTF8CHECK_H diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index 42f7abad..e1761a59 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -5,85 +5,29 @@ #ifdef IS_X86_64 -#include "haswell/simd_input.h" -#include "haswell/simdutf8check.h" +#include "haswell/bitmask.h" +#include "haswell/simd.h" #include "simdjson/stage1_find_marks.h" TARGET_HASWELL namespace simdjson::haswell { -really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { - // There should be no such thing with a processing supporting avx2 - // but not clmul. - uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0)); - return quote_mask; -} +using namespace simd; really_inline void find_whitespace_and_operators( - const simd_input in, + const simd::simd8x64 in, uint64_t &whitespace, uint64_t &op) { - #ifdef SIMDJSON_NAIVE_STRUCTURAL + whitespace = in.map([&](simd8 _in) { + return _in == _in.lookup_lower_4_bits(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + }).to_bitmask(); - // You should never need this naive approach, but it can be useful - // for research purposes - const __m256i mask_open_brace = _mm256_set1_epi8(0x7b); - const __m256i mask_close_brace = _mm256_set1_epi8(0x7d); - const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b); - const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d); - const __m256i mask_column = _mm256_set1_epi8(0x3a); - const __m256i mask_comma = _mm256_set1_epi8(0x2c); - op = in.map([&](auto in) { - __m256i op = _mm256_cmpeq_epi8(in, mask_open_brace); - op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_brace)); - op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_open_bracket)); - op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_close_bracket)); - op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_column)); - op = _mm256_or_si256(op, _mm256_cmpeq_epi8(in, mask_comma)); - return op; - }).to_bitmask(); - - const __m256i mask_space = _mm256_set1_epi8(0x20); - const __m256i mask_linefeed = _mm256_set1_epi8(0x0a); - const __m256i mask_tab = _mm256_set1_epi8(0x09); - const __m256i mask_carriage = _mm256_set1_epi8(0x0d); - whitespace = in.map([&](auto in) { - __m256i space = _mm256_cmpeq_epi8(in, mask_space); - space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed)); - space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab)); - space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage)); - return space; - }).to_bitmask(); - // end of naive approach - - #else // SIMDJSON_NAIVE_STRUCTURAL - - // clang-format off - const __m256i operator_table = - _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123, - 44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); - const __m256i white_table = _mm256_setr_epi8( - 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100, - 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); - // clang-format on - const __m256i op_offset = _mm256_set1_epi8(0xd4u); - const __m256i op_mask = _mm256_set1_epi8(32); - - whitespace = in.map([&](auto _in) { - return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in)); - }).to_bitmask(); - - op = in.map([&](auto _in) { - const __m256i r1 = _mm256_add_epi8(op_offset, _in); - const __m256i r2 = _mm256_or_si256(_in, op_mask); - const __m256i r3 = _mm256_shuffle_epi8(operator_table, r1); - return _mm256_cmpeq_epi8(r2, r3); - }).to_bitmask(); - - #endif // else SIMDJSON_NAIVE_STRUCTURAL + op = in.map([&](simd8 _in) { + return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); + }).to_bitmask(); } +#include "generic/simdutf8check.h" #include "generic/stage1_find_marks.h" } // namespace haswell diff --git a/src/haswell/stringparsing.h b/src/haswell/stringparsing.h index c99a3cfa..e739a5f2 100644 --- a/src/haswell/stringparsing.h +++ b/src/haswell/stringparsing.h @@ -1,41 +1,37 @@ #ifndef SIMDJSON_HASWELL_STRINGPARSING_H #define SIMDJSON_HASWELL_STRINGPARSING_H +#include "simdjson/portability.h" + #ifdef IS_X86_64 +#include "haswell/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - TARGET_HASWELL namespace simdjson::haswell { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(__m256i); } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { - // this can read up to 31 bytes beyond the buffer size, but we require + // this can read up to 15 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING); - __m256i v = _mm256_loadu_si256(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v); - auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v(src); + // store to dest unconditionally - we can overwrite the bits we don't like later + v.store(dst); return { - static_cast(_mm256_movemask_epi8( - _mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits - static_cast(_mm256_movemask_epi8(quote_mask)) // quote_bits + (uint32_t)(v == '\\').to_bitmask(), // bs_bits + (uint32_t)(v == '"').to_bitmask(), // quote_bits }; } diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp index a8156f88..ed2537b0 100644 --- a/src/stage1_find_marks.cpp +++ b/src/stage1_find_marks.cpp @@ -1,19 +1,3 @@ -#include "simdjson/portability.h" -#include "simdjson/common_defs.h" - -namespace { -// for when clmul is unavailable -[[maybe_unused]] really_inline uint64_t portable_compute_quote_mask(uint64_t quote_bits) { - uint64_t quote_mask = quote_bits ^ (quote_bits << 1); - quote_mask = quote_mask ^ (quote_mask << 2); - quote_mask = quote_mask ^ (quote_mask << 4); - quote_mask = quote_mask ^ (quote_mask << 8); - quote_mask = quote_mask ^ (quote_mask << 16); - quote_mask = quote_mask ^ (quote_mask << 32); - return quote_mask; -} -} // namespace - #include "arm64/stage1_find_marks.h" #include "haswell/stage1_find_marks.h" #include "westmere/stage1_find_marks.h" diff --git a/src/stage2_build_tape.cpp b/src/stage2_build_tape.cpp index 08f95638..e1372779 100644 --- a/src/stage2_build_tape.cpp +++ b/src/stage2_build_tape.cpp @@ -65,6 +65,12 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) { return error == 0; } +#ifdef JSON_TEST_STRINGS +void found_string(const uint8_t *buf, const uint8_t *parsed_begin, + const uint8_t *parsed_end); +void found_bad_string(const uint8_t *buf); +#endif + #include "arm64/stage2_build_tape.h" #include "haswell/stage2_build_tape.h" #include "westmere/stage2_build_tape.h" diff --git a/src/westmere/bitmask.h b/src/westmere/bitmask.h new file mode 100644 index 00000000..e92ecad7 --- /dev/null +++ b/src/westmere/bitmask.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_WESTMERE_BITMASK_H +#define SIMDJSON_WESTMERE_BITMASK_H + +#include "simdjson/portability.h" + +#ifdef IS_X86_64 + +#include "simdjson/common_defs.h" + +TARGET_WESTMERE +namespace simdjson::westmere { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +really_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processing supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // namespace simdjson::westmere +UNTARGET_REGION + +#endif // IS_X86_64 +#endif diff --git a/src/westmere/simd.h b/src/westmere/simd.h new file mode 100644 index 00000000..b169d597 --- /dev/null +++ b/src/westmere/simd.h @@ -0,0 +1,270 @@ +#ifndef SIMDJSON_WESTMERE_SIMD_H +#define SIMDJSON_WESTMERE_SIMD_H + +#include "simdjson/portability.h" + +#ifdef IS_X86_64 + +#include "simdjson/common_defs.h" +#include "simdjson/simdjson.h" + +TARGET_WESTMERE +namespace simdjson::westmere::simd { + + template + struct base { + __m128i value; + + // Zero constructor + really_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + really_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + really_inline operator const __m128i&() const { return this->value; } + really_inline operator __m128i&() { return this->value; } + + // Bit operations + really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } + really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } + really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } + really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(*this, other); } + really_inline Child operator~() const { return *this ^ 0xFFu; } + really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; } + really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; } + really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef int bitmask_t; + + really_inline base8() : base>() {} + really_inline base8(const __m128i _value) : base>(_value) {} + + really_inline Mask operator==(const simd8 other) const { return _mm_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base>::value); + + template + really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static really_inline simd8 splat(bool _value) { return _mm_set1_epi8(-(!!_value)); } + + really_inline simd8() : base8() {} + really_inline simd8(const __m128i _value) : base8(_value) {} + // Splat constructor + really_inline simd8(bool _value) : base8(splat(_value)) {} + + really_inline bitmask_t to_bitmask() const { return _mm_movemask_epi8(*this); } + really_inline bool any() const { return !_mm_testz_si128(*this, *this); } + }; + + template + struct base8_numeric: base8 { + static really_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } + static really_inline simd8 zero() { return _mm_setzero_si128(); } + static really_inline simd8 load(const T values[16]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + + really_inline base8_numeric() : base8() {} + really_inline base8_numeric(const __m128i _value) : base8(_value) {} + + // Store to array + really_inline void store(T dst[16]) { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } + really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Perform a lookup of the lower 4 bits + template + really_inline simd8 lookup_lower_4_bits( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + + simd8 lookup_table( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + return _mm_shuffle_epi8(lookup_table, *this); + } + + // Perform a lookup assuming the value is between 0 and 16 + template + really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_lower_4_bits( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + ); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + really_inline simd8() : base8_numeric() {} + really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + + // Order-sensitive comparisons + really_inline simd8 max(const simd8 other) const { return _mm_max_epi8(*this, other); } + really_inline simd8 min(const simd8 other) const { return _mm_min_epi8(*this, other); } + really_inline simd8 operator>(const simd8 other) const { return _mm_cmpgt_epi8(*this, other); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + really_inline simd8() : base8_numeric() {} + really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + really_inline simd8(const uint8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + + // Saturated math + really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } + really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu8(*this, other); } + + // Order-specific operations + really_inline simd8 max(const simd8 other) const { return _mm_max_epu8(*this, other); } + really_inline simd8 min(const simd8 other) const { return _mm_min_epu8(*this, other); } + really_inline simd8 operator<=(const simd8 other) const { return other.max(*this) == other; } + + // Bit-specific operations + really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } + really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } + really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm_testz_si128(*this, bits); } + really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); } + template + really_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + really_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + }; + + template + struct simd8x64 { + const simd8 chunks[4]; + + really_inline simd8x64() : chunks{simd8(), simd8(), simd8(), simd8()} {} + really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + really_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + really_inline void store(T ptr[64]) { + this->chunks[0].store(ptr); + this->chunks[0].store(ptr+16); + this->chunks[0].store(ptr+32); + this->chunks[0].store(ptr+48); + } + + template + really_inline void each(F const& each_chunk) const + { + each_chunk(this->chunks[0]); + each_chunk(this->chunks[1]); + each_chunk(this->chunks[2]); + each_chunk(this->chunks[3]); + } + + template + really_inline simd8x64 map(F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0]), + map_chunk(this->chunks[1]), + map_chunk(this->chunks[2]), + map_chunk(this->chunks[3]) + ); + } + + template + really_inline simd8x64 map(const simd8x64 b, F const& map_chunk) const { + return simd8x64( + map_chunk(this->chunks[0], b.chunks[0]), + map_chunk(this->chunks[1], b.chunks[1]), + map_chunk(this->chunks[2], b.chunks[2]), + map_chunk(this->chunks[3], b.chunks[3]) + ); + } + + template + really_inline simd8 reduce(F const& reduce_pair) const { + return reduce_pair( + reduce_pair(this->chunks[0], this->chunks[1]), + reduce_pair(this->chunks[2], this->chunks[3]) + ); + } + + really_inline uint64_t to_bitmask() const { + uint64_t r0 = static_cast(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + really_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a | mask; } ); + } + + really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); + } + + really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); + } + + }; // struct simd8x64 + +} // namespace simdjson::westmere::simd +UNTARGET_REGION + +#endif // IS_X86_64 +#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H diff --git a/src/westmere/simd_input.h b/src/westmere/simd_input.h deleted file mode 100644 index e107a0e4..00000000 --- a/src/westmere/simd_input.h +++ /dev/null @@ -1,100 +0,0 @@ -#ifndef SIMDJSON_WESTMERE_SIMD_INPUT_H -#define SIMDJSON_WESTMERE_SIMD_INPUT_H - -#include "simdjson/common_defs.h" -#include "simdjson/portability.h" -#include "simdjson/simdjson.h" - -#ifdef IS_X86_64 - -TARGET_WESTMERE -namespace simdjson::westmere { - -struct simd_input { - const __m128i chunks[4]; - - really_inline simd_input() - : chunks { __m128i(), __m128i(), __m128i(), __m128i() } {} - - really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3) - : chunks{chunk0, chunk1, chunk2, chunk3} {} - - really_inline simd_input(const uint8_t *ptr) - : simd_input( - _mm_loadu_si128(reinterpret_cast(ptr + 0)), - _mm_loadu_si128(reinterpret_cast(ptr + 16)), - _mm_loadu_si128(reinterpret_cast(ptr + 32)), - _mm_loadu_si128(reinterpret_cast(ptr + 48)) - ) {} - - template - really_inline void each(F const& each_chunk) const { - each_chunk(this->chunks[0]); - each_chunk(this->chunks[1]); - each_chunk(this->chunks[2]); - each_chunk(this->chunks[3]); - } - - template - really_inline simd_input map(F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0]), - map_chunk(this->chunks[1]), - map_chunk(this->chunks[2]), - map_chunk(this->chunks[3]) - ); - } - - template - really_inline simd_input map(const simd_input b, F const& map_chunk) const { - return simd_input( - map_chunk(this->chunks[0], b.chunks[0]), - map_chunk(this->chunks[1], b.chunks[1]), - map_chunk(this->chunks[2], b.chunks[2]), - map_chunk(this->chunks[3], b.chunks[3]) - ); - } - - template - really_inline __m128i reduce(F const& reduce_pair) const { - __m128i r01 = reduce_pair(this->chunks[0], this->chunks[1]); - __m128i r23 = reduce_pair(this->chunks[2], this->chunks[3]); - return reduce_pair(r01, r23); - } - - really_inline uint64_t to_bitmask() const { - uint64_t r0 = static_cast(_mm_movemask_epi8(this->chunks[0])); - uint64_t r1 = _mm_movemask_epi8(this->chunks[1]); - uint64_t r2 = _mm_movemask_epi8(this->chunks[2]); - uint64_t r3 = _mm_movemask_epi8(this->chunks[3]); - return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); - } - - really_inline simd_input bit_or(const uint8_t m) const { - const __m128i mask = _mm_set1_epi8(m); - return this->map( [&](auto a) { - return _mm_or_si128(a, mask); - }); - } - - really_inline uint64_t eq(const uint8_t m) const { - const __m128i mask = _mm_set1_epi8(m); - return this->map( [&](auto a) { - return _mm_cmpeq_epi8(a, mask); - }).to_bitmask(); - } - - really_inline uint64_t lteq(const uint8_t m) const { - const __m128i maxval = _mm_set1_epi8(m); - return this->map( [&](auto a) { - return _mm_cmpeq_epi8(_mm_max_epu8(maxval, a), maxval); - }).to_bitmask(); - } - -}; // struct simd_input - -} // namespace simdjson::westmere -UNTARGET_REGION - -#endif // IS_X86_64 -#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H diff --git a/src/westmere/simdutf8check.h b/src/westmere/simdutf8check.h deleted file mode 100644 index 4c7e2801..00000000 --- a/src/westmere/simdutf8check.h +++ /dev/null @@ -1,203 +0,0 @@ -#ifndef SIMDJSON_WESTMERE_SIMDUTF8CHECK_H -#define SIMDJSON_WESTMERE_SIMDUTF8CHECK_H - -#include "simdjson/portability.h" -#include "simdjson/simdjson.h" -#include "westmere/simd_input.h" -#include -#include -#include -#ifdef IS_X86_64 - -/* - * legal utf-8 byte sequence - * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 - * - * Code Points 1st 2s 3s 4s - * U+0000..U+007F 00..7F - * U+0080..U+07FF C2..DF 80..BF - * U+0800..U+0FFF E0 A0..BF 80..BF - * U+1000..U+CFFF E1..EC 80..BF 80..BF - * U+D000..U+D7FF ED 80..9F 80..BF - * U+E000..U+FFFF EE..EF 80..BF 80..BF - * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - * - */ - -// all byte values must be no larger than 0xF4 - -/********** sse code **********/ -TARGET_WESTMERE -namespace simdjson::westmere { - -struct processed_utf_bytes { - __m128i raw_bytes; - __m128i high_nibbles; - __m128i carried_continuations; -}; - -struct utf8_checker { - __m128i has_error{_mm_setzero_si128()}; - processed_utf_bytes previous{_mm_setzero_si128(), _mm_setzero_si128(), _mm_setzero_si128()}; - - really_inline void add_errors(__m128i errors) { - this->has_error = _mm_or_si128(errors, this->has_error); - } - - // all byte values must be no larger than 0xF4 - really_inline void check_smaller_than_0xF4(__m128i current_bytes) { - // unsigned, saturates to 0 below max - this->add_errors( _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4u)) ); - } - - really_inline __m128i continuation_lengths(__m128i high_nibbles) { - return _mm_shuffle_epi8( - _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4), // 1111, next should be 0 (not checked here) - high_nibbles); - } - - really_inline __m128i carry_continuations(__m128i initial_lengths) { - - __m128i right1 = - _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, this->previous.carried_continuations, 16 - 1), - _mm_set1_epi8(1)); - __m128i sum = _mm_add_epi8(initial_lengths, right1); - - __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, this->previous.carried_continuations, 16 - 2), - _mm_set1_epi8(2)); - return _mm_add_epi8(sum, right2); - } - - really_inline void check_continuations(__m128i initial_lengths, __m128i carries) { - - // overlap || underlap - // carry > length && length > 0 || !(carry > length) && !(length > 0) - // (carries > length) == (lengths > 0) - __m128i overunder = - _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths), - _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128())); - - this->add_errors( overunder ); - } - - // when 0xED is found, next byte must be no larger than 0x9F - // when 0xF4 is found, next byte must be no larger than 0x8F - // next byte must be continuation, ie sign bit is set, so signed < is ok - really_inline void check_first_continuation_max(__m128i current_bytes, __m128i off1_current_bytes) { - __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xEDu)); - __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4u)); - - __m128i badfollowED = _mm_and_si128( - _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9Fu)), maskED); - __m128i badfollowF4 = _mm_and_si128( - _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8Fu)), maskF4); - - this->add_errors( _mm_or_si128(badfollowED, badfollowF4) ); - } - - // map off1_hibits => error condition - // hibits off1 cur - // C => < C2 && true - // E => < E1 && < A0 - // F => < F1 && < 90 - // else false && false - really_inline void check_overlong(__m128i current_bytes, - __m128i off1_current_bytes, __m128i high_nibbles) { - __m128i off1_hibits = _mm_alignr_epi8(high_nibbles, this->previous.high_nibbles, 16 - 1); - __m128i initial_mins = _mm_shuffle_epi8( - _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, // 10xx => false - 0xC2u, -128, // 110x - 0xE1u, // 1110 - 0xF1u), - off1_hibits); - - __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes); - - __m128i second_mins = _mm_shuffle_epi8( - _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0u, // 1110 - 0x90u), - off1_hibits); - __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes); - this->add_errors( _mm_and_si128(initial_under, second_under) ); - } - - really_inline void count_nibbles(__m128i bytes, struct processed_utf_bytes *answer) { - answer->raw_bytes = bytes; - answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)); - } - - // check whether the current bytes are valid UTF-8 - // at the end of the function, previous gets updated - really_inline void check_utf8_bytes(__m128i current_bytes) { - struct processed_utf_bytes pb; - this->count_nibbles(current_bytes, &pb); - - this->check_smaller_than_0xF4(current_bytes); - - __m128i initial_lengths = this->continuation_lengths(pb.high_nibbles); - - pb.carried_continuations = this->carry_continuations(initial_lengths); - - this->check_continuations(initial_lengths, pb.carried_continuations); - - __m128i off1_current_bytes = - _mm_alignr_epi8(pb.raw_bytes, this->previous.raw_bytes, 16 - 1); - this->check_first_continuation_max(current_bytes, off1_current_bytes); - - this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles); - this->previous = pb; - } - - really_inline void check_carried_continuations() { - this->has_error = _mm_cmpgt_epi8(this->previous.carried_continuations, - _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 1)); - } - - really_inline void check_next_input(__m128i in) { - __m128i high_bit = _mm_set1_epi8(0x80u); - if (_mm_testz_si128( in, high_bit) == 1) { - // it is ascii, we just check continuations - this->check_carried_continuations(); - } else { - // it is not ascii so we have to do heavy work - this->check_utf8_bytes(in); - } - } - - really_inline void check_next_input(simd_input in) { - __m128i high_bit = _mm_set1_epi8(0x80u); - __m128i any_bits_on = in.reduce([&](auto a, auto b) { - return _mm_or_si128(a, b); - }); - if (_mm_testz_si128(any_bits_on, high_bit) == 1) { - // it is ascii, we just check continuations - this->check_carried_continuations(); - } else { - // it is not ascii so we have to do heavy work - in.each([&](auto _in) { this->check_utf8_bytes(_in); }); - } - } - - really_inline ErrorValues errors() { - return _mm_testz_si128(this->has_error, this->has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; - } - -}; // struct utf8_checker - -} // namespace simdjson::westmere -UNTARGET_REGION // westmere - -#endif // IS_X86_64 - -#endif diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index 6abaf5e4..3184fb0d 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -5,41 +5,29 @@ #ifdef IS_X86_64 -#include "westmere/simd_input.h" -#include "westmere/simdutf8check.h" +#include "westmere/bitmask.h" +#include "westmere/simd.h" #include "simdjson/stage1_find_marks.h" TARGET_WESTMERE namespace simdjson::westmere { -really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) { - return _mm_cvtsi128_si64(_mm_clmulepi64_si128( - _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0)); -} +using namespace simd; really_inline void find_whitespace_and_operators( - const simd_input in, + const simd8x64 in, uint64_t &whitespace, uint64_t &op) { - const __m128i operator_table = - _mm_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); - const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2, - 100, 9, 10, 112, 100, 13, 100, 100); - const __m128i op_offset = _mm_set1_epi8(0xd4u); - const __m128i op_mask = _mm_set1_epi8(32); - - whitespace = in.map([&](auto _in) { - return _mm_cmpeq_epi8(_in, _mm_shuffle_epi8(white_table, _in)); + whitespace = in.map([&](simd8 _in) { + return _in == _in.lookup_lower_4_bits(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); }).to_bitmask(); - op = in.map([&](auto _in) { - const __m128i r1 = _mm_add_epi8(op_offset, _in); - const __m128i r2 = _mm_or_si128(_in, op_mask); - const __m128i r3 = _mm_shuffle_epi8(operator_table, r1); - return _mm_cmpeq_epi8(r2, r3); + op = in.map([&](simd8 _in) { + return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); }).to_bitmask(); } +#include "generic/simdutf8check.h" #include "generic/stage1_find_marks.h" } // namespace westmere diff --git a/src/westmere/stringparsing.h b/src/westmere/stringparsing.h index 864e1771..dcc8deb5 100644 --- a/src/westmere/stringparsing.h +++ b/src/westmere/stringparsing.h @@ -1,40 +1,39 @@ #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H #define SIMDJSON_WESTMERE_STRINGPARSING_H +#include "simdjson/portability.h" + #ifdef IS_X86_64 +#include "westmere/simd.h" #include "simdjson/common_defs.h" #include "simdjson/parsedjson.h" #include "jsoncharutils.h" -#ifdef JSON_TEST_STRINGS -void found_string(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end); -void found_bad_string(const uint8_t *buf); -#endif - TARGET_WESTMERE namespace simdjson::westmere { +using namespace simd; + // Holds backslashes and quotes locations. struct parse_string_helper { uint32_t bs_bits; uint32_t quote_bits; - really_inline uint32_t bytes_processed() const { return sizeof(__m128i); } + static const uint32_t BYTES_PROCESSED = 32; }; really_inline parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst) { // this can read up to 31 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding - __m128i v = _mm_loadu_si128(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v); - auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"')); + static_assert(SIMDJSON_PADDING >= (parse_string_helper::BYTES_PROCESSED - 1)); + simd8 v0(src); + simd8 v1(src + 16); + v0.store(dst); + v1.store(dst + 16); + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); return { - static_cast( - _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits - static_cast(_mm_movemask_epi8(quote_mask)) // quote_bits + static_cast(bs_and_quote), // bs_bits + static_cast(bs_and_quote >> 32) // quote_bits }; }