diff --git a/.appveyor.yml b/.appveyor.yml index f0cda5e0..7247c92b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -7,30 +7,37 @@ platform: x64 environment: matrix: - job_name: VS2019 - CMAKE_ARGS: + CMAKE_ARGS: -A %Platform% - job_name: VS2019CLANG - CMAKE_ARGS: -T ClangCL + CMAKE_ARGS: -A %Platform% -T ClangCL + - job_name: VS2019ARM + CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM? - job_name: VS2017 (Static, No Threads) image: Visual Studio 2017 - CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF + CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF CTEST_ARGS: -E checkperf - job_name: VS2019 (Win32) platform: Win32 - CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway. + CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway. CTEST_ARGS: -E checkperf build_script: - - set - mkdir build - cd build - cmake --version - - cmake -A %Platform% %CMAKE_ARGS% --parallel .. + - cmake %CMAKE_ARGS% --parallel .. - cmake -LH .. - cmake --build . --config %Configuration% --verbose --parallel -test_script: - - ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel - +for: +- + matrix: + except: + - job_name: VS2019ARM + + test_script: + - ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel + clone_folder: c:\projects\simdjson matrix: diff --git a/singleheader/amalgamate_demo.cpp b/singleheader/amalgamate_demo.cpp index 9b3e337d..6a862b91 100644 --- a/singleheader/amalgamate_demo.cpp +++ b/singleheader/amalgamate_demo.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */ +/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */ #include #include "simdjson.h" diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp index 09b7f713..7c420f37 100644 --- a/singleheader/simdjson.cpp +++ b/singleheader/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */ +/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */ /* begin file src/simdjson.cpp */ #include "simdjson.h" @@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) { /* result might be undefined when input_num is zero */ really_inline int count_ones(uint64_t input_num) { - return vaddv_u8(vcnt_u8((uint8x8_t)input_num)); + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); } really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { @@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu #endif // SIMDJSON_ARM64_BITMANIPULATION_H /* end file src/arm64/bitmanipulation.h */ /* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */ +#include + namespace simdjson { namespace arm64 { namespace simd { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO +namespace { +/** + * make_uint8x16_t initializes a SIMD register (uint8x16_t). + * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...} + * is not recognized under Visual Studio! This is a workaround. + * Using a std::initializer_list as a parameter resulted in + * inefficient code. With the current approach, if the parameters are + * compile-time constants, + * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}. + * You should not use this function except for compile-time constant: + * it is not efficient. + */ +really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, + uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8, + uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12, + uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) { + // Doing a load like so end ups generating worse code. + // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_u8(array); + uint8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_u8(x1, x, 0); + x = vsetq_lane_u8(x2, x, 1); + x = vsetq_lane_u8(x3, x, 2); + x = vsetq_lane_u8(x4, x, 3); + x = vsetq_lane_u8(x5, x, 4); + x = vsetq_lane_u8(x6, x, 5); + x = vsetq_lane_u8(x7, x, 6); + x = vsetq_lane_u8(x8, x, 7); + x = vsetq_lane_u8(x9, x, 8); + x = vsetq_lane_u8(x10, x, 9); + x = vsetq_lane_u8(x11, x, 10); + x = vsetq_lane_u8(x12, x, 11); + x = vsetq_lane_u8(x13, x, 12); + x = vsetq_lane_u8(x14, x, 13); + x = vsetq_lane_u8(x15, x, 14); + x = vsetq_lane_u8(x16, x, 15); + return x; +} + + +// We have to do the same work for make_int8x16_t +really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4, + int8_t x5, int8_t x6, int8_t x7, int8_t x8, + int8_t x9, int8_t x10, int8_t x11, int8_t x12, + int8_t x13, int8_t x14, int8_t x15, int8_t x16) { + // Doing a load like so end ups generating worse code. + // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_s8(array); + int8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_s8(x1, x, 0); + x = vsetq_lane_s8(x2, x, 1); + x = vsetq_lane_s8(x3, x, 2); + x = vsetq_lane_s8(x4, x, 3); + x = vsetq_lane_s8(x5, x, 4); + x = vsetq_lane_s8(x6, x, 5); + x = vsetq_lane_s8(x7, x, 6); + x = vsetq_lane_s8(x8, x, 7); + x = vsetq_lane_s8(x9, x, 8); + x = vsetq_lane_s8(x10, x, 9); + x = vsetq_lane_s8(x11, x, 10); + x = vsetq_lane_s8(x12, x, 11); + x = vsetq_lane_s8(x13, x, 12); + x = vsetq_lane_s8(x14, x, 13); + x = vsetq_lane_s8(x15, x, 14); + x = vsetq_lane_s8(x16, x, 15); + return x; +} + +} // namespace +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO + + template struct simd8; @@ -792,8 +871,13 @@ namespace simd { // We return uint32_t instead of uint16_t because that seems to be more efficient for most // purposes (cutting it down to uint16_t costs performance in some compilers). really_inline uint32_t to_bitmask() const { - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif auto minput = *this & bit_mask; uint8x16_t tmp = vpaddq_u8(minput, minput); tmp = vpaddq_u8(tmp, tmp); @@ -818,6 +902,15 @@ namespace simd { // Splat constructor really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(make_uint8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else really_inline simd8( uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 @@ -825,6 +918,8 @@ namespace simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} +#endif + // Repeat 16 values as many times as necessary (usually for lookup tables) really_inline static simd8 repeat_16( uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, @@ -898,7 +993,11 @@ namespace simd { uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); // we increment by 0x08 the second half of the mask +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); +#else uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; +#endif shufmask = vaddq_u8(shufmask, inc); // this is the version "nearly pruned" uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); @@ -955,6 +1054,15 @@ namespace simd { // Array constructor really_inline simd8(const int8_t* values) : simd8(load(values)) {} // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(make_int8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else really_inline simd8( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 @@ -962,6 +1070,7 @@ namespace simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} +#endif // Repeat 16 values as many times as necessary (usually for lookup tables) really_inline static simd8 repeat_16( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, @@ -977,8 +1086,14 @@ namespace simd { really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. + // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14 + // and relatively ugly and hard to read. +#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} - really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(*this); } +#endif + really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(this->value); } // Math really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(*this, other); } @@ -1092,10 +1207,17 @@ namespace simd { } really_inline uint64_t to_bitmask() const { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else const uint8x16_t bit_mask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 }; +#endif // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); @@ -5852,12 +5974,12 @@ struct value128 { }; #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \ - !defined(_M_X64) // _umul128 for x86, arm, arm64 -#if defined(_M_ARM) + !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm +// this is a slow emulation routine for 32-bit Windows +// static inline uint64_t __emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; } -#endif static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); @@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) { value128 answer; #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO - answer.low = _umul128(value1, value2, &answer.high); +#ifdef _M_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; #else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // _M_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO __uint128_t r = ((__uint128_t)value1) * value2; answer.low = uint64_t(r); answer.high = uint64_t(r >> 64); diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h index 7fee763f..30a34fcf 100644 --- a/singleheader/simdjson.h +++ b/singleheader/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */ +/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */ /* begin file include/simdjson.h */ #ifndef SIMDJSON_H #define SIMDJSON_H diff --git a/src/arm64/bitmanipulation.h b/src/arm64/bitmanipulation.h index 4d9c0f24..fe002043 100644 --- a/src/arm64/bitmanipulation.h +++ b/src/arm64/bitmanipulation.h @@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) { /* result might be undefined when input_num is zero */ really_inline int count_ones(uint64_t input_num) { - return vaddv_u8(vcnt_u8((uint8x8_t)input_num)); + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); } really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { diff --git a/src/arm64/simd.h b/src/arm64/simd.h index 1b51a7d2..ebce9d62 100644 --- a/src/arm64/simd.h +++ b/src/arm64/simd.h @@ -5,11 +5,94 @@ #include "simdprune_tables.h" #include "arm64/bitmanipulation.h" #include "arm64/intrinsics.h" +#include + namespace simdjson { namespace arm64 { namespace simd { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO +namespace { +// Start of private section with Visual Studio workaround + + +/** + * make_uint8x16_t initializes a SIMD register (uint8x16_t). + * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...} + * is not recognized under Visual Studio! This is a workaround. + * Using a std::initializer_list as a parameter resulted in + * inefficient code. With the current approach, if the parameters are + * compile-time constants, + * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}. + * You should not use this function except for compile-time constants: + * it is not efficient. + */ +really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, + uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8, + uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12, + uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) { + // Doing a load like so end ups generating worse code. + // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_u8(array); + uint8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_u8(x1, x, 0); + x = vsetq_lane_u8(x2, x, 1); + x = vsetq_lane_u8(x3, x, 2); + x = vsetq_lane_u8(x4, x, 3); + x = vsetq_lane_u8(x5, x, 4); + x = vsetq_lane_u8(x6, x, 5); + x = vsetq_lane_u8(x7, x, 6); + x = vsetq_lane_u8(x8, x, 7); + x = vsetq_lane_u8(x9, x, 8); + x = vsetq_lane_u8(x10, x, 9); + x = vsetq_lane_u8(x11, x, 10); + x = vsetq_lane_u8(x12, x, 11); + x = vsetq_lane_u8(x13, x, 12); + x = vsetq_lane_u8(x14, x, 13); + x = vsetq_lane_u8(x15, x, 14); + x = vsetq_lane_u8(x16, x, 15); + return x; +} + + +// We have to do the same work for make_int8x16_t +really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4, + int8_t x5, int8_t x6, int8_t x7, int8_t x8, + int8_t x9, int8_t x10, int8_t x11, int8_t x12, + int8_t x13, int8_t x14, int8_t x15, int8_t x16) { + // Doing a load like so end ups generating worse code. + // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_s8(array); + int8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_s8(x1, x, 0); + x = vsetq_lane_s8(x2, x, 1); + x = vsetq_lane_s8(x3, x, 2); + x = vsetq_lane_s8(x4, x, 3); + x = vsetq_lane_s8(x5, x, 4); + x = vsetq_lane_s8(x6, x, 5); + x = vsetq_lane_s8(x7, x, 6); + x = vsetq_lane_s8(x8, x, 7); + x = vsetq_lane_s8(x9, x, 8); + x = vsetq_lane_s8(x10, x, 9); + x = vsetq_lane_s8(x11, x, 10); + x = vsetq_lane_s8(x12, x, 11); + x = vsetq_lane_s8(x13, x, 12); + x = vsetq_lane_s8(x14, x, 13); + x = vsetq_lane_s8(x15, x, 14); + x = vsetq_lane_s8(x16, x, 15); + return x; +} + +// End of private section with Visual Studio workaround +} // namespace +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO + + template struct simd8; @@ -61,8 +144,13 @@ namespace simd { // We return uint32_t instead of uint16_t because that seems to be more efficient for most // purposes (cutting it down to uint16_t costs performance in some compilers). really_inline uint32_t to_bitmask() const { - const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif auto minput = *this & bit_mask; uint8x16_t tmp = vpaddq_u8(minput, minput); tmp = vpaddq_u8(tmp, tmp); @@ -87,6 +175,15 @@ namespace simd { // Splat constructor really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(make_uint8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else really_inline simd8( uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 @@ -94,6 +191,8 @@ namespace simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} +#endif + // Repeat 16 values as many times as necessary (usually for lookup tables) really_inline static simd8 repeat_16( uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, @@ -167,7 +266,11 @@ namespace simd { uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); // we increment by 0x08 the second half of the mask +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); +#else uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; +#endif shufmask = vaddq_u8(shufmask, inc); // this is the version "nearly pruned" uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); @@ -224,6 +327,15 @@ namespace simd { // Array constructor really_inline simd8(const int8_t* values) : simd8(load(values)) {} // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(make_int8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else really_inline simd8( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 @@ -231,6 +343,7 @@ namespace simd { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 }) {} +#endif // Repeat 16 values as many times as necessary (usually for lookup tables) really_inline static simd8 repeat_16( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, @@ -246,8 +359,14 @@ namespace simd { really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. + // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14 + // and relatively ugly and hard to read. +#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} - really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(*this); } +#endif + really_inline explicit operator simd8() const { return vreinterpretq_u8_s8(this->value); } // Math really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(*this, other); } @@ -361,10 +480,17 @@ namespace simd { } really_inline uint64_t to_bitmask() const { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else const uint8x16_t bit_mask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 }; +#endif // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); diff --git a/src/jsoncharutils.h b/src/jsoncharutils.h index 7a90266c..bf620dfc 100644 --- a/src/jsoncharutils.h +++ b/src/jsoncharutils.h @@ -312,12 +312,12 @@ struct value128 { }; #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \ - !defined(_M_X64) // _umul128 for x86, arm, arm64 -#if defined(_M_ARM) + !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm +// this is a slow emulation routine for 32-bit Windows +// static inline uint64_t __emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; } -#endif static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); @@ -333,8 +333,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) { value128 answer; #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO - answer.low = _umul128(value1, value2, &answer.high); +#ifdef _M_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; #else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // _M_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO __uint128_t r = ((__uint128_t)value1) * value2; answer.low = uint64_t(r); answer.high = uint64_t(r >> 64);