Compile under Visual Studio for ARM64 (#861)

* Modifications so that we can compile under Visual Studio for ARM64
* Let us throw appveyor at this beast.
This commit is contained in:
Daniel Lemire 2020-05-06 20:08:10 -07:00 committed by GitHub
parent 0e6bd2224f
commit 3c3a4db54e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 294 additions and 27 deletions

View File

@ -7,28 +7,35 @@ platform: x64
environment: environment:
matrix: matrix:
- job_name: VS2019 - job_name: VS2019
CMAKE_ARGS: CMAKE_ARGS: -A %Platform%
- job_name: VS2019CLANG - job_name: VS2019CLANG
CMAKE_ARGS: -T ClangCL CMAKE_ARGS: -A %Platform% -T ClangCL
- job_name: VS2019ARM
CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM?
- job_name: VS2017 (Static, No Threads) - job_name: VS2017 (Static, No Threads)
image: Visual Studio 2017 image: Visual Studio 2017
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
CTEST_ARGS: -E checkperf CTEST_ARGS: -E checkperf
- job_name: VS2019 (Win32) - job_name: VS2019 (Win32)
platform: Win32 platform: Win32
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway. CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
CTEST_ARGS: -E checkperf CTEST_ARGS: -E checkperf
build_script: build_script:
- set
- mkdir build - mkdir build
- cd build - cd build
- cmake --version - cmake --version
- cmake -A %Platform% %CMAKE_ARGS% --parallel .. - cmake %CMAKE_ARGS% --parallel ..
- cmake -LH .. - cmake -LH ..
- cmake --build . --config %Configuration% --verbose --parallel - cmake --build . --config %Configuration% --verbose --parallel
test_script: for:
-
matrix:
except:
- job_name: VS2019ARM
test_script:
- ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel - ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
clone_folder: c:\projects\simdjson clone_folder: c:\projects\simdjson

View File

@ -1,4 +1,4 @@
/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */ /* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
#include <iostream> #include <iostream>
#include "simdjson.h" #include "simdjson.h"

View File

@ -1,4 +1,4 @@
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */ /* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
/* begin file src/simdjson.cpp */ /* begin file src/simdjson.cpp */
#include "simdjson.h" #include "simdjson.h"
@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
/* result might be undefined when input_num is zero */ /* result might be undefined when input_num is zero */
really_inline int count_ones(uint64_t input_num) { really_inline int count_ones(uint64_t input_num) {
return vaddv_u8(vcnt_u8((uint8x8_t)input_num)); return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
} }
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu
#endif // SIMDJSON_ARM64_BITMANIPULATION_H #endif // SIMDJSON_ARM64_BITMANIPULATION_H
/* end file src/arm64/bitmanipulation.h */ /* end file src/arm64/bitmanipulation.h */
/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */ /* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
#include <type_traits>
namespace simdjson { namespace simdjson {
namespace arm64 { namespace arm64 {
namespace simd { namespace simd {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
namespace {
/**
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
* is not recognized under Visual Studio! This is a workaround.
* Using a std::initializer_list<uint8_t> as a parameter resulted in
* inefficient code. With the current approach, if the parameters are
* compile-time constants,
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
* You should not use this function except for compile-time constant:
* it is not efficient.
*/
really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
// Doing a load like so end ups generating worse code.
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_u8(array);
uint8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_u8(x1, x, 0);
x = vsetq_lane_u8(x2, x, 1);
x = vsetq_lane_u8(x3, x, 2);
x = vsetq_lane_u8(x4, x, 3);
x = vsetq_lane_u8(x5, x, 4);
x = vsetq_lane_u8(x6, x, 5);
x = vsetq_lane_u8(x7, x, 6);
x = vsetq_lane_u8(x8, x, 7);
x = vsetq_lane_u8(x9, x, 8);
x = vsetq_lane_u8(x10, x, 9);
x = vsetq_lane_u8(x11, x, 10);
x = vsetq_lane_u8(x12, x, 11);
x = vsetq_lane_u8(x13, x, 12);
x = vsetq_lane_u8(x14, x, 13);
x = vsetq_lane_u8(x15, x, 14);
x = vsetq_lane_u8(x16, x, 15);
return x;
}
// We have to do the same work for make_int8x16_t
really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
// Doing a load like so end ups generating worse code.
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_s8(array);
int8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_s8(x1, x, 0);
x = vsetq_lane_s8(x2, x, 1);
x = vsetq_lane_s8(x3, x, 2);
x = vsetq_lane_s8(x4, x, 3);
x = vsetq_lane_s8(x5, x, 4);
x = vsetq_lane_s8(x6, x, 5);
x = vsetq_lane_s8(x7, x, 6);
x = vsetq_lane_s8(x8, x, 7);
x = vsetq_lane_s8(x9, x, 8);
x = vsetq_lane_s8(x10, x, 9);
x = vsetq_lane_s8(x11, x, 10);
x = vsetq_lane_s8(x12, x, 11);
x = vsetq_lane_s8(x13, x, 12);
x = vsetq_lane_s8(x14, x, 13);
x = vsetq_lane_s8(x15, x, 14);
x = vsetq_lane_s8(x16, x, 15);
return x;
}
} // namespace
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
template<typename T> template<typename T>
struct simd8; struct simd8;
@ -792,8 +871,13 @@ namespace simd {
// We return uint32_t instead of uint16_t because that seems to be more efficient for most // We return uint32_t instead of uint16_t because that seems to be more efficient for most
// purposes (cutting it down to uint16_t costs performance in some compilers). // purposes (cutting it down to uint16_t costs performance in some compilers).
really_inline uint32_t to_bitmask() const { really_inline uint32_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
auto minput = *this & bit_mask; auto minput = *this & bit_mask;
uint8x16_t tmp = vpaddq_u8(minput, minput); uint8x16_t tmp = vpaddq_u8(minput, minput);
tmp = vpaddq_u8(tmp, tmp); tmp = vpaddq_u8(tmp, tmp);
@ -818,6 +902,15 @@ namespace simd {
// Splat constructor // Splat constructor
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
// Member-by-member initialization // Member-by-member initialization
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline simd8(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
) : simd8(make_uint8x16_t(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
)) {}
#else
really_inline simd8( really_inline simd8(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
@ -825,6 +918,8 @@ namespace simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
#endif
// Repeat 16 values as many times as necessary (usually for lookup tables) // Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<uint8_t> repeat_16( really_inline static simd8<uint8_t> repeat_16(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
@ -898,7 +993,11 @@ namespace simd {
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
// we increment by 0x08 the second half of the mask // we increment by 0x08 the second half of the mask
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
#else
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
#endif
shufmask = vaddq_u8(shufmask, inc); shufmask = vaddq_u8(shufmask, inc);
// this is the version "nearly pruned" // this is the version "nearly pruned"
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
@ -955,6 +1054,15 @@ namespace simd {
// Array constructor // Array constructor
really_inline simd8(const int8_t* values) : simd8(load(values)) {} really_inline simd8(const int8_t* values) : simd8(load(values)) {}
// Member-by-member initialization // Member-by-member initialization
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline simd8(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
) : simd8(make_int8x16_t(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
)) {}
#else
really_inline simd8( really_inline simd8(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
@ -962,6 +1070,7 @@ namespace simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
#endif
// Repeat 16 values as many times as necessary (usually for lookup tables) // Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<int8_t> repeat_16( really_inline static simd8<int8_t> repeat_16(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
@ -977,8 +1086,14 @@ namespace simd {
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
// Explicit conversion to/from unsigned // Explicit conversion to/from unsigned
//
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
// In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
// and relatively ugly and hard to read.
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); } #endif
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
// Math // Math
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); } really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
@ -1092,10 +1207,17 @@ namespace simd {
} }
really_inline uint64_t to_bitmask() const { really_inline uint64_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
);
#else
const uint8x16_t bit_mask = { const uint8x16_t bit_mask = {
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
}; };
#endif
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
@ -5852,12 +5974,12 @@ struct value128 {
}; };
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \ #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
!defined(_M_X64) // _umul128 for x86, arm, arm64 !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
#if defined(_M_ARM) // this is a slow emulation routine for 32-bit Windows
//
static inline uint64_t __emulu(uint32_t x, uint32_t y) { static inline uint64_t __emulu(uint32_t x, uint32_t y) {
return x * (uint64_t)y; return x * (uint64_t)y;
} }
#endif
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) { really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
value128 answer; value128 answer;
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
answer.low = _umul128(value1, value2, &answer.high); #ifdef _M_ARM64
// ARM64 has native support for 64-bit multiplications, no need to emultate
answer.high = __umulh(value1, value2);
answer.low = value1 * value2;
#else #else
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
#endif // _M_ARM64
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
__uint128_t r = ((__uint128_t)value1) * value2; __uint128_t r = ((__uint128_t)value1) * value2;
answer.low = uint64_t(r); answer.low = uint64_t(r);
answer.high = uint64_t(r >> 64); answer.high = uint64_t(r >> 64);

View File

@ -1,4 +1,4 @@
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */ /* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
/* begin file include/simdjson.h */ /* begin file include/simdjson.h */
#ifndef SIMDJSON_H #ifndef SIMDJSON_H
#define SIMDJSON_H #define SIMDJSON_H

View File

@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
/* result might be undefined when input_num is zero */ /* result might be undefined when input_num is zero */
really_inline int count_ones(uint64_t input_num) { really_inline int count_ones(uint64_t input_num) {
return vaddv_u8(vcnt_u8((uint8x8_t)input_num)); return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
} }
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {

View File

@ -5,11 +5,94 @@
#include "simdprune_tables.h" #include "simdprune_tables.h"
#include "arm64/bitmanipulation.h" #include "arm64/bitmanipulation.h"
#include "arm64/intrinsics.h" #include "arm64/intrinsics.h"
#include <type_traits>
namespace simdjson { namespace simdjson {
namespace arm64 { namespace arm64 {
namespace simd { namespace simd {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
namespace {
// Start of private section with Visual Studio workaround
/**
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
* is not recognized under Visual Studio! This is a workaround.
* Using a std::initializer_list<uint8_t> as a parameter resulted in
* inefficient code. With the current approach, if the parameters are
* compile-time constants,
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
* You should not use this function except for compile-time constants:
* it is not efficient.
*/
really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
// Doing a load like so end ups generating worse code.
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_u8(array);
uint8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_u8(x1, x, 0);
x = vsetq_lane_u8(x2, x, 1);
x = vsetq_lane_u8(x3, x, 2);
x = vsetq_lane_u8(x4, x, 3);
x = vsetq_lane_u8(x5, x, 4);
x = vsetq_lane_u8(x6, x, 5);
x = vsetq_lane_u8(x7, x, 6);
x = vsetq_lane_u8(x8, x, 7);
x = vsetq_lane_u8(x9, x, 8);
x = vsetq_lane_u8(x10, x, 9);
x = vsetq_lane_u8(x11, x, 10);
x = vsetq_lane_u8(x12, x, 11);
x = vsetq_lane_u8(x13, x, 12);
x = vsetq_lane_u8(x14, x, 13);
x = vsetq_lane_u8(x15, x, 14);
x = vsetq_lane_u8(x16, x, 15);
return x;
}
// We have to do the same work for make_int8x16_t
really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
// Doing a load like so end ups generating worse code.
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
// x9, x10,x11,x12,x13,x14,x15,x16};
// return vld1q_s8(array);
int8x16_t x{};
// incredibly, Visual Studio does not allow x[0] = x1
x = vsetq_lane_s8(x1, x, 0);
x = vsetq_lane_s8(x2, x, 1);
x = vsetq_lane_s8(x3, x, 2);
x = vsetq_lane_s8(x4, x, 3);
x = vsetq_lane_s8(x5, x, 4);
x = vsetq_lane_s8(x6, x, 5);
x = vsetq_lane_s8(x7, x, 6);
x = vsetq_lane_s8(x8, x, 7);
x = vsetq_lane_s8(x9, x, 8);
x = vsetq_lane_s8(x10, x, 9);
x = vsetq_lane_s8(x11, x, 10);
x = vsetq_lane_s8(x12, x, 11);
x = vsetq_lane_s8(x13, x, 12);
x = vsetq_lane_s8(x14, x, 13);
x = vsetq_lane_s8(x15, x, 14);
x = vsetq_lane_s8(x16, x, 15);
return x;
}
// End of private section with Visual Studio workaround
} // namespace
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
template<typename T> template<typename T>
struct simd8; struct simd8;
@ -61,8 +144,13 @@ namespace simd {
// We return uint32_t instead of uint16_t because that seems to be more efficient for most // We return uint32_t instead of uint16_t because that seems to be more efficient for most
// purposes (cutting it down to uint16_t costs performance in some compilers). // purposes (cutting it down to uint16_t costs performance in some compilers).
really_inline uint32_t to_bitmask() const { really_inline uint32_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
auto minput = *this & bit_mask; auto minput = *this & bit_mask;
uint8x16_t tmp = vpaddq_u8(minput, minput); uint8x16_t tmp = vpaddq_u8(minput, minput);
tmp = vpaddq_u8(tmp, tmp); tmp = vpaddq_u8(tmp, tmp);
@ -87,6 +175,15 @@ namespace simd {
// Splat constructor // Splat constructor
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
// Member-by-member initialization // Member-by-member initialization
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline simd8(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
) : simd8(make_uint8x16_t(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
)) {}
#else
really_inline simd8( really_inline simd8(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
@ -94,6 +191,8 @@ namespace simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
#endif
// Repeat 16 values as many times as necessary (usually for lookup tables) // Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<uint8_t> repeat_16( really_inline static simd8<uint8_t> repeat_16(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
@ -167,7 +266,11 @@ namespace simd {
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
// we increment by 0x08 the second half of the mask // we increment by 0x08 the second half of the mask
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
#else
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
#endif
shufmask = vaddq_u8(shufmask, inc); shufmask = vaddq_u8(shufmask, inc);
// this is the version "nearly pruned" // this is the version "nearly pruned"
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
@ -224,6 +327,15 @@ namespace simd {
// Array constructor // Array constructor
really_inline simd8(const int8_t* values) : simd8(load(values)) {} really_inline simd8(const int8_t* values) : simd8(load(values)) {}
// Member-by-member initialization // Member-by-member initialization
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline simd8(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
) : simd8(make_int8x16_t(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
)) {}
#else
really_inline simd8( really_inline simd8(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
@ -231,6 +343,7 @@ namespace simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
#endif
// Repeat 16 values as many times as necessary (usually for lookup tables) // Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<int8_t> repeat_16( really_inline static simd8<int8_t> repeat_16(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
@ -246,8 +359,14 @@ namespace simd {
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
// Explicit conversion to/from unsigned // Explicit conversion to/from unsigned
//
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
// In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
// and relatively ugly and hard to read.
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); } #endif
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
// Math // Math
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); } really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
@ -361,10 +480,17 @@ namespace simd {
} }
really_inline uint64_t to_bitmask() const { really_inline uint64_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t(
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
);
#else
const uint8x16_t bit_mask = { const uint8x16_t bit_mask = {
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
}; };
#endif
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);

View File

@ -312,12 +312,12 @@ struct value128 {
}; };
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \ #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
!defined(_M_X64) // _umul128 for x86, arm, arm64 !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
#if defined(_M_ARM) // this is a slow emulation routine for 32-bit Windows
//
static inline uint64_t __emulu(uint32_t x, uint32_t y) { static inline uint64_t __emulu(uint32_t x, uint32_t y) {
return x * (uint64_t)y; return x * (uint64_t)y;
} }
#endif
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
@ -333,8 +333,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) { really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
value128 answer; value128 answer;
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
answer.low = _umul128(value1, value2, &answer.high); #ifdef _M_ARM64
// ARM64 has native support for 64-bit multiplications, no need to emultate
answer.high = __umulh(value1, value2);
answer.low = value1 * value2;
#else #else
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
#endif // _M_ARM64
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
__uint128_t r = ((__uint128_t)value1) * value2; __uint128_t r = ((__uint128_t)value1) * value2;
answer.low = uint64_t(r); answer.low = uint64_t(r);
answer.high = uint64_t(r >> 64); answer.high = uint64_t(r >> 64);