Compile under Visual Studio for ARM64 (#861)
* Modifications so that we can compile under Visual Studio for ARM64 * Let us throw appveyor at this beast.
This commit is contained in:
parent
0e6bd2224f
commit
3c3a4db54e
|
@ -7,30 +7,37 @@ platform: x64
|
|||
environment:
|
||||
matrix:
|
||||
- job_name: VS2019
|
||||
CMAKE_ARGS:
|
||||
CMAKE_ARGS: -A %Platform%
|
||||
- job_name: VS2019CLANG
|
||||
CMAKE_ARGS: -T ClangCL
|
||||
CMAKE_ARGS: -A %Platform% -T ClangCL
|
||||
- job_name: VS2019ARM
|
||||
CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM?
|
||||
- job_name: VS2017 (Static, No Threads)
|
||||
image: Visual Studio 2017
|
||||
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
|
||||
CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
|
||||
CTEST_ARGS: -E checkperf
|
||||
- job_name: VS2019 (Win32)
|
||||
platform: Win32
|
||||
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
|
||||
CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
|
||||
CTEST_ARGS: -E checkperf
|
||||
|
||||
build_script:
|
||||
- set
|
||||
- mkdir build
|
||||
- cd build
|
||||
- cmake --version
|
||||
- cmake -A %Platform% %CMAKE_ARGS% --parallel ..
|
||||
- cmake %CMAKE_ARGS% --parallel ..
|
||||
- cmake -LH ..
|
||||
- cmake --build . --config %Configuration% --verbose --parallel
|
||||
|
||||
test_script:
|
||||
- ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
|
||||
|
||||
for:
|
||||
-
|
||||
matrix:
|
||||
except:
|
||||
- job_name: VS2019ARM
|
||||
|
||||
test_script:
|
||||
- ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
|
||||
|
||||
clone_folder: c:\projects\simdjson
|
||||
|
||||
matrix:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */
|
||||
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
|
||||
|
||||
#include <iostream>
|
||||
#include "simdjson.h"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */
|
||||
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
|
||||
/* begin file src/simdjson.cpp */
|
||||
#include "simdjson.h"
|
||||
|
||||
|
@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
|
|||
|
||||
/* result might be undefined when input_num is zero */
|
||||
really_inline int count_ones(uint64_t input_num) {
|
||||
return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
|
||||
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
||||
}
|
||||
|
||||
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
|
@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu
|
|||
#endif // SIMDJSON_ARM64_BITMANIPULATION_H
|
||||
/* end file src/arm64/bitmanipulation.h */
|
||||
/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
|
||||
#include <type_traits>
|
||||
|
||||
|
||||
namespace simdjson {
|
||||
namespace arm64 {
|
||||
namespace simd {
|
||||
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
namespace {
|
||||
/**
|
||||
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
|
||||
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
|
||||
* is not recognized under Visual Studio! This is a workaround.
|
||||
* Using a std::initializer_list<uint8_t> as a parameter resulted in
|
||||
* inefficient code. With the current approach, if the parameters are
|
||||
* compile-time constants,
|
||||
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
|
||||
* You should not use this function except for compile-time constant:
|
||||
* it is not efficient.
|
||||
*/
|
||||
really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
|
||||
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
|
||||
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
|
||||
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
|
||||
// Doing a load like so end ups generating worse code.
|
||||
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
||||
// x9, x10,x11,x12,x13,x14,x15,x16};
|
||||
// return vld1q_u8(array);
|
||||
uint8x16_t x{};
|
||||
// incredibly, Visual Studio does not allow x[0] = x1
|
||||
x = vsetq_lane_u8(x1, x, 0);
|
||||
x = vsetq_lane_u8(x2, x, 1);
|
||||
x = vsetq_lane_u8(x3, x, 2);
|
||||
x = vsetq_lane_u8(x4, x, 3);
|
||||
x = vsetq_lane_u8(x5, x, 4);
|
||||
x = vsetq_lane_u8(x6, x, 5);
|
||||
x = vsetq_lane_u8(x7, x, 6);
|
||||
x = vsetq_lane_u8(x8, x, 7);
|
||||
x = vsetq_lane_u8(x9, x, 8);
|
||||
x = vsetq_lane_u8(x10, x, 9);
|
||||
x = vsetq_lane_u8(x11, x, 10);
|
||||
x = vsetq_lane_u8(x12, x, 11);
|
||||
x = vsetq_lane_u8(x13, x, 12);
|
||||
x = vsetq_lane_u8(x14, x, 13);
|
||||
x = vsetq_lane_u8(x15, x, 14);
|
||||
x = vsetq_lane_u8(x16, x, 15);
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
// We have to do the same work for make_int8x16_t
|
||||
really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
|
||||
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
|
||||
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
|
||||
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
|
||||
// Doing a load like so end ups generating worse code.
|
||||
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
||||
// x9, x10,x11,x12,x13,x14,x15,x16};
|
||||
// return vld1q_s8(array);
|
||||
int8x16_t x{};
|
||||
// incredibly, Visual Studio does not allow x[0] = x1
|
||||
x = vsetq_lane_s8(x1, x, 0);
|
||||
x = vsetq_lane_s8(x2, x, 1);
|
||||
x = vsetq_lane_s8(x3, x, 2);
|
||||
x = vsetq_lane_s8(x4, x, 3);
|
||||
x = vsetq_lane_s8(x5, x, 4);
|
||||
x = vsetq_lane_s8(x6, x, 5);
|
||||
x = vsetq_lane_s8(x7, x, 6);
|
||||
x = vsetq_lane_s8(x8, x, 7);
|
||||
x = vsetq_lane_s8(x9, x, 8);
|
||||
x = vsetq_lane_s8(x10, x, 9);
|
||||
x = vsetq_lane_s8(x11, x, 10);
|
||||
x = vsetq_lane_s8(x12, x, 11);
|
||||
x = vsetq_lane_s8(x13, x, 12);
|
||||
x = vsetq_lane_s8(x14, x, 13);
|
||||
x = vsetq_lane_s8(x15, x, 14);
|
||||
x = vsetq_lane_s8(x16, x, 15);
|
||||
return x;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
|
||||
|
||||
template<typename T>
|
||||
struct simd8;
|
||||
|
||||
|
@ -792,8 +871,13 @@ namespace simd {
|
|||
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
|
||||
// purposes (cutting it down to uint16_t costs performance in some compilers).
|
||||
really_inline uint32_t to_bitmask() const {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
||||
#else
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
#endif
|
||||
auto minput = *this & bit_mask;
|
||||
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
|
@ -818,6 +902,15 @@ namespace simd {
|
|||
// Splat constructor
|
||||
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
||||
// Member-by-member initialization
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline simd8(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
||||
) : simd8(make_uint8x16_t(
|
||||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
)) {}
|
||||
#else
|
||||
really_inline simd8(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
||||
|
@ -825,6 +918,8 @@ namespace simd {
|
|||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
}) {}
|
||||
#endif
|
||||
|
||||
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
||||
really_inline static simd8<uint8_t> repeat_16(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
|
@ -898,7 +993,11 @@ namespace simd {
|
|||
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
|
||||
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
|
||||
// we increment by 0x08 the second half of the mask
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
||||
#else
|
||||
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
||||
#endif
|
||||
shufmask = vaddq_u8(shufmask, inc);
|
||||
// this is the version "nearly pruned"
|
||||
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
|
||||
|
@ -955,6 +1054,15 @@ namespace simd {
|
|||
// Array constructor
|
||||
really_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
||||
// Member-by-member initialization
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline simd8(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
||||
) : simd8(make_int8x16_t(
|
||||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
)) {}
|
||||
#else
|
||||
really_inline simd8(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
||||
|
@ -962,6 +1070,7 @@ namespace simd {
|
|||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
}) {}
|
||||
#endif
|
||||
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
||||
really_inline static simd8<int8_t> repeat_16(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
|
@ -977,8 +1086,14 @@ namespace simd {
|
|||
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
|
||||
|
||||
// Explicit conversion to/from unsigned
|
||||
//
|
||||
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
|
||||
// In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
|
||||
// and relatively ugly and hard to read.
|
||||
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
|
||||
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
|
||||
#endif
|
||||
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
|
||||
|
||||
// Math
|
||||
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
|
||||
|
@ -1092,10 +1207,17 @@ namespace simd {
|
|||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() const {
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
const uint8x16_t bit_mask = make_uint8x16_t(
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
);
|
||||
#else
|
||||
const uint8x16_t bit_mask = {
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
};
|
||||
#endif
|
||||
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
||||
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
|
||||
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
|
||||
|
@ -5852,12 +5974,12 @@ struct value128 {
|
|||
};
|
||||
|
||||
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
|
||||
!defined(_M_X64) // _umul128 for x86, arm, arm64
|
||||
#if defined(_M_ARM)
|
||||
!defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
|
||||
// this is a slow emulation routine for 32-bit Windows
|
||||
//
|
||||
static inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
||||
return x * (uint64_t)y;
|
||||
}
|
||||
#endif
|
||||
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
||||
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
||||
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
||||
|
@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|||
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
||||
value128 answer;
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
answer.low = _umul128(value1, value2, &answer.high);
|
||||
#ifdef _M_ARM64
|
||||
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
||||
answer.high = __umulh(value1, value2);
|
||||
answer.low = value1 * value2;
|
||||
#else
|
||||
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
||||
#endif // _M_ARM64
|
||||
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
__uint128_t r = ((__uint128_t)value1) * value2;
|
||||
answer.low = uint64_t(r);
|
||||
answer.high = uint64_t(r >> 64);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */
|
||||
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
|
||||
/* begin file include/simdjson.h */
|
||||
#ifndef SIMDJSON_H
|
||||
#define SIMDJSON_H
|
||||
|
|
|
@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
|
|||
|
||||
/* result might be undefined when input_num is zero */
|
||||
really_inline int count_ones(uint64_t input_num) {
|
||||
return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
|
||||
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
||||
}
|
||||
|
||||
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
|
|
132
src/arm64/simd.h
132
src/arm64/simd.h
|
@ -5,11 +5,94 @@
|
|||
#include "simdprune_tables.h"
|
||||
#include "arm64/bitmanipulation.h"
|
||||
#include "arm64/intrinsics.h"
|
||||
#include <type_traits>
|
||||
|
||||
|
||||
namespace simdjson {
|
||||
namespace arm64 {
|
||||
namespace simd {
|
||||
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
namespace {
|
||||
// Start of private section with Visual Studio workaround
|
||||
|
||||
|
||||
/**
|
||||
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
|
||||
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
|
||||
* is not recognized under Visual Studio! This is a workaround.
|
||||
* Using a std::initializer_list<uint8_t> as a parameter resulted in
|
||||
* inefficient code. With the current approach, if the parameters are
|
||||
* compile-time constants,
|
||||
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
|
||||
* You should not use this function except for compile-time constants:
|
||||
* it is not efficient.
|
||||
*/
|
||||
really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
|
||||
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
|
||||
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
|
||||
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
|
||||
// Doing a load like so end ups generating worse code.
|
||||
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
||||
// x9, x10,x11,x12,x13,x14,x15,x16};
|
||||
// return vld1q_u8(array);
|
||||
uint8x16_t x{};
|
||||
// incredibly, Visual Studio does not allow x[0] = x1
|
||||
x = vsetq_lane_u8(x1, x, 0);
|
||||
x = vsetq_lane_u8(x2, x, 1);
|
||||
x = vsetq_lane_u8(x3, x, 2);
|
||||
x = vsetq_lane_u8(x4, x, 3);
|
||||
x = vsetq_lane_u8(x5, x, 4);
|
||||
x = vsetq_lane_u8(x6, x, 5);
|
||||
x = vsetq_lane_u8(x7, x, 6);
|
||||
x = vsetq_lane_u8(x8, x, 7);
|
||||
x = vsetq_lane_u8(x9, x, 8);
|
||||
x = vsetq_lane_u8(x10, x, 9);
|
||||
x = vsetq_lane_u8(x11, x, 10);
|
||||
x = vsetq_lane_u8(x12, x, 11);
|
||||
x = vsetq_lane_u8(x13, x, 12);
|
||||
x = vsetq_lane_u8(x14, x, 13);
|
||||
x = vsetq_lane_u8(x15, x, 14);
|
||||
x = vsetq_lane_u8(x16, x, 15);
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
// We have to do the same work for make_int8x16_t
|
||||
really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
|
||||
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
|
||||
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
|
||||
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
|
||||
// Doing a load like so end ups generating worse code.
|
||||
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
||||
// x9, x10,x11,x12,x13,x14,x15,x16};
|
||||
// return vld1q_s8(array);
|
||||
int8x16_t x{};
|
||||
// incredibly, Visual Studio does not allow x[0] = x1
|
||||
x = vsetq_lane_s8(x1, x, 0);
|
||||
x = vsetq_lane_s8(x2, x, 1);
|
||||
x = vsetq_lane_s8(x3, x, 2);
|
||||
x = vsetq_lane_s8(x4, x, 3);
|
||||
x = vsetq_lane_s8(x5, x, 4);
|
||||
x = vsetq_lane_s8(x6, x, 5);
|
||||
x = vsetq_lane_s8(x7, x, 6);
|
||||
x = vsetq_lane_s8(x8, x, 7);
|
||||
x = vsetq_lane_s8(x9, x, 8);
|
||||
x = vsetq_lane_s8(x10, x, 9);
|
||||
x = vsetq_lane_s8(x11, x, 10);
|
||||
x = vsetq_lane_s8(x12, x, 11);
|
||||
x = vsetq_lane_s8(x13, x, 12);
|
||||
x = vsetq_lane_s8(x14, x, 13);
|
||||
x = vsetq_lane_s8(x15, x, 14);
|
||||
x = vsetq_lane_s8(x16, x, 15);
|
||||
return x;
|
||||
}
|
||||
|
||||
// End of private section with Visual Studio workaround
|
||||
} // namespace
|
||||
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
|
||||
|
||||
template<typename T>
|
||||
struct simd8;
|
||||
|
||||
|
@ -61,8 +144,13 @@ namespace simd {
|
|||
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
|
||||
// purposes (cutting it down to uint16_t costs performance in some compilers).
|
||||
really_inline uint32_t to_bitmask() const {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
||||
#else
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
#endif
|
||||
auto minput = *this & bit_mask;
|
||||
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
|
@ -87,6 +175,15 @@ namespace simd {
|
|||
// Splat constructor
|
||||
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
||||
// Member-by-member initialization
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline simd8(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
||||
) : simd8(make_uint8x16_t(
|
||||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
)) {}
|
||||
#else
|
||||
really_inline simd8(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
||||
|
@ -94,6 +191,8 @@ namespace simd {
|
|||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
}) {}
|
||||
#endif
|
||||
|
||||
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
||||
really_inline static simd8<uint8_t> repeat_16(
|
||||
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
||||
|
@ -167,7 +266,11 @@ namespace simd {
|
|||
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
|
||||
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
|
||||
// we increment by 0x08 the second half of the mask
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
||||
#else
|
||||
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
||||
#endif
|
||||
shufmask = vaddq_u8(shufmask, inc);
|
||||
// this is the version "nearly pruned"
|
||||
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
|
||||
|
@ -224,6 +327,15 @@ namespace simd {
|
|||
// Array constructor
|
||||
really_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
||||
// Member-by-member initialization
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline simd8(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
||||
) : simd8(make_int8x16_t(
|
||||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
)) {}
|
||||
#else
|
||||
really_inline simd8(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
||||
|
@ -231,6 +343,7 @@ namespace simd {
|
|||
v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10,v11,v12,v13,v14,v15
|
||||
}) {}
|
||||
#endif
|
||||
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
||||
really_inline static simd8<int8_t> repeat_16(
|
||||
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
|
@ -246,8 +359,14 @@ namespace simd {
|
|||
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
|
||||
|
||||
// Explicit conversion to/from unsigned
|
||||
//
|
||||
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
|
||||
// In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
|
||||
// and relatively ugly and hard to read.
|
||||
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
|
||||
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
|
||||
#endif
|
||||
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
|
||||
|
||||
// Math
|
||||
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
|
||||
|
@ -361,10 +480,17 @@ namespace simd {
|
|||
}
|
||||
|
||||
really_inline uint64_t to_bitmask() const {
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
const uint8x16_t bit_mask = make_uint8x16_t(
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
);
|
||||
#else
|
||||
const uint8x16_t bit_mask = {
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
};
|
||||
#endif
|
||||
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
||||
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
|
||||
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
|
||||
|
|
|
@ -312,12 +312,12 @@ struct value128 {
|
|||
};
|
||||
|
||||
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
|
||||
!defined(_M_X64) // _umul128 for x86, arm, arm64
|
||||
#if defined(_M_ARM)
|
||||
!defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
|
||||
// this is a slow emulation routine for 32-bit Windows
|
||||
//
|
||||
static inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
||||
return x * (uint64_t)y;
|
||||
}
|
||||
#endif
|
||||
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
||||
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
||||
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
||||
|
@ -333,8 +333,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|||
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
||||
value128 answer;
|
||||
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
answer.low = _umul128(value1, value2, &answer.high);
|
||||
#ifdef _M_ARM64
|
||||
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
||||
answer.high = __umulh(value1, value2);
|
||||
answer.low = value1 * value2;
|
||||
#else
|
||||
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
||||
#endif // _M_ARM64
|
||||
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
||||
__uint128_t r = ((__uint128_t)value1) * value2;
|
||||
answer.low = uint64_t(r);
|
||||
answer.high = uint64_t(r >> 64);
|
||||
|
|
Loading…
Reference in New Issue