Merge pull request #202 from lemire/sse_integration

SSE integration (PR#139)
This commit is contained in:
ioioioio 2019-07-05 12:25:06 -04:00 committed by GitHub
commit 3bd3116cf8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 572 additions and 87 deletions

View File

@ -9,9 +9,14 @@ clone_folder: c:\projects\simdjson
platform:
- x64
environment:
matrix:
- AVXFLAG: "OFF"
- AVXFLAG: "ON"
build_script:
- mkdir build
- cd build
- ps: cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
- ps: cmake -DSIMDJSON_DISABLE_AVX="$env:AVXFLAG" -DCMAKE_GENERATOR_PLATFORM=x64 ..
- cmake --build .
- ctest --verbose

View File

@ -38,6 +38,44 @@ jobs:
cd build
make test
"gccnoavx":
docker:
- image: ubuntu:18.04
environment:
CXX: g++-7
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
g++-7
git
- run:
name: Building (gcc)
command: ARCHFLAGS="-march=nehalem" make
- run:
name: Running tests (gcc)
command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
- run:
name: Building (gcc, cmake)
command: |
mkdir build
cd build
cmake -DSIMDJSON_DISABLE_AVX=on ..
make
- run:
name: Running tests (gcc, cmake)
command: |
cd build
make test
"clang":
docker:
- image: ubuntu:18.04
@ -76,9 +114,49 @@ jobs:
cd build
make test
"clangnoavx":
docker:
- image: ubuntu:18.04
environment:
CXX: clang++-6.0
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
clang-6.0
git
- run:
name: Building (clang)
command: ARCHFLAGS="-march=nehalem" make
- run:
name: Running tests (clang)
command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
- run:
name: Building (clang, cmake)
command: |
mkdir build
cd build
cmake -DSIMDJSON_DISABLE_AVX=on ..
make
- run:
name: Running tests (clang, cmake)
command: |
cd build
make test
workflows:
version: 2
build_and_test:
jobs:
- "clang"
- "gcc"
- "clangnoavx"
- "gccnoavx"

View File

@ -19,3 +19,6 @@ script:
- make test
- make clean
- make SANITIZEGOLD=1 test
- make clean
- ARCHFLAGS="-march=nehalem" make
- ARCHFLAGS="-march=nehalem" make test

View File

@ -5,6 +5,9 @@ if(ltoresult)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()
# usage: cmake -DSIMDJSON_DISABLE_AVX=on ..
option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_MACOSX_RPATH OFF)

View File

@ -20,5 +20,6 @@ Tom Dyson
Ihor Dotsenko
Alexey Milovidov
Chang Liu
Sunny Gleason
# if you have contributed to the project and your name does not
# appear in this list, please let us know!

View File

@ -9,16 +9,22 @@ COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include
EXTRADEPSINCLUDE = -Idependencies/jsoncppdist -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
# users can provide their own additional flags with make EXTRAFLAGS=something
architecture:=$(shell arch)
CXXFLAGS = -std=c++17 -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(EXTRAFLAGS)
CFLAGS = -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
####
# If you want to specify your own target architecture,
# then define ARCHFLAGS. Otherwise, we set good default.
# E.g., type ' ARCHFLAGS="-march=nehalem" make parse '
###
ifeq ($(architecture),aarch64)
CXXFLAGS += -march=armv8-a+crc+crypto
CFLAGS += -march=armv8-a+crc+crypto
ARCHFLAGS ?= -march=armv8-a+crc+crypto
else
CXXFLAGS += -march=native
CFLAGS += -march=native
ARCHFLAGS ?= -march=native
endif
CXXFLAGS = $(ARCHFLAGS) -std=c++17 -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux $(EXTRAFLAGS)
CFLAGS = $(ARCHFLAGS) -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
# This is a convenience flag
ifdef SANITIZEGOLD
SANITIZE = 1

View File

@ -52,7 +52,7 @@ On a Skylake processor, the parsing speeds (in GB/s) of various processors on th
## Requirements
- We support platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later.
- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017).
- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017) or at least SSE 4.2 (i.e., Intel processors going back to Nehalem released in 2008 or AMD processors starting with the Jaguar used in the PS4 and XBox One).
- A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU GCC 7 or better or LLVM's clang 6 or better.
- Some benchmark scripts assume bash and other common utilities, but they are optional.
@ -169,7 +169,7 @@ int main(int argc, char *argv[]) {
}
```
We require hardware support for AVX2 instructions. You have to make sure that you instruct your
On Intel and AMD processors, we get best performance by using the hardware support for AVX2 instructions. You have to make sure that you instruct your
compiler to use these instructions as needed. Under compilers such as GNU GCC or LLVM clang, the
flag `-march=native` used on a recent Intel processor (Haswell or better) is sufficient. For portability
of the binary files you can also specify directly the Haswell processor (`-march=haswell`). You may
@ -261,14 +261,15 @@ make test
## Usage (CMake on Windows using Visual Studio)
We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later).
We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 (2008 Nehalem or later).
- Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/).
- Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake.
- Create a subdirectory within simdjson, such as `VisualStudio`.
- Using a shell, go to this newly created directory.
- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.)
- This last command created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) This will build the code with AVX2 instructions. If your target processor does not support AVX2, you need to replace `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` by `cmake -DSIMDJSON_DISABLE_AVX=on -DCMAKE_GENERATOR_PLATFORM=x64 ..` . That is, you need to set the flag to forcefully disable AVX support since we compile with AVX2 instructions *by default*.
- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
## Usage (Using `vcpkg` on Windows, Linux and MacOS)

View File

@ -5,9 +5,11 @@
#include <cstdint>
namespace simdjson {
// Take input from buf and remove useless whitespace, write it to out; buf and
// out can be the same pointer. Result is null terminated,
// return the string length (minus the null termination).
// The accelerated version of this function only runs on AVX2 hardware.
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);

View File

@ -114,7 +114,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
return structural_or_whitespace_or_exponent_or_decimal_negated[c];
}
#ifdef __AVX2__
#if defined (__AVX2__) || defined (__SSE4_2__)
#define SWAR_NUMBER_PARSING
#endif

View File

@ -40,7 +40,7 @@ static inline int hamming(uint64_t input_num) {
#include <cstdint>
#include <cstdlib>
#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__)
#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__) || defined(__SSE4_2__)
#include <x86intrin.h>
#endif
namespace simdjson {

View File

@ -12,7 +12,7 @@ enum class instruction_set {
// the 'native' enum class value should point at a good default on the current machine
#ifdef __AVX2__
native = avx2
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
native = neon
#else
// Let us assume that we have an old x64 processor, but one that has SSE (i.e., something

View File

@ -27,6 +27,138 @@
// all byte values must be no larger than 0xF4
namespace simdjson {
// all byte values must be no larger than 0xF4
static inline void checkSmallerThan0xF4(__m128i current_bytes,
__m128i *has_error) {
// unsigned, saturates to 0 below max
*has_error = _mm_or_si128(*has_error,
_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
}
static inline __m128i continuationLengths(__m128i high_nibbles) {
return _mm_shuffle_epi8(
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
2, 2, // 110x
3, // 1110
4), // 1111, next should be 0 (not checked here)
high_nibbles);
}
static inline __m128i carryContinuations(__m128i initial_lengths,
__m128i previous_carries) {
__m128i right1 =
_mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
_mm_set1_epi8(1));
__m128i sum = _mm_add_epi8(initial_lengths, right1);
__m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
_mm_set1_epi8(2));
return _mm_add_epi8(sum, right2);
}
static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
__m128i *has_error) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
// (carries > length) == (lengths > 0)
__m128i overunder =
_mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
_mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
*has_error = _mm_or_si128(*has_error, overunder);
}
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
static inline void checkFirstContinuationMax(__m128i current_bytes,
__m128i off1_current_bytes,
__m128i *has_error) {
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
__m128i badfollowED =
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
__m128i badfollowF4 =
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
*has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
}
// map off1_hibits => error condition
// hibits off1 cur
// C => < C2 && true
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
static inline void checkOverlong(__m128i current_bytes,
__m128i off1_current_bytes, __m128i hibits,
__m128i previous_hibits, __m128i *has_error) {
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
__m128i initial_mins = _mm_shuffle_epi8(
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1),
off1_hibits);
__m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
__m128i second_mins = _mm_shuffle_epi8(
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90),
off1_hibits);
__m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
*has_error =
_mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
}
struct processed_utf_bytes {
__m128i rawbytes;
__m128i high_nibbles;
__m128i carried_continuations;
};
static inline void count_nibbles(__m128i bytes,
struct processed_utf_bytes *answer) {
answer->rawbytes = bytes;
answer->high_nibbles =
_mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
}
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static struct processed_utf_bytes
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
__m128i *has_error) {
struct processed_utf_bytes pb;
count_nibbles(current_bytes, &pb);
checkSmallerThan0xF4(current_bytes, has_error);
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
pb.carried_continuations =
carryContinuations(initial_lengths, previous->carried_continuations);
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
__m128i off1_current_bytes =
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
#ifdef __AVX2__
/*****************************/
static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
@ -190,8 +322,6 @@ avxcheckUTF8Bytes(__m256i current_bytes,
return pb;
}
#else // __AVX2__
#warning "We require AVX2 support!"
#endif // __AVX2__
}
#endif

View File

@ -6,22 +6,21 @@
#include "simdjson/parsedjson.h"
#include "simdjson/portability.h"
#ifdef __AVX2__
#if defined (__AVX2__) || defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
#ifndef SIMDJSON_SKIPUTF8VALIDATION
#define SIMDJSON_UTF8VALIDATE
#endif
#else
// currently we don't UTF8 validate for ARM
// also we assume that if you're not __AVX2__
// you're ARM, which is a bit dumb. TODO: Fix...
#ifdef __ARM_NEON
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
#include <arm_neon.h>
#else
#warning It appears that neither ARM NEON nor AVX2 are detected.
#endif // __ARM_NEON
#endif // __AVX2__
#endif // (__AVX2__) || (__SSE4_2__)
// It seems that many parsers do UTF-8 validation.
// RapidJSON does not do it by default, but a flag
@ -35,6 +34,7 @@
namespace simdjson {
template<instruction_set>
struct simd_input;
#ifdef __AVX2__
template<>
struct simd_input<instruction_set::avx2>
@ -44,7 +44,18 @@ struct simd_input<instruction_set::avx2>
};
#endif
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<>
struct simd_input<instruction_set::sse4_2>
{
__m128i v0;
__m128i v1;
__m128i v2;
__m128i v3;
};
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> struct simd_input<instruction_set::neon>
{
#ifndef TRANSPOSE
@ -58,7 +69,7 @@ template<> struct simd_input<instruction_set::neon>
};
#endif
#ifdef __ARM_NEON
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
really_inline
uint16_t neonmovemask(uint8x16_t input) {
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
@ -114,16 +125,9 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
template<instruction_set T>
uint64_t compute_quote_mask(uint64_t quote_bits);
// In practice, if you have NEON or __PCLMUL__, you would
// always want to use them, but it might be useful, for research
// purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
// does.
// Also: we don't know of an instance where AVX2 is supported but
// where clmul is not supported, so check for both, to be sure.
#ifdef SIMDJSON_AVOID_CLMUL
template<instruction_set T> really_inline
uint64_t compute_quote_mask(uint64_t quote_bits)
{
namespace {
// for when clmul is unavailable
[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
quote_mask = quote_mask ^ (quote_mask << 2);
quote_mask = quote_mask ^ (quote_mask << 4);
@ -132,6 +136,19 @@ uint64_t compute_quote_mask(uint64_t quote_bits)
quote_mask = quote_mask ^ (quote_mask << 32);
return quote_mask;
}
}
// In practice, if you have NEON or __PCLMUL__, you would
// always want to use them, but it might be useful, for research
// purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
// does.
// Also: we don't know of an instance where AVX2 is supported but
// where clmul is not supported, so check for both, to be sure.
#ifdef SIMDJSON_AVOID_CLMUL
template<instruction_set T> really_inline
uint64_t compute_quote_mask(uint64_t quote_bits) {
return portable_compute_quote_mask(quote_bits);
}
#else
template<instruction_set>
uint64_t compute_quote_mask(uint64_t quote_bits);
@ -139,49 +156,147 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
#ifdef __AVX2__
template<> really_inline
uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
// There should be no such thing with a processing supporting avx2
// but not clmul.
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
return quote_mask;
}
#endif
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits),
_mm_set1_epi8(0xFF), 0));
uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
// CLMUL is supported on some SSE42 hardware such as Sandy Bridge,
// but not on others.
#ifdef __PCLMUL__
return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
#else
uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
return portable_compute_quote_mask(quote_bits);
#endif
return quote_mask;
}
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> really_inline
uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
return vmull_p64( -1ULL, quote_bits);
#else
return portable_compute_quote_mask(quote_bits);
#endif
}
#endif
#endif // SIMDJSON_AVOID_CLMUL
#ifdef SIMDJSON_UTF8VALIDATE
template<instruction_set T>really_inline
void check_utf8(simd_input<T> in,
__m256i &has_error,
struct avx_processed_utf_bytes &previous) {
// Holds the state required to perform check_utf8().
template<instruction_set>
struct utf8_checking_state;
#ifdef __AVX2__
template<>
struct utf8_checking_state<instruction_set::avx2>
{
__m256i has_error = _mm256_setzero_si256();
avx_processed_utf_bytes previous {
_mm256_setzero_si256(), // rawbytes
_mm256_setzero_si256(), // high_nibbles
_mm256_setzero_si256() // carried_continuations
};
};
#endif
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<>
struct utf8_checking_state<instruction_set::sse4_2>
{
__m128i has_error = _mm_setzero_si128();
processed_utf_bytes previous {
_mm_setzero_si128(), // rawbytes
_mm_setzero_si128(), // high_nibbles
_mm_setzero_si128() // carried_continuations
};
};
#endif
template<instruction_set T>
void check_utf8(simd_input<T> in, utf8_checking_state<T>& state);
#ifdef __AVX2__
template<> really_inline
void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
utf8_checking_state<instruction_set::avx2>& state) {
__m256i highbit = _mm256_set1_epi8(0x80);
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
// it is ascii, we just check continuation
has_error = _mm256_or_si256(
state.has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(
previous.carried_continuations,
state.previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
has_error);
state.has_error);
} else {
// it is not ascii so we have to do heavy work
previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error));
state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error));
}
}
#endif //__AVX2__
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
utf8_checking_state<instruction_set::sse4_2>& state) {
__m128i highbit = _mm_set1_epi8(0x80);
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
// it is ascii, we just check continuation
state.has_error = _mm_or_si128(
_mm_cmpgt_epi8(
state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error));
}
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
// it is ascii, we just check continuation
state.has_error = _mm_or_si128(
_mm_cmpgt_epi8(
state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error));
}
}
#endif // __SSE4_2
// Checks if the utf8 validation has found any error.
template<instruction_set T>
errorValues check_utf8_errors(utf8_checking_state<T>& state);
#ifdef __AVX2__
template<> really_inline
errorValues check_utf8_errors<instruction_set::avx2>(utf8_checking_state<instruction_set::avx2>& state) {
return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
}
#endif
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
errorValues check_utf8_errors<instruction_set::sse4_2>(utf8_checking_state<instruction_set::sse4_2>& state) {
return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
}
#endif
#endif // SIMDJSON_UTF8VALIDATE
template<instruction_set T>
simd_input<T> fill_input(const uint8_t * ptr);
@ -195,7 +310,19 @@ simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_
}
#endif
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
simd_input<instruction_set::sse4_2> fill_input<instruction_set::sse4_2>(const uint8_t * ptr) {
struct simd_input<instruction_set::sse4_2> in;
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
return in;
}
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> really_inline
simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
struct simd_input<instruction_set::neon> in;
@ -219,7 +346,6 @@ uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
#ifdef __AVX2__
template<> really_inline
uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
const __m256i mask = _mm256_set1_epi8(m);
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@ -229,7 +355,23 @@ uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_se
}
#endif
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
const __m128i mask = _mm_set1_epi8(m);
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
__m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
__m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
__m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
}
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> really_inline
uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
@ -257,7 +399,23 @@ uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instructi
}
#endif
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
const __m128i maxval = _mm_set1_epi8(m);
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval);
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval);
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval);
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval);
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
}
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> really_inline
uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
@ -447,9 +605,80 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
whitespace = ~(ws_res_0 | (ws_res_1 << 32));
#endif // SIMDJSON_NAIVE_STRUCTURAL
}
#endif
#endif // __AVX2__
#ifdef __ARM_NEON
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
uint64_t &whitespace,
uint64_t &structurals) {
const __m128i low_nibble_mask = _mm_setr_epi8(
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const __m128i high_nibble_mask = _mm_setr_epi8(
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
__m128i structural_shufti_mask = _mm_set1_epi8(0x7);
__m128i whitespace_shufti_mask = _mm_set1_epi8(0x18);
__m128i v_0 = _mm_and_si128(
_mm_shuffle_epi8(low_nibble_mask, in.v0),
_mm_shuffle_epi8(high_nibble_mask,
_mm_and_si128(_mm_srli_epi32(in.v0, 4),
_mm_set1_epi8(0x7f))));
__m128i v_1 = _mm_and_si128(
_mm_shuffle_epi8(low_nibble_mask, in.v1),
_mm_shuffle_epi8(high_nibble_mask,
_mm_and_si128(_mm_srli_epi32(in.v1, 4),
_mm_set1_epi8(0x7f))));
__m128i v_2 = _mm_and_si128(
_mm_shuffle_epi8(low_nibble_mask, in.v2),
_mm_shuffle_epi8(high_nibble_mask,
_mm_and_si128(_mm_srli_epi32(in.v2, 4),
_mm_set1_epi8(0x7f))));
__m128i v_3 = _mm_and_si128(
_mm_shuffle_epi8(low_nibble_mask, in.v3),
_mm_shuffle_epi8(high_nibble_mask,
_mm_and_si128(_mm_srli_epi32(in.v3, 4),
_mm_set1_epi8(0x7f))));
__m128i tmp_v0 = _mm_cmpeq_epi8(
_mm_and_si128(v_0, structural_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_v1 = _mm_cmpeq_epi8(
_mm_and_si128(v_1, structural_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_v2 = _mm_cmpeq_epi8(
_mm_and_si128(v_2, structural_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_v3 = _mm_cmpeq_epi8(
_mm_and_si128(v_3, structural_shufti_mask), _mm_set1_epi8(0));
uint64_t structural_res_0 = _mm_movemask_epi8(tmp_v0);
uint64_t structural_res_1 = _mm_movemask_epi8(tmp_v1);
uint64_t structural_res_2 = _mm_movemask_epi8(tmp_v2);
uint64_t structural_res_3 = _mm_movemask_epi8(tmp_v3);
structurals = ~(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
__m128i tmp_ws_v0 = _mm_cmpeq_epi8(
_mm_and_si128(v_0, whitespace_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_ws_v1 = _mm_cmpeq_epi8(
_mm_and_si128(v_1, whitespace_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_ws_v2 = _mm_cmpeq_epi8(
_mm_and_si128(v_2, whitespace_shufti_mask), _mm_set1_epi8(0));
__m128i tmp_ws_v3 = _mm_cmpeq_epi8(
_mm_and_si128(v_3, whitespace_shufti_mask), _mm_set1_epi8(0));
uint64_t ws_res_0 = _mm_movemask_epi8(tmp_ws_v0);
uint64_t ws_res_1 = _mm_movemask_epi8(tmp_ws_v1);
uint64_t ws_res_2 = _mm_movemask_epi8(tmp_ws_v2);
uint64_t ws_res_3 = _mm_movemask_epi8(tmp_ws_v3);
whitespace = ~(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
}
#endif // __SSE4_2__
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
template<> really_inline
void find_whitespace_and_structurals<instruction_set::neon>(
simd_input<instruction_set::neon> in,
@ -569,9 +798,9 @@ void find_whitespace_and_structurals<instruction_set::neon>(
structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
#endif
#endif // FUNKY_BAD_TABLE
}
#endif
#endif // __ARM_NEON
#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
@ -657,7 +886,7 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
}
base = next_base;
}
#endif
#endif // SIMDJSON_NAIVE_FLATTEN
// return a updated structural bit vector with quoted contents cleared out and
// pseudo-structural characters added to the mask
@ -711,11 +940,7 @@ WARN_UNUSED
uint32_t *base_ptr = pj.structural_indexes;
uint32_t base = 0;
#ifdef SIMDJSON_UTF8VALIDATE
__m256i has_error = _mm256_setzero_si256();
struct avx_processed_utf_bytes previous {};
previous.rawbytes = _mm256_setzero_si256();
previous.high_nibbles = _mm256_setzero_si256();
previous.carried_continuations = _mm256_setzero_si256();
utf8_checking_state<T> state;
#endif
// we have padded the input out to 64 byte multiple with the remainder being
@ -751,7 +976,7 @@ WARN_UNUSED
#endif
simd_input<T> in = fill_input<T>(buf+idx);
#ifdef SIMDJSON_UTF8VALIDATE
check_utf8(in, has_error, previous);
check_utf8<T>(in, state);
#endif
// detect odd sequences of backslashes
uint64_t odd_ends = find_odd_backslash_sequences<T>(
@ -786,7 +1011,7 @@ WARN_UNUSED
memcpy(tmpbuf, buf + idx, len - idx);
simd_input<T> in = fill_input<T>(tmpbuf);
#ifdef SIMDJSON_UTF8VALIDATE
check_utf8(in, has_error, previous);
check_utf8<T>(in, state);
#endif
// detect odd sequences of backslashes
@ -843,7 +1068,7 @@ WARN_UNUSED
return simdjson::UNESCAPED_CHARS;
}
#ifdef SIMDJSON_UTF8VALIDATE
return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
return check_utf8_errors<T>(state);
#else
return simdjson::SUCCESS;
#endif

View File

@ -109,6 +109,23 @@ parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const ui
}
#endif
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<instruction_set::sse4_2> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
return {
static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
};
}
#endif
#ifdef __ARM_NEON
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
@ -221,10 +238,15 @@ bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
} else {
// they are the same. Since they can't co-occur, it means we encountered
// neither.
if constexpr(T == instruction_set::sse4_2) {
src += 16;
dst += 16;
} else {
src += 32;
dst += 32;
}
}
}
// can't be reached
return true;
#endif // SIMDJSON_SKIPSTRINGPARSING

View File

@ -14,10 +14,10 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
#ifdef __AVX2__
json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>;
#endif
#ifdef __SSE4_2__
// json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>; // not implemented yet
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>;
#endif
#ifdef __ARM_NEON
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>;
#endif
@ -25,9 +25,9 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
// Should be done at runtime. Does not make any sense on preprocessor.
#ifdef __AVX2__
instruction_set best_implementation = instruction_set::avx2;
#elif defined (__SSE4_2__)
#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
instruction_set best_implementation = instruction_set::sse4_2;
#elif defined (__ARM_NEON)
#elif defined (__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
instruction_set best_implementation = instruction_set::neon;
#else
instruction_set best_implementation = instruction_set::none;
@ -39,11 +39,13 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
case instruction_set::avx2 :
json_parse_ptr = avx_implementation;
break;
#elif defined (__SSE4_2__)
/*case instruction_set::sse4_2 :
#endif
#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
case instruction_set::sse4_2 :
json_parse_ptr = sse4_2_implementation;
break;*/
#elif defined (__ARM_NEON)
break;
#endif
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
case instruction_set::neon :
json_parse_ptr = neon_implementation;
break;

View File

@ -7,7 +7,8 @@ endif()
add_cpp_test(basictests)
add_cpp_test(jsoncheck)
add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
add_test(singleheader singleheader)
## This causes problems
# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
# target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
# target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
# add_test(singleheader singleheader)

View File

@ -13,14 +13,20 @@ if(SIMDJSON_SANITIZE)
endif()
if(SIMDJSON_DISABLE_AVX)
if(NOT MSVC)
set (OPT_FLAGS "${OPT_FLAGS} -mno-avx -mno-bmi -mno-pclmul -msse4.2")
else()
set (OPT_FLAGS "${OPT_FLAGS}")
endif()
else()
# some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
if(NOT MSVC)
set (OPT_FLAGS "${OPT_FLAGS} -mavx2 -mbmi -mbmi2 -mpclmul")
else()
set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2 /std:c++latest")
set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2")
endif()
endif()
if(NOT MSVC)
set(CXXSTD_FLAGS "-std=c++17 -fPIC")