Merge pull request #277 from jkeiser/separate_include
Move simd_input and utf8 check functions into their own headers
This commit is contained in:
commit
1740d93420
|
@ -36,6 +36,11 @@ $SCRIPTPATH/include/simdjson/jsoncharutils.h
|
|||
$SCRIPTPATH/include/simdjson/jsonformatutils.h
|
||||
$SCRIPTPATH/include/simdjson/jsonioutil.h
|
||||
$SCRIPTPATH/include/simdjson/simdprune_tables.h
|
||||
$SCRIPTPATH/include/simdjson/simd_input.h
|
||||
$SCRIPTPATH/include/simdjson/simd_input_haswell.h
|
||||
$SCRIPTPATH/include/simdjson/simd_input_westmere.h
|
||||
$SCRIPTPATH/include/simdjson/simd_input_arm64.h
|
||||
$SCRIPTPATH/include/simdjson/simdutf8check.h
|
||||
$SCRIPTPATH/include/simdjson/simdutf8check_haswell.h
|
||||
$SCRIPTPATH/include/simdjson/simdutf8check_westmere.h
|
||||
$SCRIPTPATH/include/simdjson/simdutf8check_arm64.h
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
#ifndef SIMDJSON_SIMD_INPUT_H
|
||||
#define SIMDJSON_SIMD_INPUT_H
|
||||
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include <cassert>
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
template <Architecture> struct simd_input;
|
||||
|
||||
template <Architecture T>
|
||||
simd_input<T> fill_input(const uint8_t *ptr);
|
||||
|
||||
// a straightforward comparison of a mask against input.
|
||||
template <Architecture T>
|
||||
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
// find all values less than or equal than the content of maxval (using unsigned
|
||||
// arithmetic)
|
||||
template <Architecture T>
|
||||
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
|
@ -0,0 +1,78 @@
|
|||
#ifndef SIMDJSON_SIMD_INPUT_ARM64_H
|
||||
#define SIMDJSON_SIMD_INPUT_ARM64_H
|
||||
|
||||
#include "simdjson/simd_input.h"
|
||||
|
||||
#ifdef IS_ARM64
|
||||
namespace simdjson {
|
||||
|
||||
template <>
|
||||
struct simd_input<Architecture::ARM64> {
|
||||
uint8x16_t i0;
|
||||
uint8x16_t i1;
|
||||
uint8x16_t i2;
|
||||
uint8x16_t i3;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::ARM64>
|
||||
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::ARM64> in;
|
||||
in.i0 = vld1q_u8(ptr + 0);
|
||||
in.i1 = vld1q_u8(ptr + 16);
|
||||
in.i2 = vld1q_u8(ptr + 32);
|
||||
in.i3 = vld1q_u8(ptr + 48);
|
||||
return in;
|
||||
}
|
||||
|
||||
really_inline uint16_t neon_movemask(uint8x16_t input) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t minput = vandq_u8(input, bit_mask);
|
||||
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
||||
}
|
||||
|
||||
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
|
||||
uint8x16_t p2, uint8x16_t p3) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t t0 = vandq_u8(p0, bit_mask);
|
||||
uint8x16_t t1 = vandq_u8(p1, bit_mask);
|
||||
uint8x16_t t2 = vandq_u8(p2, bit_mask);
|
||||
uint8x16_t t3 = vandq_u8(p3, bit_mask);
|
||||
uint8x16_t sum0 = vpaddq_u8(t0, t1);
|
||||
uint8x16_t sum1 = vpaddq_u8(t2, t3);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
sum0 = vpaddq_u8(sum0, sum0);
|
||||
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // IS_ARM64
|
||||
#endif // SIMDJSON_SIMD_INPUT_ARM64_H
|
|
@ -0,0 +1,52 @@
|
|||
#ifndef SIMDJSON_SIMD_INPUT_HASWELL_H
|
||||
#define SIMDJSON_SIMD_INPUT_HASWELL_H
|
||||
|
||||
#include "simdjson/simd_input.h"
|
||||
|
||||
#ifdef IS_X86_64
|
||||
|
||||
TARGET_HASWELL
|
||||
namespace simdjson {
|
||||
|
||||
template <>
|
||||
struct simd_input<Architecture::HASWELL> {
|
||||
__m256i lo;
|
||||
__m256i hi;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::HASWELL>
|
||||
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::HASWELL> in;
|
||||
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
|
||||
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||
return in;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // IS_X86_64
|
||||
#endif // SIMDJSON_SIMD_INPUT_HASWELL_H
|
|
@ -0,0 +1,64 @@
|
|||
#ifndef SIMDJSON_SIMD_INPUT_WESTMERE_H
|
||||
#define SIMDJSON_SIMD_INPUT_WESTMERE_H
|
||||
|
||||
#include "simdjson/simd_input.h"
|
||||
|
||||
#ifdef IS_X86_64
|
||||
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
|
||||
template <>
|
||||
struct simd_input<Architecture::WESTMERE> {
|
||||
__m128i v0;
|
||||
__m128i v1;
|
||||
__m128i v2;
|
||||
__m128i v3;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::WESTMERE>
|
||||
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::WESTMERE> in;
|
||||
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
|
||||
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
|
||||
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
|
||||
in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
|
||||
return in;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif // IS_X86_64
|
||||
#endif // SIMDJSON_SIMD_INPUT_WESTMERE_H
|
|
@ -1,5 +1,5 @@
|
|||
#ifndef SIMDJSON_ERR_H
|
||||
#define SIMDJSON_ERR_H
|
||||
#ifndef SIMDJSON_SIMDJSON_H
|
||||
#define SIMDJSON_SIMDJSON_H
|
||||
|
||||
#include <string>
|
||||
|
||||
|
@ -41,4 +41,4 @@ enum ErrorValues {
|
|||
};
|
||||
const std::string &error_message(const int);
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
#endif // SIMDJSON_SIMDJSON_H
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
#ifndef SIMDJSON_SIMDUTF8CHECK_H
|
||||
#define SIMDJSON_SIMDUTF8CHECK_H
|
||||
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/simd_input.h"
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
// Holds the state required to perform check_utf8().
|
||||
template <Architecture> struct utf8_checking_state;
|
||||
|
||||
template <Architecture T>
|
||||
void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
|
||||
|
||||
// Checks if the utf8 validation has found any error.
|
||||
template <Architecture T>
|
||||
ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // SIMDJSON_SIMDUTF8CHECK_H
|
|
@ -7,6 +7,7 @@
|
|||
#if defined(_ARM_NEON) || defined(__aarch64__) || \
|
||||
(defined(_MSC_VER) && defined(_M_ARM64))
|
||||
|
||||
#include "simdjson/simdutf8check.h"
|
||||
#include <arm_neon.h>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
|
@ -175,6 +176,64 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
|
|||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct utf8_checking_state<Architecture::ARM64> {
|
||||
int8x16_t has_error{};
|
||||
processed_utf_bytes previous{};
|
||||
};
|
||||
|
||||
// Checks that all bytes are ascii
|
||||
really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
|
||||
// checking if the most significant bit is always equal to 0.
|
||||
uint8x16_t high_bit = vdupq_n_u8(0x80);
|
||||
uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
|
||||
uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
|
||||
uint8x16_t t3 = vorrq_u8(t0, t1);
|
||||
uint8x16_t t4 = vandq_u8(t3, high_bit);
|
||||
uint64x2_t v64 = vreinterpretq_u64_u8(t4);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) == 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in,
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
if (check_ascii_neon(in)) {
|
||||
// All bytes are ascii. Therefore the byte that was just before must be
|
||||
// ascii too. We only check the byte that was just before simd_input. Nines
|
||||
// are arbitrary values.
|
||||
const int8x16_t verror =
|
||||
(int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
|
||||
state.has_error =
|
||||
vorrq_s8(vreinterpretq_s8_u8(
|
||||
vcgtq_s8(state.previous.carried_continuations, verror)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
|
||||
&(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#define SIMDJSON_SIMDUTF8CHECK_HASWELL_H
|
||||
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdutf8check.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
@ -190,6 +191,48 @@ avx_check_utf8_bytes(__m256i current_bytes,
|
|||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
template <> struct utf8_checking_state<Architecture::HASWELL> {
|
||||
__m256i has_error;
|
||||
avx_processed_utf_bytes previous;
|
||||
utf8_checking_state() {
|
||||
has_error = _mm256_setzero_si256();
|
||||
previous.raw_bytes = _mm256_setzero_si256();
|
||||
previous.high_nibbles = _mm256_setzero_si256();
|
||||
previous.carried_continuations = _mm256_setzero_si256();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in,
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
__m256i high_bit = _mm256_set1_epi8(0x80u);
|
||||
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
return _mm256_testz_si256(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION // haswell
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#define SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
|
||||
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdutf8check.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
@ -161,6 +162,61 @@ check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
|
|||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct utf8_checking_state<Architecture::WESTMERE> {
|
||||
__m128i has_error = _mm_setzero_si128();
|
||||
processed_utf_bytes previous{
|
||||
_mm_setzero_si128(), // raw_bytes
|
||||
_mm_setzero_si128(), // high_nibbles
|
||||
_mm_setzero_si128() // carried_continuations
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in,
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
__m128i high_bit = _mm_set1_epi8(0x80u);
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
|
||||
}
|
||||
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
return _mm_testz_si128(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION // westmere
|
||||
|
||||
|
|
|
@ -5,12 +5,11 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/simd_input.h"
|
||||
#include <cassert>
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
template <Architecture> struct simd_input;
|
||||
|
||||
template <Architecture> uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||
|
||||
namespace {
|
||||
|
@ -36,17 +35,6 @@ void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
|
|||
template <Architecture T>
|
||||
ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
|
||||
|
||||
// a straightforward comparison of a mask against input.
|
||||
template <Architecture T>
|
||||
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
|
||||
|
||||
// find all values less than or equal than the content of maxval (using unsigned
|
||||
// arithmetic)
|
||||
template <Architecture T>
|
||||
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
template <Architecture T>
|
||||
really_inline uint64_t find_odd_backslash_sequences(
|
||||
simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
|
||||
|
|
|
@ -1,53 +1,12 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
|
||||
|
||||
#include "simdjson/simd_input_arm64.h"
|
||||
#include "simdjson/simdutf8check_arm64.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
|
||||
#ifdef IS_ARM64
|
||||
namespace simdjson {
|
||||
template <> struct simd_input<Architecture::ARM64> {
|
||||
uint8x16_t i0;
|
||||
uint8x16_t i1;
|
||||
uint8x16_t i2;
|
||||
uint8x16_t i3;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::ARM64>
|
||||
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::ARM64> in;
|
||||
in.i0 = vld1q_u8(ptr + 0);
|
||||
in.i1 = vld1q_u8(ptr + 16);
|
||||
in.i2 = vld1q_u8(ptr + 32);
|
||||
in.i3 = vld1q_u8(ptr + 48);
|
||||
return in;
|
||||
}
|
||||
|
||||
really_inline uint16_t neon_movemask(uint8x16_t input) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t minput = vandq_u8(input, bit_mask);
|
||||
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
||||
}
|
||||
|
||||
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
|
||||
uint8x16_t p2, uint8x16_t p3) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t t0 = vandq_u8(p0, bit_mask);
|
||||
uint8x16_t t1 = vandq_u8(p1, bit_mask);
|
||||
uint8x16_t t2 = vandq_u8(p2, bit_mask);
|
||||
uint8x16_t t3 = vandq_u8(p3, bit_mask);
|
||||
uint8x16_t sum0 = vpaddq_u8(t0, t1);
|
||||
uint8x16_t sum1 = vpaddq_u8(t2, t3);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
sum0 = vpaddq_u8(sum0, sum0);
|
||||
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
|
@ -59,84 +18,6 @@ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
|
|||
#endif
|
||||
}
|
||||
|
||||
template <> struct utf8_checking_state<Architecture::ARM64> {
|
||||
int8x16_t has_error{};
|
||||
processed_utf_bytes previous{};
|
||||
};
|
||||
|
||||
// Checks that all bytes are ascii
|
||||
really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
|
||||
// checking if the most significant bit is always equal to 0.
|
||||
uint8x16_t high_bit = vdupq_n_u8(0x80);
|
||||
uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
|
||||
uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
|
||||
uint8x16_t t3 = vorrq_u8(t0, t1);
|
||||
uint8x16_t t4 = vandq_u8(t3, high_bit);
|
||||
uint64x2_t v64 = vreinterpretq_u64_u8(t4);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) == 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in,
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
if (check_ascii_neon(in)) {
|
||||
// All bytes are ascii. Therefore the byte that was just before must be
|
||||
// ascii too. We only check the byte that was just before simd_input. Nines
|
||||
// are arbitrary values.
|
||||
const int8x16_t verror =
|
||||
(int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
|
||||
state.has_error =
|
||||
vorrq_s8(vreinterpretq_s8_u8(
|
||||
vcgtq_s8(state.previous.carried_continuations, verror)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
|
||||
&(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint64_t &whitespace,
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
|
||||
|
||||
#include "simdjson/simd_input_haswell.h"
|
||||
#include "simdjson/simdutf8check_haswell.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
|
||||
|
@ -8,19 +9,6 @@
|
|||
|
||||
TARGET_HASWELL
|
||||
namespace simdjson {
|
||||
template <> struct simd_input<Architecture::HASWELL> {
|
||||
__m256i lo;
|
||||
__m256i hi;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::HASWELL>
|
||||
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::HASWELL> in;
|
||||
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
|
||||
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||
return in;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
|
@ -32,69 +20,6 @@ compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
|
|||
return quote_mask;
|
||||
}
|
||||
|
||||
template <> struct utf8_checking_state<Architecture::HASWELL> {
|
||||
__m256i has_error;
|
||||
avx_processed_utf_bytes previous;
|
||||
utf8_checking_state() {
|
||||
has_error = _mm256_setzero_si256();
|
||||
previous.raw_bytes = _mm256_setzero_si256();
|
||||
previous.high_nibbles = _mm256_setzero_si256();
|
||||
previous.carried_continuations = _mm256_setzero_si256();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in,
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
__m256i high_bit = _mm256_set1_epi8(0x80u);
|
||||
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
return _mm256_testz_si256(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
|
||||
|
||||
#include "simdjson/simd_input_westmere.h"
|
||||
#include "simdjson/simdutf8check_westmere.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
|
||||
|
@ -8,23 +9,6 @@
|
|||
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
template <> struct simd_input<Architecture::WESTMERE> {
|
||||
__m128i v0;
|
||||
__m128i v1;
|
||||
__m128i v2;
|
||||
__m128i v3;
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline simd_input<Architecture::WESTMERE>
|
||||
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::WESTMERE> in;
|
||||
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
|
||||
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
|
||||
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
|
||||
in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
|
||||
return in;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
|
@ -33,89 +17,6 @@ compute_quote_mask<Architecture::WESTMERE>(uint64_t quote_bits) {
|
|||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
|
||||
}
|
||||
|
||||
template <> struct utf8_checking_state<Architecture::WESTMERE> {
|
||||
__m128i has_error = _mm_setzero_si128();
|
||||
processed_utf_bytes previous{
|
||||
_mm_setzero_si128(), // raw_bytes
|
||||
_mm_setzero_si128(), // high_nibbles
|
||||
_mm_setzero_si128() // carried_continuations
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in,
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
__m128i high_bit = _mm_set1_epi8(0x80u);
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
|
||||
}
|
||||
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
return _mm_testz_si128(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Sun Aug 4 15:43:41 EDT 2019. Do not edit! */
|
||||
/* auto-generated on Wed Aug 14 10:31:26 DST 2019. Do not edit! */
|
||||
|
||||
#include <iostream>
|
||||
#include "simdjson.h"
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue