Ok. Let us benchmark this thing.
This commit is contained in:
parent
a43b0772e1
commit
58ac242770
|
@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
#ifndef SQUASH_COUNTERS
|
||||
printf("number of bytes %ld number of structural chars %d ratio %.3f\n",
|
||||
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
|
||||
p.second, pj.n_structural_indexes,
|
||||
(double)pj.n_structural_indexes / p.second);
|
||||
unsigned long total = cy1 + cy2 + cy3;
|
||||
|
|
|
@ -40,7 +40,7 @@ typedef __m256i m256;
|
|||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#endif
|
||||
|
||||
static inline u32 ctz64(u64 x) {
|
||||
/*static inline u32 ctz64(u64 x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
|
@ -56,4 +56,4 @@ static inline u32 ctz64(u64 x) {
|
|||
#else
|
||||
return (u32)__builtin_ctzll(x);
|
||||
#endif
|
||||
}
|
||||
}*/
|
||||
|
|
|
@ -50,7 +50,7 @@ const char digittoval[256] = {
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1};
|
||||
|
||||
// return true if we have a valid hex between 0000 and FFFF
|
||||
inline bool hex_to_u32(const u8 *src, u32 *res) {
|
||||
/*inline bool hex_to_u32(const u8 *src, u32 *res) {
|
||||
u8 v1 = src[0];
|
||||
u8 v2 = src[1];
|
||||
u8 v3 = src[2];
|
||||
|
@ -58,7 +58,7 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
|
|||
*res = digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
|
||||
digittoval[v4];
|
||||
return (int32_t)(*res) >= 0;
|
||||
}
|
||||
}*/
|
||||
|
||||
// returns a value with the highest bit set if it is not valud
|
||||
uint32_t hex_to_u32_nocheck(const u8 *src) {
|
||||
|
|
|
@ -147,6 +147,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|||
//
|
||||
// This function will almost never be called!!!
|
||||
//
|
||||
// Note: a redesign could avoid this function entirely.
|
||||
//
|
||||
static never_inline bool
|
||||
parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const u32 depth, const u32 offset,
|
||||
|
|
|
@ -97,30 +97,32 @@ public:
|
|||
//
|
||||
|
||||
// this should be considered a private function
|
||||
void write_tape(u64 val, u8 c) {
|
||||
inline void write_tape(u64 val, u8 c) {
|
||||
tape[current_loc++] = val | (((u64)c) << 56);
|
||||
//tape[tape_locs[depth]] = val | (((u64)c) << 56);
|
||||
//tape_locs[depth]++;
|
||||
}
|
||||
|
||||
|
||||
void write_tape_s64(s64 i) {
|
||||
*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
|
||||
inline void write_tape_s64(s64 i) {
|
||||
memcpy(current_number_buf_loc, &i, sizeof(s64));
|
||||
//*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
|
||||
current_number_buf_loc += sizeof(s64);
|
||||
write_tape(current_number_buf_loc - number_buf, 'l');
|
||||
}
|
||||
|
||||
void write_tape_double(double d) {
|
||||
*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
|
||||
inline void write_tape_double(double d) {
|
||||
memcpy(current_number_buf_loc, &d, sizeof(double));
|
||||
//*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
|
||||
current_number_buf_loc += sizeof(double);
|
||||
write_tape(current_number_buf_loc - number_buf, 'd');
|
||||
}
|
||||
|
||||
u32 get_current_loc() {
|
||||
inline u32 get_current_loc() {
|
||||
return current_loc;
|
||||
}
|
||||
|
||||
void annotate_previousloc(u32 saved_loc,u64 val) {
|
||||
inline void annotate_previousloc(u32 saved_loc,u64 val) {
|
||||
tape[saved_loc] |= val;
|
||||
}
|
||||
|
||||
|
@ -167,7 +169,7 @@ public:
|
|||
|
||||
|
||||
#ifdef DEBUG
|
||||
inline void dump256(m256 d, const std::string msg) {
|
||||
inline void dump256(m256 d, const std::string& msg) {
|
||||
for (u32 i = 0; i < 32; i++) {
|
||||
std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i);
|
||||
if (!((i + 1) % 8))
|
||||
|
@ -181,14 +183,14 @@ inline void dump256(m256 d, const std::string msg) {
|
|||
}
|
||||
|
||||
// dump bits low to high
|
||||
inline void dumpbits(u64 v, const std::string msg) {
|
||||
inline void dumpbits(u64 v, const std::string& msg) {
|
||||
for (u32 i = 0; i < 64; i++) {
|
||||
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
std::cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
inline void dumpbits32(u32 v, const std::string msg) {
|
||||
inline void dumpbits32(u32 v, const std::string& msg) {
|
||||
for (u32 i = 0; i < 32; i++) {
|
||||
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
|
@ -201,14 +203,14 @@ inline void dumpbits32(u32 v, const std::string msg) {
|
|||
#endif
|
||||
|
||||
// dump bits low to high
|
||||
inline void dumpbits_always(u64 v, const std::string msg) {
|
||||
inline void dumpbits_always(u64 v, const std::string& msg) {
|
||||
for (u32 i = 0; i < 64; i++) {
|
||||
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
std::cout << " " << msg << "\n";
|
||||
}
|
||||
|
||||
inline void dumpbits32_always(u32 v, const std::string msg) {
|
||||
inline void dumpbits32_always(u32 v, const std::string& msg) {
|
||||
for (u32 i = 0; i < 32; i++) {
|
||||
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
|
||||
}
|
||||
|
|
|
@ -24,168 +24,7 @@
|
|||
*/
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
static inline void checkSmallerThan0xF4(__m128i current_bytes,
|
||||
__m128i *has_error) {
|
||||
// unsigned, saturates to 0 below max
|
||||
*has_error = _mm_or_si128(*has_error,
|
||||
_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
|
||||
}
|
||||
|
||||
static inline __m128i continuationLengths(__m128i high_nibbles) {
|
||||
return _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
2, 2, // 110x
|
||||
3, // 1110
|
||||
4), // 1111, next should be 0 (not checked here)
|
||||
high_nibbles);
|
||||
}
|
||||
|
||||
static inline __m128i carryContinuations(__m128i initial_lengths,
|
||||
__m128i previous_carries) {
|
||||
|
||||
__m128i right1 =
|
||||
_mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
|
||||
_mm_set1_epi8(1));
|
||||
__m128i sum = _mm_add_epi8(initial_lengths, right1);
|
||||
|
||||
__m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
|
||||
_mm_set1_epi8(2));
|
||||
return _mm_add_epi8(sum, right2);
|
||||
}
|
||||
|
||||
static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
|
||||
__m128i *has_error) {
|
||||
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
// (carries > length) == (lengths > 0)
|
||||
__m128i overunder =
|
||||
_mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
|
||||
_mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
|
||||
|
||||
*has_error = _mm_or_si128(*has_error, overunder);
|
||||
}
|
||||
|
||||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
static inline void checkFirstContinuationMax(__m128i current_bytes,
|
||||
__m128i off1_current_bytes,
|
||||
__m128i *has_error) {
|
||||
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
|
||||
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
|
||||
|
||||
__m128i badfollowED =
|
||||
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
|
||||
__m128i badfollowF4 =
|
||||
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
|
||||
|
||||
*has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
|
||||
}
|
||||
|
||||
// map off1_hibits => error condition
|
||||
// hibits off1 cur
|
||||
// C => < C2 && true
|
||||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
static inline void checkOverlong(__m128i current_bytes,
|
||||
__m128i off1_current_bytes, __m128i hibits,
|
||||
__m128i previous_hibits, __m128i *has_error) {
|
||||
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
|
||||
__m128i initial_mins = _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1),
|
||||
off1_hibits);
|
||||
|
||||
__m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
|
||||
|
||||
__m128i second_mins = _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90),
|
||||
off1_hibits);
|
||||
__m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
|
||||
*has_error =
|
||||
_mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
|
||||
}
|
||||
|
||||
struct processed_utf_bytes {
|
||||
__m128i rawbytes;
|
||||
__m128i high_nibbles;
|
||||
__m128i carried_continuations;
|
||||
};
|
||||
|
||||
static inline void count_nibbles(__m128i bytes,
|
||||
struct processed_utf_bytes *answer) {
|
||||
answer->rawbytes = bytes;
|
||||
answer->high_nibbles =
|
||||
_mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static struct processed_utf_bytes
|
||||
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
|
||||
__m128i *has_error) {
|
||||
struct processed_utf_bytes pb;
|
||||
count_nibbles(current_bytes, &pb);
|
||||
|
||||
checkSmallerThan0xF4(current_bytes, has_error);
|
||||
|
||||
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations =
|
||||
carryContinuations(initial_lengths, previous->carried_continuations);
|
||||
|
||||
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
__m128i off1_current_bytes =
|
||||
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
|
||||
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
|
||||
|
||||
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
static inline bool validate_utf8_fast(const char *src, size_t len) {
|
||||
size_t i = 0;
|
||||
__m128i has_error = _mm_setzero_si128();
|
||||
struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
|
||||
.high_nibbles = _mm_setzero_si128(),
|
||||
.carried_continuations =
|
||||
_mm_setzero_si128()};
|
||||
if (len >= 16) {
|
||||
for (; i <= len - 16; i += 16) {
|
||||
__m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
|
||||
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
}
|
||||
}
|
||||
|
||||
// last part
|
||||
if (i < len) {
|
||||
char buffer[16];
|
||||
memset(buffer, 0, 16);
|
||||
memcpy(buffer, src + i, len - i);
|
||||
__m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
|
||||
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
} else {
|
||||
has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
has_error);
|
||||
}
|
||||
|
||||
return _mm_testz_si128(has_error, has_error);
|
||||
}
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
|
@ -349,109 +188,7 @@ avxcheckUTF8Bytes(__m256i current_bytes,
|
|||
return pb;
|
||||
}
|
||||
|
||||
static inline bool validate_utf8_fast_avx(const char *src, size_t len) {
|
||||
size_t i = 0;
|
||||
__m256i has_error = _mm256_setzero_si256();
|
||||
struct avx_processed_utf_bytes previous = {
|
||||
.rawbytes = _mm256_setzero_si256(),
|
||||
.high_nibbles = _mm256_setzero_si256(),
|
||||
.carried_continuations = _mm256_setzero_si256()};
|
||||
if (len >= 32) {
|
||||
for (; i <= len - 32; i += 32) {
|
||||
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
|
||||
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
}
|
||||
}
|
||||
|
||||
// last part
|
||||
if (i < len) {
|
||||
char buffer[32];
|
||||
memset(buffer, 0, 32);
|
||||
memcpy(buffer, src + i, len - i);
|
||||
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
|
||||
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
} else {
|
||||
has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
has_error);
|
||||
}
|
||||
|
||||
return _mm256_testz_si256(has_error, has_error);
|
||||
}
|
||||
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static struct avx_processed_utf_bytes
|
||||
avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
|
||||
struct avx_processed_utf_bytes *previous,
|
||||
__m256i *has_error) {
|
||||
if(_mm256_testz_si256(current_bytes,_mm256_set1_epi8(0x80))) { // fast ascii path
|
||||
*has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(previous->carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),*has_error);
|
||||
return *previous;
|
||||
}
|
||||
struct avx_processed_utf_bytes pb;
|
||||
avx_count_nibbles(current_bytes, &pb);
|
||||
|
||||
avxcheckSmallerThan0xF4(current_bytes, has_error);
|
||||
|
||||
__m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations =
|
||||
avxcarryContinuations(initial_lengths, previous->carried_continuations);
|
||||
|
||||
avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
__m256i off1_current_bytes =
|
||||
push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
|
||||
avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
|
||||
|
||||
avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
|
||||
static inline bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
|
||||
size_t i = 0;
|
||||
__m256i has_error = _mm256_setzero_si256();
|
||||
struct avx_processed_utf_bytes previous = {
|
||||
.rawbytes = _mm256_setzero_si256(),
|
||||
.high_nibbles = _mm256_setzero_si256(),
|
||||
.carried_continuations = _mm256_setzero_si256()};
|
||||
if (len >= 32) {
|
||||
for (; i <= len - 32; i += 32) {
|
||||
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
|
||||
previous = avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
|
||||
}
|
||||
}
|
||||
|
||||
// last part
|
||||
if (i < len) {
|
||||
char buffer[32];
|
||||
memset(buffer, 0, 32);
|
||||
memcpy(buffer, src + i, len - i);
|
||||
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
|
||||
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
|
||||
} else {
|
||||
has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
has_error);
|
||||
}
|
||||
|
||||
return _mm256_testz_si256(has_error, has_error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#else // __AVX2__
|
||||
#warning "We require AVX2 support!"
|
||||
#endif // __AVX2__
|
||||
#endif
|
||||
|
|
|
@ -205,10 +205,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t odd_carries;
|
||||
bool iter_ends_odd_backslash = __builtin_uaddll_overflow(
|
||||
bs_bits, odd_starts, (unsigned long long *)&odd_carries);
|
||||
//bool iter_ends_odd_backslash =
|
||||
__builtin_uaddll_overflow( bs_bits, odd_starts, (unsigned long long *)&odd_carries);
|
||||
odd_carries |= prev_iter_ends_odd_backslash;
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
//prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
|
|
|
@ -54,7 +54,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
|
|||
u32 *base_ptr = pj.structural_indexes;
|
||||
u32 base = 0;
|
||||
#ifdef BUILDHISTOGRAM
|
||||
uint32_t counters[65];
|
||||
uint32_t counters[66];
|
||||
uint32_t total = 0;
|
||||
for (int k = 0; k < 66; k++)
|
||||
counters[k] = 0;
|
||||
|
|
Loading…
Reference in New Issue