Ok. Let us benchmark this thing.

This commit is contained in:
Daniel Lemire 2018-11-27 15:05:50 -05:00
parent a43b0772e1
commit 58ac242770
8 changed files with 27 additions and 286 deletions

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
}
#ifndef SQUASH_COUNTERS
printf("number of bytes %ld number of structural chars %d ratio %.3f\n",
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
p.second, pj.n_structural_indexes,
(double)pj.n_structural_indexes / p.second);
unsigned long total = cy1 + cy2 + cy3;

View File

@ -40,7 +40,7 @@ typedef __m256i m256;
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
static inline u32 ctz64(u64 x) {
/*static inline u32 ctz64(u64 x) {
assert(x); // behaviour not defined for x == 0
#if defined(_WIN64)
unsigned long r;
@ -56,4 +56,4 @@ static inline u32 ctz64(u64 x) {
#else
return (u32)__builtin_ctzll(x);
#endif
}
}*/

View File

@ -50,7 +50,7 @@ const char digittoval[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1};
// return true if we have a valid hex between 0000 and FFFF
inline bool hex_to_u32(const u8 *src, u32 *res) {
/*inline bool hex_to_u32(const u8 *src, u32 *res) {
u8 v1 = src[0];
u8 v2 = src[1];
u8 v3 = src[2];
@ -58,7 +58,7 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
*res = digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
digittoval[v4];
return (int32_t)(*res) >= 0;
}
}*/
// returns a value with the highest bit set if it is not valud
uint32_t hex_to_u32_nocheck(const u8 *src) {

View File

@ -147,6 +147,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
//
// This function will almost never be called!!!
//
// Note: a redesign could avoid this function entirely.
//
static never_inline bool
parse_highprecision_float(const u8 *const buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const u32 depth, const u32 offset,

View File

@ -97,30 +97,32 @@ public:
//
// this should be considered a private function
void write_tape(u64 val, u8 c) {
inline void write_tape(u64 val, u8 c) {
tape[current_loc++] = val | (((u64)c) << 56);
//tape[tape_locs[depth]] = val | (((u64)c) << 56);
//tape_locs[depth]++;
}
void write_tape_s64(s64 i) {
*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
inline void write_tape_s64(s64 i) {
memcpy(current_number_buf_loc, &i, sizeof(s64));
//*((s64 *)current_number_buf_loc) = i;// safe because array will be 8-byte aligned, could use memcpy
current_number_buf_loc += sizeof(s64);
write_tape(current_number_buf_loc - number_buf, 'l');
}
void write_tape_double(double d) {
*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
inline void write_tape_double(double d) {
memcpy(current_number_buf_loc, &d, sizeof(double));
//*((double *)current_number_buf_loc) = d;// safe because array will be 8-byte aligned, could use memcpy
current_number_buf_loc += sizeof(double);
write_tape(current_number_buf_loc - number_buf, 'd');
}
u32 get_current_loc() {
inline u32 get_current_loc() {
return current_loc;
}
void annotate_previousloc(u32 saved_loc,u64 val) {
inline void annotate_previousloc(u32 saved_loc,u64 val) {
tape[saved_loc] |= val;
}
@ -167,7 +169,7 @@ public:
#ifdef DEBUG
inline void dump256(m256 d, const std::string msg) {
inline void dump256(m256 d, const std::string& msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << std::setw(3) << (int)*(((u8 *)(&d)) + i);
if (!((i + 1) % 8))
@ -181,14 +183,14 @@ inline void dump256(m256 d, const std::string msg) {
}
// dump bits low to high
inline void dumpbits(u64 v, const std::string msg) {
inline void dumpbits(u64 v, const std::string& msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
inline void dumpbits32(u32 v, const std::string msg) {
inline void dumpbits32(u32 v, const std::string& msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
}
@ -201,14 +203,14 @@ inline void dumpbits32(u32 v, const std::string msg) {
#endif
// dump bits low to high
inline void dumpbits_always(u64 v, const std::string msg) {
inline void dumpbits_always(u64 v, const std::string& msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v >> (u64)i) & 0x1ULL) ? "1" : "_");
}
std::cout << " " << msg << "\n";
}
inline void dumpbits32_always(u32 v, const std::string msg) {
inline void dumpbits32_always(u32 v, const std::string& msg) {
for (u32 i = 0; i < 32; i++) {
std::cout << (((v >> (u32)i) & 0x1ULL) ? "1" : "_");
}

View File

@ -24,168 +24,7 @@
*/
// all byte values must be no larger than 0xF4
static inline void checkSmallerThan0xF4(__m128i current_bytes,
__m128i *has_error) {
// unsigned, saturates to 0 below max
*has_error = _mm_or_si128(*has_error,
_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
}
static inline __m128i continuationLengths(__m128i high_nibbles) {
return _mm_shuffle_epi8(
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
2, 2, // 110x
3, // 1110
4), // 1111, next should be 0 (not checked here)
high_nibbles);
}
static inline __m128i carryContinuations(__m128i initial_lengths,
__m128i previous_carries) {
__m128i right1 =
_mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
_mm_set1_epi8(1));
__m128i sum = _mm_add_epi8(initial_lengths, right1);
__m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
_mm_set1_epi8(2));
return _mm_add_epi8(sum, right2);
}
static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
__m128i *has_error) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
// (carries > length) == (lengths > 0)
__m128i overunder =
_mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
_mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
*has_error = _mm_or_si128(*has_error, overunder);
}
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
static inline void checkFirstContinuationMax(__m128i current_bytes,
__m128i off1_current_bytes,
__m128i *has_error) {
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
__m128i badfollowED =
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
__m128i badfollowF4 =
_mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
*has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
}
// map off1_hibits => error condition
// hibits off1 cur
// C => < C2 && true
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
static inline void checkOverlong(__m128i current_bytes,
__m128i off1_current_bytes, __m128i hibits,
__m128i previous_hibits, __m128i *has_error) {
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
__m128i initial_mins = _mm_shuffle_epi8(
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1),
off1_hibits);
__m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
__m128i second_mins = _mm_shuffle_epi8(
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90),
off1_hibits);
__m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
*has_error =
_mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
}
struct processed_utf_bytes {
__m128i rawbytes;
__m128i high_nibbles;
__m128i carried_continuations;
};
static inline void count_nibbles(__m128i bytes,
struct processed_utf_bytes *answer) {
answer->rawbytes = bytes;
answer->high_nibbles =
_mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
}
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static struct processed_utf_bytes
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
__m128i *has_error) {
struct processed_utf_bytes pb;
count_nibbles(current_bytes, &pb);
checkSmallerThan0xF4(current_bytes, has_error);
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
pb.carried_continuations =
carryContinuations(initial_lengths, previous->carried_continuations);
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
__m128i off1_current_bytes =
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
static inline bool validate_utf8_fast(const char *src, size_t len) {
size_t i = 0;
__m128i has_error = _mm_setzero_si128();
struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
.high_nibbles = _mm_setzero_si128(),
.carried_continuations =
_mm_setzero_si128()};
if (len >= 16) {
for (; i <= len - 16; i += 16) {
__m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
}
}
// last part
if (i < len) {
char buffer[16];
memset(buffer, 0, 16);
memcpy(buffer, src + i, len - i);
__m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
} else {
has_error =
_mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 1)),
has_error);
}
return _mm_testz_si128(has_error, has_error);
}
#ifdef __AVX2__
@ -349,109 +188,7 @@ avxcheckUTF8Bytes(__m256i current_bytes,
return pb;
}
static inline bool validate_utf8_fast_avx(const char *src, size_t len) {
size_t i = 0;
__m256i has_error = _mm256_setzero_si256();
struct avx_processed_utf_bytes previous = {
.rawbytes = _mm256_setzero_si256(),
.high_nibbles = _mm256_setzero_si256(),
.carried_continuations = _mm256_setzero_si256()};
if (len >= 32) {
for (; i <= len - 32; i += 32) {
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
}
}
// last part
if (i < len) {
char buffer[32];
memset(buffer, 0, 32);
memcpy(buffer, src + i, len - i);
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
} else {
has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),
has_error);
}
return _mm256_testz_si256(has_error, has_error);
}
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static struct avx_processed_utf_bytes
avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
struct avx_processed_utf_bytes *previous,
__m256i *has_error) {
if(_mm256_testz_si256(current_bytes,_mm256_set1_epi8(0x80))) { // fast ascii path
*has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(previous->carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),*has_error);
return *previous;
}
struct avx_processed_utf_bytes pb;
avx_count_nibbles(current_bytes, &pb);
avxcheckSmallerThan0xF4(current_bytes, has_error);
__m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
pb.carried_continuations =
avxcarryContinuations(initial_lengths, previous->carried_continuations);
avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
__m256i off1_current_bytes =
push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
static inline bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
size_t i = 0;
__m256i has_error = _mm256_setzero_si256();
struct avx_processed_utf_bytes previous = {
.rawbytes = _mm256_setzero_si256(),
.high_nibbles = _mm256_setzero_si256(),
.carried_continuations = _mm256_setzero_si256()};
if (len >= 32) {
for (; i <= len - 32; i += 32) {
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
previous = avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
}
}
// last part
if (i < len) {
char buffer[32];
memset(buffer, 0, 32);
memcpy(buffer, src + i, len - i);
__m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
} else {
has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),
has_error);
}
return _mm256_testz_si256(has_error, has_error);
}
#else // __AVX2__
#warning "We require AVX2 support!"
#endif // __AVX2__
#endif

View File

@ -205,10 +205,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
bool iter_ends_odd_backslash = __builtin_uaddll_overflow(
bs_bits, odd_starts, (unsigned long long *)&odd_carries);
//bool iter_ends_odd_backslash =
__builtin_uaddll_overflow( bs_bits, odd_starts, (unsigned long long *)&odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
//prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;

View File

@ -54,7 +54,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
u32 *base_ptr = pj.structural_indexes;
u32 base = 0;
#ifdef BUILDHISTOGRAM
uint32_t counters[65];
uint32_t counters[66];
uint32_t total = 0;
for (int k = 0; k < 66; k++)
counters[k] = 0;