We were forgetting to check the end bytes at the end of the UTF8 validation. (#1173)
* We were forgetting to check the end bytes at the end of the UTF8 validation. * Silencing the sanitizer * Better explanation.
This commit is contained in:
parent
461f7dc9f9
commit
bfbac12f76
|
@ -75,6 +75,7 @@ simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block()
|
|||
|
||||
template<size_t STEP_SIZE>
|
||||
simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
||||
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
||||
memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
||||
memcpy(dst, buf + idx, len - idx);
|
||||
return len - idx;
|
||||
|
|
|
@ -221,6 +221,7 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
|
|||
}
|
||||
parser.n_structural_indexes = new_structural_indexes;
|
||||
}
|
||||
checker.check_eof();
|
||||
return checker.errors();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,184 +0,0 @@
|
|||
namespace simdjson {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
|
||||
/*
|
||||
* legal utf-8 byte sequence
|
||||
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
|
||||
*
|
||||
* Code Points 1st 2s 3s 4s
|
||||
* U+0000..U+007F 00..7F
|
||||
* U+0080..U+07FF C2..DF 80..BF
|
||||
* U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
* U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
* U+D000..U+D7FF ED 80..9F 80..BF
|
||||
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
*
|
||||
*/
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
|
||||
using namespace simd;
|
||||
|
||||
struct processed_utf_bytes {
|
||||
simd8<uint8_t> raw_bytes;
|
||||
simd8<int8_t> high_nibbles;
|
||||
simd8<int8_t> carried_continuations;
|
||||
};
|
||||
|
||||
struct utf8_checker {
|
||||
simd8<uint8_t> has_error;
|
||||
processed_utf_bytes previous;
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
simdjson_really_inline void check_smaller_than_0xF4(const simd8<uint8_t> current_bytes) {
|
||||
// unsigned, saturates to 0 below max
|
||||
this->has_error |= current_bytes.saturating_sub(0xF4u);
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<int8_t> continuation_lengths(const simd8<int8_t> high_nibbles) {
|
||||
return high_nibbles.lookup_16<int8_t>(
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
2, 2, // 110x
|
||||
3, // 1110
|
||||
4); // 1111, next should be 0 (not checked here)
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<int8_t> carry_continuations(const simd8<int8_t>& initial_lengths) {
|
||||
simd8<int8_t> prev_carried_continuations = initial_lengths.prev(this->previous.carried_continuations);
|
||||
simd8<int8_t> right1 = simd8<int8_t>(simd8<uint8_t>(prev_carried_continuations).saturating_sub(1));
|
||||
simd8<int8_t> sum = initial_lengths + right1;
|
||||
|
||||
simd8<int8_t> prev2_carried_continuations = sum.prev<2>(this->previous.carried_continuations);
|
||||
simd8<int8_t> right2 = simd8<int8_t>(simd8<uint8_t>(prev2_carried_continuations).saturating_sub(2));
|
||||
return sum + right2;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_continuations(const simd8<int8_t>& initial_lengths, const simd8<int8_t>& carries) {
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
// (carries > length) == (lengths > 0)
|
||||
// (carries > current) == (current > 0)
|
||||
this->has_error |= simd8<uint8_t>(
|
||||
(carries > initial_lengths) == (initial_lengths > simd8<int8_t>::zero()));
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_carried_continuations() {
|
||||
static const int8_t last_1[32] = {
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1
|
||||
};
|
||||
this->has_error |= simd8<uint8_t>(this->previous.carried_continuations > simd8<int8_t>(last_1 + 32 - sizeof(simd8<int8_t>)));
|
||||
}
|
||||
|
||||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
simdjson_really_inline void check_first_continuation_max(const simd8<uint8_t> current_bytes,
|
||||
const simd8<uint8_t> off1_current_bytes) {
|
||||
simd8<bool> prev_ED = off1_current_bytes == 0xEDu;
|
||||
simd8<bool> prev_F4 = off1_current_bytes == 0xF4u;
|
||||
// Check if ED is followed by A0 or greater
|
||||
simd8<bool> ED_too_large = (simd8<int8_t>(current_bytes) > simd8<int8_t>::splat(0x9Fu)) & prev_ED;
|
||||
// Check if F4 is followed by 90 or greater
|
||||
simd8<bool> F4_too_large = (simd8<int8_t>(current_bytes) > simd8<int8_t>::splat(0x8Fu)) & prev_F4;
|
||||
// These will also error if ED or F4 is followed by ASCII, but that's an error anyway
|
||||
this->has_error |= simd8<uint8_t>(ED_too_large | F4_too_large);
|
||||
}
|
||||
|
||||
// map off1_hibits => error condition
|
||||
// hibits off1 cur
|
||||
// C => < C2 && true
|
||||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
simdjson_really_inline void check_overlong(const simd8<uint8_t> current_bytes,
|
||||
const simd8<uint8_t> off1_current_bytes,
|
||||
const simd8<int8_t>& high_nibbles) {
|
||||
simd8<int8_t> off1_high_nibbles = high_nibbles.prev(this->previous.high_nibbles);
|
||||
|
||||
// Two-byte characters must start with at least C2
|
||||
// Three-byte characters must start with at least E1
|
||||
// Four-byte characters must start with at least F1
|
||||
simd8<int8_t> initial_mins = off1_high_nibbles.lookup_16<int8_t>(
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, // 0xxx -> false
|
||||
-128, -128, -128, -128, // 10xx -> false
|
||||
0xC2, -128, // 1100 -> C2
|
||||
0xE1, // 1110
|
||||
0xF1 // 1111
|
||||
);
|
||||
simd8<bool> initial_under = initial_mins > simd8<int8_t>(off1_current_bytes);
|
||||
|
||||
// Two-byte characters starting with at least C2 are always OK
|
||||
// Three-byte characters starting with at least E1 must be followed by at least A0
|
||||
// Four-byte characters starting with at least F1 must be followed by at least 90
|
||||
simd8<int8_t> second_mins = off1_high_nibbles.lookup_16<int8_t>(
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, -128, // 0xxx => false
|
||||
-128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90 // 1111
|
||||
);
|
||||
simd8<bool> second_under = second_mins > simd8<int8_t>(current_bytes);
|
||||
this->has_error |= simd8<uint8_t>(initial_under & second_under);
|
||||
}
|
||||
|
||||
simdjson_really_inline void count_nibbles(simd8<uint8_t> bytes, struct processed_utf_bytes *answer) {
|
||||
answer->raw_bytes = bytes;
|
||||
answer->high_nibbles = simd8<int8_t>(bytes.shr<4>());
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> current_bytes) {
|
||||
struct processed_utf_bytes pb {};
|
||||
this->count_nibbles(current_bytes, &pb);
|
||||
|
||||
this->check_smaller_than_0xF4(current_bytes);
|
||||
|
||||
simd8<int8_t> initial_lengths = this->continuation_lengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations = this->carry_continuations(initial_lengths);
|
||||
|
||||
this->check_continuations(initial_lengths, pb.carried_continuations);
|
||||
|
||||
simd8<uint8_t> off1_current_bytes = pb.raw_bytes.prev(this->previous.raw_bytes);
|
||||
this->check_first_continuation_max(current_bytes, off1_current_bytes);
|
||||
|
||||
this->check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles);
|
||||
this->previous = pb;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(Dconst simd8<uint8_t> in) {
|
||||
if (simdjson_likely(!in.any_bits_set_anywhere(0x80u))) {
|
||||
this->check_carried_continuations();
|
||||
} else {
|
||||
this->check_utf8_bytes(in);
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& in) {
|
||||
simd8<uint8_t> bits = in.reduce_or();
|
||||
if (simdjson_likely(!bits.any_bits_set_anywhere(0x80u))) {
|
||||
// it is ascii, we just check carried continuations.
|
||||
this->check_carried_continuations();
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
|
||||
this->check_utf8_bytes(in.chunks[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->has_error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
}; // struct utf8_checker
|
||||
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // unnamed namespace
|
|
@ -1,225 +0,0 @@
|
|||
namespace {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
namespace utf8_validation {
|
||||
|
||||
//
|
||||
// Detect Unicode errors.
|
||||
//
|
||||
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
|
||||
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
|
||||
// are straight up concatenated into the final value. The first byte of a multibyte character is a
|
||||
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
|
||||
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size looks like:
|
||||
//
|
||||
// - ASCII (7 bits): 0_______
|
||||
// - 2 byte character (11 bits): 110_____ 10______
|
||||
// - 3 byte character (17 bits): 1110____ 10______ 10______
|
||||
// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
|
||||
// - 5+ byte character (illegal): 11111___ <illegal>
|
||||
//
|
||||
// There are 5 classes of error that can happen in Unicode:
|
||||
//
|
||||
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
|
||||
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
|
||||
// character.
|
||||
//
|
||||
// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
|
||||
//
|
||||
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
|
||||
// We detect this by requiring that the next byte after your multibyte character be a new
|
||||
// character--so a continuation after your character is wrong.
|
||||
//
|
||||
// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
|
||||
//
|
||||
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
|
||||
//
|
||||
// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
|
||||
//
|
||||
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
|
||||
// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
|
||||
// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
|
||||
//
|
||||
// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
|
||||
//
|
||||
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
|
||||
// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
|
||||
//
|
||||
// e.g. 11101101 10100000 10000000 (U+D800)
|
||||
//
|
||||
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
|
||||
// support values with more than 23 bits (which a 4-byte character supports).
|
||||
//
|
||||
// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
|
||||
//
|
||||
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
|
||||
//
|
||||
// Code Points 1st 2s 3s 4s
|
||||
// U+0000..U+007F 00..7F
|
||||
// U+0080..U+07FF C2..DF 80..BF
|
||||
// U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
// U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
// U+D000..U+D7FF ED 80..9F 80..BF
|
||||
// U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
//
|
||||
using namespace simd;
|
||||
|
||||
// For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
|
||||
|
||||
//
|
||||
// Find special case UTF-8 errors where the character is technically readable (has the right length)
|
||||
// but the *value* is disallowed.
|
||||
//
|
||||
// This includes overlong encodings, surrogates and values too large for Unicode.
|
||||
//
|
||||
// It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
|
||||
// UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
|
||||
// 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
|
||||
// If all 3 lookups detect the same error, it's an error.
|
||||
//
|
||||
simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
||||
//
|
||||
// These are the errors we're going to match for bytes 1-2, by looking at the first three
|
||||
// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
|
||||
//
|
||||
static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
|
||||
static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________
|
||||
static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________
|
||||
static const int SURROGATE = 0x08; // 11101101 [101_]____
|
||||
static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____
|
||||
static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
|
||||
|
||||
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
|
||||
// byte 2 to be sure which things are errors and which aren't.
|
||||
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
|
||||
static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
|
||||
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
||||
// ASCII: ________ [0___]____
|
||||
CARRY, CARRY, CARRY, CARRY,
|
||||
// ASCII: ________ [0___]____
|
||||
CARRY, CARRY, CARRY, CARRY,
|
||||
// Continuations: ________ [10__]____
|
||||
CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
|
||||
CARRY | OVERLONG_3 | TOO_LARGE, // ________ [1001]____
|
||||
CARRY | TOO_LARGE | SURROGATE, // ________ [1010]____
|
||||
CARRY | TOO_LARGE | SURROGATE, // ________ [1011]____
|
||||
// Multibyte Leads: ________ [11__]____
|
||||
CARRY, CARRY, CARRY, CARRY
|
||||
);
|
||||
|
||||
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
||||
// [0___]____ (ASCII)
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
// [10__]____ (continuation)
|
||||
0, 0, 0, 0,
|
||||
// [11__]____ (2+-byte leads)
|
||||
OVERLONG_2, 0, // [110_]____ (2-byte lead)
|
||||
OVERLONG_3 | SURROGATE, // [1110]____ (3-byte lead)
|
||||
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
|
||||
);
|
||||
|
||||
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
||||
// ____[00__] ________
|
||||
OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
|
||||
OVERLONG_2, // ____[0001] ________
|
||||
0, 0,
|
||||
// ____[01__] ________
|
||||
TOO_LARGE, // ____[0100] ________
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
// ____[10__] ________
|
||||
TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
|
||||
// ____[11__] ________
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2 | SURROGATE, // ____[1101] ________
|
||||
TOO_LARGE_2, TOO_LARGE_2
|
||||
);
|
||||
|
||||
return byte_1_high & byte_1_low & byte_2_high;
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> prev1) {
|
||||
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
||||
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
||||
|
||||
// Cont is 10000000-101111111 (-65...-128)
|
||||
simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64);
|
||||
// must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
|
||||
return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
|
||||
}
|
||||
|
||||
//
|
||||
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
||||
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
||||
//
|
||||
simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
||||
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
||||
// ... 1111____ 111_____ 11______
|
||||
static const uint8_t max_array[32] = {
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
||||
};
|
||||
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
||||
return input.gt_bits(max_value);
|
||||
}
|
||||
|
||||
struct utf8_checker {
|
||||
// If this is nonzero, there has been a UTF-8 error.
|
||||
simd8<uint8_t> error;
|
||||
// The last input we received
|
||||
simd8<uint8_t> prev_input_block;
|
||||
// Whether the last input we received was incomplete (used for ASCII fast path)
|
||||
simd8<uint8_t> prev_incomplete;
|
||||
|
||||
//
|
||||
// Check whether the current bytes are valid UTF-8.
|
||||
//
|
||||
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
||||
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
||||
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
||||
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
||||
this->error |= check_special_cases(input, prev1);
|
||||
this->error |= check_multibyte_lengths(input, prev_input, prev1);
|
||||
}
|
||||
|
||||
// The only problem that can happen at EOF is that a multibyte character is too short.
|
||||
simdjson_really_inline void check_eof() {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
||||
if (likely(is_ascii(input))) {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
} else {
|
||||
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
||||
for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
|
||||
this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
|
||||
}
|
||||
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
||||
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
}; // struct utf8_checker
|
||||
} // namespace utf8_validation
|
||||
} // unnamed namespace
|
||||
|
||||
using utf8_validation::utf8_checker;
|
||||
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // namespace simdjson
|
|
@ -1,245 +0,0 @@
|
|||
namespace {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
namespace utf8_validation {
|
||||
|
||||
//
|
||||
// Detect Unicode errors.
|
||||
//
|
||||
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
|
||||
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
|
||||
// are straight up concatenated into the final value. The first byte of a multibyte character is a
|
||||
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
|
||||
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size looks like:
|
||||
//
|
||||
// - ASCII (7 bits): 0_______
|
||||
// - 2 byte character (11 bits): 110_____ 10______
|
||||
// - 3 byte character (17 bits): 1110____ 10______ 10______
|
||||
// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
|
||||
// - 5+ byte character (illegal): 11111___ <illegal>
|
||||
//
|
||||
// There are 5 classes of error that can happen in Unicode:
|
||||
//
|
||||
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
|
||||
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
|
||||
// character.
|
||||
//
|
||||
// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
|
||||
//
|
||||
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
|
||||
// We detect this by requiring that the next byte after your multibyte character be a new
|
||||
// character--so a continuation after your character is wrong.
|
||||
//
|
||||
// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
|
||||
//
|
||||
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
|
||||
//
|
||||
// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
|
||||
//
|
||||
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
|
||||
// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
|
||||
// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
|
||||
//
|
||||
// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
|
||||
//
|
||||
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
|
||||
// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
|
||||
//
|
||||
// e.g. 11101101 10100000 10000000 (U+D800)
|
||||
//
|
||||
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
|
||||
// support values with more than 23 bits (which a 4-byte character supports).
|
||||
//
|
||||
// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
|
||||
//
|
||||
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
|
||||
//
|
||||
// Code Points 1st 2s 3s 4s
|
||||
// U+0000..U+007F 00..7F
|
||||
// U+0080..U+07FF C2..DF 80..BF
|
||||
// U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
// U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
// U+D000..U+D7FF ED 80..9F 80..BF
|
||||
// U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
//
|
||||
using namespace simd;
|
||||
|
||||
// For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
|
||||
|
||||
//
|
||||
// Find special case UTF-8 errors where the character is technically readable (has the right length)
|
||||
// but the *value* is disallowed.
|
||||
//
|
||||
// This includes overlong encodings, surrogates and values too large for Unicode.
|
||||
//
|
||||
// It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
|
||||
// UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
|
||||
// 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
|
||||
// If all 3 lookups detect the same error, it's an error.
|
||||
//
|
||||
simdjson_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
||||
//
|
||||
// These are the errors we're going to match for bytes 1-2, by looking at the first three
|
||||
// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
|
||||
//
|
||||
static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
|
||||
static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________
|
||||
static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________
|
||||
static const int SURROGATE = 0x08; // 11101101 [101_]____
|
||||
static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____
|
||||
static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
|
||||
|
||||
// New with lookup3. We want to catch the case where an non-continuation
|
||||
// follows a leading byte
|
||||
static const int TOO_SHORT_2_3_4 = 0x40; // (110_|1110|1111) ____ (0___|110_|1111) ____
|
||||
// We also want to catch a continuation that is preceded by an ASCII byte
|
||||
static const int LONELY_CONTINUATION = 0x80; // 0___ ____ 01__ ____
|
||||
|
||||
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
|
||||
// byte 2 to be sure which things are errors and which aren't.
|
||||
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
|
||||
static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
|
||||
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
||||
// ASCII: ________ [0___]____
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
|
||||
// ASCII: ________ [0___]____
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
|
||||
// Continuations: ________ [10__]____
|
||||
CARRY | OVERLONG_3 | OVERLONG_4 | LONELY_CONTINUATION, // ________ [1000]____
|
||||
CARRY | OVERLONG_3 | TOO_LARGE | LONELY_CONTINUATION, // ________ [1001]____
|
||||
CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION, // ________ [1010]____
|
||||
CARRY | TOO_LARGE | SURROGATE | LONELY_CONTINUATION, // ________ [1011]____
|
||||
// Multibyte Leads: ________ [11__]____
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4, // 110_
|
||||
CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4
|
||||
);
|
||||
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
||||
// [0___]____ (ASCII)
|
||||
LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
|
||||
LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
|
||||
// [10__]____ (continuation)
|
||||
0, 0, 0, 0,
|
||||
// [11__]____ (2+-byte leads)
|
||||
OVERLONG_2 | TOO_SHORT_2_3_4, TOO_SHORT_2_3_4, // [110_]____ (2-byte lead)
|
||||
OVERLONG_3 | SURROGATE | TOO_SHORT_2_3_4, // [1110]____ (3-byte lead)
|
||||
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 | TOO_SHORT_2_3_4 // [1111]____ (4+-byte lead)
|
||||
);
|
||||
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
||||
// ____[00__] ________
|
||||
OVERLONG_2 | OVERLONG_3 | OVERLONG_4 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0000] ________
|
||||
OVERLONG_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0001] ________
|
||||
TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
// ____[01__] ________
|
||||
TOO_LARGE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0100] ________
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
// ____[10__] ________
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
// ____[11__] ________
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | SURROGATE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[1101] ________
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4| LONELY_CONTINUATION,
|
||||
TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION
|
||||
);
|
||||
return byte_1_high & byte_1_low & byte_2_high;
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input, const simd8<uint8_t> prev_input,
|
||||
simd8<uint8_t> prev1) {
|
||||
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
||||
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
||||
// is_2_3_continuation uses one more instruction than lookup2
|
||||
simd8<bool> is_2_3_continuation = (simd8<int8_t>(input).max(simd8<int8_t>(prev1))) < int8_t(-64);
|
||||
// must_be_2_3_continuation has two fewer instructions than lookup 2
|
||||
return simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3) ^ is_2_3_continuation);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
||||
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
||||
//
|
||||
simdjson_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
||||
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
||||
// ... 1111____ 111_____ 11______
|
||||
static const uint8_t max_array[32] = {
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
||||
};
|
||||
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
||||
return input.gt_bits(max_value);
|
||||
}
|
||||
|
||||
struct utf8_checker {
|
||||
// If this is nonzero, there has been a UTF-8 error.
|
||||
simd8<uint8_t> error;
|
||||
// The last input we received
|
||||
simd8<uint8_t> prev_input_block;
|
||||
// Whether the last input we received was incomplete (used for ASCII fast path)
|
||||
simd8<uint8_t> prev_incomplete;
|
||||
|
||||
//
|
||||
// Check whether the current bytes are valid UTF-8.
|
||||
//
|
||||
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
||||
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
||||
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
||||
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
||||
this->error |= check_special_cases(input, prev1);
|
||||
this->error |= check_multibyte_lengths(input, prev_input, prev1);
|
||||
}
|
||||
|
||||
// The only problem that can happen at EOF is that a multibyte character is too short.
|
||||
simdjson_really_inline void check_eof() {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
||||
if(simdjson_likely(is_ascii(input))) {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
} else {
|
||||
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
||||
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
||||
"We support either two or four chunks per 64-byte block.");
|
||||
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
||||
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
||||
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
||||
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
||||
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
||||
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
||||
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
||||
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
||||
}
|
||||
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
||||
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
}; // struct utf8_checker
|
||||
} // namespace utf8_validation
|
||||
} // unnamed namespace
|
||||
|
||||
using utf8_validation::utf8_checker;
|
||||
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // namespace simdjson
|
|
@ -139,7 +139,9 @@ using namespace simd;
|
|||
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
||||
}
|
||||
|
||||
// The only problem that can happen at EOF is that a multibyte character is too short.
|
||||
// The only problem that can happen at EOF is that a multibyte character is too short
|
||||
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
||||
// too large in the first of two bytes.
|
||||
simdjson_really_inline void check_eof() {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
|
@ -167,7 +169,7 @@ using namespace simd;
|
|||
|
||||
}
|
||||
}
|
||||
|
||||
// do not forget to call check_eof!
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
||||
}
|
||||
|
|
|
@ -1,303 +0,0 @@
|
|||
namespace simdjson {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
namespace utf8_validation {
|
||||
|
||||
//
|
||||
// Detect Unicode errors.
|
||||
//
|
||||
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
|
||||
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
|
||||
// are straight up concatenated into the final value. The first byte of a multibyte character is a
|
||||
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
|
||||
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size
|
||||
//
|
||||
// - ASCII (7 bits): 0_______
|
||||
// - 2 byte character (11 bits): 110_____ 10______
|
||||
// - 3 byte character (17 bits): 1110____ 10______ 10______
|
||||
// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
|
||||
// - 5+ byte character (illegal): 11111___ <illegal>
|
||||
//
|
||||
// There are 5 classes of error that can happen in Unicode:
|
||||
//
|
||||
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
|
||||
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
|
||||
// character.
|
||||
//
|
||||
// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
|
||||
//
|
||||
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
|
||||
// We detect this by requiring that the next byte after your multibyte character be a new
|
||||
// character--so a continuation after your character is wrong.
|
||||
//
|
||||
// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
|
||||
//
|
||||
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
|
||||
//
|
||||
// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
|
||||
//
|
||||
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
|
||||
// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
|
||||
// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
|
||||
//
|
||||
// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
|
||||
//
|
||||
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
|
||||
// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
|
||||
//
|
||||
// e.g. 11101101 10100000 10000000 (U+D800)
|
||||
//
|
||||
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
|
||||
// support values with more than 23 bits (which a 4-byte character supports).
|
||||
//
|
||||
// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
|
||||
//
|
||||
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
|
||||
//
|
||||
// Code Points 1st 2s 3s 4s
|
||||
// U+0000..U+007F 00..7F
|
||||
// U+0080..U+07FF C2..DF 80..BF
|
||||
// U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
// U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
// U+D000..U+D7FF ED 80..9F 80..BF
|
||||
// U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
//
|
||||
using namespace simd;
|
||||
|
||||
struct utf8_checker {
|
||||
// If this is nonzero, there has been a UTF-8 error.
|
||||
simd8<uint8_t> error;
|
||||
// The last input we received.
|
||||
simd8<uint8_t> prev_input_block;
|
||||
// If there were leads at the end of the previous block, to be continued in the next.
|
||||
simd8<uint8_t> prev_incomplete;
|
||||
|
||||
//
|
||||
// These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character
|
||||
// it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be
|
||||
// used to detect other kinds of errors.
|
||||
//
|
||||
// LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3,
|
||||
// LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick
|
||||
// lets us use one constant table instead of 3, possibly saving registers on systems with fewer
|
||||
// registers.
|
||||
//
|
||||
static const uint8_t LEAD_4 = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__
|
||||
static const uint8_t LEAD_3 = 0x02; // [1110]____ 10______ 10______ (0|11)__
|
||||
static const uint8_t LEAD_2 = 0x04; // [110_]____ 10______ (0|11)__
|
||||
static const uint8_t LEAD_1 = 0x08; // [0___]____ (0|11)__
|
||||
static const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ...
|
||||
static const uint8_t LEAD_1100 = 0x20; // [1100]____ ...
|
||||
static const uint8_t LEAD_1110 = 0x40; // [1110]____ ...
|
||||
static const uint8_t LEAD_1111 = 0x80; // [1111]____ ...
|
||||
|
||||
// Prepare fast_path_error in case the next block is ASCII
|
||||
simdjson_really_inline void set_fast_path_error() {
|
||||
// If any of the last 3 bytes in the input needs a continuation at the start of the next input,
|
||||
// it is an error for the next input to be ASCII.
|
||||
// static const uint8_t incomplete_long[32] = {
|
||||
// 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// 0, 0, 0, 0, 0, LEAD_4, LEAD_4 | LEAD_3, LEAD_4 | LEAD_3 | LEAD_2
|
||||
// };
|
||||
// const simd8<uint8_t> incomplete(&incomplete_long[sizeof(incomplete_long) - sizeof(simd8<uint8_t>)]);
|
||||
// this->prev_incomplete = lead_flags & incomplete;
|
||||
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
||||
// ... 1111____ 111_____ 11______
|
||||
static const uint8_t last_len[32] = {
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
||||
};
|
||||
const simd8<uint8_t> max_value(&last_len[sizeof(last_len)-sizeof(simd8<uint8_t>)]);
|
||||
// If anything is > the desired value, there will be a nonzero value in the result.
|
||||
this->prev_incomplete = this->prev_input_block.saturating_sub(max_value);
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<uint8_t> get_lead_flags(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
|
||||
// Total: 2 instructions, 1 constant
|
||||
// - 1 byte shift (shuffle)
|
||||
// - 1 table lookup (shuffle)
|
||||
// - 1 table constant
|
||||
|
||||
// high_bits is byte 5, so lead is high_bits.prev<4>()
|
||||
return high_bits.prev<4>(prev_high_bits).lookup_16<uint8_t>(
|
||||
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
|
||||
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
|
||||
0, 0, 0, 0, // [10__]____ (continuation)
|
||||
LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____
|
||||
LEAD_2 | LEAD_2_PLUS, // [110_]____
|
||||
LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____
|
||||
LEAD_4 | LEAD_2_PLUS | LEAD_1111 // [1111]____
|
||||
);
|
||||
}
|
||||
|
||||
// Find errors in bytes 1 and 2 together (one single multi-nibble &)
|
||||
simdjson_really_inline simd8<uint8_t> get_byte_1_2_errors(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
|
||||
//
|
||||
// These are the errors we're going to match for bytes 1-2, by looking at the first three
|
||||
// nibbles of the character: lead_flags & <low bits of byte 1> & <high bits of byte 2>
|
||||
//
|
||||
// The important thing here is that these constants all take up *different* bits, since they
|
||||
// match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among
|
||||
// other things.
|
||||
//
|
||||
static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____
|
||||
static const int TOO_LONG_1 = LEAD_1; // 0_______ 10______
|
||||
static const int OVERLONG_2 = LEAD_1100; // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way)
|
||||
static const int OVERLONG_3 = LEAD_3; // 11100000 100_____ ________
|
||||
static const int OVERLONG_4 = LEAD_4; // 11110000 1000____ ________ ________
|
||||
static const int TOO_LARGE = LEAD_1111; // 11110100 (1001|101_)____
|
||||
static const int SURROGATE = LEAD_1110; // 11101101 [101_]____
|
||||
|
||||
// Total: 4 instructions, 2 constants
|
||||
// - 2 table lookups (shuffles)
|
||||
// - 1 byte shift (shuffle)
|
||||
// - 1 "and"
|
||||
// - 2 table constants
|
||||
|
||||
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
|
||||
// byte 2 to be sure which things are errors and which aren't.
|
||||
// Since input is byte 5, byte 1 is input.prev<4>
|
||||
const simd8<uint8_t> byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16<uint8_t>(
|
||||
// ____[00__] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2, // ____[0001] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
|
||||
// ____[01__] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE, // ____[0100] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
|
||||
// ____[10__] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
|
||||
// ____[11__] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1,
|
||||
TOO_SHORT_2 | TOO_LONG_1 | SURROGATE, // ____[1101] ________
|
||||
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1
|
||||
);
|
||||
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
|
||||
const simd8<uint8_t> byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16<uint8_t>(
|
||||
// ASCII: ________ [0___]____
|
||||
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
|
||||
// ASCII: ________ [0___]____
|
||||
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
|
||||
// Continuations: ________ [10__]____
|
||||
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
|
||||
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE, // ________ [1001]____
|
||||
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1010]____
|
||||
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1011]____
|
||||
// Multibyte Leads: ________ [11__]____
|
||||
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2
|
||||
);
|
||||
return byte_1_flags & byte_2_flags;
|
||||
}
|
||||
|
||||
simdjson_really_inline simd8<uint8_t> get_byte_3_4_5_errors(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
|
||||
// Total 7 instructions, 3 simd constants:
|
||||
// - 3 table lookups (shuffles)
|
||||
// - 2 byte shifts (shuffles)
|
||||
// - 2 "or"
|
||||
// - 1 table constant
|
||||
|
||||
const simd8<uint8_t> byte_3_table = simd8<uint8_t>::repeat_16(
|
||||
// TOO_SHORT ASCII: 111_____ ________ [0___]____
|
||||
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
|
||||
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
|
||||
// TOO_LONG Continuations: 110_____ ________ [10__]____
|
||||
LEAD_2, LEAD_2, LEAD_2, LEAD_2,
|
||||
// TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____
|
||||
LEAD_3, LEAD_3, LEAD_3, LEAD_3
|
||||
);
|
||||
const simd8<uint8_t> byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3
|
||||
const simd8<uint8_t> byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: <none>, TOO_LONG: LEAD_4
|
||||
|
||||
// high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4
|
||||
return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) |
|
||||
high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) |
|
||||
high_bits.lookup_16(byte_5_table);
|
||||
}
|
||||
|
||||
// Check whether the current bytes are valid UTF-8.
|
||||
// At the end of the function, previous gets updated
|
||||
// This should come down to 22 instructions if table definitions are in registers--30 if not.
|
||||
simdjson_really_inline simd8<uint8_t> check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
||||
// When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows
|
||||
// us to look for all errors related to any lead character at one time (since UTF-8 characters
|
||||
// can only be up to 4 bytes, and the next byte after a character finishes must be another lead,
|
||||
// we never need to look more than 4 bytes past the current one to fully validate).
|
||||
// This way, we have all relevant bytes around and can save ourselves a little overflow and
|
||||
// several instructions on each loop.
|
||||
|
||||
// Total: 22 instructions, 7 simd constants
|
||||
// Local: 8 instructions, 1 simd constant
|
||||
// - 2 bit shifts
|
||||
// - 1 byte shift (shuffle)
|
||||
// - 3 "or"
|
||||
// - 1 "and"
|
||||
// - 1 saturating_sub
|
||||
// - 1 constant (0b11111000-1)
|
||||
// lead_flags: 2 instructions, 1 simd constant
|
||||
// - 1 byte shift (shuffle)
|
||||
// - 1 table lookup (shuffle)
|
||||
// - 1 table constant
|
||||
// byte_1_2_errors: 5 instructions, 2 simd constants
|
||||
// - 2 table lookups (shuffles)
|
||||
// - 2 byte shifts (shuffles)
|
||||
// - 1 "and"
|
||||
// - 2 table constants
|
||||
// byte_3_4_5_errors: 7 instructions, 3 simd constants
|
||||
// - 3 table lookups (shuffles)
|
||||
// - 2 byte shifts (shuffles)
|
||||
// - 2 "or"
|
||||
// - 3 table constants
|
||||
|
||||
const simd8<uint8_t> high_bits = input.shr<4>();
|
||||
const simd8<uint8_t> prev_high_bits = prev_input.shr<4>();
|
||||
const simd8<uint8_t> lead_flags = get_lead_flags(high_bits, prev_high_bits);
|
||||
const simd8<uint8_t> byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits);
|
||||
const simd8<uint8_t> byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits);
|
||||
// Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors because
|
||||
// it would need a third lead_flag = 1111, and we've already used up all 8 between
|
||||
// byte_1_2_errors and byte_3_4_5_errors.
|
||||
const simd8<uint8_t> too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero
|
||||
return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors));
|
||||
}
|
||||
|
||||
// TODO special case start of file, too, so that small documents are efficient! No shifting needed ...
|
||||
|
||||
// The only problem that can happen at EOF is that a multibyte character is too short.
|
||||
simdjson_really_inline void check_eof() {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
||||
simd8<uint8_t> bits = input.reduce_or();
|
||||
if (simdjson_likely(!bits.any_bits_set_anywhere(0b10000000u))) {
|
||||
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
||||
// possibly finish them.
|
||||
this->error |= this->prev_incomplete;
|
||||
} else {
|
||||
this->error |= this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
||||
for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
|
||||
this->error |= this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
|
||||
}
|
||||
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
||||
this->set_fast_path_error();
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
}; // struct utf8_checker
|
||||
|
||||
} // namespace utf8_validation
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // unnamed namespace
|
|
@ -1,186 +0,0 @@
|
|||
namespace simdjson {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
|
||||
/*
|
||||
* legal utf-8 byte sequence
|
||||
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
|
||||
*
|
||||
* Code Points 1st 2s 3s 4s
|
||||
* U+0000..U+007F 00..7F
|
||||
* U+0080..U+07FF C2..DF 80..BF
|
||||
* U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
* U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
* U+D000..U+D7FF ED 80..9F 80..BF
|
||||
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
||||
*
|
||||
*/
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
|
||||
using namespace simd;
|
||||
|
||||
struct processed_utf_bytes {
|
||||
simd8<uint8_t> raw_bytes;
|
||||
simd8<uint8_t> first_len;
|
||||
};
|
||||
|
||||
struct utf8_checker {
|
||||
simd8<bool> has_error;
|
||||
processed_utf_bytes previous;
|
||||
|
||||
simdjson_really_inline void check_carried_continuations() {
|
||||
static const int8_t last_len[32] = {
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 2, 1, 0
|
||||
};
|
||||
this->has_error |= simd8<int8_t>(this->previous.first_len) > simd8<int8_t>(last_len + 32 - sizeof(simd8<int8_t>));
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> current_bytes) {
|
||||
|
||||
/* high_nibbles = input >> 4 */
|
||||
const simd8<uint8_t> high_nibbles = current_bytes.shr<4>();
|
||||
|
||||
/*
|
||||
* Map high nibble of "First Byte" to legal character length minus 1
|
||||
* 0x00 ~ 0xBF --> 0
|
||||
* 0xC0 ~ 0xDF --> 1
|
||||
* 0xE0 ~ 0xEF --> 2
|
||||
* 0xF0 ~ 0xFF --> 3
|
||||
*/
|
||||
/* first_len = legal character length minus 1 */
|
||||
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
|
||||
/* first_len = first_len_tbl[high_nibbles] */
|
||||
simd8<uint8_t> first_len = high_nibbles.lookup_16<uint8_t>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
|
||||
|
||||
/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
|
||||
/* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
|
||||
/* range = first_range_tbl[high_nibbles] */
|
||||
simd8<uint8_t> range = high_nibbles.lookup_16<uint8_t>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
|
||||
|
||||
/* Second Byte: set range index to first_len */
|
||||
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
|
||||
/* range |= (first_len, previous->first_len) << 1 byte */
|
||||
range |= first_len.prev(this->previous.first_len);
|
||||
|
||||
/* Third Byte: set range index to saturate_sub(first_len, 1) */
|
||||
/* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
|
||||
/* range |= (first_len - 1) << 2 bytes */
|
||||
range |= first_len.saturating_sub(1).prev<2>(this->previous.first_len.saturating_sub(1));
|
||||
|
||||
/* Fourth Byte: set range index to saturate_sub(first_len, 2) */
|
||||
/* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
|
||||
/* range |= (first_len - 2) << 3 bytes */
|
||||
range |= first_len.saturating_sub(2).prev<3>(this->previous.first_len.saturating_sub(2));
|
||||
|
||||
/*
|
||||
* Now we have below range indices caluclated
|
||||
* Correct cases:
|
||||
* - 8 for C0~FF
|
||||
* - 3 for 1st byte after F0~FF
|
||||
* - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
|
||||
* - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
|
||||
* 3rd byte after F0~FF
|
||||
* - 0 for others
|
||||
* Error cases:
|
||||
* 9,10,11 if non ascii First Byte overlaps
|
||||
* E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
|
||||
*/
|
||||
|
||||
/* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
|
||||
/* Overlaps lead to index 9~15, which are illegal in range table */
|
||||
/* shift1 = (input, previous->input) << 1 byte */
|
||||
simd8<uint8_t> shift1 = current_bytes.prev(this->previous.raw_bytes);
|
||||
/*
|
||||
* shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
|
||||
* pos: | 0 1 15 | 16 17 239| 240 241 255|
|
||||
* pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
|
||||
* pos+112: | 112 113 127| >= 128 | >= 128 |
|
||||
*/
|
||||
simd8<uint8_t> pos = shift1 - 0xEF;
|
||||
|
||||
/*
|
||||
* Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
|
||||
* which the Second Byte are not 80~BF. It contains "range index adjustment".
|
||||
* +------------+---------------+------------------+----------------+
|
||||
* | First Byte | original range| range adjustment | adjusted range |
|
||||
* +------------+---------------+------------------+----------------+
|
||||
* | E0 | 2 | 2 | 4 |
|
||||
* +------------+---------------+------------------+----------------+
|
||||
* | ED | 2 | 3 | 5 |
|
||||
* +------------+---------------+------------------+----------------+
|
||||
* | F0 | 3 | 3 | 6 |
|
||||
* +------------+---------------+------------------+----------------+
|
||||
* | F4 | 4 | 4 | 8 |
|
||||
* +------------+---------------+------------------+----------------+
|
||||
*/
|
||||
/* index1 -> E0, index14 -> ED */
|
||||
simd8<uint8_t> range2 = pos.saturating_sub(240).lookup_16<uint8_t>(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
|
||||
/* index1 -> F0, index5 -> F4 */
|
||||
range2 += pos.saturating_add(112).lookup_16<uint8_t>(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
range += range2;
|
||||
|
||||
/* Load min and max values per calculated range index */
|
||||
/*
|
||||
* Range table, map range index to min and max values
|
||||
* Index 0 : 00 ~ 7F (First Byte, ascii)
|
||||
* Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte)
|
||||
* Index 4 : A0 ~ BF (Second Byte after E0)
|
||||
* Index 5 : 80 ~ 9F (Second Byte after ED)
|
||||
* Index 6 : 90 ~ BF (Second Byte after F0)
|
||||
* Index 7 : 80 ~ 8F (Second Byte after F4)
|
||||
* Index 8 : C2 ~ F4 (First Byte, non ascii)
|
||||
* Index 9~15 : illegal: i >= 127 && i <= -128
|
||||
*/
|
||||
simd8<uint8_t> minv = range.lookup_16<uint8_t>(
|
||||
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
|
||||
0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F
|
||||
);
|
||||
simd8<uint8_t> maxv = range.lookup_16<uint8_t>(
|
||||
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
|
||||
0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
|
||||
);
|
||||
|
||||
// We're fine with high-bit wraparound here, so we use int comparison since it's faster on Intel
|
||||
this->has_error |= simd8<int8_t>(minv) > simd8<int8_t>(current_bytes);
|
||||
this->has_error |= simd8<int8_t>(current_bytes) > simd8<int8_t>(maxv);
|
||||
|
||||
this->previous.raw_bytes = current_bytes;
|
||||
this->previous.first_len = first_len;
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8<uint8_t> in) {
|
||||
if (simdjson_likely(!in.any_bits_set_anywhere(0x80u))) {
|
||||
this->check_carried_continuations();
|
||||
} else {
|
||||
this->check_utf8_bytes(in);
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& in) {
|
||||
simd8<uint8_t> bits = in.reduce_or();
|
||||
if (simdjson_likely(!bits.any_bits_set_anywhere(0x80u))) {
|
||||
// it is ascii, we just check carried continuations.
|
||||
this->check_carried_continuations();
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
|
||||
this->check_utf8_bytes(in.chunks[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return this->has_error.any() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
}; // struct utf8_checker
|
||||
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // unnamed namespace
|
|
@ -19,6 +19,7 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|||
simd::simd8x64<uint8_t> in(block);
|
||||
c.check_next_input(in);
|
||||
reader.advance();
|
||||
c.check_eof();
|
||||
return c.errors() == error_code::SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,365 +0,0 @@
|
|||
namespace simdjson {
|
||||
namespace SIMDJSON_IMPLEMENTATION {
|
||||
//
|
||||
// Detect UTF-8 errors.
|
||||
//
|
||||
// Copied and adapted from algorithm by @zwegner: https://github.com/zwegner/faster-utf8-validator
|
||||
//
|
||||
// UTF-8 Refresher
|
||||
// ---------------
|
||||
//
|
||||
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
|
||||
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
|
||||
// are straight up concatenated into the final value. The first byte of a multibyte character is a
|
||||
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
|
||||
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
|
||||
// start with 0, because that's what ASCII looks like. Here's what each size
|
||||
//
|
||||
// | Character Length | UTF-8 Byte Sequence |
|
||||
// |-----------------------------|---------------------------------------|
|
||||
// | ASCII (7 bits): | `0_______` |
|
||||
// | 2 byte character (11 bits) | `110_____ 10______` |
|
||||
// | 3 byte character (17 bits) | `1110____ 10______ 10______` |
|
||||
// | 4 byte character (23 bits) | `11110___ 10______ 10______ 10______` |
|
||||
// | 5+ byte character (illegal) | `11111___` <illegal> |
|
||||
//
|
||||
// UTF-8 Error Classes
|
||||
// -------------------
|
||||
//
|
||||
// There are 5 classes of error that can happen in UTF-8:
|
||||
//
|
||||
// ### Too short (missing continuations)
|
||||
//
|
||||
// TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
|
||||
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
|
||||
// character.
|
||||
//
|
||||
// e.g. `11000000 01100001` (2-byte character where second byte is ASCII)
|
||||
//
|
||||
// ### Too long (stray continuations)
|
||||
//
|
||||
// TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
|
||||
// We detect this by requiring that the next byte after your multibyte character be a new
|
||||
// character--so a continuation after your character is wrong.
|
||||
//
|
||||
// e.g. `11011111 10111111 10111111` (2-byte character followed by *another* continuation byte)
|
||||
//
|
||||
// ### Too large (out of range for unicode)
|
||||
//
|
||||
// TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
|
||||
//
|
||||
// e.g. `11110111 10111111 10111111 10111111` (bigger than 10FFFF).
|
||||
//
|
||||
// ### Overlong encoding (used more bytes than needed)
|
||||
//
|
||||
// Multibyte characters with a bunch of leading zeroes, where you could have
|
||||
// used fewer bytes to make the same character, are considered *overlong encodings*. They are
|
||||
// disallowed in UTF-8 to ensure there is only one way to write a single Unicode codepoint, making strings
|
||||
// easier to search. Like encoding an ASCII character in 2 bytes is technically possible, but UTF-8
|
||||
// disallows it so that you only have to search for the ASCII character `a` to find it.
|
||||
//
|
||||
// e.g. `11000001 10100001` (2-byte encoding of "a", which only requires 1 byte: 01100001)
|
||||
//
|
||||
// ### Surrogate characters
|
||||
//
|
||||
// Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and WTF-8 encodings
|
||||
// for characters with > 2 bytes. These are illegal in pure UTF-8.
|
||||
//
|
||||
// e.g. `11101101 10100000 10000000` (U+D800)
|
||||
//
|
||||
// ### 5+ byte characters
|
||||
//
|
||||
// INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
|
||||
// support values with more than 23 bits (which a 4-byte character supports).
|
||||
//
|
||||
// Even if these were supported, anything with 5 bytes would be either too large (bigger than the
|
||||
// Unicode max value), or overlong (could fit in 4+ bytes).
|
||||
//
|
||||
// e.g. `11111000 10100000 10000000 10000000 10000000` (U+800000)
|
||||
//
|
||||
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
|
||||
//
|
||||
// | Code Points | 1st | 2nd | 3s | 4s |
|
||||
// |--------------------|--------|--------|--------|--------|
|
||||
// | U+0000..U+007F | 00..7F | | | |
|
||||
// | U+0080..U+07FF | C2..DF | 80..BF | | |
|
||||
// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
|
||||
// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
|
||||
// | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
|
||||
// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
|
||||
// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
|
||||
// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
|
||||
// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
|
||||
//
|
||||
// Algorithm
|
||||
// ---------
|
||||
//
|
||||
// This validator works in two basic steps: checking continuation bytes, and
|
||||
// handling special cases. Each step works on one vector's worth of input
|
||||
// bytes at a time.
|
||||
//
|
||||
using namespace simd;
|
||||
|
||||
using vmask_t = simd8<bool>::bitmask_t;
|
||||
using vmask2_t = simd8<bool>::bitmask2_t;
|
||||
|
||||
struct utf8_checker {
|
||||
simd8<uint8_t> special_case_errors;
|
||||
simd8<uint8_t> prev_bytes;
|
||||
vmask2_t last_cont;
|
||||
vmask_t length_errors;
|
||||
|
||||
//
|
||||
// Check for missing / extra continuation bytes.
|
||||
//
|
||||
// The continuation bytes are handled in a fairly straightforward manner in
|
||||
// the scalar domain. A mask is created from the input byte vector for each
|
||||
// of the highest four bits of every byte. The first mask allows us to quickly
|
||||
// skip pure ASCII input vectors, which have no bits set. The first and
|
||||
// (inverted) second masks together give us every continuation byte (10xxxxxx).
|
||||
// The other masks are used to find prefixes of multi-byte code points (110,
|
||||
// 1110, 11110). For these, we keep a "required continuation" mask, by shifting
|
||||
// these masks 1, 2, and 3 bits respectively forward in the byte stream. That
|
||||
// is, we take a mask of all bytes that start with 11, and shift it left one
|
||||
// bit forward to get the mask of all the first continuation bytes, then do the
|
||||
// same for the second and third continuation bytes. Here's an example input
|
||||
// sequence along with the corresponding masks:
|
||||
//
|
||||
// bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00
|
||||
// code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00
|
||||
// # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1
|
||||
// cont. mask 1: - - 1 - - 1 - - - 1 - - -
|
||||
// cont. mask 2: - - - - - - 1 - - - 1 - -
|
||||
// cont. mask 3: - - - - - - - - - - - 1 -
|
||||
// cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0
|
||||
//
|
||||
// The final required continuation mask is then compared with the mask of
|
||||
// actual continuation bytes, and must match exactly in valid UTF-8. The only
|
||||
// complication in this step is that the shifted masks can cross vector
|
||||
// boundaries, so we need to keep a "carry" mask of the bits that were shifted
|
||||
// past the boundary in the last loop iteration.
|
||||
//
|
||||
simdjson_really_inline void check_length_errors(const simd8<uint8_t> bytes, const vmask_t bit_7) {
|
||||
// Compute the continuation byte mask by finding bytes that start with
|
||||
// 11x, 111x, and 1111. For each of these prefixes, we get a bitmask
|
||||
// and shift it forward by 1, 2, or 3. This loop should be unrolled by
|
||||
// the compiler, and the (n == 1) branch inside eliminated.
|
||||
//
|
||||
// NOTE (@jkeiser): I unrolled the for(i=1..3) loop because I don't trust compiler unrolling
|
||||
// anymore. This should be exactly equivalent and yield the same optimizations (and also lets
|
||||
// us rearrange statements if we so desire).
|
||||
|
||||
// We add the shifted mask here instead of ORing it, which would
|
||||
// be the more natural operation, so that this line can be done
|
||||
// with one lea. While adding could give a different result due
|
||||
// to carries, this will only happen for invalid UTF-8 sequences,
|
||||
// and in a way that won't cause it to pass validation. Reasoning:
|
||||
// Any bits for required continuation bytes come after the bits
|
||||
// for their leader bytes, and are all contiguous. For a carry to
|
||||
// happen, two of these bit sequences would have to overlap. If
|
||||
// this is the case, there is a leader byte before the second set
|
||||
// of required continuation bytes (and thus before the bit that
|
||||
// will be cleared by a carry). This leader byte will not be
|
||||
// in the continuation mask, despite being required. QEDish.
|
||||
// Which bytes are required to be continuation bytes
|
||||
vmask2_t cont_required = this->last_cont;
|
||||
|
||||
// 2-byte lead: 11______
|
||||
const vmask_t bit_6 = bytes.get_bit<6>();
|
||||
const vmask_t lead_2_plus = bit_7 & bit_6; // 11______
|
||||
cont_required += vmask2_t(lead_2_plus) << 1;
|
||||
|
||||
// 3-byte lead: 111_____
|
||||
const vmask_t bit_5 = bytes.get_bit<5>();
|
||||
const vmask_t lead_3_plus = lead_2_plus & bit_5; // 111_____
|
||||
cont_required += vmask2_t(lead_3_plus) << 2;
|
||||
|
||||
// 4-byte lead: 1111____
|
||||
const vmask_t bit_4 = bytes.get_bit<4>();
|
||||
const vmask_t lead_4_plus = lead_3_plus & bit_4;
|
||||
cont_required += vmask2_t(lead_4_plus) << 3;
|
||||
|
||||
const vmask_t cont = bit_7 ^ lead_2_plus; // 10______ TODO &~ bit_6 might be fine, and involve less data dependency
|
||||
|
||||
// Check that continuation bytes match. We must cast req from vmask2_t
|
||||
// (which holds the carry mask in the upper half) to vmask_t, which
|
||||
// zeroes out the upper bits
|
||||
//
|
||||
// NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
|
||||
// success cases: instead of spending time testing the result and introducing a branch (which
|
||||
// can affect performance even if it's easily predictable), we test once at the end.
|
||||
// The ^ is equivalent to !=, however, leaving a 1 where the bits are different and 0 where they
|
||||
// are the same.
|
||||
this->length_errors |= cont ^ vmask_t(cont_required);
|
||||
|
||||
this->last_cont = cont_required >> sizeof(simd8<uint8_t>);
|
||||
}
|
||||
|
||||
//
|
||||
// These constants define the set of error flags in check_special_cases().
|
||||
//
|
||||
static const uint8_t OVERLONG_2 = 0x01; // 1100000_ ________ Could have been encoded in 1 byte
|
||||
static const uint8_t OVERLONG_3 = 0x02; // 11100000 100_____ Could have been encoded in 2 bytes
|
||||
static const uint8_t SURROGATE = 0x04; // 11101010 101_____ Surrogate pairs
|
||||
static const uint8_t TOO_LARGE = 0x08; // 11110100 (1001|101_)____ > U+10FFFF
|
||||
static const uint8_t TOO_LARGE_2 = 0x10; // 1111(0101..1111) ________ > U+10FFFF
|
||||
static const uint8_t OVERLONG_4 = 0x20; // 11110000 1000____ Could have been encoded in 3 bytes
|
||||
|
||||
//
|
||||
// Check for special-case errors with table lookups on the first 3 nibbles (first 2 bytes).
|
||||
//
|
||||
// Besides the basic prefix coding of UTF-8, there are several invalid byte
|
||||
// sequences that need special handling. These are due to three factors:
|
||||
// code points that could be described in fewer bytes, code points that are
|
||||
// part of a surrogate pair (which are only valid in UTF-16), and code points
|
||||
// that are past the highest valid code point U+10FFFF.
|
||||
//
|
||||
// All of the invalid sequences can be detected by independently observing
|
||||
// the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte
|
||||
// lookup in parallel for all 32 bytes in a vector, we can create bit masks
|
||||
// for all of these error conditions, look up the bit masks for the three
|
||||
// nibbles for all input bytes, and AND them together to get a final error mask,
|
||||
// that must be all zero for valid UTF-8. This is somewhat complicated by
|
||||
// needing to shift the error masks from the first and second nibbles forward in
|
||||
// the byte stream to line up with the third nibble.
|
||||
//
|
||||
// We have these possible values for valid UTF-8 sequences, broken down
|
||||
// by the first three nibbles:
|
||||
//
|
||||
// 1st 2nd 3rd comment
|
||||
// 0..7 0..F ASCII
|
||||
// 8..B 0..F continuation bytes
|
||||
// C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte
|
||||
// D 0..F 8..B D0..DF are valid with a continuation byte
|
||||
// E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes
|
||||
// 1..C 8..B E1..EC are valid with continuation bytes
|
||||
// D 8..9 ED Ax and ED Bx correspond to surrogate pairs
|
||||
// E..F 8..B EE..EF are valid with continuation bytes
|
||||
// F 0 9..B F0 8x can be encoded with 3 bytes
|
||||
// 1..3 8..B F1..F3 are valid with continuation bytes
|
||||
// 4 8 F4 8F BF BF is the maximum valid code point
|
||||
//
|
||||
// That leaves us with these invalid sequences, which would otherwise fit
|
||||
// into UTF-8's prefix encoding. Each of these invalid sequences needs to
|
||||
// be detected separately, with their own bits in the error mask.
|
||||
//
|
||||
// 1st 2nd 3rd error bit
|
||||
// C 0..1 0..F 0x01
|
||||
// E 0 8..9 0x02
|
||||
// D A..B 0x04
|
||||
// F 0 0..8 0x08
|
||||
// 4 9..F 0x10
|
||||
// 5..F 0..F 0x20
|
||||
//
|
||||
// For every possible value of the first, second, and third nibbles, we keep
|
||||
// a lookup table that contains the bitwise OR of all errors that that nibble
|
||||
// value can cause. For example, the first nibble has zeroes in every entry
|
||||
// except for C, E, and F, and the third nibble lookup has the 0x21 bits in
|
||||
// every entry, since those errors don't depend on the third nibble. After
|
||||
// doing a parallel lookup of the first/second/third nibble values for all
|
||||
// bytes, we AND them together. Only when all three have an error bit in common
|
||||
// do we fail validation.
|
||||
//
|
||||
simdjson_really_inline void check_special_cases(const simd8<uint8_t> bytes) {
|
||||
const simd8<uint8_t> shifted_bytes = bytes.prev<1>(this->prev_bytes);
|
||||
this->prev_bytes = bytes;
|
||||
|
||||
// Look up error masks for three consecutive nibbles. We need to
|
||||
// AND with 0x0F for each one, because vpshufb has the neat
|
||||
// "feature" that negative values in an index byte will result in
|
||||
// a zero.
|
||||
simd8<uint8_t> nibble_1_error = shifted_bytes.shr<4>().lookup_16<uint8_t>(
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
|
||||
OVERLONG_2, // [1100]000_ ________ Could have been encoded in 1 byte
|
||||
0,
|
||||
OVERLONG_3 | SURROGATE, // [1110]0000 100_____ Could have been encoded in 2 bytes
|
||||
// [1110]1010 101_____ Surrogate pairs
|
||||
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]0000 1000____ Could have been encoded in 3 bytes
|
||||
// [1111]0100 (1001|101_)____ > U+10FFFF
|
||||
);
|
||||
|
||||
simd8<uint8_t> nibble_2_error = (shifted_bytes & 0x0F).lookup_16<uint8_t>(
|
||||
OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // 1100[000_] ________ Could have been encoded in 1 byte
|
||||
// 1110[0000] 100_____ Could have been encoded in 2 bytes
|
||||
// 1111[0000] 1000____ Could have been encoded in 3 bytes
|
||||
OVERLONG_2,
|
||||
0,
|
||||
0,
|
||||
|
||||
TOO_LARGE, // 1111[0100] (1001|101_)____ > U+10FFFF
|
||||
TOO_LARGE_2, // 1111[0101..1111] ________ > U+10FFFF
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2,
|
||||
|
||||
TOO_LARGE_2,
|
||||
TOO_LARGE_2 | SURROGATE, // 1110[1010] 101_____ Surrogate pairs
|
||||
TOO_LARGE_2, TOO_LARGE_2
|
||||
);
|
||||
|
||||
// Errors that apply no matter what the third byte is
|
||||
const uint8_t CARRY = OVERLONG_2 | TOO_LARGE_2; // 1100000_ [____]____ Could have been encoded in 1 byte
|
||||
// 1111(0101..1111) [____]____ > U+10FFFF
|
||||
simd8<uint8_t> nibble_3_error = bytes.shr<4>().lookup_16<uint8_t>(
|
||||
CARRY, CARRY, CARRY, CARRY,
|
||||
|
||||
CARRY, CARRY, CARRY, CARRY,
|
||||
|
||||
CARRY | OVERLONG_3 | OVERLONG_4, // 11100000 [100_]____ Could have been encoded in 2 bytes
|
||||
// 11110000 [1000]____ Could have been encoded in 3 bytes
|
||||
CARRY | OVERLONG_3 | TOO_LARGE, // 11100000 [100_]____ Could have been encoded in 2 bytes
|
||||
// 11110100 [1001|101_]____ > U+10FFFF
|
||||
CARRY | SURROGATE | TOO_LARGE, // 11101010 [101_]____ Surrogate pairs
|
||||
CARRY | SURROGATE | TOO_LARGE,
|
||||
|
||||
CARRY, CARRY, CARRY, CARRY
|
||||
);
|
||||
|
||||
// Check if any bits are set in all three error masks
|
||||
//
|
||||
// NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
|
||||
// success cases: instead of spending time testing the result and introducing a branch (which
|
||||
// can affect performance even if it's easily predictable), we test once at the end.
|
||||
this->special_case_errors |= nibble_1_error & nibble_2_error & nibble_3_error;
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> bytes, const vmask_t bit_7) {
|
||||
this->check_length_errors(bytes, bit_7);
|
||||
this->check_special_cases(bytes);
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8<uint8_t> bytes) {
|
||||
vmask_t bit_7 = bytes.get_bit<7>();
|
||||
if (simdjson_unlikely(bit_7)) {
|
||||
// TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of
|
||||
// shifted_bytes inside check_utf8_bytes. I believe this adds an extra instruction to the hot
|
||||
// path (saving prev_bytes), which is undesirable, though 2 register accesses vs. 1 memory
|
||||
// access might be a wash. Come back and try the other way.
|
||||
this->check_utf8_bytes(bytes, bit_7);
|
||||
} else {
|
||||
this->length_errors |= this->last_cont;
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& in) {
|
||||
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
|
||||
this->check_next_input(in.chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
simdjson_really_inline error_code errors() {
|
||||
return (this->special_case_errors.any_bits_set_anywhere() | this->length_errors) ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
}
|
||||
}; // struct utf8_checker
|
||||
|
||||
} // namespace SIMDJSON_IMPLEMENTATION
|
||||
} // unnamed namespace
|
|
@ -1270,6 +1270,14 @@ namespace validate_tests {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
bool test_range() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
for(size_t len = 0; len <= 128; len++) {
|
||||
std::vector<uint8_t> source(len,' ');
|
||||
if(!simdjson::validate_utf8((const char*)source.data(), source.size())) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool test_bad_validate() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
|
@ -1279,8 +1287,46 @@ namespace validate_tests {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
bool test_issue1169() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
std::vector<uint8_t> source(64,' ');
|
||||
for(size_t idx = 0; idx < 64; idx++) {
|
||||
source[idx] = 255;
|
||||
if(simdjson::validate_utf8((const char*)source.data(), source.size())) { return false; }
|
||||
source[idx] = 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool test_issue1169_long() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
for(size_t len = 1; len <= 128; len++) {
|
||||
std::vector<uint8_t> source(len,' ');
|
||||
source[len-1] = 255;
|
||||
if(simdjson::validate_utf8((const char*)source.data(), source.size())) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool test_random() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
std::vector<uint8_t> source(64,' ');
|
||||
const simdjson::implementation *impl_fallback = simdjson::available_implementations["fallback"];
|
||||
if(!impl_fallback) { return true; }
|
||||
for(size_t i = 0; i < 10000; i++) {
|
||||
std::vector<uint8_t>& s(source);
|
||||
s[i%64] ^= uint8_t(1235 * i);
|
||||
const bool active_ok = simdjson::active_implementation->validate_utf8((const char*)s.data(), s.size());
|
||||
const bool fallback_ok = impl_fallback->validate_utf8((const char*)s.data(), s.size());
|
||||
if(active_ok != fallback_ok) { return false; }
|
||||
s[i%64] ^= uint8_t(1235 * i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool run() {
|
||||
return test_validate() &&
|
||||
return test_range() &&
|
||||
test_issue1169_long() &&
|
||||
test_issue1169() &&
|
||||
test_random() &&
|
||||
test_validate() &&
|
||||
test_bad_validate();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue