Precalculate the ASCII path

This commit is contained in:
John Keiser 2019-11-20 20:19:39 -08:00
parent 7356b4532f
commit 9b6377fd80
4 changed files with 226 additions and 301 deletions

View File

@ -166,7 +166,9 @@ struct utf8_checker {
this->check_carried_continuations();
} else {
// it is not ascii so we have to do heavy work
in.each([&](auto _in) { this->check_utf8_bytes(_in); });
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_utf8_bytes(in.chunks[i]);
}
}
}

View File

@ -65,309 +65,231 @@ using namespace simd;
namespace utf8_validation {
//
// These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character
// it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be
// used to detect other kinds of errors.
//
// LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3,
// LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick
// lets us use one constant table instead of 3, possibly saving registers on systems with fewer
// registers.
//
const uint8_t LEAD_4 = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__
const uint8_t LEAD_3 = 0x02; // [1110]____ 10______ 10______ (0|11)__
const uint8_t LEAD_2 = 0x04; // [110_]____ 10______ (0|11)__
const uint8_t LEAD_1 = 0x08; // [0___]____ (0|11)__
const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ...
const uint8_t LEAD_1100 = 0x20; // [1100]____ ...
const uint8_t LEAD_1110 = 0x40; // [1110]____ ...
const uint8_t LEAD_1111 = 0x80; // [1111]____ ...
really_inline simd8<uint8_t> get_lead_flags(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
// Total: 2 instructions, 1 constant
// - 1 byte shift (shuffle)
// - 1 table lookup (shuffle)
// - 1 table constant
// high_bits is byte 5, so lead is high_bits.prev<4>()
return high_bits.prev<4>(prev_high_bits).lookup_16<uint8_t>(
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
0, 0, 0, 0, // [10__]____ (continuation)
LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____
LEAD_2 | LEAD_2_PLUS, // [110_]____
LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____
LEAD_4 | LEAD_2_PLUS | LEAD_1111 // [1111]____
);
}
// Find errors in bytes 1 and 2 together (one single multi-nibble &)
really_inline simd8<uint8_t> get_byte_1_2_errors(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
//
// These are the errors we're going to match for bytes 1-2, by looking at the first three
// nibbles of the character: lead_flags & <low bits of byte 1> & <high bits of byte 2>
//
// The important thing here is that these constants all take up *different* bits, since they
// match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among
// other things.
//
static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____
static const int TOO_LONG_1 = LEAD_1; // 0_______ 10______
static const int OVERLONG_2 = LEAD_1100; // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way)
static const int OVERLONG_3 = LEAD_3; // 11100000 100_____ ________
static const int OVERLONG_4 = LEAD_4; // 11110000 1000____ ________ ________
static const int TOO_LARGE = LEAD_1111; // 11110100 (1001|101_)____
static const int SURROGATE = LEAD_1110; // 11101101 [101_]____
// Total: 4 instructions, 2 constants
// - 2 table lookups (shuffles)
// - 1 byte shift (shuffle)
// - 1 "and"
// - 2 table constants
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
// byte 2 to be sure which things are errors and which aren't.
// Since input is byte 5, byte 1 is input.prev<4>
const simd8<uint8_t> byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16<uint8_t>(
// ____[00__] ________
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2, // ____[0001] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[01__] ________
TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE, // ____[0100] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[10__] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[11__] ________
TOO_SHORT_2 | TOO_LONG_1,
TOO_SHORT_2 | TOO_LONG_1 | SURROGATE, // ____[1101] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1
);
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
const simd8<uint8_t> byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16<uint8_t>(
// ASCII: ________ [0___]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
// ASCII: ________ [0___]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
// Continuations: ________ [10__]____
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE, // ________ [1001]____
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1010]____
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1011]____
// Multibyte Leads: ________ [11__]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2
);
return byte_1_flags & byte_2_flags;
}
really_inline simd8<uint8_t> get_byte_3_4_5_errors(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
// Total 7 instructions, 3 simd constants:
// - 3 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 2 "or"
// - 1 table constant
const simd8<uint8_t> byte_3_table = simd8<uint8_t>::repeat_16(
// TOO_SHORT ASCII: 111_____ ________ [0___]____
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
// TOO_LONG Continuations: 110_____ ________ [10__]____
LEAD_2, LEAD_2, LEAD_2, LEAD_2,
// TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____
LEAD_3, LEAD_3, LEAD_3, LEAD_3
);
const simd8<uint8_t> byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3
const simd8<uint8_t> byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: <none>, TOO_LONG: LEAD_4
// high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4
return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) |
high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) |
high_bits.lookup_16(byte_5_table);
}
// Check whether the current bytes are valid UTF-8.
// At the end of the function, previous gets updated
// This should come down to 22 instructions if table definitions are in registers--30 if not.
really_inline simd8<uint8_t> check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows
// us to look for all errors related to any lead character at one time (since UTF-8 characters
// can only be up to 4 bytes, and the next byte after a character finishes must be another lead,
// we never need to look more than 4 bytes past the current one to fully validate).
// This way, we have all relevant bytes around and can save ourselves a little overflow and
// several instructions on each loop.
// Total: 22 instructions, 7 simd constants
// Local: 8 instructions, 1 simd constant
// - 2 bit shifts
// - 1 byte shift (shuffle)
// - 3 "or"
// - 1 "and"
// - 1 saturating_sub
// - 1 constant (0b11111000-1)
// lead_flags: 2 instructions, 1 simd constant
// - 1 byte shift (shuffle)
// - 1 table lookup (shuffle)
// - 1 table constant
// byte_1_2_errors: 5 instructions, 2 simd constants
// - 2 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 1 "and"
// - 2 table constants
// byte_3_4_5_errors: 7 instructions, 3 simd constants
// - 3 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 2 "or"
// - 3 table constants
const simd8<uint8_t> high_bits = input.shr<4>();
const simd8<uint8_t> prev_high_bits = prev_input.shr<4>();
const simd8<uint8_t> lead_flags = get_lead_flags(high_bits, prev_high_bits);
const simd8<uint8_t> byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits);
const simd8<uint8_t> byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits);
// Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors because
// it would need a third lead_flag = 1111, and we've already used up all 8 between
// byte_1_2_errors and byte_3_4_5_errors.
const simd8<uint8_t> too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero
return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors));
}
// TODO special case start of file, too, so that small documents are efficient! No shifting needed ...
// The only problem that can happen at EOF is that a multibyte character is too short.
really_inline simd8<uint8_t> check_eof(simd8<uint8_t> prev_input) {
// Total: 1 instruction, 1 simd constant
// - 1 saturating_sub
// - 1 simd constant
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t last_len[32] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
};
const simd8<uint8_t> max_value(last_len+sizeof(last_len)-sizeof(simd8<uint8_t>));
// If anything is > the desired value, there will be a nonzero value in the result.
return prev_input.saturating_sub(max_value);
}
really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> prev_input) {
// Total: 9 simd constants
// - ASCII: 3 instructions, 2 simd constants
// - UTF-8: 24 instructions, 8 simd constants
simd8<uint8_t> bits = input1;
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// This has the same semantics as EOF: we only have to check for multibyte characters in part
// 1 that got cut off
return check_eof(prev_input);
} else {
return check_utf8_bytes(input1, prev_input);
}
}
really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> prev_input) {
// Total: 9 simd constants
// - ASCII: 3 instructions, 2 simd constants
// - UTF-8: 24 instructions, 8 simd constants
simd8<uint8_t> bits = input1 | input2;
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// This has the same semantics as EOF: we only have to check for multibyte characters in part
// 1 that got cut off
return check_eof(prev_input);
} else {
return check_utf8_bytes(input1, prev_input) |
check_utf8_bytes(input2, input1);
}
}
really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> input3, simd8<uint8_t> input4, simd8<uint8_t> prev_input) {
// Total: 9 simd constants
// - ASCII: 3 instructions, 2 simd constants
// - UTF-8: 24 instructions, 8 simd constants
simd8<uint8_t> bits = input1 | input2 | input3 | input4;
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// This has the same semantics as EOF: we only have to check for multibyte characters in part
// 1 that got cut off
return check_eof(prev_input);
} else {
return check_utf8_bytes(input1, prev_input) |
check_utf8_bytes(input2, input1) |
check_utf8_bytes(input3, input2) |
check_utf8_bytes(input4, input3);
}
}
really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> input3, simd8<uint8_t> input4, simd8<uint8_t> input5, simd8<uint8_t> input6, simd8<uint8_t> input7, simd8<uint8_t> input8, simd8<uint8_t> prev_input) {
// Total: 9 simd constants
// - ASCII: 3 instructions, 2 simd constants
// - UTF-8: 24 instructions, 8 simd constants
simd8<uint8_t> bits = input1 | input2 | input3 | input4 | input5 | input6 | input7 | input8;
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// This has the same semantics as EOF: we only have to check for multibyte characters in part
// 1 that got cut off
return check_eof(prev_input);
} else {
return check_utf8_bytes(input1, prev_input) |
check_utf8_bytes(input2, input1) |
check_utf8_bytes(input3, input2) |
check_utf8_bytes(input4, input3) |
check_utf8_bytes(input5, input4) |
check_utf8_bytes(input6, input5) |
check_utf8_bytes(input7, input6) |
check_utf8_bytes(input8, input7);
}
}
template<int N=simd8x64<uint8_t>::NUM_CHUNKS>
really_inline simd8<uint8_t> check_input(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input);
template<>
really_inline simd8<uint8_t> check_input<2>(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input) {
simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], prev_input);
prev_input = input.chunks[1];
return error;
}
template<>
really_inline simd8<uint8_t> check_input<4>(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input) {
simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], prev_input);
prev_input = input.chunks[3];
return error;
}
template<int N=simd8x64<uint8_t>::NUM_CHUNKS>
really_inline simd8<uint8_t> check_input(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input);
template<>
really_inline simd8<uint8_t> check_input<2>(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input) {
simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input2.chunks[0], input2.chunks[1], prev_input);
prev_input = input2.chunks[1];
return error;
}
template<>
really_inline simd8<uint8_t> check_input<4>(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input) {
simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], input2.chunks[0], input2.chunks[1], input2.chunks[2], input2.chunks[3], prev_input);
prev_input = input2.chunks[3];
return error;
}
} // namespace utf8_validation
struct utf8_checker {
// If this is nonzero, there has been a UTF-8 error.
simd8<uint8_t> error;
simd8<uint8_t> prev_input;
// The last input we received.
simd8<uint8_t> prev_input_block;
// If there were leads at the end of the previous block, to be continued in the next.
simd8<uint8_t> prev_incomplete;
really_inline void check_next_input(simd8x64<uint8_t> input) {
// Total: 9 simd constants
// [256-bit]
// - ASCII: 4 instructions, 2 simd constants
// - UTF-8: 47 instructions, 8 simd constants (7 of them used twice)
// [128-bit]
// - ASCII: 6 instructions, 2 simd constants
// - UTF-8: 93 instructions, 8 simd constants (7 of them used four times)
//
// These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character
// it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be
// used to detect other kinds of errors.
//
// LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3,
// LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick
// lets us use one constant table instead of 3, possibly saving registers on systems with fewer
// registers.
//
static const uint8_t LEAD_4 = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__
static const uint8_t LEAD_3 = 0x02; // [1110]____ 10______ 10______ (0|11)__
static const uint8_t LEAD_2 = 0x04; // [110_]____ 10______ (0|11)__
static const uint8_t LEAD_1 = 0x08; // [0___]____ (0|11)__
static const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ...
static const uint8_t LEAD_1100 = 0x20; // [1100]____ ...
static const uint8_t LEAD_1110 = 0x40; // [1110]____ ...
static const uint8_t LEAD_1111 = 0x80; // [1111]____ ...
// it is not ascii so we have to do heavy work
this->error |= utf8_validation::check_input(input, this->prev_input);
// Prepare fast_path_error in case the next block is ASCII
really_inline void set_fast_path_error() {
// If any of the last 3 bytes in the input needs a continuation at the start of the next input,
// it is an error for the next input to be ASCII.
// static const uint8_t incomplete_long[32] = {
// 0, 0, 0, 0, 0, 0, 0, 0,
// 0, 0, 0, 0, 0, 0, 0, 0,
// 0, 0, 0, 0, 0, 0, 0, 0,
// 0, 0, 0, 0, 0, LEAD_4, LEAD_4 | LEAD_3, LEAD_4 | LEAD_3 | LEAD_2
// };
// const simd8<uint8_t> incomplete(&incomplete_long[sizeof(incomplete_long) - sizeof(simd8<uint8_t>)]);
// this->prev_incomplete = lead_flags & incomplete;
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
// ... 1111____ 111_____ 11______
static const uint8_t last_len[32] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
};
const simd8<uint8_t> max_value(&last_len[sizeof(last_len)-sizeof(simd8<uint8_t>)]);
// If anything is > the desired value, there will be a nonzero value in the result.
this->prev_incomplete = this->prev_input_block.saturating_sub(max_value);
}
really_inline void check_next_input(simd8x64<uint8_t> input, simd8x64<uint8_t> input2) {
this->error |= utf8_validation::check_input(input, input2, this->prev_input);
really_inline simd8<uint8_t> get_lead_flags(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
// Total: 2 instructions, 1 constant
// - 1 byte shift (shuffle)
// - 1 table lookup (shuffle)
// - 1 table constant
// high_bits is byte 5, so lead is high_bits.prev<4>()
return high_bits.prev<4>(prev_high_bits).lookup_16<uint8_t>(
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII)
0, 0, 0, 0, // [10__]____ (continuation)
LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____
LEAD_2 | LEAD_2_PLUS, // [110_]____
LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____
LEAD_4 | LEAD_2_PLUS | LEAD_1111 // [1111]____
);
}
// Find errors in bytes 1 and 2 together (one single multi-nibble &)
really_inline simd8<uint8_t> get_byte_1_2_errors(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
//
// These are the errors we're going to match for bytes 1-2, by looking at the first three
// nibbles of the character: lead_flags & <low bits of byte 1> & <high bits of byte 2>
//
// The important thing here is that these constants all take up *different* bits, since they
// match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among
// other things.
//
static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____
static const int TOO_LONG_1 = LEAD_1; // 0_______ 10______
static const int OVERLONG_2 = LEAD_1100; // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way)
static const int OVERLONG_3 = LEAD_3; // 11100000 100_____ ________
static const int OVERLONG_4 = LEAD_4; // 11110000 1000____ ________ ________
static const int TOO_LARGE = LEAD_1111; // 11110100 (1001|101_)____
static const int SURROGATE = LEAD_1110; // 11101101 [101_]____
// Total: 4 instructions, 2 constants
// - 2 table lookups (shuffles)
// - 1 byte shift (shuffle)
// - 1 "and"
// - 2 table constants
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
// byte 2 to be sure which things are errors and which aren't.
// Since input is byte 5, byte 1 is input.prev<4>
const simd8<uint8_t> byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16<uint8_t>(
// ____[00__] ________
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2, // ____[0001] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[01__] ________
TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE, // ____[0100] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[10__] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
// ____[11__] ________
TOO_SHORT_2 | TOO_LONG_1,
TOO_SHORT_2 | TOO_LONG_1 | SURROGATE, // ____[1101] ________
TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1
);
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
const simd8<uint8_t> byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16<uint8_t>(
// ASCII: ________ [0___]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
// ASCII: ________ [0___]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
// Continuations: ________ [10__]____
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE, // ________ [1001]____
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1010]____
OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1011]____
// Multibyte Leads: ________ [11__]____
OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2
);
return byte_1_flags & byte_2_flags;
}
really_inline simd8<uint8_t> get_byte_3_4_5_errors(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
// Total 7 instructions, 3 simd constants:
// - 3 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 2 "or"
// - 1 table constant
const simd8<uint8_t> byte_3_table = simd8<uint8_t>::repeat_16(
// TOO_SHORT ASCII: 111_____ ________ [0___]____
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
LEAD_3, LEAD_3, LEAD_3, LEAD_3,
// TOO_LONG Continuations: 110_____ ________ [10__]____
LEAD_2, LEAD_2, LEAD_2, LEAD_2,
// TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____
LEAD_3, LEAD_3, LEAD_3, LEAD_3
);
const simd8<uint8_t> byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3
const simd8<uint8_t> byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: <none>, TOO_LONG: LEAD_4
// high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4
return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) |
high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) |
high_bits.lookup_16(byte_5_table);
}
// Check whether the current bytes are valid UTF-8.
// At the end of the function, previous gets updated
// This should come down to 22 instructions if table definitions are in registers--30 if not.
really_inline simd8<uint8_t> check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
// When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows
// us to look for all errors related to any lead character at one time (since UTF-8 characters
// can only be up to 4 bytes, and the next byte after a character finishes must be another lead,
// we never need to look more than 4 bytes past the current one to fully validate).
// This way, we have all relevant bytes around and can save ourselves a little overflow and
// several instructions on each loop.
// Total: 22 instructions, 7 simd constants
// Local: 8 instructions, 1 simd constant
// - 2 bit shifts
// - 1 byte shift (shuffle)
// - 3 "or"
// - 1 "and"
// - 1 saturating_sub
// - 1 constant (0b11111000-1)
// lead_flags: 2 instructions, 1 simd constant
// - 1 byte shift (shuffle)
// - 1 table lookup (shuffle)
// - 1 table constant
// byte_1_2_errors: 5 instructions, 2 simd constants
// - 2 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 1 "and"
// - 2 table constants
// byte_3_4_5_errors: 7 instructions, 3 simd constants
// - 3 table lookups (shuffles)
// - 2 byte shifts (shuffles)
// - 2 "or"
// - 3 table constants
const simd8<uint8_t> high_bits = input.shr<4>();
const simd8<uint8_t> prev_high_bits = prev_input.shr<4>();
const simd8<uint8_t> lead_flags = get_lead_flags(high_bits, prev_high_bits);
const simd8<uint8_t> byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits);
const simd8<uint8_t> byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits);
// Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors because
// it would need a third lead_flag = 1111, and we've already used up all 8 between
// byte_1_2_errors and byte_3_4_5_errors.
const simd8<uint8_t> too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero
return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors));
}
// TODO special case start of file, too, so that small documents are efficient! No shifting needed ...
// The only problem that can happen at EOF is that a multibyte character is too short.
really_inline void check_eof() {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
}
really_inline void check_next_input(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
// possibly finish them.
this->error |= this->prev_incomplete;
} else {
this->error |= this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->error |= this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
}
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
this->set_fast_path_error();
}
}
really_inline ErrorValues errors() {
@ -375,6 +297,3 @@ struct utf8_checker {
}
}; // struct utf8_checker
struct utf8_checker;

View File

@ -168,7 +168,9 @@ struct utf8_checker {
this->check_carried_continuations();
} else {
// it is not ascii so we have to do heavy work
in.each([&](auto _in) { this->check_utf8_bytes(_in); });
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_utf8_bytes(in.chunks[i]);
}
}
}

View File

@ -349,7 +349,9 @@ struct utf8_checker {
}
really_inline void check_next_input(simd8x64<uint8_t> in) {
in.each([&](auto bytes) { this->check_next_input(bytes); });
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_next_input(in.chunks[i]);
}
}
really_inline ErrorValues errors() {