Unneeded.

This commit is contained in:
Daniel Lemire 2021-02-02 17:58:10 -05:00
parent 5449365658
commit 26b5b19f88
1 changed files with 0 additions and 367 deletions

View File

@ -1,367 +0,0 @@
namespace simdjson {
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
//
// Detect UTF-8 errors.
//
// Copied and adapted from algorithm by @zwegner: https://github.com/zwegner/faster-utf8-validator
//
// UTF-8 Refresher
// ---------------
//
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
// are straight up concatenated into the final value. The first byte of a multibyte character is a
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
// start with 0, because that's what ASCII looks like. Here's what each size
//
// | Character Length | UTF-8 Byte Sequence |
// |-----------------------------|---------------------------------------|
// | ASCII (7 bits): | `0_______` |
// | 2 byte character (11 bits) | `110_____ 10______` |
// | 3 byte character (17 bits) | `1110____ 10______ 10______` |
// | 4 byte character (23 bits) | `11110___ 10______ 10______ 10______` |
// | 5+ byte character (illegal) | `11111___` <illegal> |
//
// UTF-8 Error Classes
// -------------------
//
// There are 5 classes of error that can happen in UTF-8:
//
// ### Too short (missing continuations)
//
// TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
// character.
//
// e.g. `11000000 01100001` (2-byte character where second byte is ASCII)
//
// ### Too long (stray continuations)
//
// TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
// We detect this by requiring that the next byte after your multibyte character be a new
// character--so a continuation after your character is wrong.
//
// e.g. `11011111 10111111 10111111` (2-byte character followed by *another* continuation byte)
//
// ### Too large (out of range for unicode)
//
// TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
//
// e.g. `11110111 10111111 10111111 10111111` (bigger than 10FFFF).
//
// ### Overlong encoding (used more bytes than needed)
//
// Multibyte characters with a bunch of leading zeroes, where you could have
// used fewer bytes to make the same character, are considered *overlong encodings*. They are
// disallowed in UTF-8 to ensure there is only one way to write a single Unicode codepoint, making strings
// easier to search. Like encoding an ASCII character in 2 bytes is technically possible, but UTF-8
// disallows it so that you only have to search for the ASCII character `a` to find it.
//
// e.g. `11000001 10100001` (2-byte encoding of "a", which only requires 1 byte: 01100001)
//
// ### Surrogate characters
//
// Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and WTF-8 encodings
// for characters with > 2 bytes. These are illegal in pure UTF-8.
//
// e.g. `11101101 10100000 10000000` (U+D800)
//
// ### 5+ byte characters
//
// INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
// support values with more than 23 bits (which a 4-byte character supports).
//
// Even if these were supported, anything with 5 bytes would be either too large (bigger than the
// Unicode max value), or overlong (could fit in 4+ bytes).
//
// e.g. `11111000 10100000 10000000 10000000 10000000` (U+800000)
//
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
//
// | Code Points | 1st | 2nd | 3s | 4s |
// |--------------------|--------|--------|--------|--------|
// | U+0000..U+007F | 00..7F | | | |
// | U+0080..U+07FF | C2..DF | 80..BF | | |
// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
// | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
//
// Algorithm
// ---------
//
// This validator works in two basic steps: checking continuation bytes, and
// handling special cases. Each step works on one vector's worth of input
// bytes at a time.
//
using namespace simd;
using vmask_t = simd8<bool>::bitmask_t;
using vmask2_t = simd8<bool>::bitmask2_t;
struct utf8_checker {
simd8<uint8_t> special_case_errors;
simd8<uint8_t> prev_bytes;
vmask2_t last_cont;
vmask_t length_errors;
//
// Check for missing / extra continuation bytes.
//
// The continuation bytes are handled in a fairly straightforward manner in
// the scalar domain. A mask is created from the input byte vector for each
// of the highest four bits of every byte. The first mask allows us to quickly
// skip pure ASCII input vectors, which have no bits set. The first and
// (inverted) second masks together give us every continuation byte (10xxxxxx).
// The other masks are used to find prefixes of multi-byte code points (110,
// 1110, 11110). For these, we keep a "required continuation" mask, by shifting
// these masks 1, 2, and 3 bits respectively forward in the byte stream. That
// is, we take a mask of all bytes that start with 11, and shift it left one
// bit forward to get the mask of all the first continuation bytes, then do the
// same for the second and third continuation bytes. Here's an example input
// sequence along with the corresponding masks:
//
// bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00
// code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00
// # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1
// cont. mask 1: - - 1 - - 1 - - - 1 - - -
// cont. mask 2: - - - - - - 1 - - - 1 - -
// cont. mask 3: - - - - - - - - - - - 1 -
// cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0
//
// The final required continuation mask is then compared with the mask of
// actual continuation bytes, and must match exactly in valid UTF-8. The only
// complication in this step is that the shifted masks can cross vector
// boundaries, so we need to keep a "carry" mask of the bits that were shifted
// past the boundary in the last loop iteration.
//
simdjson_really_inline void check_length_errors(const simd8<uint8_t> bytes, const vmask_t bit_7) {
// Compute the continuation byte mask by finding bytes that start with
// 11x, 111x, and 1111. For each of these prefixes, we get a bitmask
// and shift it forward by 1, 2, or 3. This loop should be unrolled by
// the compiler, and the (n == 1) branch inside eliminated.
//
// NOTE (@jkeiser): I unrolled the for(i=1..3) loop because I don't trust compiler unrolling
// anymore. This should be exactly equivalent and yield the same optimizations (and also lets
// us rearrange statements if we so desire).
// We add the shifted mask here instead of ORing it, which would
// be the more natural operation, so that this line can be done
// with one lea. While adding could give a different result due
// to carries, this will only happen for invalid UTF-8 sequences,
// and in a way that won't cause it to pass validation. Reasoning:
// Any bits for required continuation bytes come after the bits
// for their leader bytes, and are all contiguous. For a carry to
// happen, two of these bit sequences would have to overlap. If
// this is the case, there is a leader byte before the second set
// of required continuation bytes (and thus before the bit that
// will be cleared by a carry). This leader byte will not be
// in the continuation mask, despite being required. QEDish.
// Which bytes are required to be continuation bytes
vmask2_t cont_required = this->last_cont;
// 2-byte lead: 11______
const vmask_t bit_6 = bytes.get_bit<6>();
const vmask_t lead_2_plus = bit_7 & bit_6; // 11______
cont_required += vmask2_t(lead_2_plus) << 1;
// 3-byte lead: 111_____
const vmask_t bit_5 = bytes.get_bit<5>();
const vmask_t lead_3_plus = lead_2_plus & bit_5; // 111_____
cont_required += vmask2_t(lead_3_plus) << 2;
// 4-byte lead: 1111____
const vmask_t bit_4 = bytes.get_bit<4>();
const vmask_t lead_4_plus = lead_3_plus & bit_4;
cont_required += vmask2_t(lead_4_plus) << 3;
const vmask_t cont = bit_7 ^ lead_2_plus; // 10______ TODO &~ bit_6 might be fine, and involve less data dependency
// Check that continuation bytes match. We must cast req from vmask2_t
// (which holds the carry mask in the upper half) to vmask_t, which
// zeroes out the upper bits
//
// NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
// success cases: instead of spending time testing the result and introducing a branch (which
// can affect performance even if it's easily predictable), we test once at the end.
// The ^ is equivalent to !=, however, leaving a 1 where the bits are different and 0 where they
// are the same.
this->length_errors |= cont ^ vmask_t(cont_required);
this->last_cont = cont_required >> sizeof(simd8<uint8_t>);
}
//
// These constants define the set of error flags in check_special_cases().
//
static const uint8_t OVERLONG_2 = 0x01; // 1100000_ ________ Could have been encoded in 1 byte
static const uint8_t OVERLONG_3 = 0x02; // 11100000 100_____ Could have been encoded in 2 bytes
static const uint8_t SURROGATE = 0x04; // 11101010 101_____ Surrogate pairs
static const uint8_t TOO_LARGE = 0x08; // 11110100 (1001|101_)____ > U+10FFFF
static const uint8_t TOO_LARGE_2 = 0x10; // 1111(0101..1111) ________ > U+10FFFF
static const uint8_t OVERLONG_4 = 0x20; // 11110000 1000____ Could have been encoded in 3 bytes
//
// Check for special-case errors with table lookups on the first 3 nibbles (first 2 bytes).
//
// Besides the basic prefix coding of UTF-8, there are several invalid byte
// sequences that need special handling. These are due to three factors:
// code points that could be described in fewer bytes, code points that are
// part of a surrogate pair (which are only valid in UTF-16), and code points
// that are past the highest valid code point U+10FFFF.
//
// All of the invalid sequences can be detected by independently observing
// the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte
// lookup in parallel for all 32 bytes in a vector, we can create bit masks
// for all of these error conditions, look up the bit masks for the three
// nibbles for all input bytes, and AND them together to get a final error mask,
// that must be all zero for valid UTF-8. This is somewhat complicated by
// needing to shift the error masks from the first and second nibbles forward in
// the byte stream to line up with the third nibble.
//
// We have these possible values for valid UTF-8 sequences, broken down
// by the first three nibbles:
//
// 1st 2nd 3rd comment
// 0..7 0..F ASCII
// 8..B 0..F continuation bytes
// C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte
// D 0..F 8..B D0..DF are valid with a continuation byte
// E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes
// 1..C 8..B E1..EC are valid with continuation bytes
// D 8..9 ED Ax and ED Bx correspond to surrogate pairs
// E..F 8..B EE..EF are valid with continuation bytes
// F 0 9..B F0 8x can be encoded with 3 bytes
// 1..3 8..B F1..F3 are valid with continuation bytes
// 4 8 F4 8F BF BF is the maximum valid code point
//
// That leaves us with these invalid sequences, which would otherwise fit
// into UTF-8's prefix encoding. Each of these invalid sequences needs to
// be detected separately, with their own bits in the error mask.
//
// 1st 2nd 3rd error bit
// C 0..1 0..F 0x01
// E 0 8..9 0x02
// D A..B 0x04
// F 0 0..8 0x08
// 4 9..F 0x10
// 5..F 0..F 0x20
//
// For every possible value of the first, second, and third nibbles, we keep
// a lookup table that contains the bitwise OR of all errors that that nibble
// value can cause. For example, the first nibble has zeroes in every entry
// except for C, E, and F, and the third nibble lookup has the 0x21 bits in
// every entry, since those errors don't depend on the third nibble. After
// doing a parallel lookup of the first/second/third nibble values for all
// bytes, we AND them together. Only when all three have an error bit in common
// do we fail validation.
//
simdjson_really_inline void check_special_cases(const simd8<uint8_t> bytes) {
const simd8<uint8_t> shifted_bytes = bytes.prev<1>(this->prev_bytes);
this->prev_bytes = bytes;
// Look up error masks for three consecutive nibbles. We need to
// AND with 0x0F for each one, because vpshufb has the neat
// "feature" that negative values in an index byte will result in
// a zero.
simd8<uint8_t> nibble_1_error = shifted_bytes.shr<4>().lookup_16<uint8_t>(
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
OVERLONG_2, // [1100]000_ ________ Could have been encoded in 1 byte
0,
OVERLONG_3 | SURROGATE, // [1110]0000 100_____ Could have been encoded in 2 bytes
// [1110]1010 101_____ Surrogate pairs
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]0000 1000____ Could have been encoded in 3 bytes
// [1111]0100 (1001|101_)____ > U+10FFFF
);
simd8<uint8_t> nibble_2_error = (shifted_bytes & 0x0F).lookup_16<uint8_t>(
OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // 1100[000_] ________ Could have been encoded in 1 byte
// 1110[0000] 100_____ Could have been encoded in 2 bytes
// 1111[0000] 1000____ Could have been encoded in 3 bytes
OVERLONG_2,
0,
0,
TOO_LARGE, // 1111[0100] (1001|101_)____ > U+10FFFF
TOO_LARGE_2, // 1111[0101..1111] ________ > U+10FFFF
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2 | SURROGATE, // 1110[1010] 101_____ Surrogate pairs
TOO_LARGE_2, TOO_LARGE_2
);
// Errors that apply no matter what the third byte is
const uint8_t CARRY = OVERLONG_2 | TOO_LARGE_2; // 1100000_ [____]____ Could have been encoded in 1 byte
// 1111(0101..1111) [____]____ > U+10FFFF
simd8<uint8_t> nibble_3_error = bytes.shr<4>().lookup_16<uint8_t>(
CARRY, CARRY, CARRY, CARRY,
CARRY, CARRY, CARRY, CARRY,
CARRY | OVERLONG_3 | OVERLONG_4, // 11100000 [100_]____ Could have been encoded in 2 bytes
// 11110000 [1000]____ Could have been encoded in 3 bytes
CARRY | OVERLONG_3 | TOO_LARGE, // 11100000 [100_]____ Could have been encoded in 2 bytes
// 11110100 [1001|101_]____ > U+10FFFF
CARRY | SURROGATE | TOO_LARGE, // 11101010 [101_]____ Surrogate pairs
CARRY | SURROGATE | TOO_LARGE,
CARRY, CARRY, CARRY, CARRY
);
// Check if any bits are set in all three error masks
//
// NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
// success cases: instead of spending time testing the result and introducing a branch (which
// can affect performance even if it's easily predictable), we test once at the end.
this->special_case_errors |= nibble_1_error & nibble_2_error & nibble_3_error;
}
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> bytes, const vmask_t bit_7) {
this->check_length_errors(bytes, bit_7);
this->check_special_cases(bytes);
}
simdjson_really_inline void check_next_input(const simd8<uint8_t> bytes) {
vmask_t bit_7 = bytes.get_bit<7>();
if (simdjson_unlikely(bit_7)) {
// TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of
// shifted_bytes inside check_utf8_bytes. I believe this adds an extra instruction to the hot
// path (saving prev_bytes), which is undesirable, though 2 register accesses vs. 1 memory
// access might be a wash. Come back and try the other way.
this->check_utf8_bytes(bytes, bit_7);
} else {
this->length_errors |= this->last_cont;
}
}
simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& in) {
for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
this->check_next_input(in.chunks[i]);
}
}
simdjson_really_inline error_code errors() {
return (this->special_case_errors.any_bits_set_anywhere() | this->length_errors) ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
}
}; // struct utf8_checker
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson