diff --git a/src/generic/stage1/utf8_zwegner_algorithm.h b/src/generic/stage1/utf8_zwegner_algorithm.h deleted file mode 100644 index 3c55b61f..00000000 --- a/src/generic/stage1/utf8_zwegner_algorithm.h +++ /dev/null @@ -1,367 +0,0 @@ -namespace simdjson { -namespace simdjson { -namespace SIMDJSON_IMPLEMENTATION { -// -// Detect UTF-8 errors. -// -// Copied and adapted from algorithm by @zwegner: https://github.com/zwegner/faster-utf8-validator -// -// UTF-8 Refresher -// --------------- -// -// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic -// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits -// are straight up concatenated into the final value. The first byte of a multibyte character is a -// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte -// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just -// start with 0, because that's what ASCII looks like. Here's what each size -// -// | Character Length | UTF-8 Byte Sequence | -// |-----------------------------|---------------------------------------| -// | ASCII (7 bits): | `0_______` | -// | 2 byte character (11 bits) | `110_____ 10______` | -// | 3 byte character (17 bits) | `1110____ 10______ 10______` | -// | 4 byte character (23 bits) | `11110___ 10______ 10______ 10______` | -// | 5+ byte character (illegal) | `11111___` | -// -// UTF-8 Error Classes -// ------------------- -// -// There are 5 classes of error that can happen in UTF-8: -// -// ### Too short (missing continuations) -// -// TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation). -// We detect this by looking for new characters (lead bytes) inside the range of a multibyte -// character. -// -// e.g. `11000000 01100001` (2-byte character where second byte is ASCII) -// -// ### Too long (stray continuations) -// -// TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation). -// We detect this by requiring that the next byte after your multibyte character be a new -// character--so a continuation after your character is wrong. -// -// e.g. `11011111 10111111 10111111` (2-byte character followed by *another* continuation byte) -// -// ### Too large (out of range for unicode) -// -// TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large. -// -// e.g. `11110111 10111111 10111111 10111111` (bigger than 10FFFF). -// -// ### Overlong encoding (used more bytes than needed) -// -// Multibyte characters with a bunch of leading zeroes, where you could have -// used fewer bytes to make the same character, are considered *overlong encodings*. They are -// disallowed in UTF-8 to ensure there is only one way to write a single Unicode codepoint, making strings -// easier to search. Like encoding an ASCII character in 2 bytes is technically possible, but UTF-8 -// disallows it so that you only have to search for the ASCII character `a` to find it. -// -// e.g. `11000001 10100001` (2-byte encoding of "a", which only requires 1 byte: 01100001) -// -// ### Surrogate characters -// -// Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and WTF-8 encodings -// for characters with > 2 bytes. These are illegal in pure UTF-8. -// -// e.g. `11101101 10100000 10000000` (U+D800) -// -// ### 5+ byte characters -// -// INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not -// support values with more than 23 bits (which a 4-byte character supports). -// -// Even if these were supported, anything with 5 bytes would be either too large (bigger than the -// Unicode max value), or overlong (could fit in 4+ bytes). -// -// e.g. `11111000 10100000 10000000 10000000 10000000` (U+800000) -// -// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// -// | Code Points | 1st | 2nd | 3s | 4s | -// |--------------------|--------|--------|--------|--------| -// | U+0000..U+007F | 00..7F | | | | -// | U+0080..U+07FF | C2..DF | 80..BF | | | -// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | -// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | -// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | -// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | -// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | -// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | -// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | -// -// Algorithm -// --------- -// -// This validator works in two basic steps: checking continuation bytes, and -// handling special cases. Each step works on one vector's worth of input -// bytes at a time. -// -using namespace simd; - -using vmask_t = simd8::bitmask_t; -using vmask2_t = simd8::bitmask2_t; - -struct utf8_checker { - simd8 special_case_errors; - simd8 prev_bytes; - vmask2_t last_cont; - vmask_t length_errors; - - // - // Check for missing / extra continuation bytes. - // - // The continuation bytes are handled in a fairly straightforward manner in - // the scalar domain. A mask is created from the input byte vector for each - // of the highest four bits of every byte. The first mask allows us to quickly - // skip pure ASCII input vectors, which have no bits set. The first and - // (inverted) second masks together give us every continuation byte (10xxxxxx). - // The other masks are used to find prefixes of multi-byte code points (110, - // 1110, 11110). For these, we keep a "required continuation" mask, by shifting - // these masks 1, 2, and 3 bits respectively forward in the byte stream. That - // is, we take a mask of all bytes that start with 11, and shift it left one - // bit forward to get the mask of all the first continuation bytes, then do the - // same for the second and third continuation bytes. Here's an example input - // sequence along with the corresponding masks: - // - // bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00 - // code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00 - // # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1 - // cont. mask 1: - - 1 - - 1 - - - 1 - - - - // cont. mask 2: - - - - - - 1 - - - 1 - - - // cont. mask 3: - - - - - - - - - - - 1 - - // cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0 - // - // The final required continuation mask is then compared with the mask of - // actual continuation bytes, and must match exactly in valid UTF-8. The only - // complication in this step is that the shifted masks can cross vector - // boundaries, so we need to keep a "carry" mask of the bits that were shifted - // past the boundary in the last loop iteration. - // - simdjson_really_inline void check_length_errors(const simd8 bytes, const vmask_t bit_7) { - // Compute the continuation byte mask by finding bytes that start with - // 11x, 111x, and 1111. For each of these prefixes, we get a bitmask - // and shift it forward by 1, 2, or 3. This loop should be unrolled by - // the compiler, and the (n == 1) branch inside eliminated. - // - // NOTE (@jkeiser): I unrolled the for(i=1..3) loop because I don't trust compiler unrolling - // anymore. This should be exactly equivalent and yield the same optimizations (and also lets - // us rearrange statements if we so desire). - - // We add the shifted mask here instead of ORing it, which would - // be the more natural operation, so that this line can be done - // with one lea. While adding could give a different result due - // to carries, this will only happen for invalid UTF-8 sequences, - // and in a way that won't cause it to pass validation. Reasoning: - // Any bits for required continuation bytes come after the bits - // for their leader bytes, and are all contiguous. For a carry to - // happen, two of these bit sequences would have to overlap. If - // this is the case, there is a leader byte before the second set - // of required continuation bytes (and thus before the bit that - // will be cleared by a carry). This leader byte will not be - // in the continuation mask, despite being required. QEDish. - // Which bytes are required to be continuation bytes - vmask2_t cont_required = this->last_cont; - - // 2-byte lead: 11______ - const vmask_t bit_6 = bytes.get_bit<6>(); - const vmask_t lead_2_plus = bit_7 & bit_6; // 11______ - cont_required += vmask2_t(lead_2_plus) << 1; - - // 3-byte lead: 111_____ - const vmask_t bit_5 = bytes.get_bit<5>(); - const vmask_t lead_3_plus = lead_2_plus & bit_5; // 111_____ - cont_required += vmask2_t(lead_3_plus) << 2; - - // 4-byte lead: 1111____ - const vmask_t bit_4 = bytes.get_bit<4>(); - const vmask_t lead_4_plus = lead_3_plus & bit_4; - cont_required += vmask2_t(lead_4_plus) << 3; - - const vmask_t cont = bit_7 ^ lead_2_plus; // 10______ TODO &~ bit_6 might be fine, and involve less data dependency - - // Check that continuation bytes match. We must cast req from vmask2_t - // (which holds the carry mask in the upper half) to vmask_t, which - // zeroes out the upper bits - // - // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in - // success cases: instead of spending time testing the result and introducing a branch (which - // can affect performance even if it's easily predictable), we test once at the end. - // The ^ is equivalent to !=, however, leaving a 1 where the bits are different and 0 where they - // are the same. - this->length_errors |= cont ^ vmask_t(cont_required); - - this->last_cont = cont_required >> sizeof(simd8); - } - - // - // These constants define the set of error flags in check_special_cases(). - // - static const uint8_t OVERLONG_2 = 0x01; // 1100000_ ________ Could have been encoded in 1 byte - static const uint8_t OVERLONG_3 = 0x02; // 11100000 100_____ Could have been encoded in 2 bytes - static const uint8_t SURROGATE = 0x04; // 11101010 101_____ Surrogate pairs - static const uint8_t TOO_LARGE = 0x08; // 11110100 (1001|101_)____ > U+10FFFF - static const uint8_t TOO_LARGE_2 = 0x10; // 1111(0101..1111) ________ > U+10FFFF - static const uint8_t OVERLONG_4 = 0x20; // 11110000 1000____ Could have been encoded in 3 bytes - - // - // Check for special-case errors with table lookups on the first 3 nibbles (first 2 bytes). - // - // Besides the basic prefix coding of UTF-8, there are several invalid byte - // sequences that need special handling. These are due to three factors: - // code points that could be described in fewer bytes, code points that are - // part of a surrogate pair (which are only valid in UTF-16), and code points - // that are past the highest valid code point U+10FFFF. - // - // All of the invalid sequences can be detected by independently observing - // the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte - // lookup in parallel for all 32 bytes in a vector, we can create bit masks - // for all of these error conditions, look up the bit masks for the three - // nibbles for all input bytes, and AND them together to get a final error mask, - // that must be all zero for valid UTF-8. This is somewhat complicated by - // needing to shift the error masks from the first and second nibbles forward in - // the byte stream to line up with the third nibble. - // - // We have these possible values for valid UTF-8 sequences, broken down - // by the first three nibbles: - // - // 1st 2nd 3rd comment - // 0..7 0..F ASCII - // 8..B 0..F continuation bytes - // C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte - // D 0..F 8..B D0..DF are valid with a continuation byte - // E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes - // 1..C 8..B E1..EC are valid with continuation bytes - // D 8..9 ED Ax and ED Bx correspond to surrogate pairs - // E..F 8..B EE..EF are valid with continuation bytes - // F 0 9..B F0 8x can be encoded with 3 bytes - // 1..3 8..B F1..F3 are valid with continuation bytes - // 4 8 F4 8F BF BF is the maximum valid code point - // - // That leaves us with these invalid sequences, which would otherwise fit - // into UTF-8's prefix encoding. Each of these invalid sequences needs to - // be detected separately, with their own bits in the error mask. - // - // 1st 2nd 3rd error bit - // C 0..1 0..F 0x01 - // E 0 8..9 0x02 - // D A..B 0x04 - // F 0 0..8 0x08 - // 4 9..F 0x10 - // 5..F 0..F 0x20 - // - // For every possible value of the first, second, and third nibbles, we keep - // a lookup table that contains the bitwise OR of all errors that that nibble - // value can cause. For example, the first nibble has zeroes in every entry - // except for C, E, and F, and the third nibble lookup has the 0x21 bits in - // every entry, since those errors don't depend on the third nibble. After - // doing a parallel lookup of the first/second/third nibble values for all - // bytes, we AND them together. Only when all three have an error bit in common - // do we fail validation. - // - simdjson_really_inline void check_special_cases(const simd8 bytes) { - const simd8 shifted_bytes = bytes.prev<1>(this->prev_bytes); - this->prev_bytes = bytes; - - // Look up error masks for three consecutive nibbles. We need to - // AND with 0x0F for each one, because vpshufb has the neat - // "feature" that negative values in an index byte will result in - // a zero. - simd8 nibble_1_error = shifted_bytes.shr<4>().lookup_16( - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - - OVERLONG_2, // [1100]000_ ________ Could have been encoded in 1 byte - 0, - OVERLONG_3 | SURROGATE, // [1110]0000 100_____ Could have been encoded in 2 bytes - // [1110]1010 101_____ Surrogate pairs - OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]0000 1000____ Could have been encoded in 3 bytes - // [1111]0100 (1001|101_)____ > U+10FFFF - ); - - simd8 nibble_2_error = (shifted_bytes & 0x0F).lookup_16( - OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // 1100[000_] ________ Could have been encoded in 1 byte - // 1110[0000] 100_____ Could have been encoded in 2 bytes - // 1111[0000] 1000____ Could have been encoded in 3 bytes - OVERLONG_2, - 0, - 0, - - TOO_LARGE, // 1111[0100] (1001|101_)____ > U+10FFFF - TOO_LARGE_2, // 1111[0101..1111] ________ > U+10FFFF - TOO_LARGE_2, - TOO_LARGE_2, - - TOO_LARGE_2, - TOO_LARGE_2, - TOO_LARGE_2, - TOO_LARGE_2, - - TOO_LARGE_2, - TOO_LARGE_2 | SURROGATE, // 1110[1010] 101_____ Surrogate pairs - TOO_LARGE_2, TOO_LARGE_2 - ); - - // Errors that apply no matter what the third byte is - const uint8_t CARRY = OVERLONG_2 | TOO_LARGE_2; // 1100000_ [____]____ Could have been encoded in 1 byte - // 1111(0101..1111) [____]____ > U+10FFFF - simd8 nibble_3_error = bytes.shr<4>().lookup_16( - CARRY, CARRY, CARRY, CARRY, - - CARRY, CARRY, CARRY, CARRY, - - CARRY | OVERLONG_3 | OVERLONG_4, // 11100000 [100_]____ Could have been encoded in 2 bytes - // 11110000 [1000]____ Could have been encoded in 3 bytes - CARRY | OVERLONG_3 | TOO_LARGE, // 11100000 [100_]____ Could have been encoded in 2 bytes - // 11110100 [1001|101_]____ > U+10FFFF - CARRY | SURROGATE | TOO_LARGE, // 11101010 [101_]____ Surrogate pairs - CARRY | SURROGATE | TOO_LARGE, - - CARRY, CARRY, CARRY, CARRY - ); - - // Check if any bits are set in all three error masks - // - // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in - // success cases: instead of spending time testing the result and introducing a branch (which - // can affect performance even if it's easily predictable), we test once at the end. - this->special_case_errors |= nibble_1_error & nibble_2_error & nibble_3_error; - } - - // check whether the current bytes are valid UTF-8 - // at the end of the function, previous gets updated - simdjson_really_inline void check_utf8_bytes(const simd8 bytes, const vmask_t bit_7) { - this->check_length_errors(bytes, bit_7); - this->check_special_cases(bytes); - } - - simdjson_really_inline void check_next_input(const simd8 bytes) { - vmask_t bit_7 = bytes.get_bit<7>(); - if (simdjson_unlikely(bit_7)) { - // TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of - // shifted_bytes inside check_utf8_bytes. I believe this adds an extra instruction to the hot - // path (saving prev_bytes), which is undesirable, though 2 register accesses vs. 1 memory - // access might be a wash. Come back and try the other way. - this->check_utf8_bytes(bytes, bit_7); - } else { - this->length_errors |= this->last_cont; - } - } - - simdjson_really_inline void check_next_input(const simd8x64& in) { - for (int i=0; i::NUM_CHUNKS; i++) { - this->check_next_input(in.chunks[i]); - } - } - - simdjson_really_inline error_code errors() { - return (this->special_case_errors.any_bits_set_anywhere() | this->length_errors) ? simdjson::UTF8_ERROR : simdjson::SUCCESS; - } -}; // struct utf8_checker - -} // unnamed namespace -} // namespace SIMDJSON_IMPLEMENTATION -} // namespace simdjson