Unneeded.

2021-02-02 17:58:10 -05:00 · 2021-02-02 17:58:10 -05:00 · 26b5b19f88
parent 5449365658
commit 26b5b19f88
1 changed files with 0 additions and 367 deletions
--- a/src/generic/stage1/utf8_zwegner_algorithm.h
+++ b/src/generic/stage1/utf8_zwegner_algorithm.h
@ -1,367 +0,0 @@
-namespace simdjson {
-namespace simdjson {
-namespace SIMDJSON_IMPLEMENTATION {
-//
-// Detect UTF-8 errors.
-//
-// Copied and adapted from algorithm by @zwegner: https://github.com/zwegner/faster-utf8-validator
-//
-// UTF-8 Refresher
-// ---------------
-//
-// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
-// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
-// are straight up concatenated into the final value. The first byte of a multibyte character is a
-// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
-// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
-// start with 0, because that's what ASCII looks like. Here's what each size
-//
-// | Character Length            | UTF-8 Byte Sequence                   |
-// |-----------------------------|---------------------------------------|
-// | ASCII (7 bits):             | `0_______`                            |
-// | 2 byte character (11 bits)  | `110_____ 10______`                   |
-// | 3 byte character (17 bits)  | `1110____ 10______ 10______`          |
-// | 4 byte character (23 bits)  | `11110___ 10______ 10______ 10______` |
-// | 5+ byte character (illegal) | `11111___` <illegal>                  |
-//
-// UTF-8 Error Classes
-// -------------------
-//
-// There are 5 classes of error that can happen in UTF-8:
-//
-// ### Too short (missing continuations)
-//
-// TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
-// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
-// character.
-//
-// e.g. `11000000 01100001` (2-byte character where second byte is ASCII)
-//
-// ### Too long (stray continuations)
-//
-// TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
-// We detect this by requiring that the next byte after your multibyte character be a new
-// character--so a continuation after your character is wrong.
-//
-// e.g. `11011111 10111111 10111111` (2-byte character followed by *another* continuation byte)
-//
-// ### Too large (out of range for unicode)
-//
-// TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
-//
-// e.g. `11110111 10111111 10111111 10111111` (bigger than 10FFFF).
-//
-// ### Overlong encoding (used more bytes than needed)
-//
-// Multibyte characters with a bunch of leading zeroes, where you could have
-// used fewer bytes to make the same character, are considered *overlong encodings*. They are
-// disallowed in UTF-8 to ensure there is only one way to write a single Unicode codepoint, making strings
-// easier to search. Like encoding an ASCII character in 2 bytes is technically possible, but UTF-8
-// disallows it so that you only have to search for the ASCII character `a` to find it.
-//
-// e.g. `11000001 10100001` (2-byte encoding of "a", which only requires 1 byte: 01100001)
-//
-// ### Surrogate characters
-//
-// Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and WTF-8 encodings
-// for characters with > 2 bytes. These are illegal in pure UTF-8.
-//
-// e.g. `11101101 10100000 10000000` (U+D800)
-//
-// ### 5+ byte characters
-//
-// INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
-// support values with more than 23 bits (which a 4-byte character supports).
-//
-// Even if these were supported, anything with 5 bytes would be either too large (bigger than the
-// Unicode max value), or overlong (could fit in 4+ bytes).
-//
-// e.g. `11111000 10100000 10000000 10000000 10000000` (U+800000)
-//
-// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-//
-//  |  Code Points       |  1st   |  2nd   |   3s   |   4s   |
-//  |--------------------|--------|--------|--------|--------|
-//  | U+0000..U+007F     | 00..7F |        |        |        |
-//  | U+0080..U+07FF     | C2..DF | 80..BF |        |        |
-//  | U+0800..U+0FFF     | E0     | A0..BF | 80..BF |        |
-//  | U+1000..U+CFFF     | E1..EC | 80..BF | 80..BF |        |
-//  | U+D000..U+D7FF     | ED     | 80..9F | 80..BF |        |
-//  | U+E000..U+FFFF     | EE..EF | 80..BF | 80..BF |        |
-//  | U+10000..U+3FFFF   | F0     | 90..BF | 80..BF | 80..BF |
-//  | U+40000..U+FFFFF   | F1..F3 | 80..BF | 80..BF | 80..BF |
-//  | U+100000..U+10FFFF | F4     | 80..8F | 80..BF | 80..BF |
-//
-// Algorithm
-// ---------
-//
-// This validator works in two basic steps: checking continuation bytes, and
-// handling special cases. Each step works on one vector's worth of input
-// bytes at a time.
-//
-using namespace simd;
-
-using vmask_t = simd8<bool>::bitmask_t;
-using vmask2_t = simd8<bool>::bitmask2_t;
-
-struct utf8_checker {
-  simd8<uint8_t> special_case_errors;
-  simd8<uint8_t> prev_bytes;
-  vmask2_t last_cont;
-  vmask_t length_errors;
-
-  //
-  // Check for missing / extra continuation bytes.
-  //
-  // The continuation bytes are handled in a fairly straightforward manner in
-  // the scalar domain. A mask is created from the input byte vector for each
-  // of the highest four bits of every byte. The first mask allows us to quickly
-  // skip pure ASCII input vectors, which have no bits set. The first and
-  // (inverted) second masks together give us every continuation byte (10xxxxxx).
-  // The other masks are used to find prefixes of multi-byte code points (110,
-  // 1110, 11110). For these, we keep a "required continuation" mask, by shifting
-  // these masks 1, 2, and 3 bits respectively forward in the byte stream. That
-  // is, we take a mask of all bytes that start with 11, and shift it left one
-  // bit forward to get the mask of all the first continuation bytes, then do the
-  // same for the second and third continuation bytes. Here's an example input
-  // sequence along with the corresponding masks:
-  //
-  //   bytes:        61 C3 80 62 E0 A0 80 63 F0 90 80 80 00
-  //   code points:  61|C3 80|62|E0 A0 80|63|F0 90 80 80|00
-  //   # of bytes:   1 |2  - |1 |3  -  - |1 |4  -  -  - |1
-  //   cont. mask 1: -  -  1  -  -  1  -  -  -  1  -  -  -
-  //   cont. mask 2: -  -  -  -  -  -  1  -  -  -  1  -  -
-  //   cont. mask 3: -  -  -  -  -  -  -  -  -  -  -  1  -
-  //   cont. mask *: 0  0  1  0  0  1  1  0  0  1  1  1  0
-  //
-  // The final required continuation mask is then compared with the mask of
-  // actual continuation bytes, and must match exactly in valid UTF-8. The only
-  // complication in this step is that the shifted masks can cross vector
-  // boundaries, so we need to keep a "carry" mask of the bits that were shifted
-  // past the boundary in the last loop iteration.
-  //
-  simdjson_really_inline void check_length_errors(const simd8<uint8_t> bytes, const vmask_t bit_7) {
-    // Compute the continuation byte mask by finding bytes that start with
-    // 11x, 111x, and 1111. For each of these prefixes, we get a bitmask
-    // and shift it forward by 1, 2, or 3. This loop should be unrolled by
-    // the compiler, and the (n == 1) branch inside eliminated.
-    //
-    // NOTE (@jkeiser): I unrolled the for(i=1..3) loop because I don't trust compiler unrolling
-    // anymore. This should be exactly equivalent and yield the same optimizations (and also lets
-    // us rearrange statements if we so desire).
-
-    // We add the shifted mask here instead of ORing it, which would
-    // be the more natural operation, so that this line can be done
-    // with one lea. While adding could give a different result due
-    // to carries, this will only happen for invalid UTF-8 sequences,
-    // and in a way that won't cause it to pass validation. Reasoning:
-    // Any bits for required continuation bytes come after the bits
-    // for their leader bytes, and are all contiguous. For a carry to
-    // happen, two of these bit sequences would have to overlap. If
-    // this is the case, there is a leader byte before the second set
-    // of required continuation bytes (and thus before the bit that
-    // will be cleared by a carry). This leader byte will not be
-    // in the continuation mask, despite being required. QEDish.
-    // Which bytes are required to be continuation bytes
-    vmask2_t cont_required = this->last_cont;
-
-    // 2-byte lead: 11______
-    const vmask_t bit_6 = bytes.get_bit<6>();
-    const vmask_t lead_2_plus = bit_7 & bit_6;       // 11______
-    cont_required += vmask2_t(lead_2_plus) << 1;
-
-    // 3-byte lead: 111_____
-    const vmask_t bit_5 = bytes.get_bit<5>();
-    const vmask_t lead_3_plus = lead_2_plus & bit_5; // 111_____
-    cont_required += vmask2_t(lead_3_plus) << 2;
-
-    // 4-byte lead: 1111____
-    const vmask_t bit_4 = bytes.get_bit<4>();
-    const vmask_t lead_4_plus = lead_3_plus & bit_4;
-    cont_required += vmask2_t(lead_4_plus) << 3;
-
-    const vmask_t cont = bit_7 ^ lead_2_plus;        // 10______ TODO &~ bit_6 might be fine, and involve less data dependency
-
-    // Check that continuation bytes match. We must cast req from vmask2_t
-    // (which holds the carry mask in the upper half) to vmask_t, which
-    // zeroes out the upper bits
-    //
-    // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
-    // success cases: instead of spending time testing the result and introducing a branch (which
-    // can affect performance even if it's easily predictable), we test once at the end.
-    // The ^ is equivalent to !=, however, leaving a 1 where the bits are different and 0 where they
-    // are the same.
-    this->length_errors |= cont ^ vmask_t(cont_required);
-
-    this->last_cont = cont_required >> sizeof(simd8<uint8_t>);
-  }
-
-  //
-  // These constants define the set of error flags in check_special_cases().
-  //
-  static const uint8_t OVERLONG_2  = 0x01; // 1100000_         ________         Could have been encoded in 1 byte
-  static const uint8_t OVERLONG_3  = 0x02; // 11100000         100_____         Could have been encoded in 2 bytes
-  static const uint8_t SURROGATE   = 0x04; // 11101010         101_____         Surrogate pairs
-  static const uint8_t TOO_LARGE   = 0x08; // 11110100         (1001|101_)____ > U+10FFFF
-  static const uint8_t TOO_LARGE_2 = 0x10; // 1111(0101..1111) ________       > U+10FFFF
-  static const uint8_t OVERLONG_4  = 0x20; // 11110000         1000____         Could have been encoded in 3 bytes
-
-  //
-  // Check for special-case errors with table lookups on the first 3 nibbles (first 2 bytes).
-  //
-  // Besides the basic prefix coding of UTF-8, there are several invalid byte
-  // sequences that need special handling. These are due to three factors:
-  // code points that could be described in fewer bytes, code points that are
-  // part of a surrogate pair (which are only valid in UTF-16), and code points
-  // that are past the highest valid code point U+10FFFF.
-  //
-  // All of the invalid sequences can be detected by independently observing
-  // the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte
-  // lookup in parallel for all 32 bytes in a vector, we can create bit masks
-  // for all of these error conditions, look up the bit masks for the three
-  // nibbles for all input bytes, and AND them together to get a final error mask,
-  // that must be all zero for valid UTF-8. This is somewhat complicated by
-  // needing to shift the error masks from the first and second nibbles forward in
-  // the byte stream to line up with the third nibble.
-  //
-  // We have these possible values for valid UTF-8 sequences, broken down
-  // by the first three nibbles:
-  //
-  //   1st   2nd   3rd   comment
-  //   0..7  0..F        ASCII
-  //   8..B  0..F        continuation bytes
-  //   C     2..F  8..B  C0 xx and C1 xx can be encoded in 1 byte
-  //   D     0..F  8..B  D0..DF are valid with a continuation byte
-  //   E     0     A..B  E0 8x and E0 9x can be encoded with 2 bytes
-  //         1..C  8..B  E1..EC are valid with continuation bytes
-  //         D     8..9  ED Ax and ED Bx correspond to surrogate pairs
-  //         E..F  8..B  EE..EF are valid with continuation bytes
-  //   F     0     9..B  F0 8x can be encoded with 3 bytes
-  //         1..3  8..B  F1..F3 are valid with continuation bytes
-  //         4     8     F4 8F BF BF is the maximum valid code point
-  //
-  // That leaves us with these invalid sequences, which would otherwise fit
-  // into UTF-8's prefix encoding. Each of these invalid sequences needs to
-  // be detected separately, with their own bits in the error mask.
-  //
-  //   1st   2nd   3rd   error bit
-  //   C     0..1  0..F  0x01
-  //   E     0     8..9  0x02
-  //         D     A..B  0x04
-  //   F     0     0..8  0x08
-  //         4     9..F  0x10
-  //         5..F  0..F  0x20
-  //
-  // For every possible value of the first, second, and third nibbles, we keep
-  // a lookup table that contains the bitwise OR of all errors that that nibble
-  // value can cause. For example, the first nibble has zeroes in every entry
-  // except for C, E, and F, and the third nibble lookup has the 0x21 bits in
-  // every entry, since those errors don't depend on the third nibble. After
-  // doing a parallel lookup of the first/second/third nibble values for all
-  // bytes, we AND them together. Only when all three have an error bit in common
-  // do we fail validation.
-  //
-  simdjson_really_inline void check_special_cases(const simd8<uint8_t> bytes) {
-    const simd8<uint8_t> shifted_bytes = bytes.prev<1>(this->prev_bytes);
-    this->prev_bytes = bytes;
-
-    // Look up error masks for three consecutive nibbles. We need to
-    // AND with 0x0F for each one, because vpshufb has the neat
-    // "feature" that negative values in an index byte will result in
-    // a zero.
-    simd8<uint8_t> nibble_1_error = shifted_bytes.shr<4>().lookup_16<uint8_t>(
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-
-        OVERLONG_2,                          // [1100]000_         ________        Could have been encoded in 1 byte
-        0,
-        OVERLONG_3 | SURROGATE,              // [1110]0000         100_____        Could have been encoded in 2 bytes
-                                             // [1110]1010         101_____        Surrogate pairs
-        OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]0000         1000____        Could have been encoded in 3 bytes
-                                             // [1111]0100         (1001|101_)____ > U+10FFFF
-    );
-
-    simd8<uint8_t> nibble_2_error = (shifted_bytes & 0x0F).lookup_16<uint8_t>(
-      OVERLONG_2 | OVERLONG_3 | OVERLONG_4,  // 1100[000_]       ________        Could have been encoded in 1 byte
-                                             // 1110[0000]       100_____        Could have been encoded in 2 bytes
-                                             // 1111[0000]       1000____        Could have been encoded in 3 bytes
-      OVERLONG_2,
-      0,
-      0,
-
-      TOO_LARGE,                             // 1111[0100]       (1001|101_)____ > U+10FFFF
-      TOO_LARGE_2,                           // 1111[0101..1111] ________        > U+10FFFF
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-      TOO_LARGE_2,
-
-      TOO_LARGE_2,
-      TOO_LARGE_2 | SURROGATE,               // 1110[1010]       101_____        Surrogate pairs
-      TOO_LARGE_2, TOO_LARGE_2
-    );
-
-    // Errors that apply no matter what the third byte is
-    const uint8_t CARRY = OVERLONG_2 | TOO_LARGE_2; // 1100000_         [____]____        Could have been encoded in 1 byte
-                                                    // 1111(0101..1111) [____]____        > U+10FFFF
-    simd8<uint8_t> nibble_3_error = bytes.shr<4>().lookup_16<uint8_t>(
-      CARRY, CARRY, CARRY, CARRY,
-
-      CARRY, CARRY, CARRY, CARRY,
-
-      CARRY | OVERLONG_3 | OVERLONG_4,        // 11100000       [100_]____       Could have been encoded in 2 bytes
-                                              // 11110000       [1000]____       Could have been encoded in 3 bytes
-      CARRY | OVERLONG_3 | TOO_LARGE,         // 11100000       [100_]____       Could have been encoded in 2 bytes
-                                              // 11110100       [1001|101_]____  > U+10FFFF
-      CARRY | SURROGATE | TOO_LARGE,          // 11101010       [101_]____       Surrogate pairs
-      CARRY | SURROGATE | TOO_LARGE,
-
-      CARRY, CARRY, CARRY, CARRY
-    );
-
-    // Check if any bits are set in all three error masks
-    //
-    // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in
-    // success cases: instead of spending time testing the result and introducing a branch (which
-    // can affect performance even if it's easily predictable), we test once at the end.
-    this->special_case_errors |= nibble_1_error & nibble_2_error & nibble_3_error;
-  }
-
-  // check whether the current bytes are valid UTF-8
-  // at the end of the function, previous gets updated
-  simdjson_really_inline void check_utf8_bytes(const simd8<uint8_t> bytes, const vmask_t bit_7) {
-    this->check_length_errors(bytes, bit_7);
-    this->check_special_cases(bytes);
-  }
-
-  simdjson_really_inline void check_next_input(const simd8<uint8_t> bytes) {
-    vmask_t bit_7 = bytes.get_bit<7>();
-    if (simdjson_unlikely(bit_7)) {
-      // TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of
-      // shifted_bytes inside check_utf8_bytes. I believe this adds an extra instruction to the hot
-      // path (saving prev_bytes), which is undesirable, though 2 register accesses vs. 1 memory
-      // access might be a wash. Come back and try the other way.
-      this->check_utf8_bytes(bytes, bit_7);
-    } else {
-      this->length_errors |= this->last_cont;
-    }
-  }
-
-  simdjson_really_inline void check_next_input(const simd8x64<uint8_t>& in) {
-    for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
-      this->check_next_input(in.chunks[i]);
-    }
-  }
-
-  simdjson_really_inline error_code errors() {
-    return (this->special_case_errors.any_bits_set_anywhere() | this->length_errors) ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
-  }
-}; // struct utf8_checker
-
-} // unnamed namespace
-} // namespace SIMDJSON_IMPLEMENTATION
-} // namespace simdjson