Precalculate the ASCII path

2019-11-20 20:19:39 -08:00 · 2019-11-20 20:19:39 -08:00 · 9b6377fd80
parent 7356b4532f
commit 9b6377fd80
4 changed files with 226 additions and 301 deletions
--- a/src/generic/utf8_fastvalidate_algorithm.h
+++ b/src/generic/utf8_fastvalidate_algorithm.h
@ -166,7 +166,9 @@ struct utf8_checker {
      this->check_carried_continuations();
    } else {
      // it is not ascii so we have to do heavy work
-      in.each([&](auto _in) { this->check_utf8_bytes(_in); });
+      for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
+        this->check_utf8_bytes(in.chunks[i]);
+      }
    }
  }

--- a/src/generic/utf8_lookup_algorithm.h
+++ b/src/generic/utf8_lookup_algorithm.h
@ -65,309 +65,231 @@ using namespace simd;

 namespace utf8_validation {

-//
-// These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character
-// it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be
-// used to detect other kinds of errors.
-//
-// LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3,
-// LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick
-// lets us use one constant table instead of 3, possibly saving registers on systems with fewer
-// registers.
-//
-const uint8_t LEAD_4      = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__
-const uint8_t LEAD_3      = 0x02; // [1110]____ 10______ 10______ (0|11)__
-const uint8_t LEAD_2      = 0x04; // [110_]____ 10______ (0|11)__
-const uint8_t LEAD_1      = 0x08; // [0___]____ (0|11)__
-const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ...
-const uint8_t LEAD_1100   = 0x20; // [1100]____ ...
-const uint8_t LEAD_1110   = 0x40; // [1110]____ ...
-const uint8_t LEAD_1111   = 0x80; // [1111]____ ...
-
-really_inline simd8<uint8_t> get_lead_flags(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
-  // Total: 2 instructions, 1 constant
-  // - 1 byte shift (shuffle)
-  // - 1 table lookup (shuffle)
-  // - 1 table constant
-
-  // high_bits is byte 5, so lead is high_bits.prev<4>()
-  return high_bits.prev<4>(prev_high_bits).lookup_16<uint8_t>(
-    LEAD_1, LEAD_1, LEAD_1, LEAD_1,   // [0___]____ (ASCII)
-    LEAD_1, LEAD_1, LEAD_1, LEAD_1,   // [0___]____ (ASCII)
-    0,      0,      0,      0,        // [10__]____ (continuation)
-    LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____
-    LEAD_2 | LEAD_2_PLUS,             // [110_]____
-    LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____
-    LEAD_4 | LEAD_2_PLUS | LEAD_1111  // [1111]____
-  );
-}
-
-// Find errors in bytes 1 and 2 together (one single multi-nibble &)
-really_inline simd8<uint8_t> get_byte_1_2_errors(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
-  //
-  // These are the errors we're going to match for bytes 1-2, by looking at the first three
-  // nibbles of the character: lead_flags & <low bits of byte 1> & <high bits of byte 2>
-  //
-  // The important thing here is that these constants all take up *different* bits, since they
-  // match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among
-  // other things.
-  //
-  static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____
-  static const int TOO_LONG_1  = LEAD_1;      // 0_______ 10______
-  static const int OVERLONG_2  = LEAD_1100;   // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way)
-  static const int OVERLONG_3  = LEAD_3;      // 11100000 100_____ ________
-  static const int OVERLONG_4  = LEAD_4;      // 11110000 1000____ ________ ________
-  static const int TOO_LARGE   = LEAD_1111;   // 11110100 (1001|101_)____
-  static const int SURROGATE   = LEAD_1110;   // 11101101 [101_]____
-
-  // Total: 4 instructions, 2 constants
-  // - 2 table lookups (shuffles)
-  // - 1 byte shift (shuffle)
-  // - 1 "and"
-  // - 2 table constants
-
-  // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
-  // byte 2 to be sure which things are errors and which aren't.
-  // Since input is byte 5, byte 1 is input.prev<4>
-  const simd8<uint8_t> byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16<uint8_t>(
-    // ____[00__] ________
-    TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
-    TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2,                           // ____[0001] ________
-    TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
-    // ____[01__] ________
-    TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE,                            // ____[0100] ________
-    TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
-    // ____[10__] ________
-    TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
-    // ____[11__] ________
-    TOO_SHORT_2 | TOO_LONG_1,
-    TOO_SHORT_2 | TOO_LONG_1 | SURROGATE,                            // ____[1101] ________
-    TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1
-  );
-  // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
-  const simd8<uint8_t> byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16<uint8_t>(
-      // ASCII: ________ [0___]____
-      OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
-      // ASCII: ________ [0___]____
-      OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
-      // Continuations: ________ [10__]____
-      OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
-      OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE,  // ________ [1001]____
-      OVERLONG_2 | TOO_LONG_1 | TOO_LARGE  | SURROGATE,  // ________ [1010]____
-      OVERLONG_2 | TOO_LONG_1 | TOO_LARGE  | SURROGATE,  // ________ [1011]____
-      // Multibyte Leads: ________ [11__]____
-      OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2
-  );
-  return byte_1_flags & byte_2_flags;
-}
-
-really_inline simd8<uint8_t> get_byte_3_4_5_errors(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
-  // Total 7 instructions, 3 simd constants:
-  // - 3 table lookups (shuffles)
-  // - 2 byte shifts (shuffles)
-  // - 2 "or"
-  // - 1 table constant
-
-  const simd8<uint8_t> byte_3_table = simd8<uint8_t>::repeat_16(
-      // TOO_SHORT ASCII:           111_____ ________ [0___]____
-      LEAD_3, LEAD_3, LEAD_3, LEAD_3,
-      LEAD_3, LEAD_3, LEAD_3, LEAD_3,
-      // TOO_LONG  Continuations:   110_____ ________ [10__]____
-      LEAD_2, LEAD_2, LEAD_2, LEAD_2,
-      // TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____
-      LEAD_3, LEAD_3, LEAD_3, LEAD_3
-  );
-  const simd8<uint8_t> byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3
-  const simd8<uint8_t> byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: <none>, TOO_LONG: LEAD_4
-
-  // high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4
-  return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) |
-          high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) |
-          high_bits.lookup_16(byte_5_table);
-}
-
-// Check whether the current bytes are valid UTF-8.
-// At the end of the function, previous gets updated
-// This should come down to 22 instructions if table definitions are in registers--30 if not.
-really_inline simd8<uint8_t> check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-  // When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows
-  // us to look for all errors related to any lead character at one time (since UTF-8 characters
-  // can only be up to 4 bytes, and the next byte after a character finishes must be another lead,
-  // we never need to look more than 4 bytes past the current one to fully validate).
-  // This way, we have all relevant bytes around and can save ourselves a little overflow and
-  // several instructions on each loop.
-
-  // Total: 22 instructions, 7 simd constants
-  // Local: 8 instructions, 1 simd constant
-  // - 2 bit shifts
-  // - 1 byte shift (shuffle)
-  // - 3 "or"
-  // - 1 "and"
-  // - 1 saturating_sub
-  // - 1 constant (0b11111000-1)
-  // lead_flags: 2 instructions, 1 simd constant
-  // - 1 byte shift (shuffle)
-  // - 1 table lookup (shuffle)
-  // - 1 table constant
-  // byte_1_2_errors: 5 instructions, 2 simd constants
-  // - 2 table lookups (shuffles)
-  // - 2 byte shifts (shuffles)
-  // - 1 "and"
-  // - 2 table constants
-  // byte_3_4_5_errors: 7 instructions, 3 simd constants
-  // - 3 table lookups (shuffles)
-  // - 2 byte shifts (shuffles)
-  // - 2 "or"
-  // - 3 table constants
-
-  const simd8<uint8_t> high_bits = input.shr<4>();
-  const simd8<uint8_t> prev_high_bits = prev_input.shr<4>();
-  const simd8<uint8_t> lead_flags = get_lead_flags(high_bits, prev_high_bits);
-  const simd8<uint8_t> byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits);
-  const simd8<uint8_t> byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits);
-  // Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors  because
-  // it would need a third lead_flag = 1111, and we've already used up all 8 between
-  // byte_1_2_errors and byte_3_4_5_errors.
-  const simd8<uint8_t> too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero
-  return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors));
-}
-
-// TODO special case start of file, too, so that small documents are efficient! No shifting needed ...
-
-// The only problem that can happen at EOF is that a multibyte character is too short.
-really_inline simd8<uint8_t> check_eof(simd8<uint8_t> prev_input) {
-  // Total: 1 instruction, 1 simd constant
-  // - 1 saturating_sub
-  // - 1 simd constant
-
-  // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
-  // ... 1111____ 111_____ 11______
-  static const uint8_t last_len[32] = {
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
-  };
-  const simd8<uint8_t> max_value(last_len+sizeof(last_len)-sizeof(simd8<uint8_t>));
-  // If anything is > the desired value, there will be a nonzero value in the result.
-  return prev_input.saturating_sub(max_value);
-}
-
-really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> prev_input) {
-  // Total: 9 simd constants
-  // - ASCII: 3 instructions, 2 simd constants
-  // - UTF-8: 24 instructions, 8 simd constants
-  simd8<uint8_t> bits = input1;
-  if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
-    // This has the same semantics as EOF: we only have to check for multibyte characters in part
-    // 1 that got cut off
-    return check_eof(prev_input);
-  } else {
-    return check_utf8_bytes(input1, prev_input);
-  }
-}
-really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> prev_input) {
-  // Total: 9 simd constants
-  // - ASCII: 3 instructions, 2 simd constants
-  // - UTF-8: 24 instructions, 8 simd constants
-  simd8<uint8_t> bits = input1 | input2;
-  if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
-    // This has the same semantics as EOF: we only have to check for multibyte characters in part
-    // 1 that got cut off
-    return check_eof(prev_input);
-  } else {
-    return check_utf8_bytes(input1, prev_input) |
-           check_utf8_bytes(input2, input1);
-  }
-}
-really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> input3, simd8<uint8_t> input4, simd8<uint8_t> prev_input) {
-  // Total: 9 simd constants
-  // - ASCII: 3 instructions, 2 simd constants
-  // - UTF-8: 24 instructions, 8 simd constants
-  simd8<uint8_t> bits = input1 | input2 | input3 | input4;
-  if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
-    // This has the same semantics as EOF: we only have to check for multibyte characters in part
-    // 1 that got cut off
-    return check_eof(prev_input);
-  } else {
-    return check_utf8_bytes(input1, prev_input) |
-           check_utf8_bytes(input2, input1) |
-           check_utf8_bytes(input3, input2) |
-           check_utf8_bytes(input4, input3);
-  }
-}
-really_inline simd8<uint8_t> check_input(simd8<uint8_t> input1, simd8<uint8_t> input2, simd8<uint8_t> input3, simd8<uint8_t> input4, simd8<uint8_t> input5, simd8<uint8_t> input6, simd8<uint8_t> input7, simd8<uint8_t> input8, simd8<uint8_t> prev_input) {
-  // Total: 9 simd constants
-  // - ASCII: 3 instructions, 2 simd constants
-  // - UTF-8: 24 instructions, 8 simd constants
-  simd8<uint8_t> bits = input1 | input2 | input3 | input4 | input5 | input6 | input7 | input8;
-  if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
-    // This has the same semantics as EOF: we only have to check for multibyte characters in part
-    // 1 that got cut off
-    return check_eof(prev_input);
-  } else {
-    return check_utf8_bytes(input1, prev_input) |
-           check_utf8_bytes(input2, input1) |
-           check_utf8_bytes(input3, input2) |
-           check_utf8_bytes(input4, input3) |
-           check_utf8_bytes(input5, input4) |
-           check_utf8_bytes(input6, input5) |
-           check_utf8_bytes(input7, input6) |
-           check_utf8_bytes(input8, input7);
-  }
-}
-
-template<int N=simd8x64<uint8_t>::NUM_CHUNKS>
-really_inline simd8<uint8_t> check_input(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input);
-template<>
-really_inline simd8<uint8_t> check_input<2>(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input) {
-  simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], prev_input);
-  prev_input = input.chunks[1];
-  return error;
-}
-template<>
-really_inline simd8<uint8_t> check_input<4>(simd8x64<uint8_t> input, simd8<uint8_t> &prev_input) {
-  simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], prev_input);
-  prev_input = input.chunks[3];
-  return error;
-}
-
-template<int N=simd8x64<uint8_t>::NUM_CHUNKS>
-really_inline simd8<uint8_t> check_input(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input);
-template<>
-really_inline simd8<uint8_t> check_input<2>(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input) {
-  simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input2.chunks[0], input2.chunks[1], prev_input);
-  prev_input = input2.chunks[1];
-  return error;
-}
-template<>
-really_inline simd8<uint8_t> check_input<4>(simd8x64<uint8_t> input, simd8x64<uint8_t> input2, simd8<uint8_t> &prev_input) {
-  simd8<uint8_t> error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], input2.chunks[0], input2.chunks[1], input2.chunks[2], input2.chunks[3], prev_input);
-  prev_input = input2.chunks[3];
-  return error;
-}
-
 } // namespace utf8_validation

 struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
-  simd8<uint8_t> prev_input;
+  // The last input we received.
+  simd8<uint8_t> prev_input_block;
+  // If there were leads at the end of the previous block, to be continued in the next.
+  simd8<uint8_t> prev_incomplete;

-  really_inline void check_next_input(simd8x64<uint8_t> input) {
-    // Total: 9 simd constants
-    // [256-bit]
-    // - ASCII: 4 instructions, 2 simd constants
-    // - UTF-8: 47 instructions, 8 simd constants (7 of them used twice)
-    // [128-bit] 
-    // - ASCII: 6 instructions, 2 simd constants
-    // - UTF-8: 93 instructions, 8 simd constants (7 of them used four times)
+  //
+  // These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character
+  // it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be
+  // used to detect other kinds of errors.
+  //
+  // LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3,
+  // LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick
+  // lets us use one constant table instead of 3, possibly saving registers on systems with fewer
+  // registers.
+  //
+  static const uint8_t LEAD_4      = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__
+  static const uint8_t LEAD_3      = 0x02; // [1110]____ 10______ 10______ (0|11)__
+  static const uint8_t LEAD_2      = 0x04; // [110_]____ 10______ (0|11)__
+  static const uint8_t LEAD_1      = 0x08; // [0___]____ (0|11)__
+  static const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ...
+  static const uint8_t LEAD_1100   = 0x20; // [1100]____ ...
+  static const uint8_t LEAD_1110   = 0x40; // [1110]____ ...
+  static const uint8_t LEAD_1111   = 0x80; // [1111]____ ...

-    // it is not ascii so we have to do heavy work
-    this->error |= utf8_validation::check_input(input, this->prev_input);
+  // Prepare fast_path_error in case the next block is ASCII
+  really_inline void set_fast_path_error() {
+    // If any of the last 3 bytes in the input needs a continuation at the start of the next input,
+    // it is an error for the next input to be ASCII.
+    // static const uint8_t incomplete_long[32] = {
+    //   0, 0, 0, 0, 0, 0, 0, 0,
+    //   0, 0, 0, 0, 0, 0, 0, 0,
+    //   0, 0, 0, 0, 0, 0, 0, 0,
+    //   0, 0, 0, 0, 0, LEAD_4, LEAD_4 | LEAD_3, LEAD_4 | LEAD_3 | LEAD_2
+    // };
+    // const simd8<uint8_t> incomplete(&incomplete_long[sizeof(incomplete_long) - sizeof(simd8<uint8_t>)]);
+    // this->prev_incomplete = lead_flags & incomplete;
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t last_len[32] = {
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&last_len[sizeof(last_len)-sizeof(simd8<uint8_t>)]);
+    // If anything is > the desired value, there will be a nonzero value in the result.
+    this->prev_incomplete = this->prev_input_block.saturating_sub(max_value);
  }

-  really_inline void check_next_input(simd8x64<uint8_t> input, simd8x64<uint8_t> input2) {
-    this->error |= utf8_validation::check_input(input, input2, this->prev_input);
+  really_inline simd8<uint8_t> get_lead_flags(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
+    // Total: 2 instructions, 1 constant
+    // - 1 byte shift (shuffle)
+    // - 1 table lookup (shuffle)
+    // - 1 table constant
+
+    // high_bits is byte 5, so lead is high_bits.prev<4>()
+    return high_bits.prev<4>(prev_high_bits).lookup_16<uint8_t>(
+      LEAD_1, LEAD_1, LEAD_1, LEAD_1,   // [0___]____ (ASCII)
+      LEAD_1, LEAD_1, LEAD_1, LEAD_1,   // [0___]____ (ASCII)
+      0,      0,      0,      0,        // [10__]____ (continuation)
+      LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____
+      LEAD_2 | LEAD_2_PLUS,             // [110_]____
+      LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____
+      LEAD_4 | LEAD_2_PLUS | LEAD_1111  // [1111]____
+    );
+  }
+
+  // Find errors in bytes 1 and 2 together (one single multi-nibble &)
+  really_inline simd8<uint8_t> get_byte_1_2_errors(const simd8<uint8_t> input, const simd8<uint8_t> prev_input, const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
+    //
+    // These are the errors we're going to match for bytes 1-2, by looking at the first three
+    // nibbles of the character: lead_flags & <low bits of byte 1> & <high bits of byte 2>
+    //
+    // The important thing here is that these constants all take up *different* bits, since they
+    // match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among
+    // other things.
+    //
+    static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____
+    static const int TOO_LONG_1  = LEAD_1;      // 0_______ 10______
+    static const int OVERLONG_2  = LEAD_1100;   // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way)
+    static const int OVERLONG_3  = LEAD_3;      // 11100000 100_____ ________
+    static const int OVERLONG_4  = LEAD_4;      // 11110000 1000____ ________ ________
+    static const int TOO_LARGE   = LEAD_1111;   // 11110100 (1001|101_)____
+    static const int SURROGATE   = LEAD_1110;   // 11101101 [101_]____
+
+    // Total: 4 instructions, 2 constants
+    // - 2 table lookups (shuffles)
+    // - 1 byte shift (shuffle)
+    // - 1 "and"
+    // - 2 table constants
+
+    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
+    // byte 2 to be sure which things are errors and which aren't.
+    // Since input is byte 5, byte 1 is input.prev<4>
+    const simd8<uint8_t> byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16<uint8_t>(
+      // ____[00__] ________
+      TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
+      TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2,                           // ____[0001] ________
+      TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
+      // ____[01__] ________
+      TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE,                            // ____[0100] ________
+      TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
+      // ____[10__] ________
+      TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1,
+      // ____[11__] ________
+      TOO_SHORT_2 | TOO_LONG_1,
+      TOO_SHORT_2 | TOO_LONG_1 | SURROGATE,                            // ____[1101] ________
+      TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1
+    );
+    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
+    const simd8<uint8_t> byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16<uint8_t>(
+        // ASCII: ________ [0___]____
+        OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
+        // ASCII: ________ [0___]____
+        OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2,
+        // Continuations: ________ [10__]____
+        OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
+        OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE,  // ________ [1001]____
+        OVERLONG_2 | TOO_LONG_1 | TOO_LARGE  | SURROGATE,  // ________ [1010]____
+        OVERLONG_2 | TOO_LONG_1 | TOO_LARGE  | SURROGATE,  // ________ [1011]____
+        // Multibyte Leads: ________ [11__]____
+        OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2
+    );
+    return byte_1_flags & byte_2_flags;
+  }
+
+  really_inline simd8<uint8_t> get_byte_3_4_5_errors(const simd8<uint8_t> high_bits, const simd8<uint8_t> prev_high_bits) {
+    // Total 7 instructions, 3 simd constants:
+    // - 3 table lookups (shuffles)
+    // - 2 byte shifts (shuffles)
+    // - 2 "or"
+    // - 1 table constant
+
+    const simd8<uint8_t> byte_3_table = simd8<uint8_t>::repeat_16(
+        // TOO_SHORT ASCII:           111_____ ________ [0___]____
+        LEAD_3, LEAD_3, LEAD_3, LEAD_3,
+        LEAD_3, LEAD_3, LEAD_3, LEAD_3,
+        // TOO_LONG  Continuations:   110_____ ________ [10__]____
+        LEAD_2, LEAD_2, LEAD_2, LEAD_2,
+        // TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____
+        LEAD_3, LEAD_3, LEAD_3, LEAD_3
+    );
+    const simd8<uint8_t> byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3
+    const simd8<uint8_t> byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: <none>, TOO_LONG: LEAD_4
+
+    // high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4
+    return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) |
+           high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) |
+           high_bits.lookup_16(byte_5_table);
+  }
+
+  // Check whether the current bytes are valid UTF-8.
+  // At the end of the function, previous gets updated
+  // This should come down to 22 instructions if table definitions are in registers--30 if not.
+  really_inline simd8<uint8_t> check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+    // When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows
+    // us to look for all errors related to any lead character at one time (since UTF-8 characters
+    // can only be up to 4 bytes, and the next byte after a character finishes must be another lead,
+    // we never need to look more than 4 bytes past the current one to fully validate).
+    // This way, we have all relevant bytes around and can save ourselves a little overflow and
+    // several instructions on each loop.
+
+    // Total: 22 instructions, 7 simd constants
+    // Local: 8 instructions, 1 simd constant
+    // - 2 bit shifts
+    // - 1 byte shift (shuffle)
+    // - 3 "or"
+    // - 1 "and"
+    // - 1 saturating_sub
+    // - 1 constant (0b11111000-1)
+    // lead_flags: 2 instructions, 1 simd constant
+    // - 1 byte shift (shuffle)
+    // - 1 table lookup (shuffle)
+    // - 1 table constant
+    // byte_1_2_errors: 5 instructions, 2 simd constants
+    // - 2 table lookups (shuffles)
+    // - 2 byte shifts (shuffles)
+    // - 1 "and"
+    // - 2 table constants
+    // byte_3_4_5_errors: 7 instructions, 3 simd constants
+    // - 3 table lookups (shuffles)
+    // - 2 byte shifts (shuffles)
+    // - 2 "or"
+    // - 3 table constants
+
+    const simd8<uint8_t> high_bits = input.shr<4>();
+    const simd8<uint8_t> prev_high_bits = prev_input.shr<4>();
+    const simd8<uint8_t> lead_flags = get_lead_flags(high_bits, prev_high_bits);
+    const simd8<uint8_t> byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits);
+    const simd8<uint8_t> byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits);
+    // Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors  because
+    // it would need a third lead_flag = 1111, and we've already used up all 8 between
+    // byte_1_2_errors and byte_3_4_5_errors.
+    const simd8<uint8_t> too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero
+    return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors));
+  }
+
+  // TODO special case start of file, too, so that small documents are efficient! No shifting needed ...
+
+  // The only problem that can happen at EOF is that a multibyte character is too short.
+  really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+    // possibly finish them.
+    this->error |= this->prev_incomplete;
+  }
+
+  really_inline void check_next_input(simd8x64<uint8_t> input) {
+    simd8<uint8_t> bits = input.reduce([&](auto a,auto b) { return a|b; });
+    if (likely(!bits.any_bits_set_anywhere(0b10000000u))) {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    } else {
+      this->error |= this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+      for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
+        this->error |= this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
+      }
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      this->set_fast_path_error();
+    }
  }

  really_inline ErrorValues errors() {
@ -375,6 +297,3 @@ struct utf8_checker {
  }

 }; // struct utf8_checker
-
-struct utf8_checker;
-
--- a/src/generic/utf8_range_algorithm.h
+++ b/src/generic/utf8_range_algorithm.h
@ -168,7 +168,9 @@ struct utf8_checker {
      this->check_carried_continuations();
    } else {
      // it is not ascii so we have to do heavy work
-      in.each([&](auto _in) { this->check_utf8_bytes(_in); });
+      for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
+        this->check_utf8_bytes(in.chunks[i]);
+      }
    }
  }

--- a/src/generic/utf8_zwegner_algorithm.h
+++ b/src/generic/utf8_zwegner_algorithm.h
@ -349,7 +349,9 @@ struct utf8_checker {
  }

  really_inline void check_next_input(simd8x64<uint8_t> in) {
-    in.each([&](auto bytes) { this->check_next_input(bytes); });
+    for (int i=0; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
+      this->check_next_input(in.chunks[i]);
+    }
  }

  really_inline ErrorValues errors() {