Merge branch 'master' into dlemire/better_doxygen_home_page

2020-06-17 16:02:49 -04:00 · 2020-06-17 16:02:49 -04:00 · 3f00e79bcb
parent b5ea504ad2 c9a6bbeb64
commit 3f00e79bcb
9 changed files with 322 additions and 14 deletions
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -83,7 +83,7 @@ struct option_struct {
  bool stage1_only = false;

  int32_t iterations = 200;
-  int32_t iteration_step = 50;
+  int32_t iteration_step = -1;

  bool verbose = false;
  bool tabbed_output = false;
@ -149,6 +149,12 @@ struct option_struct {
      }
    }

+    if (iteration_step == -1) {
+      iteration_step = iterations / 50;
+      if (iteration_step < 200) { iteration_step = 200; }
+      if (iteration_step > iterations) { iteration_step = iterations; }
+    }
+
    // All remaining arguments are considered to be files
    for (int i=optind; i<argc; i++) {
      files.push_back(argv[i]);
@ -156,11 +162,6 @@ struct option_struct {
    if (files.empty()) {
      exit_usage("No files specified");
    }
-
-    // Keeps the numbers the same for CI (old ./parse didn't have a two-stage loop)
-    if (files.size() == 1) {
-      iteration_step = iterations;
-    }
  }
 };

--- a/doc/basics.md
+++ b/doc/basics.md
@ -513,8 +513,8 @@ parser for your CPU, is transparent and thread-safe.
 Backwards Compatibility
 -----------------------

-The only header file supported by simdjson is simdjson.h. Older versions of simdjson published a
-number of other include files such as document.h or ParsedJson.h alongside simdjson.h; these headers
+The only header file supported by simdjson is `simdjson.h`. Older versions of simdjson published a
+number of other include files such as `document.h` or `ParsedJson.h` alongside `simdjson.h`; these headers
 may be moved or removed in future versions.


--- a/doc/parse_many.md
+++ b/doc/parse_many.md
@ -97,6 +97,10 @@ of magnitude cheaper. Ain't that awesome!
 Thread support is only active if thread supported is detected in which case the macro
 SIMDJSON_THREADS_ENABLED is set. Otherwise the library runs in  single-thread mode.

+A `document_stream` instance uses at most two threads: there is a main thread and a worker thread.
+You should expect the main thread to be fully occupied while the worker thread is partially busy
+(e.g., 80% of the time).
+
 Support
 -------

--- a/doc/performance.md
+++ b/doc/performance.md
@ -11,13 +11,16 @@ are still some scenarios where tuning can enhance performance.
 * [Computed GOTOs](#computed-gotos)
 * [Number parsing](#number-parsing)
 * [Visual Studio](#visual-studio)
+* [Downclocking](#downclocking)
+

 Reusing the parser for maximum efficiency
 -----------------------------------------

 If you're using simdjson to parse multiple documents, or in a loop, you should make a parser once
 and reuse it. The simdjson library will allocate and retain internal buffers between parses, keeping
-buffers hot in cache and keeping memory allocation and initialization to a minimum.
+buffers hot in cache and keeping memory allocation and initialization to a minimum. In this manner,
+you can parse terabytes of JSON data without doing any new allocation.

 ```c++
 dom::parser parser;
@ -154,6 +157,19 @@ We do not recommend that you compile simdjson with architecture-specific flags s
 Recent versions of Microsoft Visual Studio on Windows provides support for the LLVM Clang compiler. You  only need to install the "Clang compiler" optional component. You may also get a copy of the 64-bit LLVM CLang compiler for [Windows directly from LLVM](https://releases.llvm.org/download.html). The simdjson library fully supports the LLVM Clang compiler under Windows. In fact, you may get better performance out of simdjson with the LLVM Clang compiler than with the regular Visual Studio compiler.


+Downclocking
+--------------
+
+You should not expect the simdjson library to cause downclocking of your recent Intel CPU cores.
+
+On some Intel processors, using SIMD instructions in a sustained manner on the same CPU core may result in a phenomenon called downclocking whereas the processor initially runs these instructions at a slow speed before reducing the frequency of the core for a short time (milliseconds). Intel refers to these states as licenses. On some current Intel processors, it occurs under two scenarios:
+
+- [Whenever 512-bit AVX-512 instructions are used](https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/).
+- Whenever heavy 256-bit or wider instructions are used. Heavy instructions are those involving floating point operations or integer multiplications (since these execute on the floating point unit).
+
+The simdjson library does not currently support AVX-512 instructions and it does not make use of heavy 256-bit instructions. Thus there should be no downclocking due to simdjson on recent processors. You may still be worried about which SIMD instruction set is used by simdjson.  Thankfully,  [you can always determine and change which architecture-specific implementation is used](implementation-selection.md). Thus even if your CPU supports AVX2, you do not need to use AVX2. You are in control.
+
+
 Further Reading
 -------------

--- a/src/arm64/dom_parser_implementation.cpp
+++ b/src/arm64/dom_parser_implementation.cpp
@ -73,17 +73,32 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }

+really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+    return is_third_byte ^ is_fourth_byte;
+}
+
 #include "generic/stage1/buf_block_reader.h"
 #include "generic/stage1/json_string_scanner.h"
 #include "generic/stage1/json_scanner.h"

+namespace stage1 {
+really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  // On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
+  // benefit and therefore makes things worse.
+  // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+}
+
 #include "generic/stage1/json_minifier.h"
 WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }

 #include "generic/stage1/find_next_document_index.h"
-#include "generic/stage1/utf8_lookup2_algorithm.h"
+#include "generic/stage1/utf8_lookup3_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
  this->buf = _buf;
--- a/src/generic/stage1/json_string_scanner.h
+++ b/src/generic/stage1/json_string_scanner.h
@ -37,7 +37,9 @@ public:
  really_inline error_code finish(bool streaming);

 private:
+  // Intended to be defined by the implementation
  really_inline uint64_t find_escaped(uint64_t escape);
+  really_inline uint64_t find_escaped_branchless(uint64_t escape);

  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
  uint64_t prev_in_string = 0ULL;
@ -72,7 +74,7 @@ private:
 // desired        |   x  | x x  x x  x x  x  x  |
 // text           |  \\\ | \\\"\\\" \\\" \\"\\" |
 //
-really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+really_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
  // If there was overflow, pretend the first character isn't a backslash
  backslash &= ~prev_escaped;
  uint64_t follows_escape = backslash << 1 | prev_escaped;
@ -101,13 +103,23 @@ really_inline json_string_block json_string_scanner::next(const simd::simd8x64<u
  const uint64_t backslash = in.eq('\\');
  const uint64_t escaped = find_escaped(backslash);
  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
  // (characters inside strings are outside, and characters outside strings are inside).
+  //
  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
  // right shift of a signed value expected to be well-defined and standard
  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
  // Use ^ to turn the beginning quote off, and the end quote on.
  return {
    backslash,
--- a/src/generic/stage1/utf8_lookup3_algorithm.h
+++ b/src/generic/stage1/utf8_lookup3_algorithm.h
@ -0,0 +1,230 @@
+//
+// Detect Unicode errors.
+//
+// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
+// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
+// are straight up concatenated into the final value. The first byte of a multibyte character is a
+// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
+// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
+// start with 0, because that's what ASCII looks like. Here's what each size looks like:
+//
+// - ASCII (7 bits):              0_______
+// - 2 byte character (11 bits):  110_____ 10______
+// - 3 byte character (17 bits):  1110____ 10______ 10______
+// - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
+// - 5+ byte character (illegal): 11111___ <illegal>
+//
+// There are 5 classes of error that can happen in Unicode:
+//
+// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
+//   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
+//   character.
+//
+//   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
+//
+// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
+//   We detect this by requiring that the next byte after your multibyte character be a new
+//   character--so a continuation after your character is wrong.
+//
+//   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
+//
+// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
+//
+//   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
+//
+// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
+//   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
+//   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
+//
+//   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
+//
+// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
+//   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
+//
+//   e.g. 11101101 10100000 10000000 (U+D800)
+//
+// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
+//   support values with more than 23 bits (which a 4-byte character supports).
+//
+//   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
+//
+// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
+//
+//   Code Points        1st       2s       3s       4s
+//  U+0000..U+007F     00..7F
+//  U+0080..U+07FF     C2..DF   80..BF
+//  U+0800..U+0FFF     E0       A0..BF   80..BF
+//  U+1000..U+CFFF     E1..EC   80..BF   80..BF
+//  U+D000..U+D7FF     ED       80..9F   80..BF
+//  U+E000..U+FFFF     EE..EF   80..BF   80..BF
+//  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+//  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+//  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+//
+using namespace simd;
+
+namespace utf8_validation {
+  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
+
+  //
+  // Find special case UTF-8 errors where the character is technically readable (has the right length)
+  // but the *value* is disallowed.
+  //
+  // This includes overlong encodings, surrogates and values too large for Unicode.
+  //
+  // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
+  // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
+  // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
+  // If all 3 lookups detect the same error, it's an error.
+  //
+  really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+    //
+    // These are the errors we're going to match for bytes 1-2, by looking at the first three
+    // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
+    //
+    static const int OVERLONG_2  = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
+    static const int OVERLONG_3  = 0x02; // 11100000 100_____ ________
+    static const int OVERLONG_4  = 0x04; // 11110000 1000____ ________ ________
+    static const int SURROGATE   = 0x08; // 11101101 [101_]____
+    static const int TOO_LARGE   = 0x10; // 11110100 (1001|101_)____
+    static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
+
+    // New with lookup3. We want to catch the case where an non-continuation 
+    // follows a leading byte
+    static const int TOO_SHORT_2_3_4 = 0x40; //  (110_|1110|1111) ____    (0___|110_|1111) ____
+    // We also want to catch a continuation that is preceded by an ASCII byte
+    static const int LONELY_CONTINUATION = 0x80; //  0___ ____    01__ ____
+
+    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
+    // byte 2 to be sure which things are errors and which aren't.
+    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
+    static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+        // ASCII: ________ [0___]____
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+        // ASCII: ________ [0___]____
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,
+        // Continuations: ________ [10__]____
+        CARRY | OVERLONG_3 | OVERLONG_4 | LONELY_CONTINUATION, // ________ [1000]____
+        CARRY | OVERLONG_3 | TOO_LARGE | LONELY_CONTINUATION,  // ________ [1001]____
+        CARRY | TOO_LARGE  | SURROGATE | LONELY_CONTINUATION,  // ________ [1010]____
+        CARRY | TOO_LARGE  | SURROGATE | LONELY_CONTINUATION,  // ________ [1011]____
+        // Multibyte Leads: ________ [11__]____
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4,  // 110_
+        CARRY | TOO_SHORT_2_3_4, CARRY | TOO_SHORT_2_3_4
+    );
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // [0___]____ (ASCII)
+      LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+      LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION, LONELY_CONTINUATION,
+      // [10__]____ (continuation)
+      0, 0, 0, 0,
+      // [11__]____ (2+-byte leads)
+      OVERLONG_2 | TOO_SHORT_2_3_4, TOO_SHORT_2_3_4,         // [110_]____ (2-byte lead)
+      OVERLONG_3 | SURROGATE | TOO_SHORT_2_3_4,              // [1110]____ (3-byte lead)
+      OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 | TOO_SHORT_2_3_4 // [1111]____ (4+-byte lead)
+    );
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____[00__] ________
+      OVERLONG_2 | OVERLONG_3 | OVERLONG_4 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION, // ____[0000] ________
+      OVERLONG_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,                           // ____[0001] ________
+      TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      // ____[01__] ________
+      TOO_LARGE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,                            // ____[0100] ________
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      // ____[10__] ________
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      // ____[11__] ________
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,
+      TOO_LARGE_2 | SURROGATE | TOO_SHORT_2_3_4 | LONELY_CONTINUATION,              // ____[1101] ________
+      TOO_LARGE_2 | TOO_SHORT_2_3_4| LONELY_CONTINUATION,
+      TOO_LARGE_2 | TOO_SHORT_2_3_4 | LONELY_CONTINUATION
+    );
+    return byte_1_high & byte_1_low & byte_2_high;
+  }
+
+  really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input,
+      simd8<uint8_t> prev1) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    // is_2_3_continuation uses one more instruction than lookup2
+    simd8<bool> is_2_3_continuation = (simd8<int8_t>(input).max(simd8<int8_t>(prev1))) < int8_t(-64);
+    // must_be_2_3_continuation has two fewer instructions than lookup 2
+    return simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3) ^ is_2_3_continuation);
+  }
+
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+    };
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      this->error |= check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, prev1);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short.
+    really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+    really_inline void check_next_input(simd8x64<uint8_t> input) {
+      if (likely(is_ascii(input))) {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
+      } else {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
+          this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+
+    really_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+}
+
+using utf8_validation::utf8_checker;
--- a/src/haswell/dom_parser_implementation.cpp
+++ b/src/haswell/dom_parser_implementation.cpp
@ -61,17 +61,32 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

+really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+
 #include "generic/stage1/buf_block_reader.h"
 #include "generic/stage1/json_string_scanner.h"
 #include "generic/stage1/json_scanner.h"

+namespace stage1 {
+really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+}
+
 #include "generic/stage1/json_minifier.h"
 WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
 }

 #include "generic/stage1/find_next_document_index.h"
-#include "generic/stage1/utf8_lookup2_algorithm.h"
+#include "generic/stage1/utf8_lookup3_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
  this->buf = _buf;
--- a/src/westmere/dom_parser_implementation.cpp
+++ b/src/westmere/dom_parser_implementation.cpp
@ -62,17 +62,32 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

+really_inline simd8<bool> must_be_2_3_continuation(simd8<uint8_t> prev2, simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+
 #include "generic/stage1/buf_block_reader.h"
 #include "generic/stage1/json_string_scanner.h"
 #include "generic/stage1/json_scanner.h"

+namespace stage1 {
+really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+}
+
 #include "generic/stage1/json_minifier.h"
 WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }

 #include "generic/stage1/find_next_document_index.h"
-#include "generic/stage1/utf8_lookup2_algorithm.h"
+#include "generic/stage1/utf8_lookup3_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
  this->buf = _buf;