Using just ASCII. (#899)

* Using just ASCII. * Let us prune checkperf. * Moving the description of lookup2 to the HACKING.md file.
2020-05-21 21:59:06 -04:00 · 2020-05-21 21:59:06 -04:00 · 12150baa5e
parent 219b02c1e5
commit 12150baa5e
9 changed files with 259 additions and 242 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -38,13 +38,27 @@ add_subdirectory(singleheader)
 #
 # Compile tools / tests / benchmarks
 #
-
 add_subdirectory(dependencies)
 add_subdirectory(tests)
 add_subdirectory(examples)
 add_subdirectory(benchmark)
 add_subdirectory(fuzz)

+#
+# Source files should be just ASCII
+#
+find_program(FIND find)
+find_program(FILE file)
+find_program(GREP grep)
+if((FIND) AND (FILE) AND (GREP))
+    add_test(
+      NAME "just_ascii"
+      COMMAND sh -c "${FIND}  include src windows tools singleheader tests examples benchmark -path benchmark/checkperf-reference -prune -name '*.h'  -o -name '*.cpp' -type f  -exec ${FILE} '{}' \; |${GREP} -v ASCII || exit 0  && exit 1"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+endif()
+
+
 #
 # CPack
 #
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -40,6 +40,7 @@ We have few hard rules, but we have some:

 - Printing to standard output or standard error (`stderr`, `stdout`, `std::cerr`, `std::cout`) in the core library is forbidden. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Compiled code should not write to stdout or stderr".
 - Calls to `abort()` are forbidden in the core library. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Under no circumstances should your compiled code ever call abort or exit".
+- All source code files (.h, .cpp) must be ASCII.

 Tools, tests and benchmarks are not held to these same strict rules.

--- a/HACKING.md
+++ b/HACKING.md
@ -369,6 +369,213 @@ This helps as we redefine some new characters as pseudo-structural such as the c

 > { "foo" : 1.5, "bar" : 1.5 GEOFF_IS_A_DUMMY bla bla , "baz", null }

+
+
+### UTF-8 validation (lookup2)
+
+The simdjson library relies on the lookup2 algorithm for UTF-8 validation on x64 platforms.
+
+This algorithm validate the length of multibyte characters (that each multibyte character has the right number of continuation characters, and that all continuation characters are part of a multibyte  character).
+
+####  Algorithm
+
+This algorithm compares *expected* continuation characters with *actual* continuation bytes, and emits an error anytime there is a mismatch.
+
+For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+characters, the file will look like this:
+
+| Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+|-----------------------|----|----|----|----|----|----|----|----|----|----|----|
+| Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
+| Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
+| is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
+| is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
+| is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
+| expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+| is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+
+The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
+
+- **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
+  part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
+  floating around extra outside of any character, or that there is an illegal 5-byte character,
+  or maybe it's at the beginning of the file before any characters have started; but it's an
+  error in all these cases.
+- **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
+  we started a new character before we were finished with the current one.
+
+####  Getting the Previous Bytes
+
+Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
+character, we need to "shift the bytes" to find that out. This is what they mean:
+
+- `is_continuation`: if the current byte is a continuation.
+- `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
+- `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
+- `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
+
+We use shuffles to go n bytes back, selecting part of the current `input` and part of the
+`prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
+function, because the 1-byte-back data is used by other checks as well.
+
+####   Getting the Continuation Mask
+
+Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
+numbers, using signed `<` and `>` operations to check if they are continuations or leads.
+In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
+Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
+
+In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
+respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
+Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
+
+When treated as signed numbers, they look like this:
+
+| Type         | High Bits  | Binary Range | Signed |
+|--------------|------------|--------------|--------|
+| ASCII        | `0`        | `01111111`   |   127  |
+|              |            | `00000000`   |     0  |
+| 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
+|              |            | `11110000    |   -16  |
+| 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
+|              |            | `11100000    |   -32  |
+| 2-Byte Lead  | `110`      | `11011111`   |   -33  |
+|              |            | `11000000    |   -64  |
+| Continuation | `10`       | `10111111`   |   -65  |
+|              |            | `10000000    |  -128  |
+
+This makes it pretty easy to get the continuation mask! It's just a single comparison:
+
+```
+is_continuation = input < -64`
+```
+
+We can do something similar for the others, but it takes two comparisons instead of one: "is
+the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
+`> -64`. Surely we can do better, they're right next to each other!
+
+####  Getting the is_xxx Masks: Shifting the Range
+
+Notice *why* continuations were a single comparison. The actual *range* would require two
+comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
+that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
+just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
+
+Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
+ASCII down into the negative, and puts 4+-Byte Lead at the top:
+
+| Type                 | High Bits  | Binary Range | Signed |
+|----------------------|------------|--------------|-------|
+| 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
+|                      |            | `01110000    |   112 |
+|----------------------|------------|--------------|-------|
+| 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
+|                      |            | `01100000    |    96 |
+|----------------------|------------|--------------|-------|
+| 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
+|                      |            | `01000000    |    64 |
+|----------------------|------------|--------------|-------|
+| Continuation (+ 127) | `00`       | `00111111`   |    63 |
+|                      |            | `00000000    |     0 |
+|----------------------|------------|--------------|-------|
+| ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
+|                      |            | `10000000`   |  -128 |
+|----------------------|------------|--------------|-------|
+
+*Now* we can use signed `>` on all of them:
+
+```
+prev1 = input.prev<1>
+prev2 = input.prev<2>
+prev3 = input.prev<3>
+prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
+prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
+prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
+is_second_byte = prev1_flipped > 63;2+-byte lead
+is_third_byte  = prev2_flipped > 95;3+-byte lead
+is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
+```
+
+NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
+of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
+`^`'s at a time on Haswell, but only 2 `+`'s).
+
+That doesn't look like it saved us any instructions, did it? Well, because we're adding the
+same number to all of them, we can save one of those `+ 128` operations by assembling
+`prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
+to it. One more instruction saved!
+
+```
+prev1 = input.prev<1>
+prev3 = input.prev<3>
+prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
+prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
+prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
+```
+
+####  Bringing It All Together: Detecting the Errors
+
+At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
+All we have left to do is check if they match!
+
+```
+return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
+```
+
+But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
+parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
+bitwise operations, and we're only using 1!
+
+####  Epilogue: Addition For Booleans
+
+There is one big case the above code doesn't explicitly talk about--what if is_second_byte
+and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
+to each other (or any combination), and the continuation could be part of either of them!
+Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
+
+Never fear, though. If that situation occurs, we'll already have detected that the second
+leading byte was an error, because it was supposed to be a part of the preceding multibyte
+character, but it *wasn't a continuation*.
+
+We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
+`&`, which is both interesting and possibly useful (even though we're not using it here). It
+exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
+comparisons were giving us numbers!
+
+Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
+circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
+`(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
+*both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
+*any* nonzero value is treated as an error (not just -1), we're just fine here :)
+
+Further, if *more than one* multibyte character overlaps,
+`is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
+from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
+considered an error.
+
+One reason you might want to do this is parallelism. ^ and | are not associative, so
+(A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
+you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
+be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
+then adds the result together. Same number of operations, but if the processor can run
+independent things in parallel (which most can), it runs faster.
+
+This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
+a super nice advantage in that more of them can be run at the same time (they can run on 3
+ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
+saving us the cycle we would have earned by using +. Even more, using an instruction with a
+wider array of ports can help *other* code run ahead, too, since these instructions can "get
+out of the way," running on a port other instructions can't.
+
+####  Epilogue II: One More Trick
+
+There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
+for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
+check_special_cases()--but we'll talk about that there :)
+
+
+
+
 ## About the Project

 ### Bindings and Ports of simdjson
@ -420,6 +627,8 @@ make allparsingcompetition
 Both the `parsingcompetition` and `allparsingcompetition` tools take a `-t` flag which produces
 a table-oriented output that can be conveniently parsed by other tools.

+
+
 ### Various References

 - [Google double-conv](https://github.com/google/double-conversion/)
--- a/singleheader/amalgamate_demo.cpp
+++ b/singleheader/amalgamate_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
+/* auto-generated on Thu 21 May 2020 14:01:15 EDT. Do not edit! */

 #include <iostream>
 #include "simdjson.h"
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
+/* auto-generated on Thu 21 May 2020 14:01:15 EDT. Do not edit! */
 /* begin file src/simdjson.cpp */
 #include "simdjson.h"

@ -3180,10 +3180,10 @@ namespace utf8_validation {
  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
  // and emits an error anytime there is a mismatch.
  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // For example, in the string "ab", which has a 4-, 3-, 2- and 1-byte
  // characters, the file will look like this:
  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // | Character             |   |    |    |    |   |    |    |   |    | a  | b  |
  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
@ -4049,10 +4049,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
  // If you consume a large value and you map it to "infinity", you will no
  // longer be able to serialize back a standard-compliant JSON. And there is
  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
  // 10^308 It is an unimaginable large number. There will never be any piece of
  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
  // of electrons is similar. Using a double-precision floating-point value, we
  // can represent easily the number of atoms in the universe. We could  also
  // represent the number of ways you can pick any three individual atoms at
@ -5872,10 +5872,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
  // If you consume a large value and you map it to "infinity", you will no
  // longer be able to serialize back a standard-compliant JSON. And there is
  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
  // 10^308 It is an unimaginable large number. There will never be any piece of
  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
  // of electrons is similar. Using a double-precision floating-point value, we
  // can represent easily the number of atoms in the universe. We could  also
  // represent the number of ways you can pick any three individual atoms at
@ -8142,10 +8142,10 @@ namespace utf8_validation {
  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
  // and emits an error anytime there is a mismatch.
  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // For example, in the string "ab", which has a 4-, 3-, 2- and 1-byte
  // characters, the file will look like this:
  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // | Character             |   |    |    |    |   |    |    |   |    | a  | b  |
  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
@ -9015,10 +9015,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
  // If you consume a large value and you map it to "infinity", you will no
  // longer be able to serialize back a standard-compliant JSON. And there is
  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
  // 10^308 It is an unimaginable large number. There will never be any piece of
  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
  // of electrons is similar. Using a double-precision floating-point value, we
  // can represent easily the number of atoms in the universe. We could  also
  // represent the number of ways you can pick any three individual atoms at
@ -11254,10 +11254,10 @@ namespace utf8_validation {
  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
  // and emits an error anytime there is a mismatch.
  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // For example, in the string "ab", which has a 4-, 3-, 2- and 1-byte
  // characters, the file will look like this:
  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // | Character             |   |    |    |    |   |    |    |   |    | a  | b  |
  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
@ -12130,10 +12130,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
  // If you consume a large value and you map it to "infinity", you will no
  // longer be able to serialize back a standard-compliant JSON. And there is
  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
  // 10^308 It is an unimaginable large number. There will never be any piece of
  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
  // of electrons is similar. Using a double-precision floating-point value, we
  // can represent easily the number of atoms in the universe. We could  also
  // represent the number of ways you can pick any three individual atoms at
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
+/* auto-generated on Thu 21 May 2020 14:01:15 EDT. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
--- a/src/generic/stage1/utf8_lookup2_algorithm.h
+++ b/src/generic/stage1/utf8_lookup2_algorithm.h
@ -47,9 +47,9 @@
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
+//
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
+//
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@ -64,6 +64,7 @@
 using namespace simd;

 namespace utf8_validation {
+  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".

  //
  // Find special case UTF-8 errors where the character is technically readable (has the right length)
@ -108,7 +109,7 @@ namespace utf8_validation {

    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
+      0, 0, 0, 0,
      0, 0, 0, 0,
      // [10__]____ (continuation)
      0, 0, 0, 0,
@ -139,214 +140,6 @@ namespace utf8_validation {
    return byte_1_high & byte_1_low & byte_2_high;
  }

-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
  really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@ -422,4 +215,4 @@ namespace utf8_validation {
  }; // struct utf8_checker
 }

-using utf8_validation::utf8_checker;
+using utf8_validation::utf8_checker;
--- a/src/generic/stage2/numberparsing.h
+++ b/src/generic/stage2/numberparsing.h
@ -191,10 +191,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
  // If you consume a large value and you map it to "infinity", you will no
  // longer be able to serialize back a standard-compliant JSON. And there is
  // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
  // 10^308 It is an unimaginable large number. There will never be any piece of
  // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
  // of electrons is similar. Using a double-precision floating-point value, we
  // can represent easily the number of atoms in the universe. We could  also
  // represent the number of ways you can pick any three individual atoms at
--- a/tests/basictests.cpp
+++ b/tests/basictests.cpp
@ -10,7 +10,7 @@
 #include <string_view>
 #include <sstream>
 #include <utility>
-#include <ciso646> 
+#include <ciso646>
 #include <unistd.h>

 #include "simdjson.h"
@ -62,7 +62,7 @@ namespace number_tests {
          std::cerr << "JSON '" << str << "' parsed to " << actual << " instead of " << i << std::endl;
          return false;
        }
-      } 
+      }
    }
    return true;
  }
@ -79,7 +79,7 @@ namespace number_tests {
      fflush(NULL);
      auto [actual, error] = parser.parse(buf, n).get<double>();
      if (error) { std::cerr << error << std::endl; return false; }
-      uint64_t ulp = f64_ulp_dist(actual,expected);  
+      uint64_t ulp = f64_ulp_dist(actual,expected);
      if(ulp > maxulp) maxulp = ulp;
      if(ulp > 0) {
        std::cerr << "JSON '" << buf << " parsed to " << actual << " instead of " << expected << std::endl;
@ -452,8 +452,8 @@ namespace document_stream_tests {
      size_t n = snprintf(buf,
                        sizeof(buf),
                      "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
-                      "\"été\": {\"id\": %zu, \"name\": \"éventail%zu\"}}",
-                      i, i, (i % 2) ? "⺃" : "⺕", i % 10, i % 10);
+                      "\"\xC3\xA9t\xC3\xA9\": {\"id\": %zu, \"name\": \"\xC3\xA9ventail%zu\"}}",
+                      i, i, (i % 2) ? "\xE2\xBA\x83" : "\xE2\xBA\x95", i % 10, i % 10);
      if (n >= sizeof(buf)) { abort(); }
      data += std::string(buf, n);
    }
@ -678,7 +678,7 @@ namespace dom_api_tests {
    }
    if (iter.move_to_key_insensitive("bad key")) {
      printf("We should not move to a non-existing key\n");
-      return false;    
+      return false;
    }
    if (!iter.is_object()) {
      printf("We should have remained at the object.\n");
@ -726,7 +726,7 @@ namespace dom_api_tests {
    }
    if (!iter.move_to_key("IDs")) {
      printf("We should be able to move to an existing key\n");
-      return false;    
+      return false;
    }
    if (!iter.is_array()) {
      printf("Value of IDs should be array, it is %c \n", iter.get_type());
@ -734,7 +734,7 @@ namespace dom_api_tests {
    }
    if (iter.move_to_index(4)) {
      printf("We should not be able to move to a non-existing index\n");
-      return false;    
+      return false;
    }
    if (!iter.is_array()) {
      printf("We should have remained at the array\n");
@ -1930,8 +1930,8 @@ int main(int argc, char *argv[]) {

  // this is put here deliberately to check that the documentation is correct (README),
  // should this fail to compile, you should update the documentation:
-  if (simdjson::active_implementation->name() == "unsupported") { 
-    printf("unsupported CPU\n"); 
+  if (simdjson::active_implementation->name() == "unsupported") {
+    printf("unsupported CPU\n");
  }
  std::cout << "Running basic tests." << std::endl;
  if (parse_api_tests::run() &&