Merge pull request #959 from simdjson/dlemire/utf8_val

Expose the UTF-8 string validation functions
2020-06-22 21:34:02 -04:00 · 2020-06-22 21:34:02 -04:00 · 0062e54e93
parent c25928e44f dada5090b0
commit 0062e54e93
17 changed files with 478 additions and 8 deletions
--- a/doc/basics.md
+++ b/doc/basics.md
@ -188,6 +188,23 @@ In some cases, you may have valid JSON strings that you do not wish to parse but

 Though it does not validate the JSON input, it will detect when the document ends with an unterminated string. E.g., it would refuse to minify the string `"this string is not terminated` because of the missing final quote.

+
+UTF-8 validation (alone)
+----------------------
+
+The simdjson library has fast functions to validate UTF-8 strings. They are many times faster than most functions commonly found in libraries. You can use our fast functions, even if you do not care about JSON.
+
+```C++
+  const char * some_string = "[ 1, 2, 3, 4] ";
+  size_t length = strlen(some_string);
+  bool is_ok = simdjson::validate_utf8(some_string, length);
+```
+
+The UTF-8 validation function merely checks that the input is valid UTF-8: it works with strings in general, not just JSON strings.
+
+Your input string does not need any padding. Any string will do. The `validate_utf8` function does not do any memory allocation on the heap, and it does not throw exceptions.
+
+
 C++17 Support
 -------------

--- a/include/simdjson/implementation.h
+++ b/include/simdjson/implementation.h
@ -10,6 +10,36 @@

 namespace simdjson {

+/**
+ * Validate the UTF-8 string.
+ *
+ * @param buf the string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if the string is valid UTF-8.
+ */
+WARN_UNUSED bool validate_utf8(const char * buf, size_t len) noexcept;
+
+
+/**
+ * Validate the UTF-8 string.
+ *
+ * @param sv the string_view to validate.
+ * @return true if the string is valid UTF-8.
+ */
+really_inline WARN_UNUSED bool validate_utf8(const std::string_view sv) noexcept {
+  return validate_utf8(sv.data(), sv.size());
+}
+
+/**
+ * Validate the UTF-8 string.
+ *
+ * @param p the string to validate.
+ * @return true if the string is valid UTF-8.
+ */
+really_inline WARN_UNUSED bool validate_utf8(const std::string& s) noexcept {
+  return validate_utf8(s.data(), s.size());
+}
+
 namespace dom {
  class document;
 } // namespace dom
@ -83,6 +113,18 @@ public:
   * @return the error code, or SUCCESS if there was no error.
   */
  WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
+  
+  
+  /**   
+   * Validate the UTF-8 string.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the string to validate.
+   * @param len the length of the string in bytes.
+   * @return true if and only if the string is valid UTF-8.
+   */
+  WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;

 protected:
  /** @private Construct an implementation with the given name and description. For subclasses. */
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -7,5 +7,4 @@

 #include "simdjson/compiler_check.h"
 #include "simdjson/error.h"
-
 #endif // SIMDJSON_H
--- a/src/arm64/dom_parser_implementation.cpp
+++ b/src/arm64/dom_parser_implementation.cpp
@ -105,7 +105,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
  this->len = _len;
  return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
 }
-
+#include "generic/stage1/utf8_validator.h"
+WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return simdjson::arm64::stage1::generic_validate_utf8(buf,len);
+}
 } // namespace arm64
 } // namespace simdjson

--- a/src/arm64/implementation.h
+++ b/src/arm64/implementation.h
@ -18,6 +18,7 @@ public:
    std::unique_ptr<internal::dom_parser_implementation>& dst
  ) const noexcept final;
  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
 };

 } // namespace arm64
--- a/src/fallback/dom_parser_implementation.cpp
+++ b/src/fallback/dom_parser_implementation.cpp
@ -244,6 +244,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
  return SUCCESS;
 }

+// credit: based on code from Google Fuchsia (Apache Licensed)
+WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { 
+  const uint8_t *data = (const uint8_t *)buf;
+  uint64_t pos = 0;
+  uint64_t next_pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point < 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
 } // namespace fallback
 } // namespace simdjson

--- a/src/fallback/implementation.h
+++ b/src/fallback/implementation.h
@ -22,6 +22,7 @@ public:
    std::unique_ptr<internal::dom_parser_implementation>& dst
  ) const noexcept final;
  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
 };

 } // namespace fallback
--- a/src/generic/stage1/utf8_validator.h
+++ b/src/generic/stage1/utf8_validator.h
@ -0,0 +1,26 @@
+namespace stage1 {
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
+}
+
+} // namespace stage1
--- a/src/haswell/dom_parser_implementation.cpp
+++ b/src/haswell/dom_parser_implementation.cpp
@ -93,7 +93,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
  this->len = _len;
  return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
 }
-
+#include "generic/stage1/utf8_validator.h"
+WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return simdjson::haswell::stage1::generic_validate_utf8(buf,len);
+}
 } // namespace haswell
 } // namespace simdjson
 UNTARGET_REGION
--- a/src/haswell/implementation.h
+++ b/src/haswell/implementation.h
@ -20,6 +20,7 @@ public:
    std::unique_ptr<internal::dom_parser_implementation>& dst
  ) const noexcept final;
  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
 };

 } // namespace haswell
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@ -48,7 +48,9 @@ public:
  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
    return set_best()->minify(buf, len, dst, dst_len);
  }
-
+  WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
  really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
 private:
  const implementation *set_best() const noexcept;
@ -83,10 +85,19 @@ public:
  ) const noexcept final {
    return UNSUPPORTED_ARCHITECTURE;
  }
-  WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
+  WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
    return UNSUPPORTED_ARCHITECTURE;
  }
-
+  WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+    // then it will flag all strings as invalid. The alternative is to return an error_code
+    // from which the user has to figure out whether the string is valid UTF-8... which seems
+    // like a lot of work just to handle the very unlikely case that we have an unsupported
+    // implementation. And, when it does happen (that we have an unsupported implementation),
+    // what are the chances that the programmer has a fallback? Given that *we* provide the
+    // fallback, it implies that the programmer would need a fallback for our fallback.
+  }
  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
 };

@ -137,6 +148,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_imple
 WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
  return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
 }
+WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept {
+  return active_implementation->validate_utf8(buf, len);
+}


 } // namespace simdjson
--- a/src/westmere/dom_parser_implementation.cpp
+++ b/src/westmere/dom_parser_implementation.cpp
@ -94,7 +94,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
  this->len = _len;
  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
 }
-
+#include "generic/stage1/utf8_validator.h"
+WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return simdjson::westmere::stage1::generic_validate_utf8(buf,len);
+}
 } // namespace westmere
 } // namespace simdjson
 UNTARGET_REGION
--- a/src/westmere/implementation.h
+++ b/src/westmere/implementation.h
@ -19,6 +19,7 @@ public:
    std::unique_ptr<internal::dom_parser_implementation>& dst
  ) const noexcept final;
  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
 };

 } // namespace westmere
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -58,6 +58,8 @@ add_cpp_test(jsoncheck LABELS acceptance per_implementation)
 add_cpp_test(parse_many_test LABELS acceptance per_implementation)
 add_cpp_test(pointercheck LABELS acceptance per_implementation)
 add_cpp_test(extracting_values_example LABELS acceptance per_implementation)
+add_cpp_test(unicode_tests LABELS acceptance per_implementation)
+
 find_program(BASH bash)


--- a/tests/basictests.cpp
+++ b/tests/basictests.cpp
@ -1649,6 +1649,31 @@ namespace type_tests {
 }


+namespace validate_tests {
+  bool test_validate() {
+    std::cout << "Running " << __func__ << std::endl;
+    const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
+    if(!simdjson::validate_utf8(test.data(), test.size())) {
+      return false;
+    }
+    return true;
+  }
+
+  bool test_bad_validate() {
+    std::cout << "Running " << __func__ << std::endl;
+    const std::string test = "\x80\x81";
+    if(simdjson::validate_utf8(test.data(), test.size())) {
+      return false;
+    }
+    return true;
+  }
+  bool run() {
+    return test_validate() &&
+           test_bad_validate();
+  }
+}
+
+

 namespace minify_tests {

@ -1960,7 +1985,8 @@ int main(int argc, char *argv[]) {
    printf("unsupported CPU\n");
  }
  std::cout << "Running basic tests." << std::endl;
-  if (minify_tests::run() &&
+  if (validate_tests::run() &&
+      minify_tests::run() &&
      parse_api_tests::run() &&
      dom_api_tests::run() &&
      type_tests::run() &&
--- a/tests/readme_examples.cpp
+++ b/tests/readme_examples.cpp
@ -265,6 +265,27 @@ void minify() {
  }
 }

+bool is_correct() {
+  const char * some_string = "[ 1, 2, 3, 4] ";
+  size_t length = strlen(some_string);
+  bool is_ok = simdjson::validate_utf8(some_string, length);
+  return is_ok;
+}
+
+bool is_correct_string_view() {
+  const char * some_string = "[ 1, 2, 3, 4] ";
+  size_t length = strlen(some_string);
+  std::string_view v(some_string, length);
+  bool is_ok = simdjson::validate_utf8(v);
+  return is_ok;
+}
+
+bool is_correct_string() {
+  const std::string some_string = "[ 1, 2, 3, 4] ";
+  bool is_ok = simdjson::validate_utf8(some_string);
+  return is_ok;
+}
+
 int main() {
  basics_dom_1();
  basics_dom_2();
--- a/tests/unicode_tests.cpp
+++ b/tests/unicode_tests.cpp
@ -0,0 +1,246 @@
+#include "simdjson.h"
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+class RandomUTF8 final {
+public:
+  RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
+             int prob_3bytes, int prob_4bytes);
+
+  std::vector<uint8_t> generate(size_t output_bytes);
+  std::vector<uint8_t> generate(size_t output_bytes, long seed);
+
+private:
+  uint32_t generate();
+
+  std::mt19937 gen;
+  std::discrete_distribution<> bytes_count;
+  std::uniform_int_distribution<int> val_7bit{0x00, 0x7f}; // 0b0xxxxxxx
+  std::uniform_int_distribution<int> val_6bit{0x00, 0x3f}; // 0b10xxxxxx
+  std::uniform_int_distribution<int> val_5bit{0x00, 0x1f}; // 0b110xxxxx
+  std::uniform_int_distribution<int> val_4bit{0x00, 0x0f}; // 0b1110xxxx
+  std::uniform_int_distribution<int> val_3bit{0x00, 0x07}; // 0b11110xxx
+};
+
+RandomUTF8::RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
+                       int prob_3bytes, int prob_4bytes)
+    : gen(rd()), bytes_count({double(prob_1byte), double(prob_2bytes),
+                              double(prob_3bytes), double(prob_4bytes)}) {}
+
+std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes) {
+  std::vector<uint8_t> result;
+  result.reserve(output_bytes);
+  uint8_t candidate, head;
+  while (result.size() < output_bytes) {
+    switch (bytes_count(gen)) {
+    case 0: // 1 byte
+      candidate = uint8_t(val_7bit(gen));
+      while (candidate == 0) { // though strictly speaking, a stream of nulls is
+                               // UTF8, it tends to break some code
+        candidate = uint8_t(val_7bit(gen));
+      }
+      result.push_back(candidate);
+      break;
+    case 1: // 2 bytes
+      candidate = 0xc0 | uint8_t(val_5bit(gen));
+      while (candidate < 0xC2) {
+        candidate = 0xc0 | uint8_t(val_5bit(gen));
+      }
+      result.push_back(candidate);
+      result.push_back(0x80 | uint8_t(val_6bit(gen)));
+      break;
+    case 2: // 3 bytes
+      head = 0xe0 | uint8_t(val_4bit(gen));
+      result.push_back(head);
+      candidate = 0x80 | uint8_t(val_6bit(gen));
+      if (head == 0xE0) {
+        while (candidate < 0xA0) {
+          candidate = 0x80 | uint8_t(val_6bit(gen));
+        }
+      } else if (head == 0xED) {
+        while (candidate > 0x9F) {
+          candidate = 0x80 | uint8_t(val_6bit(gen));
+        }
+      }
+      result.push_back(candidate);
+      result.push_back(0x80 | uint8_t(val_6bit(gen)));
+      break;
+    case 3: // 4 bytes
+      head = 0xf0 | uint8_t(val_3bit(gen));
+      while (head > 0xF4) {
+        head = 0xf0 | uint8_t(val_3bit(gen));
+      }
+      result.push_back(head);
+      candidate = 0x80 | uint8_t(val_6bit(gen));
+      if (head == 0xF0) {
+        while (candidate < 0x90) {
+          candidate = 0x80 | uint8_t(val_6bit(gen));
+        }
+      } else if (head == 0xF4) {
+        while (candidate > 0x8F) {
+          candidate = 0x80 | uint8_t(val_6bit(gen));
+        }
+      }
+      result.push_back(candidate);
+      result.push_back(0x80 | uint8_t(val_6bit(gen)));
+      result.push_back(0x80 | uint8_t(val_6bit(gen)));
+      break;
+    }
+  }
+  result.push_back(0); // EOS for scalar code
+
+  return result;
+}
+
+std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes, long seed) {
+  gen.seed(uint32_t(seed));
+  return generate(output_bytes);
+}
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
+  const uint8_t *data = (const uint8_t *)buf;
+  uint64_t pos = 0;
+  uint64_t next_pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point < 0xffff || 0x10ffff < code_point) {
+        return false;
+      }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+void brute_force_tests() {
+  printf("running brute-force UTF-8 tests... ");
+  fflush(NULL);
+  std::random_device rd{};
+  RandomUTF8 gen_1_2_3_4(rd, 1, 1, 1, 1);
+  size_t total = 1000;
+  for (size_t i = 0; i < total; i++) {
+
+    auto UTF8 = gen_1_2_3_4.generate(rand() % 256);
+    if (!simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size())) {
+      std::cerr << "bug" << std::endl;
+      abort();
+    }
+    for (size_t flip = 0; flip < 1000; ++flip) {
+      // we are going to hack the string as long as it is UTF-8
+      const int bitflip{1 << (rand() % 8)};
+      UTF8[rand() % UTF8.size()] = uint8_t(bitflip); // we flip exactly one bit
+      bool is_ok =
+          simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size());
+      bool is_ok_basic =
+          basic_validate_utf8((const char *)UTF8.data(), UTF8.size());
+      if (is_ok != is_ok_basic) {
+        std::cerr << "bug" << std::endl;
+        abort();
+      }
+    }
+  }
+  printf("tests ok.\n");
+}
+
+void test() {
+  printf("running hard-coded UTF-8 tests... ");
+  fflush(NULL);
+  // additional tests are from autobahn websocket testsuite
+  // https://github.com/crossbario/autobahn-testsuite/tree/master/autobahntestsuite/autobahntestsuite/case
+  const char *goodsequences[] = {"a",
+                                 "\xc3\xb1",
+                                 "\xe2\x82\xa1",
+                                 "\xf0\x90\x8c\xbc",
+                                 "\xc2\x80",         // 6.7.2
+                                 "\xf0\x90\x80\x80", // 6.7.4
+                                 "\xee\x80\x80",     // 6.11.2
+                                 "\xef\xbb\xbf"};
+  const char *badsequences[] = {
+      "\xc3\x28",                                 // 0
+      "\xa0\xa1",                                 // 1
+      "\xe2\x28\xa1",                             // 2
+      "\xe2\x82\x28",                             // 3
+      "\xf0\x28\x8c\xbc",                         // 4
+      "\xf0\x90\x28\xbc",                         // 5
+      "\xf0\x28\x8c\x28",                         // 6
+      "\xc0\x9f",                                 // 7
+      "\xf5\xff\xff\xff",                         // 8
+      "\xed\xa0\x81",                             // 9
+      "\xf8\x90\x80\x80\x80",                     // 10
+      "123456789012345\xed",                      // 11
+      "123456789012345\xf1",                      // 12
+      "123456789012345\xc2",                      // 13
+      "\xC2\x7F",                                 // 14
+      "\xce",                                     // 6.6.1
+      "\xce\xba\xe1",                             // 6.6.3
+      "\xce\xba\xe1\xbd",                         // 6.6.4
+      "\xce\xba\xe1\xbd\xb9\xcf",                 // 6.6.6
+      "\xce\xba\xe1\xbd\xb9\xcf\x83\xce",         // 6.6.8
+      "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", // 6.6.10
+      "\xdf",                                     // 6.14.6
+      "\xef\xbf",                                 // 6.14.7
+      "\x80",
+      "\x91\x85\x95\x9e",
+      "\x6c\x02\x8e\x18"};
+  for (size_t i = 0; i < 8; i++) {
+    size_t len = strlen(goodsequences[i]);
+    if (!simdjson::validate_utf8(goodsequences[i], len)) {
+      printf("bug goodsequences[%zu]\n", i);
+      abort();
+    }
+  }
+  for (size_t i = 0; i < 26; i++) {
+    size_t len = strlen(badsequences[i]);
+    if (simdjson::validate_utf8(badsequences[i], len)) {
+      printf("bug lookup2 badsequences[%zu]\n", i);
+      abort();
+    }
+  }
+  printf("tests ok.\n");
+}
+int main() {
+  brute_force_tests();
+  test();
+  return EXIT_SUCCESS;
+}