Merge pull request #959 from simdjson/dlemire/utf8_val

Expose the UTF-8 string validation functions
This commit is contained in:
Daniel Lemire 2020-06-22 21:34:02 -04:00 committed by GitHub
commit 0062e54e93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 478 additions and 8 deletions

View File

@ -188,6 +188,23 @@ In some cases, you may have valid JSON strings that you do not wish to parse but
Though it does not validate the JSON input, it will detect when the document ends with an unterminated string. E.g., it would refuse to minify the string `"this string is not terminated` because of the missing final quote.
UTF-8 validation (alone)
----------------------
The simdjson library has fast functions to validate UTF-8 strings. They are many times faster than most functions commonly found in libraries. You can use our fast functions, even if you do not care about JSON.
```C++
const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string);
bool is_ok = simdjson::validate_utf8(some_string, length);
```
The UTF-8 validation function merely checks that the input is valid UTF-8: it works with strings in general, not just JSON strings.
Your input string does not need any padding. Any string will do. The `validate_utf8` function does not do any memory allocation on the heap, and it does not throw exceptions.
C++17 Support
-------------

View File

@ -10,6 +10,36 @@
namespace simdjson {
/**
* Validate the UTF-8 string.
*
* @param buf the string to validate.
* @param len the length of the string in bytes.
* @return true if the string is valid UTF-8.
*/
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) noexcept;
/**
* Validate the UTF-8 string.
*
* @param sv the string_view to validate.
* @return true if the string is valid UTF-8.
*/
really_inline WARN_UNUSED bool validate_utf8(const std::string_view sv) noexcept {
return validate_utf8(sv.data(), sv.size());
}
/**
* Validate the UTF-8 string.
*
* @param p the string to validate.
* @return true if the string is valid UTF-8.
*/
really_inline WARN_UNUSED bool validate_utf8(const std::string& s) noexcept {
return validate_utf8(s.data(), s.size());
}
namespace dom {
class document;
} // namespace dom
@ -83,6 +113,18 @@ public:
* @return the error code, or SUCCESS if there was no error.
*/
WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
/**
* Validate the UTF-8 string.
*
* Overridden by each implementation.
*
* @param buf the string to validate.
* @param len the length of the string in bytes.
* @return true if and only if the string is valid UTF-8.
*/
WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
protected:
/** @private Construct an implementation with the given name and description. For subclasses. */

View File

@ -7,5 +7,4 @@
#include "simdjson/compiler_check.h"
#include "simdjson/error.h"
#endif // SIMDJSON_H

View File

@ -105,7 +105,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
this->len = _len;
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
#include "generic/stage1/utf8_validator.h"
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return simdjson::arm64::stage1::generic_validate_utf8(buf,len);
}
} // namespace arm64
} // namespace simdjson

View File

@ -18,6 +18,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace arm64

View File

@ -244,6 +244,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
return SUCCESS;
}
// credit: based on code from Google Fuchsia (Apache Licensed)
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
uint64_t next_pos = 0;
uint32_t code_point = 0;
while (pos < len) {
// check of the next 8 bytes are ascii.
next_pos = pos + 16;
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
uint64_t v1;
memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 | v2};
if ((v & 0x8080808080808080) == 0) {
pos = next_pos;
continue;
}
}
unsigned char byte = data[pos];
if (byte < 0b10000000) {
pos++;
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if (code_point < 0x800 || 0xffff < code_point ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point < 0xffff || 0x10ffff < code_point) { return false; }
} else {
// we may have a continuation
return false;
}
pos = next_pos;
}
return true;
}
} // namespace fallback
} // namespace simdjson

View File

@ -22,6 +22,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace fallback

View File

@ -0,0 +1,26 @@
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
while (reader.has_full_block()) {
simd::simd8x64<uint8_t> in(reader.full_block());
c.check_next_input(in);
reader.advance();
}
uint8_t block[64]{};
reader.get_remainder(block);
simd::simd8x64<uint8_t> in(block);
c.check_next_input(in);
reader.advance();
return c.errors() == error_code::SUCCESS;
}
bool generic_validate_utf8(const char * input, size_t length) {
return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
}
} // namespace stage1

View File

@ -93,7 +93,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
this->len = _len;
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
#include "generic/stage1/utf8_validator.h"
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return simdjson::haswell::stage1::generic_validate_utf8(buf,len);
}
} // namespace haswell
} // namespace simdjson
UNTARGET_REGION

View File

@ -20,6 +20,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace haswell

View File

@ -48,7 +48,9 @@ public:
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
return set_best()->minify(buf, len, dst, dst_len);
}
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
return set_best()->validate_utf8(buf, len);
}
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
private:
const implementation *set_best() const noexcept;
@ -83,10 +85,19 @@ public:
) const noexcept final {
return UNSUPPORTED_ARCHITECTURE;
}
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
return UNSUPPORTED_ARCHITECTURE;
}
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
return false; // Just refuse to validate. Given that we have a fallback implementation
// it seems unlikely that unsupported_implementation will ever be used. If it is used,
// then it will flag all strings as invalid. The alternative is to return an error_code
// from which the user has to figure out whether the string is valid UTF-8... which seems
// like a lot of work just to handle the very unlikely case that we have an unsupported
// implementation. And, when it does happen (that we have an unsupported implementation),
// what are the chances that the programmer has a fallback? Given that *we* provide the
// fallback, it implies that the programmer would need a fallback for our fallback.
}
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
};
@ -137,6 +148,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_imple
WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
}
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept {
return active_implementation->validate_utf8(buf, len);
}
} // namespace simdjson

View File

@ -94,7 +94,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
this->len = _len;
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
}
#include "generic/stage1/utf8_validator.h"
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return simdjson::westmere::stage1::generic_validate_utf8(buf,len);
}
} // namespace westmere
} // namespace simdjson
UNTARGET_REGION

View File

@ -19,6 +19,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace westmere

View File

@ -58,6 +58,8 @@ add_cpp_test(jsoncheck LABELS acceptance per_implementation)
add_cpp_test(parse_many_test LABELS acceptance per_implementation)
add_cpp_test(pointercheck LABELS acceptance per_implementation)
add_cpp_test(extracting_values_example LABELS acceptance per_implementation)
add_cpp_test(unicode_tests LABELS acceptance per_implementation)
find_program(BASH bash)

View File

@ -1649,6 +1649,31 @@ namespace type_tests {
}
namespace validate_tests {
bool test_validate() {
std::cout << "Running " << __func__ << std::endl;
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
if(!simdjson::validate_utf8(test.data(), test.size())) {
return false;
}
return true;
}
bool test_bad_validate() {
std::cout << "Running " << __func__ << std::endl;
const std::string test = "\x80\x81";
if(simdjson::validate_utf8(test.data(), test.size())) {
return false;
}
return true;
}
bool run() {
return test_validate() &&
test_bad_validate();
}
}
namespace minify_tests {
@ -1960,7 +1985,8 @@ int main(int argc, char *argv[]) {
printf("unsupported CPU\n");
}
std::cout << "Running basic tests." << std::endl;
if (minify_tests::run() &&
if (validate_tests::run() &&
minify_tests::run() &&
parse_api_tests::run() &&
dom_api_tests::run() &&
type_tests::run() &&

View File

@ -265,6 +265,27 @@ void minify() {
}
}
bool is_correct() {
const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string);
bool is_ok = simdjson::validate_utf8(some_string, length);
return is_ok;
}
bool is_correct_string_view() {
const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string);
std::string_view v(some_string, length);
bool is_ok = simdjson::validate_utf8(v);
return is_ok;
}
bool is_correct_string() {
const std::string some_string = "[ 1, 2, 3, 4] ";
bool is_ok = simdjson::validate_utf8(some_string);
return is_ok;
}
int main() {
basics_dom_1();
basics_dom_2();

246
tests/unicode_tests.cpp Normal file
View File

@ -0,0 +1,246 @@
#include "simdjson.h"
#include <cstddef>
#include <cstdint>
#include <random>
class RandomUTF8 final {
public:
RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
int prob_3bytes, int prob_4bytes);
std::vector<uint8_t> generate(size_t output_bytes);
std::vector<uint8_t> generate(size_t output_bytes, long seed);
private:
uint32_t generate();
std::mt19937 gen;
std::discrete_distribution<> bytes_count;
std::uniform_int_distribution<int> val_7bit{0x00, 0x7f}; // 0b0xxxxxxx
std::uniform_int_distribution<int> val_6bit{0x00, 0x3f}; // 0b10xxxxxx
std::uniform_int_distribution<int> val_5bit{0x00, 0x1f}; // 0b110xxxxx
std::uniform_int_distribution<int> val_4bit{0x00, 0x0f}; // 0b1110xxxx
std::uniform_int_distribution<int> val_3bit{0x00, 0x07}; // 0b11110xxx
};
RandomUTF8::RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
int prob_3bytes, int prob_4bytes)
: gen(rd()), bytes_count({double(prob_1byte), double(prob_2bytes),
double(prob_3bytes), double(prob_4bytes)}) {}
std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes) {
std::vector<uint8_t> result;
result.reserve(output_bytes);
uint8_t candidate, head;
while (result.size() < output_bytes) {
switch (bytes_count(gen)) {
case 0: // 1 byte
candidate = uint8_t(val_7bit(gen));
while (candidate == 0) { // though strictly speaking, a stream of nulls is
// UTF8, it tends to break some code
candidate = uint8_t(val_7bit(gen));
}
result.push_back(candidate);
break;
case 1: // 2 bytes
candidate = 0xc0 | uint8_t(val_5bit(gen));
while (candidate < 0xC2) {
candidate = 0xc0 | uint8_t(val_5bit(gen));
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
case 2: // 3 bytes
head = 0xe0 | uint8_t(val_4bit(gen));
result.push_back(head);
candidate = 0x80 | uint8_t(val_6bit(gen));
if (head == 0xE0) {
while (candidate < 0xA0) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
} else if (head == 0xED) {
while (candidate > 0x9F) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
case 3: // 4 bytes
head = 0xf0 | uint8_t(val_3bit(gen));
while (head > 0xF4) {
head = 0xf0 | uint8_t(val_3bit(gen));
}
result.push_back(head);
candidate = 0x80 | uint8_t(val_6bit(gen));
if (head == 0xF0) {
while (candidate < 0x90) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
} else if (head == 0xF4) {
while (candidate > 0x8F) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
}
}
result.push_back(0); // EOS for scalar code
return result;
}
std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes, long seed) {
gen.seed(uint32_t(seed));
return generate(output_bytes);
}
// credit: based on code from Google Fuchsia (Apache Licensed)
WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
uint64_t next_pos = 0;
uint32_t code_point = 0;
while (pos < len) {
unsigned char byte = data[pos];
if (byte < 0b10000000) {
pos++;
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if (code_point < 0x800 || 0xffff < code_point ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point < 0xffff || 0x10ffff < code_point) {
return false;
}
} else {
// we may have a continuation
return false;
}
pos = next_pos;
}
return true;
}
void brute_force_tests() {
printf("running brute-force UTF-8 tests... ");
fflush(NULL);
std::random_device rd{};
RandomUTF8 gen_1_2_3_4(rd, 1, 1, 1, 1);
size_t total = 1000;
for (size_t i = 0; i < total; i++) {
auto UTF8 = gen_1_2_3_4.generate(rand() % 256);
if (!simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size())) {
std::cerr << "bug" << std::endl;
abort();
}
for (size_t flip = 0; flip < 1000; ++flip) {
// we are going to hack the string as long as it is UTF-8
const int bitflip{1 << (rand() % 8)};
UTF8[rand() % UTF8.size()] = uint8_t(bitflip); // we flip exactly one bit
bool is_ok =
simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size());
bool is_ok_basic =
basic_validate_utf8((const char *)UTF8.data(), UTF8.size());
if (is_ok != is_ok_basic) {
std::cerr << "bug" << std::endl;
abort();
}
}
}
printf("tests ok.\n");
}
void test() {
printf("running hard-coded UTF-8 tests... ");
fflush(NULL);
// additional tests are from autobahn websocket testsuite
// https://github.com/crossbario/autobahn-testsuite/tree/master/autobahntestsuite/autobahntestsuite/case
const char *goodsequences[] = {"a",
"\xc3\xb1",
"\xe2\x82\xa1",
"\xf0\x90\x8c\xbc",
"\xc2\x80", // 6.7.2
"\xf0\x90\x80\x80", // 6.7.4
"\xee\x80\x80", // 6.11.2
"\xef\xbb\xbf"};
const char *badsequences[] = {
"\xc3\x28", // 0
"\xa0\xa1", // 1
"\xe2\x28\xa1", // 2
"\xe2\x82\x28", // 3
"\xf0\x28\x8c\xbc", // 4
"\xf0\x90\x28\xbc", // 5
"\xf0\x28\x8c\x28", // 6
"\xc0\x9f", // 7
"\xf5\xff\xff\xff", // 8
"\xed\xa0\x81", // 9
"\xf8\x90\x80\x80\x80", // 10
"123456789012345\xed", // 11
"123456789012345\xf1", // 12
"123456789012345\xc2", // 13
"\xC2\x7F", // 14
"\xce", // 6.6.1
"\xce\xba\xe1", // 6.6.3
"\xce\xba\xe1\xbd", // 6.6.4
"\xce\xba\xe1\xbd\xb9\xcf", // 6.6.6
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", // 6.6.8
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", // 6.6.10
"\xdf", // 6.14.6
"\xef\xbf", // 6.14.7
"\x80",
"\x91\x85\x95\x9e",
"\x6c\x02\x8e\x18"};
for (size_t i = 0; i < 8; i++) {
size_t len = strlen(goodsequences[i]);
if (!simdjson::validate_utf8(goodsequences[i], len)) {
printf("bug goodsequences[%zu]\n", i);
abort();
}
}
for (size_t i = 0; i < 26; i++) {
size_t len = strlen(badsequences[i]);
if (simdjson::validate_utf8(badsequences[i], len)) {
printf("bug lookup2 badsequences[%zu]\n", i);
abort();
}
}
printf("tests ok.\n");
}
int main() {
brute_force_tests();
test();
return EXIT_SUCCESS;
}