Merge pull request #959 from simdjson/dlemire/utf8_val
Expose the UTF-8 string validation functions
This commit is contained in:
commit
0062e54e93
|
@ -188,6 +188,23 @@ In some cases, you may have valid JSON strings that you do not wish to parse but
|
|||
|
||||
Though it does not validate the JSON input, it will detect when the document ends with an unterminated string. E.g., it would refuse to minify the string `"this string is not terminated` because of the missing final quote.
|
||||
|
||||
|
||||
UTF-8 validation (alone)
|
||||
----------------------
|
||||
|
||||
The simdjson library has fast functions to validate UTF-8 strings. They are many times faster than most functions commonly found in libraries. You can use our fast functions, even if you do not care about JSON.
|
||||
|
||||
```C++
|
||||
const char * some_string = "[ 1, 2, 3, 4] ";
|
||||
size_t length = strlen(some_string);
|
||||
bool is_ok = simdjson::validate_utf8(some_string, length);
|
||||
```
|
||||
|
||||
The UTF-8 validation function merely checks that the input is valid UTF-8: it works with strings in general, not just JSON strings.
|
||||
|
||||
Your input string does not need any padding. Any string will do. The `validate_utf8` function does not do any memory allocation on the heap, and it does not throw exceptions.
|
||||
|
||||
|
||||
C++17 Support
|
||||
-------------
|
||||
|
||||
|
|
|
@ -10,6 +10,36 @@
|
|||
|
||||
namespace simdjson {
|
||||
|
||||
/**
|
||||
* Validate the UTF-8 string.
|
||||
*
|
||||
* @param buf the string to validate.
|
||||
* @param len the length of the string in bytes.
|
||||
* @return true if the string is valid UTF-8.
|
||||
*/
|
||||
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) noexcept;
|
||||
|
||||
|
||||
/**
|
||||
* Validate the UTF-8 string.
|
||||
*
|
||||
* @param sv the string_view to validate.
|
||||
* @return true if the string is valid UTF-8.
|
||||
*/
|
||||
really_inline WARN_UNUSED bool validate_utf8(const std::string_view sv) noexcept {
|
||||
return validate_utf8(sv.data(), sv.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the UTF-8 string.
|
||||
*
|
||||
* @param p the string to validate.
|
||||
* @return true if the string is valid UTF-8.
|
||||
*/
|
||||
really_inline WARN_UNUSED bool validate_utf8(const std::string& s) noexcept {
|
||||
return validate_utf8(s.data(), s.size());
|
||||
}
|
||||
|
||||
namespace dom {
|
||||
class document;
|
||||
} // namespace dom
|
||||
|
@ -83,6 +113,18 @@ public:
|
|||
* @return the error code, or SUCCESS if there was no error.
|
||||
*/
|
||||
WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Validate the UTF-8 string.
|
||||
*
|
||||
* Overridden by each implementation.
|
||||
*
|
||||
* @param buf the string to validate.
|
||||
* @param len the length of the string in bytes.
|
||||
* @return true if and only if the string is valid UTF-8.
|
||||
*/
|
||||
WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
|
||||
|
||||
protected:
|
||||
/** @private Construct an implementation with the given name and description. For subclasses. */
|
||||
|
|
|
@ -7,5 +7,4 @@
|
|||
|
||||
#include "simdjson/compiler_check.h"
|
||||
#include "simdjson/error.h"
|
||||
|
||||
#endif // SIMDJSON_H
|
||||
|
|
|
@ -105,7 +105,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
|
|||
this->len = _len;
|
||||
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
||||
}
|
||||
|
||||
#include "generic/stage1/utf8_validator.h"
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
return simdjson::arm64::stage1::generic_validate_utf8(buf,len);
|
||||
}
|
||||
} // namespace arm64
|
||||
} // namespace simdjson
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace arm64
|
||||
|
|
|
@ -244,6 +244,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
|
|||
return SUCCESS;
|
||||
}
|
||||
|
||||
// credit: based on code from Google Fuchsia (Apache Licensed)
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
const uint8_t *data = (const uint8_t *)buf;
|
||||
uint64_t pos = 0;
|
||||
uint64_t next_pos = 0;
|
||||
uint32_t code_point = 0;
|
||||
while (pos < len) {
|
||||
// check of the next 8 bytes are ascii.
|
||||
next_pos = pos + 16;
|
||||
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
||||
uint64_t v1;
|
||||
memcpy(&v1, data + pos, sizeof(uint64_t));
|
||||
uint64_t v2;
|
||||
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
||||
uint64_t v{v1 | v2};
|
||||
if ((v & 0x8080808080808080) == 0) {
|
||||
pos = next_pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
unsigned char byte = data[pos];
|
||||
if (byte < 0b10000000) {
|
||||
pos++;
|
||||
continue;
|
||||
} else if ((byte & 0b11100000) == 0b11000000) {
|
||||
next_pos = pos + 2;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
||||
// range check
|
||||
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
||||
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
|
||||
} else if ((byte & 0b11110000) == 0b11100000) {
|
||||
next_pos = pos + 3;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
||||
// range check
|
||||
code_point = (byte & 0b00001111) << 12 |
|
||||
(data[pos + 1] & 0b00111111) << 6 |
|
||||
(data[pos + 2] & 0b00111111);
|
||||
if (code_point < 0x800 || 0xffff < code_point ||
|
||||
(0xd7ff < code_point && code_point < 0xe000)) {
|
||||
return false;
|
||||
}
|
||||
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
||||
next_pos = pos + 4;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
|
||||
// range check
|
||||
code_point =
|
||||
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
||||
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
||||
if (code_point < 0xffff || 0x10ffff < code_point) { return false; }
|
||||
} else {
|
||||
// we may have a continuation
|
||||
return false;
|
||||
}
|
||||
pos = next_pos;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace fallback
|
||||
} // namespace simdjson
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace fallback
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
namespace stage1 {
|
||||
/**
|
||||
* Validates that the string is actual UTF-8.
|
||||
*/
|
||||
template<class checker>
|
||||
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
||||
checker c{};
|
||||
buf_block_reader<64> reader(input, length);
|
||||
while (reader.has_full_block()) {
|
||||
simd::simd8x64<uint8_t> in(reader.full_block());
|
||||
c.check_next_input(in);
|
||||
reader.advance();
|
||||
}
|
||||
uint8_t block[64]{};
|
||||
reader.get_remainder(block);
|
||||
simd::simd8x64<uint8_t> in(block);
|
||||
c.check_next_input(in);
|
||||
reader.advance();
|
||||
return c.errors() == error_code::SUCCESS;
|
||||
}
|
||||
|
||||
bool generic_validate_utf8(const char * input, size_t length) {
|
||||
return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
|
||||
}
|
||||
|
||||
} // namespace stage1
|
|
@ -93,7 +93,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
|
|||
this->len = _len;
|
||||
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
|
||||
}
|
||||
|
||||
#include "generic/stage1/utf8_validator.h"
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
return simdjson::haswell::stage1::generic_validate_utf8(buf,len);
|
||||
}
|
||||
} // namespace haswell
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
|
|
@ -20,6 +20,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace haswell
|
||||
|
|
|
@ -48,7 +48,9 @@ public:
|
|||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
|
||||
return set_best()->minify(buf, len, dst, dst_len);
|
||||
}
|
||||
|
||||
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
|
||||
return set_best()->validate_utf8(buf, len);
|
||||
}
|
||||
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
|
||||
private:
|
||||
const implementation *set_best() const noexcept;
|
||||
|
@ -83,10 +85,19 @@ public:
|
|||
) const noexcept final {
|
||||
return UNSUPPORTED_ARCHITECTURE;
|
||||
}
|
||||
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
|
||||
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
|
||||
return UNSUPPORTED_ARCHITECTURE;
|
||||
}
|
||||
|
||||
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
|
||||
return false; // Just refuse to validate. Given that we have a fallback implementation
|
||||
// it seems unlikely that unsupported_implementation will ever be used. If it is used,
|
||||
// then it will flag all strings as invalid. The alternative is to return an error_code
|
||||
// from which the user has to figure out whether the string is valid UTF-8... which seems
|
||||
// like a lot of work just to handle the very unlikely case that we have an unsupported
|
||||
// implementation. And, when it does happen (that we have an unsupported implementation),
|
||||
// what are the chances that the programmer has a fallback? Given that *we* provide the
|
||||
// fallback, it implies that the programmer would need a fallback for our fallback.
|
||||
}
|
||||
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
|
||||
};
|
||||
|
||||
|
@ -137,6 +148,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_imple
|
|||
WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
|
||||
return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
|
||||
}
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept {
|
||||
return active_implementation->validate_utf8(buf, len);
|
||||
}
|
||||
|
||||
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -94,7 +94,10 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
|
|||
this->len = _len;
|
||||
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
|
||||
}
|
||||
|
||||
#include "generic/stage1/utf8_validator.h"
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
return simdjson::westmere::stage1::generic_validate_utf8(buf,len);
|
||||
}
|
||||
} // namespace westmere
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
|
|
@ -19,6 +19,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace westmere
|
||||
|
|
|
@ -58,6 +58,8 @@ add_cpp_test(jsoncheck LABELS acceptance per_implementation)
|
|||
add_cpp_test(parse_many_test LABELS acceptance per_implementation)
|
||||
add_cpp_test(pointercheck LABELS acceptance per_implementation)
|
||||
add_cpp_test(extracting_values_example LABELS acceptance per_implementation)
|
||||
add_cpp_test(unicode_tests LABELS acceptance per_implementation)
|
||||
|
||||
find_program(BASH bash)
|
||||
|
||||
|
||||
|
|
|
@ -1649,6 +1649,31 @@ namespace type_tests {
|
|||
}
|
||||
|
||||
|
||||
namespace validate_tests {
|
||||
bool test_validate() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
|
||||
if(!simdjson::validate_utf8(test.data(), test.size())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool test_bad_validate() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
const std::string test = "\x80\x81";
|
||||
if(simdjson::validate_utf8(test.data(), test.size())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool run() {
|
||||
return test_validate() &&
|
||||
test_bad_validate();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
namespace minify_tests {
|
||||
|
||||
|
@ -1960,7 +1985,8 @@ int main(int argc, char *argv[]) {
|
|||
printf("unsupported CPU\n");
|
||||
}
|
||||
std::cout << "Running basic tests." << std::endl;
|
||||
if (minify_tests::run() &&
|
||||
if (validate_tests::run() &&
|
||||
minify_tests::run() &&
|
||||
parse_api_tests::run() &&
|
||||
dom_api_tests::run() &&
|
||||
type_tests::run() &&
|
||||
|
|
|
@ -265,6 +265,27 @@ void minify() {
|
|||
}
|
||||
}
|
||||
|
||||
bool is_correct() {
|
||||
const char * some_string = "[ 1, 2, 3, 4] ";
|
||||
size_t length = strlen(some_string);
|
||||
bool is_ok = simdjson::validate_utf8(some_string, length);
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
bool is_correct_string_view() {
|
||||
const char * some_string = "[ 1, 2, 3, 4] ";
|
||||
size_t length = strlen(some_string);
|
||||
std::string_view v(some_string, length);
|
||||
bool is_ok = simdjson::validate_utf8(v);
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
bool is_correct_string() {
|
||||
const std::string some_string = "[ 1, 2, 3, 4] ";
|
||||
bool is_ok = simdjson::validate_utf8(some_string);
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
int main() {
|
||||
basics_dom_1();
|
||||
basics_dom_2();
|
||||
|
|
|
@ -0,0 +1,246 @@
|
|||
#include "simdjson.h"
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
|
||||
class RandomUTF8 final {
|
||||
public:
|
||||
RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
|
||||
int prob_3bytes, int prob_4bytes);
|
||||
|
||||
std::vector<uint8_t> generate(size_t output_bytes);
|
||||
std::vector<uint8_t> generate(size_t output_bytes, long seed);
|
||||
|
||||
private:
|
||||
uint32_t generate();
|
||||
|
||||
std::mt19937 gen;
|
||||
std::discrete_distribution<> bytes_count;
|
||||
std::uniform_int_distribution<int> val_7bit{0x00, 0x7f}; // 0b0xxxxxxx
|
||||
std::uniform_int_distribution<int> val_6bit{0x00, 0x3f}; // 0b10xxxxxx
|
||||
std::uniform_int_distribution<int> val_5bit{0x00, 0x1f}; // 0b110xxxxx
|
||||
std::uniform_int_distribution<int> val_4bit{0x00, 0x0f}; // 0b1110xxxx
|
||||
std::uniform_int_distribution<int> val_3bit{0x00, 0x07}; // 0b11110xxx
|
||||
};
|
||||
|
||||
RandomUTF8::RandomUTF8(std::random_device &rd, int prob_1byte, int prob_2bytes,
|
||||
int prob_3bytes, int prob_4bytes)
|
||||
: gen(rd()), bytes_count({double(prob_1byte), double(prob_2bytes),
|
||||
double(prob_3bytes), double(prob_4bytes)}) {}
|
||||
|
||||
std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes) {
|
||||
std::vector<uint8_t> result;
|
||||
result.reserve(output_bytes);
|
||||
uint8_t candidate, head;
|
||||
while (result.size() < output_bytes) {
|
||||
switch (bytes_count(gen)) {
|
||||
case 0: // 1 byte
|
||||
candidate = uint8_t(val_7bit(gen));
|
||||
while (candidate == 0) { // though strictly speaking, a stream of nulls is
|
||||
// UTF8, it tends to break some code
|
||||
candidate = uint8_t(val_7bit(gen));
|
||||
}
|
||||
result.push_back(candidate);
|
||||
break;
|
||||
case 1: // 2 bytes
|
||||
candidate = 0xc0 | uint8_t(val_5bit(gen));
|
||||
while (candidate < 0xC2) {
|
||||
candidate = 0xc0 | uint8_t(val_5bit(gen));
|
||||
}
|
||||
result.push_back(candidate);
|
||||
result.push_back(0x80 | uint8_t(val_6bit(gen)));
|
||||
break;
|
||||
case 2: // 3 bytes
|
||||
head = 0xe0 | uint8_t(val_4bit(gen));
|
||||
result.push_back(head);
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
if (head == 0xE0) {
|
||||
while (candidate < 0xA0) {
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
}
|
||||
} else if (head == 0xED) {
|
||||
while (candidate > 0x9F) {
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
}
|
||||
}
|
||||
result.push_back(candidate);
|
||||
result.push_back(0x80 | uint8_t(val_6bit(gen)));
|
||||
break;
|
||||
case 3: // 4 bytes
|
||||
head = 0xf0 | uint8_t(val_3bit(gen));
|
||||
while (head > 0xF4) {
|
||||
head = 0xf0 | uint8_t(val_3bit(gen));
|
||||
}
|
||||
result.push_back(head);
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
if (head == 0xF0) {
|
||||
while (candidate < 0x90) {
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
}
|
||||
} else if (head == 0xF4) {
|
||||
while (candidate > 0x8F) {
|
||||
candidate = 0x80 | uint8_t(val_6bit(gen));
|
||||
}
|
||||
}
|
||||
result.push_back(candidate);
|
||||
result.push_back(0x80 | uint8_t(val_6bit(gen)));
|
||||
result.push_back(0x80 | uint8_t(val_6bit(gen)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
result.push_back(0); // EOS for scalar code
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes, long seed) {
|
||||
gen.seed(uint32_t(seed));
|
||||
return generate(output_bytes);
|
||||
}
|
||||
|
||||
// credit: based on code from Google Fuchsia (Apache Licensed)
|
||||
WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
|
||||
const uint8_t *data = (const uint8_t *)buf;
|
||||
uint64_t pos = 0;
|
||||
uint64_t next_pos = 0;
|
||||
uint32_t code_point = 0;
|
||||
while (pos < len) {
|
||||
unsigned char byte = data[pos];
|
||||
if (byte < 0b10000000) {
|
||||
pos++;
|
||||
continue;
|
||||
} else if ((byte & 0b11100000) == 0b11000000) {
|
||||
next_pos = pos + 2;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
// range check
|
||||
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
||||
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
|
||||
} else if ((byte & 0b11110000) == 0b11100000) {
|
||||
next_pos = pos + 3;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
||||
// range check
|
||||
code_point = (byte & 0b00001111) << 12 |
|
||||
(data[pos + 1] & 0b00111111) << 6 |
|
||||
(data[pos + 2] & 0b00111111);
|
||||
if (code_point < 0x800 || 0xffff < code_point ||
|
||||
(0xd7ff < code_point && code_point < 0xe000)) {
|
||||
return false;
|
||||
}
|
||||
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
||||
next_pos = pos + 4;
|
||||
if (next_pos > len) { return false; }
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
||||
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
|
||||
// range check
|
||||
code_point =
|
||||
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
||||
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
||||
if (code_point < 0xffff || 0x10ffff < code_point) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// we may have a continuation
|
||||
return false;
|
||||
}
|
||||
pos = next_pos;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void brute_force_tests() {
|
||||
printf("running brute-force UTF-8 tests... ");
|
||||
fflush(NULL);
|
||||
std::random_device rd{};
|
||||
RandomUTF8 gen_1_2_3_4(rd, 1, 1, 1, 1);
|
||||
size_t total = 1000;
|
||||
for (size_t i = 0; i < total; i++) {
|
||||
|
||||
auto UTF8 = gen_1_2_3_4.generate(rand() % 256);
|
||||
if (!simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size())) {
|
||||
std::cerr << "bug" << std::endl;
|
||||
abort();
|
||||
}
|
||||
for (size_t flip = 0; flip < 1000; ++flip) {
|
||||
// we are going to hack the string as long as it is UTF-8
|
||||
const int bitflip{1 << (rand() % 8)};
|
||||
UTF8[rand() % UTF8.size()] = uint8_t(bitflip); // we flip exactly one bit
|
||||
bool is_ok =
|
||||
simdjson::validate_utf8((const char *)UTF8.data(), UTF8.size());
|
||||
bool is_ok_basic =
|
||||
basic_validate_utf8((const char *)UTF8.data(), UTF8.size());
|
||||
if (is_ok != is_ok_basic) {
|
||||
std::cerr << "bug" << std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("tests ok.\n");
|
||||
}
|
||||
|
||||
void test() {
|
||||
printf("running hard-coded UTF-8 tests... ");
|
||||
fflush(NULL);
|
||||
// additional tests are from autobahn websocket testsuite
|
||||
// https://github.com/crossbario/autobahn-testsuite/tree/master/autobahntestsuite/autobahntestsuite/case
|
||||
const char *goodsequences[] = {"a",
|
||||
"\xc3\xb1",
|
||||
"\xe2\x82\xa1",
|
||||
"\xf0\x90\x8c\xbc",
|
||||
"\xc2\x80", // 6.7.2
|
||||
"\xf0\x90\x80\x80", // 6.7.4
|
||||
"\xee\x80\x80", // 6.11.2
|
||||
"\xef\xbb\xbf"};
|
||||
const char *badsequences[] = {
|
||||
"\xc3\x28", // 0
|
||||
"\xa0\xa1", // 1
|
||||
"\xe2\x28\xa1", // 2
|
||||
"\xe2\x82\x28", // 3
|
||||
"\xf0\x28\x8c\xbc", // 4
|
||||
"\xf0\x90\x28\xbc", // 5
|
||||
"\xf0\x28\x8c\x28", // 6
|
||||
"\xc0\x9f", // 7
|
||||
"\xf5\xff\xff\xff", // 8
|
||||
"\xed\xa0\x81", // 9
|
||||
"\xf8\x90\x80\x80\x80", // 10
|
||||
"123456789012345\xed", // 11
|
||||
"123456789012345\xf1", // 12
|
||||
"123456789012345\xc2", // 13
|
||||
"\xC2\x7F", // 14
|
||||
"\xce", // 6.6.1
|
||||
"\xce\xba\xe1", // 6.6.3
|
||||
"\xce\xba\xe1\xbd", // 6.6.4
|
||||
"\xce\xba\xe1\xbd\xb9\xcf", // 6.6.6
|
||||
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", // 6.6.8
|
||||
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", // 6.6.10
|
||||
"\xdf", // 6.14.6
|
||||
"\xef\xbf", // 6.14.7
|
||||
"\x80",
|
||||
"\x91\x85\x95\x9e",
|
||||
"\x6c\x02\x8e\x18"};
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
size_t len = strlen(goodsequences[i]);
|
||||
if (!simdjson::validate_utf8(goodsequences[i], len)) {
|
||||
printf("bug goodsequences[%zu]\n", i);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < 26; i++) {
|
||||
size_t len = strlen(badsequences[i]);
|
||||
if (simdjson::validate_utf8(badsequences[i], len)) {
|
||||
printf("bug lookup2 badsequences[%zu]\n", i);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
printf("tests ok.\n");
|
||||
}
|
||||
int main() {
|
||||
brute_force_tests();
|
||||
test();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
Loading…
Reference in New Issue