Tweaking.

This commit is contained in:
Daniel Lemire 2020-06-21 17:39:24 -04:00
parent 5dc07ed295
commit f03a6ab5a4
7 changed files with 97 additions and 7 deletions

View File

@ -94,7 +94,7 @@ public:
* @param len the length of the string in bytes.
* @return true if and only if the string is valid UTF-8.
*/
WARN_UNUSED virtual bool utf8_validate(const char *buf, size_t len) const noexcept = 0;
WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
protected:
/** @private Construct an implementation with the given name and description. For subclasses. */

View File

@ -243,6 +243,93 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
// for fear of aliasing
return SUCCESS;
}
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
uint64_t next_pos = 0;
uint32_t code_point = 0;
while (pos < len) {
// check of the next 8 bytes are ascii.
next_pos = pos + 16;
if (next_pos <=
len) { // if it is safe to read 8 more bytes, check that they are ascii
uint64_t v1;
memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 | v2};
if ((v & 0x8080808080808080) == 0) {
pos = next_pos;
continue;
}
}
unsigned char byte = data[pos];
if (byte < 0b10000000) {
pos++;
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) {
return false;
}
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if (code_point < 0x800 || 0xffff < code_point ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point < 0xffff || 0x10ffff < code_point) {
return false;
}
} else {
// we may have a continuation
return false;
}
pos = next_pos;
}
return true;
}
} // namespace fallback
} // namespace simdjson

View File

@ -22,7 +22,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace fallback

View File

@ -94,6 +94,9 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
#include "generic/stage1/utf8_validator.h"
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return simdjson::haswell::stage1::utf8_validate(buf,len);
}
} // namespace haswell
} // namespace simdjson
UNTARGET_REGION

View File

@ -20,7 +20,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace haswell

View File

@ -48,8 +48,8 @@ public:
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
return set_best()->minify(buf, len, dst, dst_len);
}
WARN_UNUSED bool utf8_validate(const char * buf, size_t len) const noexcept final override {
return set_best()->utf8_validate(buf, len);
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
return set_best()->validate_utf8(buf, len);
}
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
private:
@ -88,7 +88,7 @@ public:
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
return UNSUPPORTED_ARCHITECTURE;
}
WARN_UNUSED bool utf8_validate(const char *, size_t) const noexcept final override {
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
return false; // just refuse the validate
}
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}

View File

@ -19,7 +19,7 @@ public:
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept final;
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
};
} // namespace westmere