Tweaking.
This commit is contained in:
parent
5dc07ed295
commit
f03a6ab5a4
|
@ -94,7 +94,7 @@ public:
|
|||
* @param len the length of the string in bytes.
|
||||
* @return true if and only if the string is valid UTF-8.
|
||||
*/
|
||||
WARN_UNUSED virtual bool utf8_validate(const char *buf, size_t len) const noexcept = 0;
|
||||
WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
|
||||
|
||||
protected:
|
||||
/** @private Construct an implementation with the given name and description. For subclasses. */
|
||||
|
|
|
@ -243,6 +243,93 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
|
|||
// for fear of aliasing
|
||||
return SUCCESS;
|
||||
}
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
const uint8_t *data = (const uint8_t *)buf;
|
||||
uint64_t pos = 0;
|
||||
uint64_t next_pos = 0;
|
||||
uint32_t code_point = 0;
|
||||
while (pos < len) {
|
||||
|
||||
// check of the next 8 bytes are ascii.
|
||||
next_pos = pos + 16;
|
||||
if (next_pos <=
|
||||
len) { // if it is safe to read 8 more bytes, check that they are ascii
|
||||
uint64_t v1;
|
||||
memcpy(&v1, data + pos, sizeof(uint64_t));
|
||||
uint64_t v2;
|
||||
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
||||
uint64_t v{v1 | v2};
|
||||
if ((v & 0x8080808080808080) == 0) {
|
||||
pos = next_pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
unsigned char byte = data[pos];
|
||||
|
||||
if (byte < 0b10000000) {
|
||||
pos++;
|
||||
continue;
|
||||
} else if ((byte & 0b11100000) == 0b11000000) {
|
||||
next_pos = pos + 2;
|
||||
if (next_pos > len) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
// range check
|
||||
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
||||
if (code_point < 0x80 || 0x7ff < code_point) {
|
||||
return false;
|
||||
}
|
||||
} else if ((byte & 0b11110000) == 0b11100000) {
|
||||
next_pos = pos + 3;
|
||||
if (next_pos > len) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
// range check
|
||||
code_point = (byte & 0b00001111) << 12 |
|
||||
(data[pos + 1] & 0b00111111) << 6 |
|
||||
(data[pos + 2] & 0b00111111);
|
||||
if (code_point < 0x800 || 0xffff < code_point ||
|
||||
(0xd7ff < code_point && code_point < 0xe000)) {
|
||||
return false;
|
||||
}
|
||||
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
||||
next_pos = pos + 4;
|
||||
if (next_pos > len) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
|
||||
return false;
|
||||
}
|
||||
// range check
|
||||
code_point =
|
||||
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
||||
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
||||
if (code_point < 0xffff || 0x10ffff < code_point) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// we may have a continuation
|
||||
return false;
|
||||
}
|
||||
pos = next_pos;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace fallback
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -22,7 +22,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace fallback
|
||||
|
|
|
@ -94,6 +94,9 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
|
|||
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
|
||||
}
|
||||
#include "generic/stage1/utf8_validator.h"
|
||||
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
||||
return simdjson::haswell::stage1::utf8_validate(buf,len);
|
||||
}
|
||||
} // namespace haswell
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
|
|
@ -20,7 +20,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace haswell
|
||||
|
|
|
@ -48,8 +48,8 @@ public:
|
|||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
|
||||
return set_best()->minify(buf, len, dst, dst_len);
|
||||
}
|
||||
WARN_UNUSED bool utf8_validate(const char * buf, size_t len) const noexcept final override {
|
||||
return set_best()->utf8_validate(buf, len);
|
||||
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
|
||||
return set_best()->validate_utf8(buf, len);
|
||||
}
|
||||
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
|
||||
private:
|
||||
|
@ -88,7 +88,7 @@ public:
|
|||
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
|
||||
return UNSUPPORTED_ARCHITECTURE;
|
||||
}
|
||||
WARN_UNUSED bool utf8_validate(const char *, size_t) const noexcept final override {
|
||||
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
|
||||
return false; // just refuse the validate
|
||||
}
|
||||
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
|
||||
|
|
|
@ -19,7 +19,7 @@ public:
|
|||
std::unique_ptr<internal::dom_parser_implementation>& dst
|
||||
) const noexcept final;
|
||||
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
||||
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
|
||||
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
||||
};
|
||||
|
||||
} // namespace westmere
|
||||
|
|
Loading…
Reference in New Issue