diff --git a/singleheader/amalgamate_demo.cpp b/singleheader/amalgamate_demo.cpp index 3dfbddcf..bd230b82 100644 --- a/singleheader/amalgamate_demo.cpp +++ b/singleheader/amalgamate_demo.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */ +/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */ #include #include "simdjson.h" diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp index d78ad5df..909e14be 100644 --- a/singleheader/simdjson.cpp +++ b/singleheader/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */ +/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */ /* begin file src/simdjson.cpp */ #include "simdjson.h" @@ -371,6 +371,7 @@ public: std::unique_ptr& dst ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; }; } // namespace haswell @@ -402,6 +403,7 @@ public: std::unique_ptr& dst ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; }; } // namespace westmere @@ -433,6 +435,7 @@ public: std::unique_ptr& dst ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; }; } // namespace arm64 @@ -468,6 +471,7 @@ public: std::unique_ptr& dst ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; }; } // namespace fallback @@ -500,7 +504,9 @@ public: WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { return set_best()->minify(buf, len, dst, dst_len); } - + WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_utf8(buf, len); + } really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} private: const implementation *set_best() const noexcept; @@ -535,10 +541,19 @@ public: ) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } - WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final { + WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override { return UNSUPPORTED_ARCHITECTURE; } - + WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override { + return false; // Just refuse to validate. Given that we have a fallback implementation + // it seems unlikely that unsupported_implementation will ever be used. If it is used, + // then it will flag all strings as invalid. The alternative is to return an error_code + // from which the user has to figure out whether the string is valid UTF-8... which seems + // like a lot of work just to handle the very unlikely case that we have an unsupported + // implementation. And, when it does happen (that we have an unsupported implementation), + // what are the chances that the programmer has a fallback? Given that *we* provide the + // fallback, it implies that the programmer would need a fallback for our fallback. + } unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} }; @@ -589,6 +604,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr active_imple WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept { return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len); } +WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept { + return active_implementation->validate_utf8(buf, len); +} } // namespace simdjson @@ -3757,7 +3775,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si this->len = _len; return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); } +/* begin file src/generic/stage1/utf8_validator.h */ +namespace stage1 { +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + return c.errors() == error_code::SUCCESS; +} +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8((const uint8_t *)input,length); +} + +} // namespace stage1 +/* end file src/generic/stage1/utf8_validator.h */ +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return simdjson::arm64::stage1::generic_validate_utf8(buf,len); +} } // namespace arm64 } // namespace simdjson @@ -5798,6 +5846,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return SUCCESS; } +// credit: based on code from Google Fuchsia (Apache Licensed) +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + const uint8_t *data = (const uint8_t *)buf; + uint64_t pos = 0; + uint64_t next_pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 8 bytes are ascii. + next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v1; + memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + if (byte < 0b10000000) { + pos++; + continue; + } else if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return false; } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return false; + } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point < 0xffff || 0x10ffff < code_point) { return false; } + } else { + // we may have a continuation + return false; + } + pos = next_pos; + } + return true; +} + } // namespace fallback } // namespace simdjson @@ -9121,7 +9233,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si this->len = _len; return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); } +/* begin file src/generic/stage1/utf8_validator.h */ +namespace stage1 { +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + return c.errors() == error_code::SUCCESS; +} +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8((const uint8_t *)input,length); +} + +} // namespace stage1 +/* end file src/generic/stage1/utf8_validator.h */ +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return simdjson::haswell::stage1::generic_validate_utf8(buf,len); +} } // namespace haswell } // namespace simdjson UNTARGET_REGION @@ -12368,7 +12510,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si this->len = _len; return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); } +/* begin file src/generic/stage1/utf8_validator.h */ +namespace stage1 { +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + return c.errors() == error_code::SUCCESS; +} +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8((const uint8_t *)input,length); +} + +} // namespace stage1 +/* end file src/generic/stage1/utf8_validator.h */ +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return simdjson::westmere::stage1::generic_validate_utf8(buf,len); +} } // namespace westmere } // namespace simdjson UNTARGET_REGION diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h index 63a8f14e..c089c622 100644 --- a/singleheader/simdjson.h +++ b/singleheader/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */ +/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */ /* begin file include/simdjson.h */ #ifndef SIMDJSON_H #define SIMDJSON_H @@ -2637,6 +2637,36 @@ inline error_code dom_parser_implementation::allocate(size_t capacity, size_t ma namespace simdjson { +/** + * Validate the UTF-8 string. + * + * @param buf the string to validate. + * @param len the length of the string in bytes. + * @return true if the string is valid UTF-8. + */ +WARN_UNUSED bool validate_utf8(const char * buf, size_t len) noexcept; + + +/** + * Validate the UTF-8 string. + * + * @param sv the string_view to validate. + * @return true if the string is valid UTF-8. + */ +really_inline WARN_UNUSED bool validate_utf8(const std::string_view sv) noexcept { + return validate_utf8(sv.data(), sv.size()); +} + +/** + * Validate the UTF-8 string. + * + * @param p the string to validate. + * @return true if the string is valid UTF-8. + */ +really_inline WARN_UNUSED bool validate_utf8(const std::string& s) noexcept { + return validate_utf8(s.data(), s.size()); +} + namespace dom { class document; } // namespace dom @@ -2710,6 +2740,18 @@ public: * @return the error code, or SUCCESS if there was no error. */ WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0; + + + /** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ + WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; protected: /** @private Construct an implementation with the given name and description. For subclasses. */ @@ -3984,7 +4026,7 @@ public: * Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE * if it is negative. */ - inline simdjson_result get_int64_t() const noexcept; + inline simdjson_result get_int64() const noexcept; /** * Cast this element to an unsigned integer. * @@ -3994,7 +4036,7 @@ public: * Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE * if it is too large. */ - inline simdjson_result get_uint64_t() const noexcept; + inline simdjson_result get_uint64() const noexcept; /** * Cast this element to an double floating-point. * @@ -4037,13 +4079,13 @@ public: * * Equivalent to is(). */ - inline bool is_int64_t() const noexcept; + inline bool is_int64() const noexcept; /** * Whether this element is a json number that fits in an unsigned 64-bit integer. * * Equivalent to is(). */ - inline bool is_uint64_t() const noexcept; + inline bool is_uint64() const noexcept; /** * Whether this element is a json number that fits in a double. * @@ -4078,6 +4120,7 @@ public: * - Object: dom::object * * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object + * @returns true if the value can be cast to the given type, false if not. */ template really_inline bool is() const noexcept; @@ -4349,7 +4392,7 @@ public: really_inline simdjson_result type() const noexcept; template - really_inline simdjson_result is() const noexcept; + really_inline bool is() const noexcept; template really_inline simdjson_result get() const noexcept; template @@ -4359,19 +4402,19 @@ public: really_inline simdjson_result get_object() const noexcept; really_inline simdjson_result get_c_str() const noexcept; really_inline simdjson_result get_string() const noexcept; - really_inline simdjson_result get_int64_t() const noexcept; - really_inline simdjson_result get_uint64_t() const noexcept; + really_inline simdjson_result get_int64() const noexcept; + really_inline simdjson_result get_uint64() const noexcept; really_inline simdjson_result get_double() const noexcept; really_inline simdjson_result get_bool() const noexcept; - really_inline simdjson_result is_array() const noexcept; - really_inline simdjson_result is_object() const noexcept; - really_inline simdjson_result is_string() const noexcept; - really_inline simdjson_result is_int64_t() const noexcept; - really_inline simdjson_result is_uint64_t() const noexcept; - really_inline simdjson_result is_double() const noexcept; - really_inline simdjson_result is_bool() const noexcept; - really_inline simdjson_result is_null() const noexcept; + really_inline bool is_array() const noexcept; + really_inline bool is_object() const noexcept; + really_inline bool is_string() const noexcept; + really_inline bool is_int64() const noexcept; + really_inline bool is_uint64() const noexcept; + really_inline bool is_double() const noexcept; + really_inline bool is_bool() const noexcept; + really_inline bool is_null() const noexcept; really_inline simdjson_result operator[](const std::string_view &key) const noexcept; really_inline simdjson_result operator[](const char *key) const noexcept; @@ -5739,9 +5782,8 @@ inline simdjson_result simdjson_result::type() } template -really_inline simdjson_result simdjson_result::is() const noexcept { - if (error()) { return error(); } - return first.is(); +really_inline bool simdjson_result::is() const noexcept { + return !error() && first.is(); } template really_inline simdjson_result simdjson_result::get() const noexcept { @@ -5770,13 +5812,13 @@ really_inline simdjson_result simdjson_result::g if (error()) { return error(); } return first.get_string(); } -really_inline simdjson_result simdjson_result::get_int64_t() const noexcept { +really_inline simdjson_result simdjson_result::get_int64() const noexcept { if (error()) { return error(); } - return first.get_int64_t(); + return first.get_int64(); } -really_inline simdjson_result simdjson_result::get_uint64_t() const noexcept { +really_inline simdjson_result simdjson_result::get_uint64() const noexcept { if (error()) { return error(); } - return first.get_uint64_t(); + return first.get_uint64(); } really_inline simdjson_result simdjson_result::get_double() const noexcept { if (error()) { return error(); } @@ -5787,38 +5829,30 @@ really_inline simdjson_result simdjson_result::get_bool() co return first.get_bool(); } -really_inline simdjson_result simdjson_result::is_array() const noexcept { - if (error()) { return error(); } - return first.is_array(); +really_inline bool simdjson_result::is_array() const noexcept { + return !error() && first.is_array(); } -really_inline simdjson_result simdjson_result::is_object() const noexcept { - if (error()) { return error(); } - return first.is_object(); +really_inline bool simdjson_result::is_object() const noexcept { + return !error() && first.is_object(); } -really_inline simdjson_result simdjson_result::is_string() const noexcept { - if (error()) { return error(); } - return first.is_string(); +really_inline bool simdjson_result::is_string() const noexcept { + return !error() && first.is_string(); } -really_inline simdjson_result simdjson_result::is_int64_t() const noexcept { - if (error()) { return error(); } - return first.is_int64_t(); +really_inline bool simdjson_result::is_int64() const noexcept { + return !error() && first.is_int64(); } -really_inline simdjson_result simdjson_result::is_uint64_t() const noexcept { - if (error()) { return error(); } - return first.is_uint64_t(); +really_inline bool simdjson_result::is_uint64() const noexcept { + return !error() && first.is_uint64(); } -really_inline simdjson_result simdjson_result::is_double() const noexcept { - if (error()) { return error(); } - return first.is_double(); +really_inline bool simdjson_result::is_double() const noexcept { + return !error() && first.is_double(); } -really_inline simdjson_result simdjson_result::is_bool() const noexcept { - if (error()) { return error(); } - return first.is_bool(); +really_inline bool simdjson_result::is_bool() const noexcept { + return !error() && first.is_bool(); } -really_inline simdjson_result simdjson_result::is_null() const noexcept { - if (error()) { return error(); } - return first.is_null(); +really_inline bool simdjson_result::is_null() const noexcept { + return !error() && first.is_null(); } really_inline simdjson_result simdjson_result::operator[](const std::string_view &key) const noexcept { @@ -5922,7 +5956,7 @@ inline simdjson_result element::get_string() const noexcept { return INCORRECT_TYPE; } } -inline simdjson_result element::get_uint64_t() const noexcept { +inline simdjson_result element::get_uint64() const noexcept { if(unlikely(!tape.is_uint64())) { // branch rarely taken if(tape.is_int64()) { int64_t result = tape.next_tape_value(); @@ -5935,7 +5969,7 @@ inline simdjson_result element::get_uint64_t() const noexcept { } return tape.next_tape_value(); } -inline simdjson_result element::get_int64_t() const noexcept { +inline simdjson_result element::get_int64() const noexcept { if(unlikely(!tape.is_int64())) { // branch rarely taken if(tape.is_uint64()) { uint64_t result = tape.next_tape_value(); @@ -6008,16 +6042,16 @@ template<> inline simdjson_result element::get() const noexcept { template<> inline simdjson_result element::get() const noexcept { return get_object(); } template<> inline simdjson_result element::get() const noexcept { return get_c_str(); } template<> inline simdjson_result element::get() const noexcept { return get_string(); } -template<> inline simdjson_result element::get() const noexcept { return get_int64_t(); } -template<> inline simdjson_result element::get() const noexcept { return get_uint64_t(); } +template<> inline simdjson_result element::get() const noexcept { return get_int64(); } +template<> inline simdjson_result element::get() const noexcept { return get_uint64(); } template<> inline simdjson_result element::get() const noexcept { return get_double(); } template<> inline simdjson_result element::get() const noexcept { return get_bool(); } inline bool element::is_array() const noexcept { return is(); } inline bool element::is_object() const noexcept { return is(); } inline bool element::is_string() const noexcept { return is(); } -inline bool element::is_int64_t() const noexcept { return is(); } -inline bool element::is_uint64_t() const noexcept { return is(); } +inline bool element::is_int64() const noexcept { return is(); } +inline bool element::is_uint64() const noexcept { return is(); } inline bool element::is_double() const noexcept { return is(); } inline bool element::is_bool() const noexcept { return is(); }