From 7356b4532faa5d822b8bce02c1a58905db59b8f3 Mon Sep 17 00:00:00 2001 From: John Keiser Date: Sun, 10 Nov 2019 20:44:51 -0800 Subject: [PATCH] Perform UTF-8 detection via flag lookup algorithm - adds the alternative zwegner, range and lookup utf8 algorithms as well, for ability to do "shootouts" --- Makefile | 2 +- src/CMakeLists.txt | 5 +- src/arm64/simd.h | 32 +- src/arm64/stage1_find_marks.h | 2 +- ...8check.h => utf8_fastvalidate_algorithm.h} | 0 src/generic/utf8_lookup_algorithm.h | 380 ++++++++++++++++++ src/generic/utf8_range_algorithm.h | 178 ++++++++ src/generic/utf8_zwegner_algorithm.h | 358 +++++++++++++++++ src/haswell/simd.h | 36 +- src/haswell/stage1_find_marks.h | 2 +- src/westmere/simd.h | 31 +- src/westmere/stage1_find_marks.h | 2 +- 12 files changed, 1000 insertions(+), 28 deletions(-) rename src/generic/{simdutf8check.h => utf8_fastvalidate_algorithm.h} (100%) create mode 100644 src/generic/utf8_lookup_algorithm.h create mode 100644 src/generic/utf8_range_algorithm.h create mode 100644 src/generic/utf8_zwegner_algorithm.h diff --git a/Makefile b/Makefile index c7fbe71a..20b9121a 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,7 @@ COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompeti SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing # Load headers and sources -LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/bitmask.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/bitmask.h src/haswell/simd.h src/generic/simdutf8check.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/bitmask.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h src/generic/stage2_streaming_build_tape.h +LIBHEADERS=src/simdprune_tables.h src/numberparsing.h src/jsoncharutils.h src/arm64/bitmask.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/haswell/bitmask.h src/haswell/simd.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h src/westmere/bitmask.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h src/generic/stage2_streaming_build_tape.h PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/parsedjson.h include/simdjson/parsedjsoniterator.h include/simdjson/portability.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h HEADERS=$(PUBHEADERS) $(LIBHEADERS) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 231e2382..9f957e67 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,7 +47,10 @@ set(SIMDJSON_SRC_HEADERS generic/stage2_build_tape.h generic/stage2_streaming_build_tape.h generic/stringparsing.h - generic/simdutf8check.h + generic/utf8_fastvalidate_algorithm.h + generic/utf8_lookup_algorithm.h + generic/utf8_range_algorithm.h + generic/utf8_zwegner_algorithm.h haswell/bitmask.h haswell/simd.h haswell/stage1_find_marks.h diff --git a/src/arm64/simd.h b/src/arm64/simd.h index 525e302c..5a3f6683 100644 --- a/src/arm64/simd.h +++ b/src/arm64/simd.h @@ -47,7 +47,8 @@ namespace simdjson::arm64::simd { // SIMD byte mask type (returned by things like eq and gt) template<> struct simd8: base_u8 { - typedef uint32_t bitmask_t; + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; static really_inline simd8 splat(bool _value) { return vmovq_n_u8(-(!!_value)); } @@ -57,7 +58,9 @@ namespace simdjson::arm64::simd { // Splat constructor really_inline simd8(bool _value) : simd8(splat(_value)) {} - really_inline simd8::bitmask_t to_bitmask() const { + // We return uint32_t instead of uint16_t because that seems to be more efficient for most + // purposes (cutting it down to uint16_t costs performance in some compilers). + really_inline uint32_t to_bitmask() const { const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; auto minput = *this & bit_mask; @@ -119,6 +122,8 @@ namespace simdjson::arm64::simd { really_inline simd8 max(const simd8 other) const { return vmaxq_u8(*this, other); } really_inline simd8 min(const simd8 other) const { return vminq_u8(*this, other); } really_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } + really_inline simd8 operator>=(const simd8 other) const { return vcgeq_u8(*this, other); } + really_inline simd8 operator>(const simd8 other) const { return vcgtq_u8(*this, other); } // Bit-specific operations really_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } @@ -131,18 +136,21 @@ namespace simdjson::arm64::simd { // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template + really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, L replace4, L replace5, L replace6, L replace7, L replace8, L replace9, L replace10, L replace11, L replace12, L replace13, L replace14, L replace15) const { - simd8 lookup_table( + return lookup_16(simd8::repeat_16( replace0, replace1, replace2, replace3, replace4, replace5, replace6, replace7, replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15 - ); - return lookup_table.apply_lookup_16_to(*this); + )); } template @@ -178,7 +186,7 @@ namespace simdjson::arm64::simd { ) : simd8(int8x16_t{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,v11,v12,v13,v14,v15 - }) {} + }) {} // Repeat 16 values as many times as necessary (usually for lookup tables) really_inline static simd8 repeat_16( int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, @@ -214,24 +222,28 @@ namespace simdjson::arm64::simd { return vextq_s8(prev_chunk, *this, 16 - N); } - // Perform a lookup of the lower 4 bits + // Perform a lookup assuming no value is larger than 16 + template + really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } template really_inline simd8 lookup_16( L replace0, L replace1, L replace2, L replace3, L replace4, L replace5, L replace6, L replace7, L replace8, L replace9, L replace10, L replace11, L replace12, L replace13, L replace14, L replace15) const { - return simd8(*this).lookup_16( + return lookup_16(simd8::repeat_16( replace0, replace1, replace2, replace3, replace4, replace5, replace6, replace7, replace8, replace9, replace10, replace11, replace12, replace13, replace14, replace15 - ); + )); } template really_inline simd8 apply_lookup_16_to(const simd8 original) { - return vqtbl1q_s8(*this, original); + return vqtbl1q_s8(*this, simd8(original)); } }; diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index caae3b7d..955d99cb 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -29,7 +29,7 @@ really_inline void find_whitespace_and_operators( whitespace = v.map([&](simd8 _v) { return _v.any_bits_set(0x18); }).to_bitmask(); } -#include "generic/simdutf8check.h" +#include "generic/utf8_fastvalidate_algorithm.h" #include "generic/stage1_find_marks.h" } // namespace simdjson::arm64 diff --git a/src/generic/simdutf8check.h b/src/generic/utf8_fastvalidate_algorithm.h similarity index 100% rename from src/generic/simdutf8check.h rename to src/generic/utf8_fastvalidate_algorithm.h diff --git a/src/generic/utf8_lookup_algorithm.h b/src/generic/utf8_lookup_algorithm.h new file mode 100644 index 00000000..d76e5c88 --- /dev/null +++ b/src/generic/utf8_lookup_algorithm.h @@ -0,0 +1,380 @@ +// +// Detect Unicode errors. +// +// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic +// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits +// are straight up concatenated into the final value. The first byte of a multibyte character is a +// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte +// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just +// start with 0, because that's what ASCII looks like. Here's what each size +// +// - ASCII (7 bits): 0_______ +// - 2 byte character (11 bits): 110_____ 10______ +// - 3 byte character (17 bits): 1110____ 10______ 10______ +// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______ +// - 5+ byte character (illegal): 11111___ +// +// There are 5 classes of error that can happen in Unicode: +// +// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation). +// We detect this by looking for new characters (lead bytes) inside the range of a multibyte +// character. +// +// e.g. 11000000 01100001 (2-byte character where second byte is ASCII) +// +// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation). +// We detect this by requiring that the next byte after your multibyte character be a new +// character--so a continuation after your character is wrong. +// +// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte) +// +// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large. +// +// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF). +// +// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have +// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is +// technically possible, but UTF-8 disallows it so that there is only one way to write an "a". +// +// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001) +// +// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and +// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8. +// +// e.g. 11101101 10100000 10000000 (U+D800) +// +// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not +// support values with more than 23 bits (which a 4-byte character supports). +// +// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) +// +// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: +// +// Code Points 1st 2s 3s 4s +// U+0000..U+007F 00..7F +// U+0080..U+07FF C2..DF 80..BF +// U+0800..U+0FFF E0 A0..BF 80..BF +// U+1000..U+CFFF E1..EC 80..BF 80..BF +// U+D000..U+D7FF ED 80..9F 80..BF +// U+E000..U+FFFF EE..EF 80..BF 80..BF +// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF +// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF +// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +// +using namespace simd; + +namespace utf8_validation { + +// +// These are the bits in lead_flags. Its main purpose is to tell you what kind of lead character +// it is (1,2,3 or 4--or none if it's continuation), but it also maps 4 other bytes that will be +// used to detect other kinds of errors. +// +// LEAD_4 is first because we use a << trick in get_byte_3_4_5_errors to turn LEAD_2 -> LEAD_3, +// LEAD_3 -> LEAD_4, and we want LEAD_4 to turn into nothing since there is no LEAD_5. This trick +// lets us use one constant table instead of 3, possibly saving registers on systems with fewer +// registers. +// +const uint8_t LEAD_4 = 0x01; // [1111]____ 10______ 10______ 10______ (0_|11)__ +const uint8_t LEAD_3 = 0x02; // [1110]____ 10______ 10______ (0|11)__ +const uint8_t LEAD_2 = 0x04; // [110_]____ 10______ (0|11)__ +const uint8_t LEAD_1 = 0x08; // [0___]____ (0|11)__ +const uint8_t LEAD_2_PLUS = 0x10; // [11__]____ ... +const uint8_t LEAD_1100 = 0x20; // [1100]____ ... +const uint8_t LEAD_1110 = 0x40; // [1110]____ ... +const uint8_t LEAD_1111 = 0x80; // [1111]____ ... + +really_inline simd8 get_lead_flags(const simd8 high_bits, const simd8 prev_high_bits) { + // Total: 2 instructions, 1 constant + // - 1 byte shift (shuffle) + // - 1 table lookup (shuffle) + // - 1 table constant + + // high_bits is byte 5, so lead is high_bits.prev<4>() + return high_bits.prev<4>(prev_high_bits).lookup_16( + LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII) + LEAD_1, LEAD_1, LEAD_1, LEAD_1, // [0___]____ (ASCII) + 0, 0, 0, 0, // [10__]____ (continuation) + LEAD_2 | LEAD_2_PLUS | LEAD_1100, // [1100]____ + LEAD_2 | LEAD_2_PLUS, // [110_]____ + LEAD_3 | LEAD_2_PLUS | LEAD_1110, // [1110]____ + LEAD_4 | LEAD_2_PLUS | LEAD_1111 // [1111]____ + ); +} + +// Find errors in bytes 1 and 2 together (one single multi-nibble &) +really_inline simd8 get_byte_1_2_errors(const simd8 input, const simd8 prev_input, const simd8 high_bits, const simd8 prev_high_bits) { + // + // These are the errors we're going to match for bytes 1-2, by looking at the first three + // nibbles of the character: lead_flags & & + // + // The important thing here is that these constants all take up *different* bits, since they + // match different patterns. This is why there are 2 LEAD_4 and 2 LEAD_3s in lead_flags, among + // other things. + // + static const int TOO_SHORT_2 = LEAD_2_PLUS; // 11______ (0___|11__)____ + static const int TOO_LONG_1 = LEAD_1; // 0_______ 10______ + static const int OVERLONG_2 = LEAD_1100; // 1100000_ ________ (technically we match 10______ but we could match ________, they both yield errors either way) + static const int OVERLONG_3 = LEAD_3; // 11100000 100_____ ________ + static const int OVERLONG_4 = LEAD_4; // 11110000 1000____ ________ ________ + static const int TOO_LARGE = LEAD_1111; // 11110100 (1001|101_)____ + static const int SURROGATE = LEAD_1110; // 11101101 [101_]____ + + // Total: 4 instructions, 2 constants + // - 2 table lookups (shuffles) + // - 1 byte shift (shuffle) + // - 1 "and" + // - 2 table constants + + // After processing the rest of byte 1 (the low bits), we're still not done--we have to check + // byte 2 to be sure which things are errors and which aren't. + // Since input is byte 5, byte 1 is input.prev<4> + const simd8 byte_1_flags = (input.prev<4>(prev_input) & 0x0F).lookup_16( + // ____[00__] ________ + TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________ + TOO_SHORT_2 | TOO_LONG_1 | OVERLONG_2, // ____[0001] ________ + TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, + // ____[01__] ________ + TOO_SHORT_2 | TOO_LONG_1 | TOO_LARGE, // ____[0100] ________ + TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, + // ____[10__] ________ + TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1, + // ____[11__] ________ + TOO_SHORT_2 | TOO_LONG_1, + TOO_SHORT_2 | TOO_LONG_1 | SURROGATE, // ____[1101] ________ + TOO_SHORT_2 | TOO_LONG_1, TOO_SHORT_2 | TOO_LONG_1 + ); + // Since high_bits is byte 5, byte 2 is high_bits.prev<3> + const simd8 byte_2_flags = high_bits.prev<3>(prev_high_bits).lookup_16( + // ASCII: ________ [0___]____ + OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, + // ASCII: ________ [0___]____ + OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, + // Continuations: ________ [10__]____ + OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | OVERLONG_4, // ________ [1000]____ + OVERLONG_2 | TOO_LONG_1 | OVERLONG_3 | SURROGATE, // ________ [1001]____ + OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1010]____ + OVERLONG_2 | TOO_LONG_1 | TOO_LARGE | SURROGATE, // ________ [1011]____ + // Multibyte Leads: ________ [11__]____ + OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2, OVERLONG_2 | TOO_SHORT_2 + ); + return byte_1_flags & byte_2_flags; +} + +really_inline simd8 get_byte_3_4_5_errors(const simd8 high_bits, const simd8 prev_high_bits) { + // Total 7 instructions, 3 simd constants: + // - 3 table lookups (shuffles) + // - 2 byte shifts (shuffles) + // - 2 "or" + // - 1 table constant + + const simd8 byte_3_table = simd8::repeat_16( + // TOO_SHORT ASCII: 111_____ ________ [0___]____ + LEAD_3, LEAD_3, LEAD_3, LEAD_3, + LEAD_3, LEAD_3, LEAD_3, LEAD_3, + // TOO_LONG Continuations: 110_____ ________ [10__]____ + LEAD_2, LEAD_2, LEAD_2, LEAD_2, + // TOO_SHORT Multibyte Leads: 111_____ ________ [11__]____ + LEAD_3, LEAD_3, LEAD_3, LEAD_3 + ); + const simd8 byte_4_table = byte_3_table.shr<1>(); // TOO_SHORT: LEAD_4, TOO_LONG: LEAD_3 + const simd8 byte_5_table = byte_3_table.shr<2>(); // TOO_SHORT: , TOO_LONG: LEAD_4 + + // high_bits is byte 5, high_bits.prev<2> is byte 3 and high_bits.prev<1> is byte 4 + return high_bits.prev<2>(prev_high_bits).lookup_16(byte_3_table) | + high_bits.prev<1>(prev_high_bits).lookup_16(byte_4_table) | + high_bits.lookup_16(byte_5_table); +} + +// Check whether the current bytes are valid UTF-8. +// At the end of the function, previous gets updated +// This should come down to 22 instructions if table definitions are in registers--30 if not. +really_inline simd8 check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // When we process bytes M through N, we look for lead characters in M-4 through N-4. This allows + // us to look for all errors related to any lead character at one time (since UTF-8 characters + // can only be up to 4 bytes, and the next byte after a character finishes must be another lead, + // we never need to look more than 4 bytes past the current one to fully validate). + // This way, we have all relevant bytes around and can save ourselves a little overflow and + // several instructions on each loop. + + // Total: 22 instructions, 7 simd constants + // Local: 8 instructions, 1 simd constant + // - 2 bit shifts + // - 1 byte shift (shuffle) + // - 3 "or" + // - 1 "and" + // - 1 saturating_sub + // - 1 constant (0b11111000-1) + // lead_flags: 2 instructions, 1 simd constant + // - 1 byte shift (shuffle) + // - 1 table lookup (shuffle) + // - 1 table constant + // byte_1_2_errors: 5 instructions, 2 simd constants + // - 2 table lookups (shuffles) + // - 2 byte shifts (shuffles) + // - 1 "and" + // - 2 table constants + // byte_3_4_5_errors: 7 instructions, 3 simd constants + // - 3 table lookups (shuffles) + // - 2 byte shifts (shuffles) + // - 2 "or" + // - 3 table constants + + const simd8 high_bits = input.shr<4>(); + const simd8 prev_high_bits = prev_input.shr<4>(); + const simd8 lead_flags = get_lead_flags(high_bits, prev_high_bits); + const simd8 byte_1_2_errors = get_byte_1_2_errors(input, prev_input, high_bits, prev_high_bits); + const simd8 byte_3_4_5_errors = get_byte_3_4_5_errors(high_bits, prev_high_bits); + // Detect illegal 5-byte+ Unicode values. We can't do this as part of byte_1_2_errors because + // it would need a third lead_flag = 1111, and we've already used up all 8 between + // byte_1_2_errors and byte_3_4_5_errors. + const simd8 too_large = input.saturating_sub(0b11111000-1); // too-large values will be nonzero + return too_large | (lead_flags & (byte_1_2_errors | byte_3_4_5_errors)); +} + +// TODO special case start of file, too, so that small documents are efficient! No shifting needed ... + +// The only problem that can happen at EOF is that a multibyte character is too short. +really_inline simd8 check_eof(simd8 prev_input) { + // Total: 1 instruction, 1 simd constant + // - 1 saturating_sub + // - 1 simd constant + + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t last_len[32] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(last_len+sizeof(last_len)-sizeof(simd8)); + // If anything is > the desired value, there will be a nonzero value in the result. + return prev_input.saturating_sub(max_value); +} + +really_inline simd8 check_input(simd8 input1, simd8 prev_input) { + // Total: 9 simd constants + // - ASCII: 3 instructions, 2 simd constants + // - UTF-8: 24 instructions, 8 simd constants + simd8 bits = input1; + if (likely(!bits.any_bits_set_anywhere(0b10000000u))) { + // This has the same semantics as EOF: we only have to check for multibyte characters in part + // 1 that got cut off + return check_eof(prev_input); + } else { + return check_utf8_bytes(input1, prev_input); + } +} +really_inline simd8 check_input(simd8 input1, simd8 input2, simd8 prev_input) { + // Total: 9 simd constants + // - ASCII: 3 instructions, 2 simd constants + // - UTF-8: 24 instructions, 8 simd constants + simd8 bits = input1 | input2; + if (likely(!bits.any_bits_set_anywhere(0b10000000u))) { + // This has the same semantics as EOF: we only have to check for multibyte characters in part + // 1 that got cut off + return check_eof(prev_input); + } else { + return check_utf8_bytes(input1, prev_input) | + check_utf8_bytes(input2, input1); + } +} +really_inline simd8 check_input(simd8 input1, simd8 input2, simd8 input3, simd8 input4, simd8 prev_input) { + // Total: 9 simd constants + // - ASCII: 3 instructions, 2 simd constants + // - UTF-8: 24 instructions, 8 simd constants + simd8 bits = input1 | input2 | input3 | input4; + if (likely(!bits.any_bits_set_anywhere(0b10000000u))) { + // This has the same semantics as EOF: we only have to check for multibyte characters in part + // 1 that got cut off + return check_eof(prev_input); + } else { + return check_utf8_bytes(input1, prev_input) | + check_utf8_bytes(input2, input1) | + check_utf8_bytes(input3, input2) | + check_utf8_bytes(input4, input3); + } +} +really_inline simd8 check_input(simd8 input1, simd8 input2, simd8 input3, simd8 input4, simd8 input5, simd8 input6, simd8 input7, simd8 input8, simd8 prev_input) { + // Total: 9 simd constants + // - ASCII: 3 instructions, 2 simd constants + // - UTF-8: 24 instructions, 8 simd constants + simd8 bits = input1 | input2 | input3 | input4 | input5 | input6 | input7 | input8; + if (likely(!bits.any_bits_set_anywhere(0b10000000u))) { + // This has the same semantics as EOF: we only have to check for multibyte characters in part + // 1 that got cut off + return check_eof(prev_input); + } else { + return check_utf8_bytes(input1, prev_input) | + check_utf8_bytes(input2, input1) | + check_utf8_bytes(input3, input2) | + check_utf8_bytes(input4, input3) | + check_utf8_bytes(input5, input4) | + check_utf8_bytes(input6, input5) | + check_utf8_bytes(input7, input6) | + check_utf8_bytes(input8, input7); + } +} + +template::NUM_CHUNKS> +really_inline simd8 check_input(simd8x64 input, simd8 &prev_input); +template<> +really_inline simd8 check_input<2>(simd8x64 input, simd8 &prev_input) { + simd8 error = check_input(input.chunks[0], input.chunks[1], prev_input); + prev_input = input.chunks[1]; + return error; +} +template<> +really_inline simd8 check_input<4>(simd8x64 input, simd8 &prev_input) { + simd8 error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], prev_input); + prev_input = input.chunks[3]; + return error; +} + +template::NUM_CHUNKS> +really_inline simd8 check_input(simd8x64 input, simd8x64 input2, simd8 &prev_input); +template<> +really_inline simd8 check_input<2>(simd8x64 input, simd8x64 input2, simd8 &prev_input) { + simd8 error = check_input(input.chunks[0], input.chunks[1], input2.chunks[0], input2.chunks[1], prev_input); + prev_input = input2.chunks[1]; + return error; +} +template<> +really_inline simd8 check_input<4>(simd8x64 input, simd8x64 input2, simd8 &prev_input) { + simd8 error = check_input(input.chunks[0], input.chunks[1], input.chunks[2], input.chunks[3], input2.chunks[0], input2.chunks[1], input2.chunks[2], input2.chunks[3], prev_input); + prev_input = input2.chunks[3]; + return error; +} + +} // namespace utf8_validation + +struct utf8_checker { + simd8 error; + simd8 prev_input; + + really_inline void check_next_input(simd8x64 input) { + // Total: 9 simd constants + // [256-bit] + // - ASCII: 4 instructions, 2 simd constants + // - UTF-8: 47 instructions, 8 simd constants (7 of them used twice) + // [128-bit] + // - ASCII: 6 instructions, 2 simd constants + // - UTF-8: 93 instructions, 8 simd constants (7 of them used four times) + + // it is not ascii so we have to do heavy work + this->error |= utf8_validation::check_input(input, this->prev_input); + } + + really_inline void check_next_input(simd8x64 input, simd8x64 input2) { + this->error |= utf8_validation::check_input(input, input2, this->prev_input); + } + + really_inline ErrorValues errors() { + return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + } + +}; // struct utf8_checker + +struct utf8_checker; + diff --git a/src/generic/utf8_range_algorithm.h b/src/generic/utf8_range_algorithm.h new file mode 100644 index 00000000..68030eb1 --- /dev/null +++ b/src/generic/utf8_range_algorithm.h @@ -0,0 +1,178 @@ +/* + * legal utf-8 byte sequence + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Code Points 1st 2s 3s 4s + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + */ + +// all byte values must be no larger than 0xF4 + +using namespace simd; + +struct processed_utf_bytes { + simd8 raw_bytes; + simd8 first_len; +}; + +struct utf8_checker { + simd8 has_error; + processed_utf_bytes previous; + + really_inline void check_carried_continuations() { + static const int8_t last_len[32] = { + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 2, 1, 0 + }; + this->has_error |= simd8(this->previous.first_len) > simd8(last_len + 32 - sizeof(simd8)); + } + + // check whether the current bytes are valid UTF-8 + // at the end of the function, previous gets updated + really_inline void check_utf8_bytes(simd8 current_bytes) { + + /* high_nibbles = input >> 4 */ + const simd8 high_nibbles = current_bytes.shr<4>(); + + /* + * Map high nibble of "First Byte" to legal character length minus 1 + * 0x00 ~ 0xBF --> 0 + * 0xC0 ~ 0xDF --> 1 + * 0xE0 ~ 0xEF --> 2 + * 0xF0 ~ 0xFF --> 3 + */ + /* first_len = legal character length minus 1 */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* first_len = first_len_tbl[high_nibbles] */ + simd8 first_len = high_nibbles.lookup_16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3); + + /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */ + /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */ + /* range = first_range_tbl[high_nibbles] */ + simd8 range = high_nibbles.lookup_16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8); + + /* Second Byte: set range index to first_len */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* range |= (first_len, previous->first_len) << 1 byte */ + range |= first_len.prev(this->previous.first_len); + + /* Third Byte: set range index to saturate_sub(first_len, 1) */ + /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */ + /* range |= (first_len - 1) << 2 bytes */ + range |= first_len.saturating_sub(1).prev<2>(this->previous.first_len.saturating_sub(1)); + + /* Fourth Byte: set range index to saturate_sub(first_len, 2) */ + /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */ + /* range |= (first_len - 2) << 3 bytes */ + range |= first_len.saturating_sub(2).prev<3>(this->previous.first_len.saturating_sub(2)); + + /* + * Now we have below range indices caluclated + * Correct cases: + * - 8 for C0~FF + * - 3 for 1st byte after F0~FF + * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF + * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or + * 3rd byte after F0~FF + * - 0 for others + * Error cases: + * 9,10,11 if non ascii First Byte overlaps + * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error + */ + + /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */ + /* Overlaps lead to index 9~15, which are illegal in range table */ + /* shift1 = (input, previous->input) << 1 byte */ + simd8 shift1 = current_bytes.prev(this->previous.raw_bytes); + /* + * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE | + * pos: | 0 1 15 | 16 17 239| 240 241 255| + * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 | + * pos+112: | 112 113 127| >= 128 | >= 128 | + */ + simd8 pos = shift1 - 0xEF; + + /* + * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after + * which the Second Byte are not 80~BF. It contains "range index adjustment". + * +------------+---------------+------------------+----------------+ + * | First Byte | original range| range adjustment | adjusted range | + * +------------+---------------+------------------+----------------+ + * | E0 | 2 | 2 | 4 | + * +------------+---------------+------------------+----------------+ + * | ED | 2 | 3 | 5 | + * +------------+---------------+------------------+----------------+ + * | F0 | 3 | 3 | 6 | + * +------------+---------------+------------------+----------------+ + * | F4 | 4 | 4 | 8 | + * +------------+---------------+------------------+----------------+ + */ + /* index1 -> E0, index14 -> ED */ + simd8 range2 = pos.saturating_sub(240).lookup_16(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0); + /* index1 -> F0, index5 -> F4 */ + range2 += pos.saturating_add(112).lookup_16(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + range += range2; + + /* Load min and max values per calculated range index */ + /* + * Range table, map range index to min and max values + * Index 0 : 00 ~ 7F (First Byte, ascii) + * Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte) + * Index 4 : A0 ~ BF (Second Byte after E0) + * Index 5 : 80 ~ 9F (Second Byte after ED) + * Index 6 : 90 ~ BF (Second Byte after F0) + * Index 7 : 80 ~ 8F (Second Byte after F4) + * Index 8 : C2 ~ F4 (First Byte, non ascii) + * Index 9~15 : illegal: i >= 127 && i <= -128 + */ + simd8 minv = range.lookup_16( + 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F + ); + simd8 maxv = range.lookup_16( + 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + ); + + // We're fine with high-bit wraparound here, so we use int comparison since it's faster on Intel + this->has_error |= simd8(minv) > simd8(current_bytes); + this->has_error |= simd8(current_bytes) > simd8(maxv); + + this->previous.raw_bytes = current_bytes; + this->previous.first_len = first_len; + } + + really_inline void check_next_input(simd8 in) { + if (likely(!in.any_bits_set_anywhere(0x80u))) { + this->check_carried_continuations(); + } else { + this->check_utf8_bytes(in); + } + } + + really_inline void check_next_input(simd8x64 in) { + simd8 bits = in.reduce([&](auto a, auto b) { return a | b; }); + if (likely(!bits.any_bits_set_anywhere(0x80u))) { + // it is ascii, we just check carried continuations. + this->check_carried_continuations(); + } else { + // it is not ascii so we have to do heavy work + in.each([&](auto _in) { this->check_utf8_bytes(_in); }); + } + } + + really_inline ErrorValues errors() { + return this->has_error.any() ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + } +}; // struct utf8_checker diff --git a/src/generic/utf8_zwegner_algorithm.h b/src/generic/utf8_zwegner_algorithm.h new file mode 100644 index 00000000..e84d7f4f --- /dev/null +++ b/src/generic/utf8_zwegner_algorithm.h @@ -0,0 +1,358 @@ +// +// Detect UTF-8 errors. +// +// Copied and adapted from algorithm by @zwegner: https://github.com/zwegner/faster-utf8-validator +// +// UTF-8 Refresher +// --------------- +// +// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic +// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits +// are straight up concatenated into the final value. The first byte of a multibyte character is a +// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte +// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just +// start with 0, because that's what ASCII looks like. Here's what each size +// +// | Character Length | UTF-8 Byte Sequence | +// |-----------------------------|---------------------------------------| +// | ASCII (7 bits): | `0_______` | +// | 2 byte character (11 bits) | `110_____ 10______` | +// | 3 byte character (17 bits) | `1110____ 10______ 10______` | +// | 4 byte character (23 bits) | `11110___ 10______ 10______ 10______` | +// | 5+ byte character (illegal) | `11111___` | +// +// UTF-8 Error Classes +// ------------------- +// +// There are 5 classes of error that can happen in UTF-8: +// +// ### Too short (missing continuations) +// +// TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation). +// We detect this by looking for new characters (lead bytes) inside the range of a multibyte +// character. +// +// e.g. `11000000 01100001` (2-byte character where second byte is ASCII) +// +// ### Too long (stray continuations) +// +// TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation). +// We detect this by requiring that the next byte after your multibyte character be a new +// character--so a continuation after your character is wrong. +// +// e.g. `11011111 10111111 10111111` (2-byte character followed by *another* continuation byte) +// +// ### Too large (out of range for unicode) +// +// TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large. +// +// e.g. `11110111 10111111 10111111 10111111` (bigger than 10FFFF). +// +// ### Overlong encoding (used more bytes than needed) +// +// Multibyte characters with a bunch of leading zeroes, where you could have +// used fewer bytes to make the same character, are considered *overlong encodings*. They are +// disallowed in UTF-8 to ensure there is only one way to write a single Unicode codepoint, making strings +// easier to search. Like encoding an ASCII character in 2 bytes is technically possible, but UTF-8 +// disallows it so that you only have to search for the ASCII character `a` to find it. +// +// e.g. `11000001 10100001` (2-byte encoding of "a", which only requires 1 byte: 01100001) +// +// ### Surrogate characters +// +// Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and WTF-8 encodings +// for characters with > 2 bytes. These are illegal in pure UTF-8. +// +// e.g. `11101101 10100000 10000000` (U+D800) +// +// ### 5+ byte characters +// +// INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not +// support values with more than 23 bits (which a 4-byte character supports). +// +// Even if these were supported, anything with 5 bytes would be either too large (bigger than the +// Unicode max value), or overlong (could fit in 4+ bytes). +// +// e.g. `11111000 10100000 10000000 10000000 10000000` (U+800000) +// +// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: +// +// | Code Points | 1st | 2nd | 3s | 4s | +// |--------------------|--------|--------|--------|--------| +// | U+0000..U+007F | 00..7F | | | | +// | U+0080..U+07FF | C2..DF | 80..BF | | | +// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | +// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | +// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | +// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | +// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | +// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | +// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | +// +// Algorithm +// --------- +// +// This validator works in two basic steps: checking continuation bytes, and +// handling special cases. Each step works on one vector's worth of input +// bytes at a time. +// +using namespace simd; + +using vmask_t = simd8::bitmask_t; +using vmask2_t = simd8::bitmask2_t; + +struct utf8_checker { + simd8 special_case_errors; + simd8 prev_bytes; + vmask2_t last_cont; + vmask_t length_errors; + + // + // Check for missing / extra continuation bytes. + // + // The continuation bytes are handled in a fairly straightforward manner in + // the scalar domain. A mask is created from the input byte vector for each + // of the highest four bits of every byte. The first mask allows us to quickly + // skip pure ASCII input vectors, which have no bits set. The first and + // (inverted) second masks together give us every continuation byte (10xxxxxx). + // The other masks are used to find prefixes of multi-byte code points (110, + // 1110, 11110). For these, we keep a "required continuation" mask, by shifting + // these masks 1, 2, and 3 bits respectively forward in the byte stream. That + // is, we take a mask of all bytes that start with 11, and shift it left one + // bit forward to get the mask of all the first continuation bytes, then do the + // same for the second and third continuation bytes. Here's an example input + // sequence along with the corresponding masks: + // + // bytes: 61 C3 80 62 E0 A0 80 63 F0 90 80 80 00 + // code points: 61|C3 80|62|E0 A0 80|63|F0 90 80 80|00 + // # of bytes: 1 |2 - |1 |3 - - |1 |4 - - - |1 + // cont. mask 1: - - 1 - - 1 - - - 1 - - - + // cont. mask 2: - - - - - - 1 - - - 1 - - + // cont. mask 3: - - - - - - - - - - - 1 - + // cont. mask *: 0 0 1 0 0 1 1 0 0 1 1 1 0 + // + // The final required continuation mask is then compared with the mask of + // actual continuation bytes, and must match exactly in valid UTF-8. The only + // complication in this step is that the shifted masks can cross vector + // boundaries, so we need to keep a "carry" mask of the bits that were shifted + // past the boundary in the last loop iteration. + // + really_inline void check_length_errors(const simd8 bytes, const vmask_t bit_7) { + // Compute the continuation byte mask by finding bytes that start with + // 11x, 111x, and 1111. For each of these prefixes, we get a bitmask + // and shift it forward by 1, 2, or 3. This loop should be unrolled by + // the compiler, and the (n == 1) branch inside eliminated. + // + // NOTE (@jkeiser): I unrolled the for(i=1..3) loop because I don't trust compiler unrolling + // anymore. This should be exactly equivalent and yield the same optimizations (and also lets + // us rearrange statements if we so desire). + + // We add the shifted mask here instead of ORing it, which would + // be the more natural operation, so that this line can be done + // with one lea. While adding could give a different result due + // to carries, this will only happen for invalid UTF-8 sequences, + // and in a way that won't cause it to pass validation. Reasoning: + // Any bits for required continuation bytes come after the bits + // for their leader bytes, and are all contiguous. For a carry to + // happen, two of these bit sequences would have to overlap. If + // this is the case, there is a leader byte before the second set + // of required continuation bytes (and thus before the bit that + // will be cleared by a carry). This leader byte will not be + // in the continuation mask, despite being required. QEDish. + // Which bytes are required to be continuation bytes + vmask2_t cont_required = this->last_cont; + + // 2-byte lead: 11______ + const vmask_t bit_6 = bytes.get_bit<6>(); + const vmask_t lead_2_plus = bit_7 & bit_6; // 11______ + cont_required += vmask2_t(lead_2_plus) << 1; + + // 3-byte lead: 111_____ + const vmask_t bit_5 = bytes.get_bit<5>(); + const vmask_t lead_3_plus = lead_2_plus & bit_5; // 111_____ + cont_required += vmask2_t(lead_3_plus) << 2; + + // 4-byte lead: 1111____ + const vmask_t bit_4 = bytes.get_bit<4>(); + const vmask_t lead_4_plus = lead_3_plus & bit_4; + cont_required += vmask2_t(lead_4_plus) << 3; + + const vmask_t cont = bit_7 ^ lead_2_plus; // 10______ TODO &~ bit_6 might be fine, and involve less data dependency + + // Check that continuation bytes match. We must cast req from vmask2_t + // (which holds the carry mask in the upper half) to vmask_t, which + // zeroes out the upper bits + // + // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in + // success cases: instead of spending time testing the result and introducing a branch (which + // can affect performance even if it's easily predictable), we test once at the end. + // The ^ is equivalent to !=, however, leaving a 1 where the bits are different and 0 where they + // are the same. + this->length_errors |= cont ^ vmask_t(cont_required); + + this->last_cont = cont_required >> sizeof(simd8); + } + + // + // These constants define the set of error flags in check_special_cases(). + // + static const uint8_t OVERLONG_2 = 0x01; // 1100000_ ________ Could have been encoded in 1 byte + static const uint8_t OVERLONG_3 = 0x02; // 11100000 100_____ Could have been encoded in 2 bytes + static const uint8_t SURROGATE = 0x04; // 11101010 101_____ Surrogate pairs + static const uint8_t TOO_LARGE = 0x08; // 11110100 (1001|101_)____ > U+10FFFF + static const uint8_t TOO_LARGE_2 = 0x10; // 1111(0101..1111) ________ > U+10FFFF + static const uint8_t OVERLONG_4 = 0x20; // 11110000 1000____ Could have been encoded in 3 bytes + + // + // Check for special-case errors with table lookups on the first 3 nibbles (first 2 bytes). + // + // Besides the basic prefix coding of UTF-8, there are several invalid byte + // sequences that need special handling. These are due to three factors: + // code points that could be described in fewer bytes, code points that are + // part of a surrogate pair (which are only valid in UTF-16), and code points + // that are past the highest valid code point U+10FFFF. + // + // All of the invalid sequences can be detected by independently observing + // the first three nibbles of each code point. Since AVX2 can do a 4-bit/16-byte + // lookup in parallel for all 32 bytes in a vector, we can create bit masks + // for all of these error conditions, look up the bit masks for the three + // nibbles for all input bytes, and AND them together to get a final error mask, + // that must be all zero for valid UTF-8. This is somewhat complicated by + // needing to shift the error masks from the first and second nibbles forward in + // the byte stream to line up with the third nibble. + // + // We have these possible values for valid UTF-8 sequences, broken down + // by the first three nibbles: + // + // 1st 2nd 3rd comment + // 0..7 0..F ASCII + // 8..B 0..F continuation bytes + // C 2..F 8..B C0 xx and C1 xx can be encoded in 1 byte + // D 0..F 8..B D0..DF are valid with a continuation byte + // E 0 A..B E0 8x and E0 9x can be encoded with 2 bytes + // 1..C 8..B E1..EC are valid with continuation bytes + // D 8..9 ED Ax and ED Bx correspond to surrogate pairs + // E..F 8..B EE..EF are valid with continuation bytes + // F 0 9..B F0 8x can be encoded with 3 bytes + // 1..3 8..B F1..F3 are valid with continuation bytes + // 4 8 F4 8F BF BF is the maximum valid code point + // + // That leaves us with these invalid sequences, which would otherwise fit + // into UTF-8's prefix encoding. Each of these invalid sequences needs to + // be detected separately, with their own bits in the error mask. + // + // 1st 2nd 3rd error bit + // C 0..1 0..F 0x01 + // E 0 8..9 0x02 + // D A..B 0x04 + // F 0 0..8 0x08 + // 4 9..F 0x10 + // 5..F 0..F 0x20 + // + // For every possible value of the first, second, and third nibbles, we keep + // a lookup table that contains the bitwise OR of all errors that that nibble + // value can cause. For example, the first nibble has zeroes in every entry + // except for C, E, and F, and the third nibble lookup has the 0x21 bits in + // every entry, since those errors don't depend on the third nibble. After + // doing a parallel lookup of the first/second/third nibble values for all + // bytes, we AND them together. Only when all three have an error bit in common + // do we fail validation. + // + really_inline void check_special_cases(const simd8 bytes) { + const simd8 shifted_bytes = bytes.prev<1>(this->prev_bytes); + this->prev_bytes = bytes; + + // Look up error masks for three consecutive nibbles. We need to + // AND with 0x0F for each one, because vpshufb has the neat + // "feature" that negative values in an index byte will result in + // a zero. + simd8 nibble_1_error = shifted_bytes.shr<4>().lookup_16( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + + OVERLONG_2, // [1100]000_ ________ Could have been encoded in 1 byte + 0, + OVERLONG_3 | SURROGATE, // [1110]0000 100_____ Could have been encoded in 2 bytes + // [1110]1010 101_____ Surrogate pairs + OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]0000 1000____ Could have been encoded in 3 bytes + // [1111]0100 (1001|101_)____ > U+10FFFF + ); + + simd8 nibble_2_error = (shifted_bytes & 0x0F).lookup_16( + OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // 1100[000_] ________ Could have been encoded in 1 byte + // 1110[0000] 100_____ Could have been encoded in 2 bytes + // 1111[0000] 1000____ Could have been encoded in 3 bytes + OVERLONG_2, + 0, + 0, + + TOO_LARGE, // 1111[0100] (1001|101_)____ > U+10FFFF + TOO_LARGE_2, // 1111[0101..1111] ________ > U+10FFFF + TOO_LARGE_2, + TOO_LARGE_2, + + TOO_LARGE_2, + TOO_LARGE_2, + TOO_LARGE_2, + TOO_LARGE_2, + + TOO_LARGE_2, + TOO_LARGE_2 | SURROGATE, // 1110[1010] 101_____ Surrogate pairs + TOO_LARGE_2, TOO_LARGE_2 + ); + + // Errors that apply no matter what the third byte is + const uint8_t CARRY = OVERLONG_2 | TOO_LARGE_2; // 1100000_ [____]____ Could have been encoded in 1 byte + // 1111(0101..1111) [____]____ > U+10FFFF + simd8 nibble_3_error = bytes.shr<4>().lookup_16( + CARRY, CARRY, CARRY, CARRY, + + CARRY, CARRY, CARRY, CARRY, + + CARRY | OVERLONG_3 | OVERLONG_4, // 11100000 [100_]____ Could have been encoded in 2 bytes + // 11110000 [1000]____ Could have been encoded in 3 bytes + CARRY | OVERLONG_3 | TOO_LARGE, // 11100000 [100_]____ Could have been encoded in 2 bytes + // 11110100 [1001|101_]____ > U+10FFFF + CARRY | SURROGATE | TOO_LARGE, // 11101010 [101_]____ Surrogate pairs + CARRY | SURROGATE | TOO_LARGE, + + CARRY, CARRY, CARRY, CARRY + ); + + // Check if any bits are set in all three error masks + // + // NOTE (@jkeiser): I turned the if() statement here into this->has_error for performance in + // success cases: instead of spending time testing the result and introducing a branch (which + // can affect performance even if it's easily predictable), we test once at the end. + this->special_case_errors |= nibble_1_error & nibble_2_error & nibble_3_error; + } + + // check whether the current bytes are valid UTF-8 + // at the end of the function, previous gets updated + really_inline void check_utf8_bytes(const simd8 bytes, const vmask_t bit_7) { + this->check_length_errors(bytes, bit_7); + this->check_special_cases(bytes); + } + + really_inline void check_next_input(simd8 bytes) { + vmask_t bit_7 = bytes.get_bit<7>(); + if (unlikely(bit_7)) { + // TODO (@jkeiser): To work with simdjson's caller model, I moved the calculation of + // shifted_bytes inside check_utf8_bytes. I believe this adds an extra instruction to the hot + // path (saving prev_bytes), which is undesirable, though 2 register accesses vs. 1 memory + // access might be a wash. Come back and try the other way. + this->check_utf8_bytes(bytes, bit_7); + } else { + this->length_errors |= this->last_cont; + } + } + + really_inline void check_next_input(simd8x64 in) { + in.each([&](auto bytes) { this->check_next_input(bytes); }); + } + + really_inline ErrorValues errors() { + return (this->special_case_errors.any_bits_set_anywhere() | this->length_errors) ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + } +}; // struct utf8_checker diff --git a/src/haswell/simd.h b/src/haswell/simd.h index 17192b1b..87193871 100644 --- a/src/haswell/simd.h +++ b/src/haswell/simd.h @@ -42,6 +42,9 @@ namespace simdjson::haswell::simd { template> struct base8: base> { + typedef uint32_t bitmask_t; + typedef uint64_t bitmask2_t; + really_inline base8() : base>() {} really_inline base8(const __m256i _value) : base>(_value) {} @@ -58,7 +61,6 @@ namespace simdjson::haswell::simd { // SIMD byte mask type (returned by things like eq and gt) template<> struct simd8: base8 { - typedef int bitmask_t; static really_inline simd8 splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); } really_inline simd8() : base8() {} @@ -66,7 +68,7 @@ namespace simdjson::haswell::simd { // Splat constructor really_inline simd8(bool _value) : base8(splat(_value)) {} - really_inline bitmask_t to_bitmask() const { return _mm256_movemask_epi8(*this); } + really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); } really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } }; @@ -77,6 +79,18 @@ namespace simdjson::haswell::simd { static really_inline simd8 load(const T values[32]) { return _mm256_loadu_si256(reinterpret_cast(values)); } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } really_inline base8_numeric() : base8() {} really_inline base8_numeric(const __m256i _value) : base8(_value) {} @@ -87,8 +101,8 @@ namespace simdjson::haswell::simd { // Addition/subtraction are the same for signed and unsigned really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } - really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } - really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *(simd8*)this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *(simd8*)this; } // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template @@ -192,16 +206,24 @@ namespace simdjson::haswell::simd { really_inline simd8 max(const simd8 other) const { return _mm256_max_epu8(*this, other); } really_inline simd8 min(const simd8 other) const { return _mm256_min_epu8(*this, other); } really_inline simd8 operator<=(const simd8 other) const { return other.max(*this) == other; } + really_inline simd8 operator>=(const simd8 other) const { return other.min(*this) == other; } + really_inline simd8 operator>(const simd8 other) const { return this->saturating_sub(other).any_bits_set(); } // Bit-specific operations - really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } - really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm256_testz_si256(*this, bits); } - really_inline bool any_bits_set_anywhere() const { return !_mm256_testz_si256(*this, *this); } + really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } + really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm256_testz_si256(*this, bits); } + really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } template really_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } template really_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } }; template diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index a6aca2d4..2af45954 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -33,7 +33,7 @@ really_inline void find_whitespace_and_operators( }).to_bitmask(); } -#include "generic/simdutf8check.h" +#include "generic/utf8_lookup_algorithm.h" #include "generic/stage1_find_marks.h" } // namespace haswell diff --git a/src/westmere/simd.h b/src/westmere/simd.h index 25a20387..ddc515a8 100644 --- a/src/westmere/simd.h +++ b/src/westmere/simd.h @@ -42,7 +42,8 @@ namespace simdjson::westmere::simd { template> struct base8: base> { - typedef int bitmask_t; + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; really_inline base8() : base>() {} really_inline base8(const __m128i _value) : base>(_value) {} @@ -67,7 +68,7 @@ namespace simdjson::westmere::simd { // Splat constructor really_inline simd8(bool _value) : base8(splat(_value)) {} - really_inline bitmask_t to_bitmask() const { return _mm_movemask_epi8(*this); } + really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } really_inline bool any() const { return !_mm_testz_si128(*this, *this); } }; @@ -78,6 +79,16 @@ namespace simdjson::westmere::simd { static really_inline simd8 load(const T values[16]) { return _mm_loadu_si128(reinterpret_cast(values)); } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } really_inline base8_numeric() : base8() {} really_inline base8_numeric(const __m128i _value) : base8(_value) {} @@ -88,8 +99,8 @@ namespace simdjson::westmere::simd { // Addition/subtraction are the same for signed and unsigned really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } - really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } - really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *(simd8*)this; } + really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *(simd8*)this; } // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) template @@ -181,16 +192,24 @@ namespace simdjson::westmere::simd { really_inline simd8 max(const simd8 other) const { return _mm_max_epu8(*this, other); } really_inline simd8 min(const simd8 other) const { return _mm_min_epu8(*this, other); } really_inline simd8 operator<=(const simd8 other) const { return other.max(*this) == other; } + really_inline simd8 operator>=(const simd8 other) const { return other.min(*this) == other; } + really_inline simd8 operator>(const simd8 other) const { return this->saturating_sub(other).any_bits_set(); } // Bit-specific operations really_inline simd8 any_bits_set(simd8 bits) const { return (*this & bits).any_bits_set(); } really_inline simd8 any_bits_set() const { return ~(*this == uint8_t(0)); } - really_inline bool any_bits_set_anywhere(simd8 bits) const { return !_mm_testz_si128(*this, bits); } - really_inline bool any_bits_set_anywhere() const { return !_mm_testz_si128(*this, *this); } + really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } template really_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } template really_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } }; template diff --git a/src/westmere/stage1_find_marks.h b/src/westmere/stage1_find_marks.h index 58e3a91c..65be11d7 100644 --- a/src/westmere/stage1_find_marks.h +++ b/src/westmere/stage1_find_marks.h @@ -33,7 +33,7 @@ really_inline void find_whitespace_and_operators( }).to_bitmask(); } -#include "generic/simdutf8check.h" +#include "generic/utf8_lookup_algorithm.h" #include "generic/stage1_find_marks.h" } // namespace westmere