From 81f224957501c548fbb5aa750a8110759f5955c6 Mon Sep 17 00:00:00 2001 From: John Keiser Date: Sun, 13 Oct 2019 11:36:52 -0700 Subject: [PATCH] Move stage1 into a class to pass fewer parameters --- src/generic/stage1_find_marks.h | 471 ++++++++++++------------ src/generic/stage1_find_marks_flatten.h | 95 +++-- src/haswell/stage1_find_marks.h | 78 ++-- src/stage1_find_marks.cpp | 3 +- 4 files changed, 319 insertions(+), 328 deletions(-) diff --git a/src/generic/stage1_find_marks.h b/src/generic/stage1_find_marks.h index a6727bd2..3a87e672 100644 --- a/src/generic/stage1_find_marks.h +++ b/src/generic/stage1_find_marks.h @@ -3,221 +3,10 @@ // We assume the file in which it is included already includes // "simdjson/stage1_find_marks.h" (this simplifies amalgation) -// return a bitvector indicating where we have characters that end an odd-length -// sequence of backslashes (and thus change the behavior of the next character -// to follow). A even-length sequence of backslashes, and, for that matter, the -// largest even-length prefix of our odd-length sequence of backslashes, simply -// modify the behavior of the backslashes themselves. -// We also update the prev_iter_ends_odd_backslash reference parameter to -// indicate whether we end an iteration on an odd-length sequence of -// backslashes, which modifies our subsequent search for odd-length -// sequences of backslashes in an obvious way. -really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) { - const uint64_t even_bits = 0x5555555555555555ULL; - const uint64_t odd_bits = ~even_bits; - uint64_t start_edges = match & ~(match << 1); - /* flip lowest if we have an odd-length run at the end of the prior - * iteration */ - uint64_t even_start_mask = even_bits ^ overflow; - uint64_t even_starts = start_edges & even_start_mask; - uint64_t odd_starts = start_edges & ~even_start_mask; - uint64_t even_carries = match + even_starts; - - uint64_t odd_carries; - /* must record the carry-out of our odd-carries out of bit 63; this - * indicates whether the sense of any edge going to the next iteration - * should be flipped */ - bool new_overflow = add_overflow(match, odd_starts, &odd_carries); - - odd_carries |= overflow; /* push in bit zero as a - * potential end if we had an - * odd-numbered run at the - * end of the previous - * iteration */ - overflow = new_overflow ? 0x1ULL : 0x0ULL; - uint64_t even_carry_ends = even_carries & ~match; - uint64_t odd_carry_ends = odd_carries & ~match; - uint64_t even_start_odd_end = even_carry_ends & odd_bits; - uint64_t odd_start_even_end = odd_carry_ends & even_bits; - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; - return odd_ends; -} - -// -// Check if the current character immediately follows a matching character. -// -// For example, this checks for quotes with backslashes in front of them: -// -// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash); -// -really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { - const uint64_t result = match << 1 | overflow; - overflow = match >> 63; - return result; -} - -// -// Check if the current character follows a matching character, with possible "filler" between. -// For example, this checks for empty curly braces, e.g. -// -// in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { * } -// -really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) { - uint64_t follows_match = follows(match, overflow); - uint64_t result; - overflow |= add_overflow(follows_match, filler, &result); - return result; -} - -really_inline ErrorValues detect_errors_on_eof( - uint64_t &unescaped_chars_error, - const uint64_t prev_in_string) { - if (prev_in_string) { - return UNCLOSED_STRING; - } - if (unescaped_chars_error) { - return UNESCAPED_CHARS; - } - return SUCCESS; -} - -// -// Return a mask of all string characters plus end quotes. -// -// prev_escaped is overflow saying whether the next character is escaped. -// prev_in_string is overflow saying whether we're still in a string. -// -// Backslash sequences outside of quotes will be detected in stage 2. -// -really_inline uint64_t find_strings(const simd_input in, uint64_t &prev_escaped, uint64_t &prev_in_string) { - const uint64_t backslash = in.eq('\\'); - const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped); - const uint64_t quote = in.eq('"') & ~escaped; - // compute_quote_mask returns start quote plus string contents. - const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string; - /* right shift of a signed value expected to be well-defined and standard - * compliant as of C++20, - * John Regher from Utah U. says this is fine code */ - prev_in_string = static_cast(static_cast(in_string) >> 63); - // Use ^ to turn the beginning quote off, and the end quote on. - return in_string ^ quote; -} - -really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) { - /* All Unicode characters may be placed within the - * quotation marks, except for the characters that MUST be escaped: - * quotation mark, reverse solidus, and the control characters (U+0000 - * through U+001F). - * https://tools.ietf.org/html/rfc8259 */ - return quote_mask & unescaped; -} - -// -// Determine which characters are *structural*: -// - braces: [] and {} -// - the start of primitives (123, true, false, null) -// - the start of invalid non-whitespace (+, &, ture, UTF-8) -// -// Also detects value sequence errors: -// - two values with no separator between ("hello" "world") -// - separators with no values ([1,] [1,,]and [,2]) -// -// This method will find all of the above whether it is in a string or not. -// -// To reduce dependency on the expensive "what is in a string" computation, this method treats the -// contents of a string the same as content outside. Errors and structurals inside the string or on -// the trailing quote will need to be removed later when the correct string information is known. -// -really_inline uint64_t find_potential_structurals(const simd_input in, uint64_t &prev_primitive) { - // These use SIMD so let's kick them off before running the regular 64-bit stuff ... - uint64_t whitespace, op; - find_whitespace_and_operators(in, whitespace, op); - - // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings ("). - // Everything except whitespace, braces, colon and comma. - const uint64_t primitive = ~(op | whitespace); - const uint64_t follows_primitive = follows(primitive, prev_primitive); - const uint64_t start_primitive = primitive & ~follows_primitive; - - // Return final structurals - return op | start_primitive; -} - static const size_t STEP_SIZE = 128; -// -// Find the important bits of JSON in a 128-byte chunk, and add them to : -// -// -// -// PERF NOTES: -// We pipe 2 inputs through these stages: -// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load -// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. -// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path. -// The output of step 1 depends entirely on this information. These functions don't quite use -// up enough CPU: the second half of the functions is highly serial, only using 1 execution core -// at a time. The second input's scans has some dependency on the first ones finishing it, but -// they can make a lot of progress before they need that information. -// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that -// to finish: utf-8 checks and generating the output from the last iteration. -// -// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all -// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough -// workout. -// -really_inline void find_structural_bits_128( - const uint8_t *buf, const size_t idx, uint32_t *&base_ptr, - uint64_t &prev_escaped, uint64_t &prev_in_string, - uint64_t &prev_primitive, - uint64_t &prev_structurals, - uint64_t &unescaped_chars_error, - utf8_checker &utf8_state) { - // - // Load up all 128 bytes into SIMD registers - // - simd_input in_1(buf); - simd_input in_2(buf+64); - - // - // Find the strings and potential structurals (operators / primitives). - // - // This will include false structurals that are *inside* strings--we'll filter strings out - // before we return. - // - uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string); - uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive); - uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string); - uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive); - - // - // Do miscellaneous work while the processor is busy calculating strings and structurals. - // - // After that, weed out structurals that are inside strings and find invalid string characters. - // - uint64_t unescaped_1 = in_1.lteq(0x1F); - utf8_state.check_next_input(in_1); - flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson - prev_structurals = structurals_1 & ~string_1; - unescaped_chars_error |= unescaped_1 & string_1; - - uint64_t unescaped_2 = in_2.lteq(0x1F); - utf8_state.check_next_input(in_2); - flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson - prev_structurals = structurals_2 & ~string_2; - unescaped_chars_error |= unescaped_2 & string_2; -} - -int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) { - if (unlikely(len > pj.byte_capacity)) { - std::cerr << "Your ParsedJson object only supports documents up to " - << pj.byte_capacity << " bytes but you are trying to process " - << len << " bytes" << std::endl; - return simdjson::CAPACITY; - } - uint32_t *base_ptr = pj.structural_indexes; - utf8_checker utf8_state; - +class json_structural_scanner { +public: // Whether the first character of the next iteration is escaped. uint64_t prev_escaped = 0ULL; // Whether the last iteration was still inside a string (all 1's = true, all 0's = false). @@ -228,41 +17,249 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p // Mask of structural characters from the last iteration. // Kept around for performance reasons, so we can call flatten_bits to soak up some unused // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask. - uint64_t structurals = 0; - - size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE; - size_t idx = 0; + uint64_t prev_structurals = 0; // Errors with unescaped characters in strings (ASCII codepoints < 0x20) uint64_t unescaped_chars_error = 0; + bit_indexer structural_indexes; - for (; idx < lenminusstep; idx += STEP_SIZE) { - find_structural_bits_128(&buf[idx], idx, base_ptr, - prev_escaped, prev_in_string, prev_primitive, - structurals, unescaped_chars_error, utf8_state); + + json_structural_scanner(uint32_t *_structural_indexes) : structural_indexes{_structural_indexes} {} + + // return a bitvector indicating where we have characters that end an odd-length + // sequence of backslashes (and thus change the behavior of the next character + // to follow). A even-length sequence of backslashes, and, for that matter, the + // largest even-length prefix of our odd-length sequence of backslashes, simply + // modify the behavior of the backslashes themselves. + // We also update the prev_iter_ends_odd_backslash reference parameter to + // indicate whether we end an iteration on an odd-length sequence of + // backslashes, which modifies our subsequent search for odd-length + // sequences of backslashes in an obvious way. + really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) { + const uint64_t even_bits = 0x5555555555555555ULL; + const uint64_t odd_bits = ~even_bits; + uint64_t start_edges = match & ~(match << 1); + /* flip lowest if we have an odd-length run at the end of the prior + * iteration */ + uint64_t even_start_mask = even_bits ^ overflow; + uint64_t even_starts = start_edges & even_start_mask; + uint64_t odd_starts = start_edges & ~even_start_mask; + uint64_t even_carries = match + even_starts; + + uint64_t odd_carries; + /* must record the carry-out of our odd-carries out of bit 63; this + * indicates whether the sense of any edge going to the next iteration + * should be flipped */ + bool new_overflow = add_overflow(match, odd_starts, &odd_carries); + + odd_carries |= overflow; /* push in bit zero as a + * potential end if we had an + * odd-numbered run at the + * end of the previous + * iteration */ + overflow = new_overflow ? 0x1ULL : 0x0ULL; + uint64_t even_carry_ends = even_carries & ~match; + uint64_t odd_carry_ends = odd_carries & ~match; + uint64_t even_start_odd_end = even_carry_ends & odd_bits; + uint64_t odd_start_even_end = odd_carry_ends & even_bits; + uint64_t odd_ends = even_start_odd_end | odd_start_even_end; + return odd_ends; } - /* If we have a final chunk of less than 64 bytes, pad it to 64 with - * spaces before processing it (otherwise, we risk invalidating the UTF-8 - * checks). */ - if (likely(idx < len)) { - uint8_t tmp_buf[STEP_SIZE]; - memset(tmp_buf, 0x20, STEP_SIZE); - memcpy(tmp_buf, buf + idx, len - idx); - find_structural_bits_128(&tmp_buf[0], idx, base_ptr, - prev_escaped, prev_in_string, prev_primitive, - structurals, unescaped_chars_error, utf8_state); - idx += STEP_SIZE; + // + // Check if the current character immediately follows a matching character. + // + // For example, this checks for quotes with backslashes in front of them: + // + // const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash); + // + really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { + const uint64_t result = match << 1 | overflow; + overflow = match >> 63; + return result; } - /* finally, flatten out the remaining structurals from the last iteration */ - flatten_bits(base_ptr, idx, structurals); + // + // Check if the current character follows a matching character, with possible "filler" between. + // For example, this checks for empty curly braces, e.g. + // + // in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { * } + // + really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) { + uint64_t follows_match = follows(match, overflow); + uint64_t result; + overflow |= add_overflow(follows_match, filler, &result); + return result; + } - simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string); + really_inline ErrorValues detect_errors_on_eof() { + if (prev_in_string) { + return UNCLOSED_STRING; + } + if (unescaped_chars_error) { + return UNESCAPED_CHARS; + } + return SUCCESS; + } + + // + // Return a mask of all string characters plus end quotes. + // + // prev_escaped is overflow saying whether the next character is escaped. + // prev_in_string is overflow saying whether we're still in a string. + // + // Backslash sequences outside of quotes will be detected in stage 2. + // + really_inline uint64_t find_strings(const simd_input in) { + const uint64_t backslash = in.eq('\\'); + const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped); + const uint64_t quote = in.eq('"') & ~escaped; + // compute_quote_mask returns start quote plus string contents. + const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string; + /* right shift of a signed value expected to be well-defined and standard + * compliant as of C++20, + * John Regher from Utah U. says this is fine code */ + prev_in_string = static_cast(static_cast(in_string) >> 63); + // Use ^ to turn the beginning quote off, and the end quote on. + return in_string ^ quote; + } + + really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) { + /* All Unicode characters may be placed within the + * quotation marks, except for the characters that MUST be escaped: + * quotation mark, reverse solidus, and the control characters (U+0000 + * through U+001F). + * https://tools.ietf.org/html/rfc8259 */ + return quote_mask & unescaped; + } + + // + // Determine which characters are *structural*: + // - braces: [] and {} + // - the start of primitives (123, true, false, null) + // - the start of invalid non-whitespace (+, &, ture, UTF-8) + // + // Also detects value sequence errors: + // - two values with no separator between ("hello" "world") + // - separators with no values ([1,] [1,,]and [,2]) + // + // This method will find all of the above whether it is in a string or not. + // + // To reduce dependency on the expensive "what is in a string" computation, this method treats the + // contents of a string the same as content outside. Errors and structurals inside the string or on + // the trailing quote will need to be removed later when the correct string information is known. + // + really_inline uint64_t find_potential_structurals(const simd_input in) { + // These use SIMD so let's kick them off before running the regular 64-bit stuff ... + uint64_t whitespace, op; + find_whitespace_and_operators(in, whitespace, op); + + // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings ("). + // Everything except whitespace, braces, colon and comma. + const uint64_t primitive = ~(op | whitespace); + const uint64_t follows_primitive = follows(primitive, prev_primitive); + const uint64_t start_primitive = primitive & ~follows_primitive; + + // Return final structurals + return op | start_primitive; + } + + // + // Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. + // + // PERF NOTES: + // We pipe 2 inputs through these stages: + // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load + // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. + // 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path. + // The output of step 1 depends entirely on this information. These functions don't quite use + // up enough CPU: the second half of the functions is highly serial, only using 1 execution core + // at a time. The second input's scans has some dependency on the first ones finishing it, but + // they can make a lot of progress before they need that information. + // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that + // to finish: utf-8 checks and generating the output from the last iteration. + // + // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all + // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough + // workout. + // + really_inline void scan_step(const uint8_t *buf, const size_t idx, utf8_checker &utf8_checker) { + // + // Load up all 128 bytes into SIMD registers + // + simd_input in_1(buf); + simd_input in_2(buf+64); + + // + // Find the strings and potential structurals (operators / primitives). + // + // This will include false structurals that are *inside* strings--we'll filter strings out + // before we return. + // + uint64_t string_1 = this->find_strings(in_1); + uint64_t structurals_1 = this->find_potential_structurals(in_1); + uint64_t string_2 = this->find_strings(in_2); + uint64_t structurals_2 = this->find_potential_structurals(in_2); + + // + // Do miscellaneous work while the processor is busy calculating strings and structurals. + // + // After that, weed out structurals that are inside strings and find invalid string characters. + // + uint64_t unescaped_1 = in_1.lteq(0x1F); + utf8_checker.check_next_input(in_1); + this->structural_indexes.write_indexes(idx-64, prev_structurals); // Output *last* iteration's structurals to ParsedJson + this->prev_structurals = structurals_1 & ~string_1; + this->unescaped_chars_error |= unescaped_1 & string_1; + + uint64_t unescaped_2 = in_2.lteq(0x1F); + utf8_checker.check_next_input(in_2); + this->structural_indexes.write_indexes(idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson + this->prev_structurals = structurals_2 & ~string_2; + this->unescaped_chars_error |= unescaped_2 & string_2; + } + + really_inline void scan(const uint8_t *buf, const size_t len, utf8_checker &utf8_checker) { + size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE; + size_t idx = 0; + + for (; idx < lenminusstep; idx += STEP_SIZE) { + this->scan_step(&buf[idx], idx, utf8_checker); + } + + /* If we have a final chunk of less than 64 bytes, pad it to 64 with + * spaces before processing it (otherwise, we risk invalidating the UTF-8 + * checks). */ + if (likely(idx < len)) { + uint8_t tmp_buf[STEP_SIZE]; + memset(tmp_buf, 0x20, STEP_SIZE); + memcpy(tmp_buf, buf + idx, len - idx); + this->scan_step(&tmp_buf[0], idx, utf8_checker); + idx += STEP_SIZE; + } + + /* finally, flatten out the remaining structurals from the last iteration */ + this->structural_indexes.write_indexes(idx-64, this->prev_structurals); + } + +}; + +int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) { + if (unlikely(len > pj.byte_capacity)) { + std::cerr << "Your ParsedJson object only supports documents up to " + << pj.byte_capacity << " bytes but you are trying to process " + << len << " bytes" << std::endl; + return simdjson::CAPACITY; + } + utf8_checker utf8_checker{}; + json_structural_scanner scanner{pj.structural_indexes}; + scanner.scan(buf, len, utf8_checker); + + simdjson::ErrorValues error = scanner.detect_errors_on_eof(); if (unlikely(error != simdjson::SUCCESS)) { return error; } - pj.n_structural_indexes = base_ptr - pj.structural_indexes; + pj.n_structural_indexes = scanner.structural_indexes.tail - pj.structural_indexes; /* a valid JSON file cannot have zero structural indexes - we should have * found something */ if (unlikely(pj.n_structural_indexes == 0u)) { @@ -278,5 +275,5 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p } /* make it safe to dereference one beyond this array */ pj.structural_indexes[pj.n_structural_indexes] = 0; - return utf8_state.errors(); + return utf8_checker.errors(); } diff --git a/src/generic/stage1_find_marks_flatten.h b/src/generic/stage1_find_marks_flatten.h index 3159229a..1d5d1b2c 100644 --- a/src/generic/stage1_find_marks_flatten.h +++ b/src/generic/stage1_find_marks_flatten.h @@ -3,65 +3,52 @@ // We assume the file in which it is include already includes // "simdjson/stage1_find_marks.h" (this simplifies amalgation) -#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking +class bit_indexer { +public: + uint32_t *tail; -// This is just a naive implementation. It should be normally -// disable, but can be used for research purposes to compare -// again our optimized version. -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { - uint32_t *out_ptr = base_ptr + base; - idx -= 64; - while (bits != 0) { - out_ptr[0] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - out_ptr++; - } - base = (out_ptr - base_ptr); -} + bit_indexer(uint32_t *index_buf) : tail(index_buf) {} -#else // SIMDJSON_NAIVE_FLATTEN + // flatten out values in 'bits' assuming that they are are to have values of idx + // plus their position in the bitvector, and store these indexes at + // this->tail[base] incrementing base as we go + // will potentially store extra values beyond end of valid bits, so this->tail + // needs to be large enough to handle this + really_inline void write_indexes(uint32_t idx, uint64_t bits) { + // In some instances, the next branch is expensive because it is mispredicted. + // Unfortunately, in other cases, + // it helps tremendously. + if (bits == 0) + return; + uint32_t cnt = hamming(bits); -// flatten out values in 'bits' assuming that they are are to have values of idx -// plus their position in the bitvector, and store these indexes at -// base_ptr[base] incrementing base as we go -// will potentially store extra values beyond end of valid bits, so base_ptr -// needs to be large enough to handle this -really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) { - // In some instances, the next branch is expensive because it is mispredicted. - // Unfortunately, in other cases, - // it helps tremendously. - if (bits == 0) - return; - uint32_t cnt = hamming(bits); - idx -= 64; - - // Do the first 8 all together - for (int i=0; i<8; i++) { - base_ptr[i] = idx + trailing_zeroes(bits); - bits = bits & (bits - 1); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - base_ptr[i] = idx + trailing_zeroes(bits); + // Do the first 8 all together + for (int i=0; i<8; i++) { + this->tail[i] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); } - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every 4 characters. - if (unlikely(cnt > 16)) { - uint32_t i = 16; - do { - base_ptr[i] = idx + trailing_zeroes(bits); + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (unlikely(cnt > 8)) { + for (int i=8; i<16; i++) { + this->tail[i] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - i++; - } while (i < cnt); - } - } + } - base_ptr += cnt; -} -#endif // SIMDJSON_NAIVE_FLATTEN + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every 4 characters. + if (unlikely(cnt > 16)) { + uint32_t i = 16; + do { + this->tail[i] = idx + trailing_zeroes(bits); + bits = bits & (bits - 1); + i++; + } while (i < cnt); + } + } + + this->tail += cnt; + } +}; \ No newline at end of file diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h index d9d99cc5..9d1b696e 100644 --- a/src/haswell/stage1_find_marks.h +++ b/src/haswell/stage1_find_marks.h @@ -84,49 +84,55 @@ really_inline void find_whitespace_and_operators( #endif // else SIMDJSON_NAIVE_STRUCTURAL } -// flatten out values in 'bits' assuming that they are are to have values of idx -// plus their position in the bitvector, and store these indexes at -// base_ptr[base] incrementing base as we go -// will potentially store extra values beyond end of valid bits, so base_ptr -// needs to be large enough to handle this -really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) { - // In some instances, the next branch is expensive because it is mispredicted. - // Unfortunately, in other cases, - // it helps tremendously. - if (bits == 0) - return; - uint32_t cnt = _mm_popcnt_u64(bits); - idx -= 64; +class bit_indexer { +public: + uint32_t *tail; - // Do the first 8 all together - for (int i=0; i<8; i++) { - base_ptr[i] = idx + trailing_zeroes(bits); - bits = _blsr_u64(bits); - } + bit_indexer(uint32_t *index_buf) : tail(index_buf) {} - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - base_ptr[i] = idx + trailing_zeroes(bits); + // flatten out values in 'bits' assuming that they are are to have values of idx + // plus their position in the bitvector, and store these indexes at + // base_ptr[base] incrementing base as we go + // will potentially store extra values beyond end of valid bits, so base_ptr + // needs to be large enough to handle this + really_inline void write_indexes(uint32_t idx, uint64_t bits) { + // In some instances, the next branch is expensive because it is mispredicted. + // Unfortunately, in other cases, + // it helps tremendously. + if (bits == 0) + return; + uint32_t cnt = _mm_popcnt_u64(bits); + + // Do the first 8 all together + for (int i=0; i<8; i++) { + this->tail[i] = idx + trailing_zeroes(bits); bits = _blsr_u64(bits); } - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (unlikely(cnt > 16)) { - uint32_t i = 16; - do { - base_ptr[i] = idx + trailing_zeroes(bits); + // Do the next 8 all together (we hope in most cases it won't happen at all + // and the branch is easily predicted). + if (unlikely(cnt > 8)) { + for (int i=8; i<16; i++) { + this->tail[i] = idx + trailing_zeroes(bits); bits = _blsr_u64(bits); - i++; - } while (i < cnt); - } - } + } - base_ptr += cnt; -} + // Most files don't have 16+ structurals per block, so we take several basically guaranteed + // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) + // or the start of a value ("abc" true 123) every four characters. + if (unlikely(cnt > 16)) { + uint32_t i = 16; + do { + this->tail[i] = idx + trailing_zeroes(bits); + bits = _blsr_u64(bits); + i++; + } while (i < cnt); + } + } + + this->tail += cnt; + } +}; #include "generic/stage1_find_marks.h" diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp index 067a2279..a8156f88 100644 --- a/src/stage1_find_marks.cpp +++ b/src/stage1_find_marks.cpp @@ -1,8 +1,9 @@ #include "simdjson/portability.h" +#include "simdjson/common_defs.h" namespace { // for when clmul is unavailable -[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) { +[[maybe_unused]] really_inline uint64_t portable_compute_quote_mask(uint64_t quote_bits) { uint64_t quote_mask = quote_bits ^ (quote_bits << 1); quote_mask = quote_mask ^ (quote_mask << 2); quote_mask = quote_mask ^ (quote_mask << 4);