Separate definition from declaration, arrange top down

This commit is contained in:
John Keiser 2020-06-02 08:09:54 -07:00
parent 89332e1696
commit 9be4a17687
2 changed files with 104 additions and 80 deletions

View File

@ -2,24 +2,13 @@
template<size_t STEP_SIZE>
struct buf_block_reader {
public:
really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
really_inline size_t block_index() { return idx; }
really_inline bool has_full_block() const {
return idx < lenminusstep;
}
really_inline const uint8_t *full_block() const {
return &buf[idx];
}
really_inline bool has_remainder() const {
return idx < len;
}
really_inline void get_remainder(uint8_t *tmp_buf) const {
memset(tmp_buf, 0x20, STEP_SIZE);
memcpy(tmp_buf, buf + idx, len - idx);
}
really_inline void advance() {
idx += STEP_SIZE;
}
really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
really_inline size_t block_index();
really_inline bool has_full_block() const;
really_inline const uint8_t *full_block() const;
really_inline bool has_remainder() const;
really_inline void get_remainder(uint8_t *tmp_buf) const;
really_inline void advance();
private:
const uint8_t *buf;
const size_t len;
@ -27,6 +16,38 @@ private:
size_t idx;
};
template<size_t STEP_SIZE>
really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
template<size_t STEP_SIZE>
really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
template<size_t STEP_SIZE>
really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
return idx < lenminusstep;
}
template<size_t STEP_SIZE>
really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
return &buf[idx];
}
template<size_t STEP_SIZE>
really_inline bool buf_block_reader<STEP_SIZE>::has_remainder() const {
return idx < len;
}
template<size_t STEP_SIZE>
really_inline void buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *tmp_buf) const {
memset(tmp_buf, 0x20, STEP_SIZE);
memcpy(tmp_buf, buf + idx, len - idx);
}
template<size_t STEP_SIZE>
really_inline void buf_block_reader<STEP_SIZE>::advance() {
idx += STEP_SIZE;
}
// Routines to print masks and text for debugging bitmask operations
UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);

View File

@ -57,12 +57,18 @@ public:
class json_structural_indexer {
public:
/**
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
*
* @param streaming Setting the streaming parameter to true allows the find_structural_bits to
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
*/
template<size_t STEP_SIZE>
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool streaming) noexcept;
private:
really_inline json_structural_indexer(uint32_t *structural_indexes)
: indexer{structural_indexes} {}
really_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
@ -75,6 +81,63 @@ private:
uint64_t unescaped_chars_error = 0;
};
really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are//still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool streaming) noexcept {
if (unlikely(len > parser.capacity())) { return CAPACITY; }
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
if (likely(reader.has_remainder())) {
uint8_t block[STEP_SIZE];
reader.get_remainder(block);
indexer.step<STEP_SIZE>(block, reader);
}
return indexer.finish(parser, reader.block_index(), len, streaming);
}
template<>
really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
checker.check_next_input(in);
@ -128,64 +191,4 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
return checker.errors();
}
template<>
really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
//
// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
// you may want to call on a function like trimmed_length_safe_utf8.
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool streaming) noexcept {
if (unlikely(len > parser.capacity())) { return CAPACITY; }
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
if (likely(reader.has_remainder())) {
uint8_t block[STEP_SIZE];
reader.get_remainder(block);
indexer.step<STEP_SIZE>(block, reader);
}
return indexer.finish(parser, reader.block_index(), len, streaming);
}
} // namespace stage1