From 81f224957501c548fbb5aa750a8110759f5955c6 Mon Sep 17 00:00:00 2001
From: John Keiser <john@johnkeiser.com>
Date: Sun, 13 Oct 2019 11:36:52 -0700
Subject: [PATCH] Move stage1 into a class to pass fewer parameters

---
 src/generic/stage1_find_marks.h         | 471 ++++++++++++------------
 src/generic/stage1_find_marks_flatten.h |  95 +++--
 src/haswell/stage1_find_marks.h         |  78 ++--
 src/stage1_find_marks.cpp               |   3 +-
 4 files changed, 319 insertions(+), 328 deletions(-)
diff --git a/src/generic/stage1_find_marks.h b/src/generic/stage1_find_marks.h
index a6727bd2..3a87e672 100644
--- a/src/generic/stage1_find_marks.h
+++ b/src/generic/stage1_find_marks.h
@@ -3,221 +3,10 @@
 // We assume the file in which it is included already includes
 // "simdjson/stage1_find_marks.h" (this simplifies amalgation)
 
-// return a bitvector indicating where we have characters that end an odd-length
-// sequence of backslashes (and thus change the behavior of the next character
-// to follow). A even-length sequence of backslashes, and, for that matter, the
-// largest even-length prefix of our odd-length sequence of backslashes, simply
-// modify the behavior of the backslashes themselves.
-// We also update the prev_iter_ends_odd_backslash reference parameter to
-// indicate whether we end an iteration on an odd-length sequence of
-// backslashes, which modifies our subsequent search for odd-length
-// sequences of backslashes in an obvious way.
-really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) {
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint64_t start_edges = match & ~(match << 1);
-  /* flip lowest if we have an odd-length run at the end of the prior
-   * iteration */
-  uint64_t even_start_mask = even_bits ^ overflow;
-  uint64_t even_starts = start_edges & even_start_mask;
-  uint64_t odd_starts = start_edges & ~even_start_mask;
-  uint64_t even_carries = match + even_starts;
-
-  uint64_t odd_carries;
-  /* must record the carry-out of our odd-carries out of bit 63; this
-   * indicates whether the sense of any edge going to the next iteration
-   * should be flipped */
-  bool new_overflow = add_overflow(match, odd_starts, &odd_carries);
-
-  odd_carries |= overflow; /* push in bit zero as a
-                              * potential end if we had an
-                              * odd-numbered run at the
-                              * end of the previous
-                              * iteration */
-  overflow = new_overflow ? 0x1ULL : 0x0ULL;
-  uint64_t even_carry_ends = even_carries & ~match;
-  uint64_t odd_carry_ends = odd_carries & ~match;
-  uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-  uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-  uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-  return odd_ends;
-}
-
-//
-// Check if the current character immediately follows a matching character.
-//
-// For example, this checks for quotes with backslashes in front of them:
-//
-//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
-//
-really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
-  const uint64_t result = match << 1 | overflow;
-  overflow = match >> 63;
-  return result;
-}
-
-//
-// Check if the current character follows a matching character, with possible "filler" between.
-// For example, this checks for empty curly braces, e.g. 
-//
-//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
-//
-really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow ) {
-  uint64_t follows_match = follows(match, overflow);
-  uint64_t result;
-  overflow |= add_overflow(follows_match, filler, &result);
-  return result;
-}
-
-really_inline ErrorValues detect_errors_on_eof(
-  uint64_t &unescaped_chars_error,
-  const uint64_t prev_in_string) {
-  if (prev_in_string) {
-    return UNCLOSED_STRING;
-  }
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
-  return SUCCESS;
-}
-
-//
-// Return a mask of all string characters plus end quotes.
-//
-// prev_escaped is overflow saying whether the next character is escaped.
-// prev_in_string is overflow saying whether we're still in a string.
-//
-// Backslash sequences outside of quotes will be detected in stage 2.
-//
-really_inline uint64_t find_strings(const simd_input in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
-  const uint64_t backslash = in.eq('\\');
-  const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
-  const uint64_t quote = in.eq('"') & ~escaped;
-  // compute_quote_mask returns start quote plus string contents.
-  const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
-  /* right shift of a signed value expected to be well-defined and standard
-   * compliant as of C++20,
-   * John Regher from Utah U. says this is fine code */
-  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
-  // Use ^ to turn the beginning quote off, and the end quote on.
-  return in_string ^ quote;
-}
-
-really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
-  /* All Unicode characters may be placed within the
-   * quotation marks, except for the characters that MUST be escaped:
-   * quotation mark, reverse solidus, and the control characters (U+0000
-   * through U+001F).
-   * https://tools.ietf.org/html/rfc8259 */
-  return quote_mask & unescaped;
-}
-
-//
-// Determine which characters are *structural*:
-// - braces: [] and {}
-// - the start of primitives (123, true, false, null)
-// - the start of invalid non-whitespace (+, &, ture, UTF-8)
-//
-// Also detects value sequence errors:
-// - two values with no separator between ("hello" "world")
-// - separators with no values ([1,] [1,,]and [,2])
-//
-// This method will find all of the above whether it is in a string or not.
-//
-// To reduce dependency on the expensive "what is in a string" computation, this method treats the
-// contents of a string the same as content outside. Errors and structurals inside the string or on
-// the trailing quote will need to be removed later when the correct string information is known.
-//
-really_inline uint64_t find_potential_structurals(const simd_input in, uint64_t &prev_primitive) {
-  // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
-  uint64_t whitespace, op;
-  find_whitespace_and_operators(in, whitespace, op);
-
-  // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
-  // Everything except whitespace, braces, colon and comma.
-  const uint64_t primitive = ~(op | whitespace);
-  const uint64_t follows_primitive = follows(primitive, prev_primitive);
-  const uint64_t start_primitive = primitive & ~follows_primitive;
-
-  // Return final structurals
-  return op | start_primitive;
-}
-
 static const size_t STEP_SIZE = 128;
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to :
-//
-//
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-// 
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-really_inline void find_structural_bits_128(
-    const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
-    uint64_t &prev_escaped, uint64_t &prev_in_string,
-    uint64_t &prev_primitive,
-    uint64_t &prev_structurals,
-    uint64_t &unescaped_chars_error,
-    utf8_checker &utf8_state) {
-  //
-  // Load up all 128 bytes into SIMD registers
-  //
-  simd_input in_1(buf);
-  simd_input in_2(buf+64);
-
-  //
-  // Find the strings and potential structurals (operators / primitives).
-  //
-  // This will include false structurals that are *inside* strings--we'll filter strings out
-  // before we return.
-  //
-  uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string);
-  uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive);
-  uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string);
-  uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive);
-
-  //
-  // Do miscellaneous work while the processor is busy calculating strings and structurals.
-  //
-  // After that, weed out structurals that are inside strings and find invalid string characters.
-  //
-  uint64_t unescaped_1 = in_1.lteq(0x1F);
-  utf8_state.check_next_input(in_1);
-  flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
-  prev_structurals = structurals_1 & ~string_1;
-  unescaped_chars_error |= unescaped_1 & string_1;
-
-  uint64_t unescaped_2 = in_2.lteq(0x1F);
-  utf8_state.check_next_input(in_2);
-  flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
-  prev_structurals = structurals_2 & ~string_2;
-  unescaped_chars_error |= unescaped_2 & string_2;
-}
-
-int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
-  if (unlikely(len > pj.byte_capacity)) {
-    std::cerr << "Your ParsedJson object only supports documents up to "
-              << pj.byte_capacity << " bytes but you are trying to process "
-              << len << " bytes" << std::endl;
-    return simdjson::CAPACITY;
-  }
-  uint32_t *base_ptr = pj.structural_indexes;
-  utf8_checker utf8_state;
-
+class json_structural_scanner {
+public:
   // Whether the first character of the next iteration is escaped.
   uint64_t prev_escaped = 0ULL;
   // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
@@ -228,41 +17,249 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
   // Mask of structural characters from the last iteration.
   // Kept around for performance reasons, so we can call flatten_bits to soak up some unused
   // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
-  uint64_t structurals = 0;
-
-  size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
-  size_t idx = 0;
+  uint64_t prev_structurals = 0;
   // Errors with unescaped characters in strings (ASCII codepoints < 0x20)
   uint64_t unescaped_chars_error = 0;
+  bit_indexer structural_indexes;
 
-  for (; idx < lenminusstep; idx += STEP_SIZE) {
-    find_structural_bits_128(&buf[idx], idx, base_ptr,
-                             prev_escaped, prev_in_string, prev_primitive,
-                             structurals, unescaped_chars_error, utf8_state);
+
+  json_structural_scanner(uint32_t *_structural_indexes) : structural_indexes{_structural_indexes} {}
+
+  // return a bitvector indicating where we have characters that end an odd-length
+  // sequence of backslashes (and thus change the behavior of the next character
+  // to follow). A even-length sequence of backslashes, and, for that matter, the
+  // largest even-length prefix of our odd-length sequence of backslashes, simply
+  // modify the behavior of the backslashes themselves.
+  // We also update the prev_iter_ends_odd_backslash reference parameter to
+  // indicate whether we end an iteration on an odd-length sequence of
+  // backslashes, which modifies our subsequent search for odd-length
+  // sequences of backslashes in an obvious way.
+  really_inline uint64_t follows_odd_sequence_of(const uint64_t match, uint64_t &overflow) {
+    const uint64_t even_bits = 0x5555555555555555ULL;
+    const uint64_t odd_bits = ~even_bits;
+    uint64_t start_edges = match & ~(match << 1);
+    /* flip lowest if we have an odd-length run at the end of the prior
+    * iteration */
+    uint64_t even_start_mask = even_bits ^ overflow;
+    uint64_t even_starts = start_edges & even_start_mask;
+    uint64_t odd_starts = start_edges & ~even_start_mask;
+    uint64_t even_carries = match + even_starts;
+
+    uint64_t odd_carries;
+    /* must record the carry-out of our odd-carries out of bit 63; this
+    * indicates whether the sense of any edge going to the next iteration
+    * should be flipped */
+    bool new_overflow = add_overflow(match, odd_starts, &odd_carries);
+
+    odd_carries |= overflow; /* push in bit zero as a
+                                * potential end if we had an
+                                * odd-numbered run at the
+                                * end of the previous
+                                * iteration */
+    overflow = new_overflow ? 0x1ULL : 0x0ULL;
+    uint64_t even_carry_ends = even_carries & ~match;
+    uint64_t odd_carry_ends = odd_carries & ~match;
+    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
+    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
+    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
+    return odd_ends;
   }
 
-  /* If we have a final chunk of less than 64 bytes, pad it to 64 with
-   * spaces  before processing it (otherwise, we risk invalidating the UTF-8
-   * checks). */
-  if (likely(idx < len)) {
-    uint8_t tmp_buf[STEP_SIZE];
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-    find_structural_bits_128(&tmp_buf[0], idx, base_ptr,
-                             prev_escaped, prev_in_string, prev_primitive,
-                             structurals, unescaped_chars_error, utf8_state);
-    idx += STEP_SIZE;
+  //
+  // Check if the current character immediately follows a matching character.
+  //
+  // For example, this checks for quotes with backslashes in front of them:
+  //
+  //     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+  //
+  really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+    const uint64_t result = match << 1 | overflow;
+    overflow = match >> 63;
+    return result;
   }
 
-  /* finally, flatten out the remaining structurals from the last iteration */
-  flatten_bits(base_ptr, idx, structurals);
+  //
+  // Check if the current character follows a matching character, with possible "filler" between.
+  // For example, this checks for empty curly braces, e.g. 
+  //
+  //     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+  //
+  really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
+    uint64_t follows_match = follows(match, overflow);
+    uint64_t result;
+    overflow |= add_overflow(follows_match, filler, &result);
+    return result;
+  }
 
-  simdjson::ErrorValues error = detect_errors_on_eof(unescaped_chars_error, prev_in_string);
+  really_inline ErrorValues detect_errors_on_eof() {
+    if (prev_in_string) {
+      return UNCLOSED_STRING;
+    }
+    if (unescaped_chars_error) {
+      return UNESCAPED_CHARS;
+    }
+    return SUCCESS;
+  }
+
+  //
+  // Return a mask of all string characters plus end quotes.
+  //
+  // prev_escaped is overflow saying whether the next character is escaped.
+  // prev_in_string is overflow saying whether we're still in a string.
+  //
+  // Backslash sequences outside of quotes will be detected in stage 2.
+  //
+  really_inline uint64_t find_strings(const simd_input in) {
+    const uint64_t backslash = in.eq('\\');
+    const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
+    const uint64_t quote = in.eq('"') & ~escaped;
+    // compute_quote_mask returns start quote plus string contents.
+    const uint64_t in_string = compute_quote_mask(quote) ^ prev_in_string;
+    /* right shift of a signed value expected to be well-defined and standard
+    * compliant as of C++20,
+    * John Regher from Utah U. says this is fine code */
+    prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
+    // Use ^ to turn the beginning quote off, and the end quote on.
+    return in_string ^ quote;
+  }
+
+  really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
+    /* All Unicode characters may be placed within the
+    * quotation marks, except for the characters that MUST be escaped:
+    * quotation mark, reverse solidus, and the control characters (U+0000
+    * through U+001F).
+    * https://tools.ietf.org/html/rfc8259 */
+    return quote_mask & unescaped;
+  }
+
+  //
+  // Determine which characters are *structural*:
+  // - braces: [] and {}
+  // - the start of primitives (123, true, false, null)
+  // - the start of invalid non-whitespace (+, &, ture, UTF-8)
+  //
+  // Also detects value sequence errors:
+  // - two values with no separator between ("hello" "world")
+  // - separators with no values ([1,] [1,,]and [,2])
+  //
+  // This method will find all of the above whether it is in a string or not.
+  //
+  // To reduce dependency on the expensive "what is in a string" computation, this method treats the
+  // contents of a string the same as content outside. Errors and structurals inside the string or on
+  // the trailing quote will need to be removed later when the correct string information is known.
+  //
+  really_inline uint64_t find_potential_structurals(const simd_input in) {
+    // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
+    uint64_t whitespace, op;
+    find_whitespace_and_operators(in, whitespace, op);
+
+    // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
+    // Everything except whitespace, braces, colon and comma.
+    const uint64_t primitive = ~(op | whitespace);
+    const uint64_t follows_primitive = follows(primitive, prev_primitive);
+    const uint64_t start_primitive = primitive & ~follows_primitive;
+
+    // Return final structurals
+    return op | start_primitive;
+  }
+
+  //
+  // Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+  //
+  // PERF NOTES:
+  // We pipe 2 inputs through these stages:
+  // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+  //    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+  // 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
+  //    The output of step 1 depends entirely on this information. These functions don't quite use
+  //    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+  //    at a time. The second input's scans has some dependency on the first ones finishing it, but
+  //    they can make a lot of progress before they need that information.
+  // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+  //    to finish: utf-8 checks and generating the output from the last iteration.
+  // 
+  // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+  // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+  // workout.
+  //
+  really_inline void scan_step(const uint8_t *buf, const size_t idx, utf8_checker &utf8_checker) {
+    //
+    // Load up all 128 bytes into SIMD registers
+    //
+    simd_input in_1(buf);
+    simd_input in_2(buf+64);
+
+    //
+    // Find the strings and potential structurals (operators / primitives).
+    //
+    // This will include false structurals that are *inside* strings--we'll filter strings out
+    // before we return.
+    //
+    uint64_t string_1 = this->find_strings(in_1);
+    uint64_t structurals_1 = this->find_potential_structurals(in_1);
+    uint64_t string_2 = this->find_strings(in_2);
+    uint64_t structurals_2 = this->find_potential_structurals(in_2);
+
+    //
+    // Do miscellaneous work while the processor is busy calculating strings and structurals.
+    //
+    // After that, weed out structurals that are inside strings and find invalid string characters.
+    //
+    uint64_t unescaped_1 = in_1.lteq(0x1F);
+    utf8_checker.check_next_input(in_1);
+    this->structural_indexes.write_indexes(idx-64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
+    this->prev_structurals = structurals_1 & ~string_1;
+    this->unescaped_chars_error |= unescaped_1 & string_1;
+
+    uint64_t unescaped_2 = in_2.lteq(0x1F);
+    utf8_checker.check_next_input(in_2);
+    this->structural_indexes.write_indexes(idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
+    this->prev_structurals = structurals_2 & ~string_2;
+    this->unescaped_chars_error |= unescaped_2 & string_2;
+  }
+
+  really_inline void scan(const uint8_t *buf, const size_t len, utf8_checker &utf8_checker) {
+    size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
+    size_t idx = 0;
+
+    for (; idx < lenminusstep; idx += STEP_SIZE) {
+      this->scan_step(&buf[idx], idx, utf8_checker);
+    }
+
+    /* If we have a final chunk of less than 64 bytes, pad it to 64 with
+    * spaces  before processing it (otherwise, we risk invalidating the UTF-8
+    * checks). */
+    if (likely(idx < len)) {
+      uint8_t tmp_buf[STEP_SIZE];
+      memset(tmp_buf, 0x20, STEP_SIZE);
+      memcpy(tmp_buf, buf + idx, len - idx);
+      this->scan_step(&tmp_buf[0], idx, utf8_checker);
+      idx += STEP_SIZE;
+    }
+
+    /* finally, flatten out the remaining structurals from the last iteration */
+    this->structural_indexes.write_indexes(idx-64, this->prev_structurals);
+  }
+
+};
+
+int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
+  if (unlikely(len > pj.byte_capacity)) {
+    std::cerr << "Your ParsedJson object only supports documents up to "
+              << pj.byte_capacity << " bytes but you are trying to process "
+              << len << " bytes" << std::endl;
+    return simdjson::CAPACITY;
+  }
+  utf8_checker utf8_checker{};
+  json_structural_scanner scanner{pj.structural_indexes};
+  scanner.scan(buf, len, utf8_checker);
+
+  simdjson::ErrorValues error = scanner.detect_errors_on_eof();
   if (unlikely(error != simdjson::SUCCESS)) {
     return error;
   }
 
-  pj.n_structural_indexes = base_ptr - pj.structural_indexes;
+  pj.n_structural_indexes = scanner.structural_indexes.tail - pj.structural_indexes;
   /* a valid JSON file cannot have zero structural indexes - we should have
    * found something */
   if (unlikely(pj.n_structural_indexes == 0u)) {
@@ -278,5 +275,5 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
   }
   /* make it safe to dereference one beyond this array */
   pj.structural_indexes[pj.n_structural_indexes] = 0;
-  return utf8_state.errors();
+  return utf8_checker.errors();
 }
diff --git a/src/generic/stage1_find_marks_flatten.h b/src/generic/stage1_find_marks_flatten.h
index 3159229a..1d5d1b2c 100644
--- a/src/generic/stage1_find_marks_flatten.h
+++ b/src/generic/stage1_find_marks_flatten.h
@@ -3,65 +3,52 @@
 // We assume the file in which it is include already includes
 // "simdjson/stage1_find_marks.h" (this simplifies amalgation)
 
-#ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
+class bit_indexer {
+public:
+  uint32_t *tail;
 
-// This is just a naive implementation. It should be normally
-// disable, but can be used for research purposes to compare
-// again our optimized version.
-really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
-  uint32_t *out_ptr = base_ptr + base;
-  idx -= 64;
-  while (bits != 0) {
-    out_ptr[0] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-    out_ptr++;
-  }
-  base = (out_ptr - base_ptr);
-}
+  bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
-#else // SIMDJSON_NAIVE_FLATTEN
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // this->tail[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so this->tail
+  // needs to be large enough to handle this
+  really_inline void write_indexes(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+      return;
+    uint32_t cnt = hamming(bits);
 
-// flatten out values in 'bits' assuming that they are are to have values of idx
-// plus their position in the bitvector, and store these indexes at
-// base_ptr[base] incrementing base as we go
-// will potentially store extra values beyond end of valid bits, so base_ptr
-// needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
-  // In some instances, the next branch is expensive because it is mispredicted.
-  // Unfortunately, in other cases,
-  // it helps tremendously.
-  if (bits == 0)
-    return;
-  uint32_t cnt = hamming(bits);
-  idx -= 64;
-
-  // Do the first 8 all together
-  for (int i=0; i<8; i++) {
-    base_ptr[i] = idx + trailing_zeroes(bits);
-    bits = bits & (bits - 1);
-  }
-
-  // Do the next 8 all together (we hope in most cases it won't happen at all
-  // and the branch is easily predicted).
-  if (unlikely(cnt > 8)) {
-    for (int i=8; i<16; i++) {
-      base_ptr[i] = idx + trailing_zeroes(bits);
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
       bits = bits & (bits - 1);
     }
 
-    // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-    // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-    // or the start of a value ("abc" true 123) every 4 characters.
-    if (unlikely(cnt > 16)) {
-      uint32_t i = 16;
-      do {
-        base_ptr[i] = idx + trailing_zeroes(bits);
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
         bits = bits & (bits - 1);
-        i++;
-      } while (i < cnt);
-    }
-  }
+      }
 
-  base_ptr += cnt;
-}
-#endif // SIMDJSON_NAIVE_FLATTEN
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every 4 characters.
+      if (unlikely(cnt > 16)) {
+        uint32_t i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = bits & (bits - 1);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+  }
+};
\ No newline at end of file
diff --git a/src/haswell/stage1_find_marks.h b/src/haswell/stage1_find_marks.h
index d9d99cc5..9d1b696e 100644
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@@ -84,49 +84,55 @@ really_inline void find_whitespace_and_operators(
   #endif // else SIMDJSON_NAIVE_STRUCTURAL
 }
 
-// flatten out values in 'bits' assuming that they are are to have values of idx
-// plus their position in the bitvector, and store these indexes at
-// base_ptr[base] incrementing base as we go
-// will potentially store extra values beyond end of valid bits, so base_ptr
-// needs to be large enough to handle this
-really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
-  // In some instances, the next branch is expensive because it is mispredicted.
-  // Unfortunately, in other cases,
-  // it helps tremendously.
-  if (bits == 0)
-      return;
-  uint32_t cnt = _mm_popcnt_u64(bits);
-  idx -= 64;
+class bit_indexer {
+public:
+  uint32_t *tail;
 
-  // Do the first 8 all together
-  for (int i=0; i<8; i++) {
-    base_ptr[i] = idx + trailing_zeroes(bits);
-    bits = _blsr_u64(bits);
-  }
+  bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
-  // Do the next 8 all together (we hope in most cases it won't happen at all
-  // and the branch is easily predicted).
-  if (unlikely(cnt > 8)) {
-    for (int i=8; i<16; i++) {
-      base_ptr[i] = idx + trailing_zeroes(bits);
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  really_inline void write_indexes(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+    uint32_t cnt = _mm_popcnt_u64(bits);
+
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
       bits = _blsr_u64(bits);
     }
 
-    // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-    // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-    // or the start of a value ("abc" true 123) every four characters.
-    if (unlikely(cnt > 16)) {
-      uint32_t i = 16;
-      do {
-        base_ptr[i] = idx + trailing_zeroes(bits);
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
         bits = _blsr_u64(bits);
-        i++;
-      } while (i < cnt);
-    }
-  }
+      }
 
-  base_ptr += cnt;
-}
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (unlikely(cnt > 16)) {
+        uint32_t i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = _blsr_u64(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+  }
+};
 
 #include "generic/stage1_find_marks.h"
 
diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp
index 067a2279..a8156f88 100644
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@@ -1,8 +1,9 @@
 #include "simdjson/portability.h"
+#include "simdjson/common_defs.h"
 
 namespace {
 // for when clmul is unavailable
-[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
+[[maybe_unused]] really_inline uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
   uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
   quote_mask = quote_mask ^ (quote_mask << 2);
   quote_mask = quote_mask ^ (quote_mask << 4);