Measure impact of utf-8 blocks and structurals per block directly

2019-09-11 08:38:13 -07:00 · 2019-09-11 08:38:13 -07:00 · e2f349e7bd
parent 102262c7ab
commit e2f349e7bd
14 changed files with 21366 additions and 384 deletions
--- a/.gitignore
+++ b/.gitignore
@ -53,6 +53,7 @@ objs
 # Build outputs (TODO build to a subdir so we can exclude that instead)
 /allparserscheckfile
 /basictests
 /benchfeatures
 /benchmark/parse
 /benchmark/perfdiff
 /benchmark/statisticalmodel
@ -86,6 +87,9 @@ objs
 /tools/jsonstats
 /tools/minify
 # Don't check in generated examples
 /jsonexamples/generated
 # C++ ignore from https://github.com/github/gitignore/blob/master/C%2B%2B.gitignore
 # Prerequisites
--- a/11
+++ b/11
@ -126,6 +126,12 @@ run_issue150_sh: allparserscheckfile
 run_testjson2json_sh: minify json2json
 	./scripts/testjson2json.sh
 generate_featurejson:
 	ruby ./benchmark/genfeaturejson.rb
 run_benchfeatures: benchfeatures generate_featurejson
 	./benchfeatures -n 1000
 test: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck  run_jsonstream_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
 	@echo "It looks like the code is good!"
@ -145,9 +151,12 @@ submodules:
 $(JSON_INCLUDE) $(SAJSON_INCLUDE) $(RAPIDJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) $(CJSON_INCLUDE) $(JSMN_INCLUDE) : submodules
-parse: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
+parse: benchmark/parse.cpp benchmark/json_parser.h benchmark/event_counter.h benchmark/benchmarker.h $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o parse $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
 benchfeatures: benchmark/benchfeatures.cpp benchmark/json_parser.h benchmark/event_counter.h benchmark/benchmarker.h $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o benchfeatures $(LIBFILES) benchmark/benchfeatures.cpp $(LIBFLAGS)
 perfdiff: benchmark/perfdiff.cpp
 	$(CXX) $(CXXFLAGS) -o perfdiff benchmark/perfdiff.cpp $(LIBFLAGS)
--- a/benchmark/benchfeatures.cpp
+++ b/benchmark/benchfeatures.cpp
@ -0,0 +1,326 @@
 #include "json_parser.h"
 #include "event_counter.h"
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
 #include <dirent.h>
 #include <unistd.h>
 #endif
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <chrono>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "linux-perf-events.h"
 #ifdef __linux__
 #include <libgen.h>
 #endif
 //#define DEBUG
 #include "simdjson/common_defs.h"
 #include "simdjson/isadetection.h"
 #include "simdjson/jsonioutil.h"
 #include "simdjson/jsonparser.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
 #include <functional>
 #include "benchmarker.h"
 using namespace simdjson;
 using std::cerr;
 using std::cout;
 using std::endl;
 using std::string;
 using std::to_string;
 using std::vector;
 using std::ostream;
 using std::ofstream;
 using std::exception;
 // Stash the exe_name in main() for functions to use
 char* exe_name;
 void print_usage(ostream& out) {
  out << "Usage: " << exe_name << " [-v] [-n #] [-s STAGE] [-a ARCH]" << endl;
  out << endl;
  out << "Runs the parser against jsonexamples/generated json files in a loop, measuring speed and other statistics." << endl;
  out << endl;
  out << "Options:" << endl;
  out << endl;
  out << "-n #       - Number of iterations per file. Default: 400" << endl;
  out << "-i #       - Number of times to iterate a single file before moving to the next. Default: 20" << endl;
  out << "-v         - Verbose output." << endl;
  out << "-s STAGE   - Stop after the given stage." << endl;
  out << "             -s stage1 - Stop after find_structural_bits." << endl;
  out << "             -s all    - Run all stages." << endl;
  out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
  out << "             or ARM64). By default, detects best supported architecture." << endl;
 }
 void exit_usage(string message) {
  cerr << message << endl;
  cerr << endl;
  print_usage(cerr);
  exit(EXIT_FAILURE);
 }
 struct option_struct {
  Architecture architecture = Architecture::UNSUPPORTED;
  bool stage1_only = false;
  int32_t iterations = 400;
  int32_t iteration_step = 50;
  bool verbose = false;
  option_struct(int argc, char **argv) {
    #ifndef _MSC_VER
      int c;
      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
        switch (c) {
        case 'n':
          iterations = atoi(optarg);
          break;
        case 'i':
          iteration_step = atoi(optarg);
          break;
        case 'v':
          verbose = true;
          break;
        case 'a':
          architecture = parse_architecture(optarg);
          if (architecture == Architecture::UNSUPPORTED) {
            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
          }
          break;
        case 's':
          if (!strcmp(optarg, "stage1")) {
            stage1_only = true;
          } else if (!strcmp(optarg, "all")) {
            stage1_only = false;
          } else {
            exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
          }
          break;
        default:
          exit_error("Unexpected argument " + c);
        }
      }
    #else
      int optind = 1;
    #endif
    // If architecture is not specified, pick the best supported architecture by default
    if (architecture == Architecture::UNSUPPORTED) {
      architecture = find_best_supported_architecture();
    }
  }
 };
 double actual(const benchmarker& feature) {
  return feature.stage1.best.elapsed_ns() / feature.stats->blocks;
 }
 double diff(const benchmarker& feature, const benchmarker& struct7) {
  if (feature.stats->blocks == struct7.stats->blocks) {
    return (feature.stage1.best.elapsed_ns() - struct7.stage1.best.elapsed_ns()) / struct7.stats->blocks;
  } else {
    return (feature.stage1.best.elapsed_ns() / feature.stats->blocks) - (struct7.stage1.best.elapsed_ns() / struct7.stats->blocks);
  }
 }
 double diff_miss(const benchmarker& feature, const benchmarker& struct7) {
  // There are roughly 2650 branch mispredicts, so we have to scale it so it represents a per block amount
  return diff(feature, struct7) * 10000.0 / 2650.0;
 }
 struct feature_benchmarker {
  benchmarker utf8;
  benchmarker utf8_miss;
  benchmarker empty;
  benchmarker empty_miss;
  benchmarker struct7;
  benchmarker struct7_miss;
  benchmarker struct7_full;
  benchmarker struct15;
  benchmarker struct15_miss;
  benchmarker struct23;
  benchmarker struct23_miss;
  feature_benchmarker(json_parser& parser, event_collector& collector) :
    utf8               ("jsonexamples/generated/utf-8.json", parser, collector),
    utf8_miss          ("jsonexamples/generated/utf-8-miss.json", parser, collector),
    empty              ("jsonexamples/generated/0-structurals.json", parser, collector),
    empty_miss         ("jsonexamples/generated/0-structurals-miss.json", parser, collector),
    struct7           ("jsonexamples/generated/7-structurals.json", parser, collector),
    struct7_miss      ("jsonexamples/generated/7-structurals-miss.json", parser, collector),
    struct7_full       ("jsonexamples/generated/7-structurals-full.json", parser, collector),
    struct15     ("jsonexamples/generated/15-structurals.json", parser, collector),
    struct15_miss("jsonexamples/generated/15-structurals-miss.json", parser, collector),
    struct23     ("jsonexamples/generated/23-structurals.json", parser, collector),
    struct23_miss("jsonexamples/generated/23-structurals-miss.json", parser, collector)
  {
  }
  really_inline void run_iterations(size_t iterations, bool stage1_only=false) {
    struct7.run_iterations(iterations, stage1_only);
    struct7_miss.run_iterations(iterations, stage1_only);
    struct7_full.run_iterations(iterations, stage1_only);
    utf8.run_iterations(iterations, stage1_only);
    utf8_miss.run_iterations(iterations, stage1_only);
    empty.run_iterations(iterations, stage1_only);
    empty_miss.run_iterations(iterations, stage1_only);
    struct15.run_iterations(iterations, stage1_only);
    struct15_miss.run_iterations(iterations, stage1_only);
    struct23.run_iterations(iterations, stage1_only);
    struct23_miss.run_iterations(iterations, stage1_only);
  }
  void print() {
    printf("base (ns/block)");
    printf(",struct 1-7");
    printf(",struct 1-7 miss");
    printf(",utf-8");
    printf(",utf-8 miss");
    printf(",struct 8-15");
    printf(",struct 8-15 miss");
    printf(",struct 16+");
    printf(",struct 16+ miss");
    printf("\n");
    printf("%g",   actual(empty));
    printf(",%+g", diff(struct7, empty));
    printf(",%+g", diff(struct7_miss, struct7));
    printf(",%+g", diff(utf8, struct7));
    printf(",%+g", diff(utf8_miss, utf8));
    printf(",%+g", diff(struct15, struct7));
    printf(",%+g", diff(struct15_miss, struct15));
    printf(",%+g", diff(struct23, struct15));
    printf(",%+g", diff(struct23_miss, struct23));
    printf("\n");
  }
  double cost_per_block(benchmarker& feature, size_t feature_blocks, benchmarker& base) {
    return (feature.stage1.best.elapsed_ns() - base.stage1.best.elapsed_ns()) / feature_blocks;
  }
  // Base cost of any block (including empty ones)
  double base_cost() {
    return (empty.stage1.best.elapsed_ns() / empty.stats->blocks);
  }
  // Extra cost of a 1-7 structural block over an empty block
  double struct1_7_cost() {
    return cost_per_block(struct7, struct7.stats->blocks_with_1_structural, empty);
  }
  // Extra cost of an 1-7-structural miss
  double struct1_7_miss_cost() {
    return cost_per_block(struct7_miss, struct7_miss.stats->blocks_with_1_structural, struct7);
  }
  // Extra cost of an 8-15 structural block over a 1-7 structural block
  double struct8_15_cost() {
    return cost_per_block(struct15, struct15.stats->blocks_with_8_structurals, struct7);
  }
  // Extra cost of an 8-15-structural miss over a 1-7 miss
  double struct8_15_miss_cost() {
    return cost_per_block(struct15_miss, struct15_miss.stats->blocks_with_8_structurals_flipped, struct15);
  }
  // Extra cost of a 16+-structural block over an 8-15 structural block (actual varies based on # of structurals!)
  double struct16_cost() {
    return cost_per_block(struct23, struct23.stats->blocks_with_16_structurals, struct15);
  }
  // Extra cost of a 16-structural miss over an 8-15 miss
  double struct16_miss_cost() {
    return cost_per_block(struct23_miss, struct23_miss.stats->blocks_with_16_structurals_flipped, struct23);
  }
  // Extra cost of having UTF-8 in a block
  double utf8_cost() {
    return cost_per_block(utf8, utf8.stats->blocks_with_utf8, struct7_full);
  }
  // Extra cost of a UTF-8 miss
  double utf8_miss_cost() {
    return cost_per_block(utf8_miss, utf8_miss.stats->blocks_with_utf8_flipped, utf8);
  }
  double calc_expected(benchmarker& file) {
    // Expected base ns/block (empty)
    json_stats& stats = *file.stats;
    double expected = base_cost()      * stats.blocks;
    expected += struct1_7_cost()       * stats.blocks_with_1_structural;
    expected += struct1_7_miss_cost()  * stats.blocks_with_1_structural_flipped;
    expected += utf8_cost()            * stats.blocks_with_utf8;
    expected += utf8_miss_cost()       * stats.blocks_with_utf8_flipped;
    expected += struct8_15_cost()      * stats.blocks_with_8_structurals;
    expected += struct8_15_miss_cost() * stats.blocks_with_8_structurals_flipped;
    expected += struct16_cost()        * stats.blocks_with_16_structurals;
    expected += struct16_miss_cost()   * stats.blocks_with_16_structurals_flipped;
    return expected / stats.blocks;
  }
 };
 int main(int argc, char *argv[]) {
  // Read options
  exe_name = argv[0];
  option_struct options(argc, argv);
  if (options.verbose) {
    verbose_stream = &cout;
  }
  // Initialize the event collector. We put this early so if it prints an error message, it's the
  // first thing printed.
  event_collector collector;
  // Set up benchmarkers by reading all files
  json_parser parser(options.architecture);
  feature_benchmarker features(parser, collector);
  benchmarker gsoc_2018("jsonexamples/gsoc-2018.json", parser, collector);
  benchmarker twitter("jsonexamples/twitter.json", parser, collector);
  benchmarker random("jsonexamples/random.json", parser, collector);
  // Run the benchmarks
  progress_bar progress(options.iterations, 100);
  // Put the if (options.stage1_only) *outside* the loop so that run_iterations will be optimized
  if (options.stage1_only) {
    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
      if (!options.verbose) { progress.print(iteration); }
      features.run_iterations(options.iteration_step, true);
      gsoc_2018.run_iterations(options.iteration_step, true);
      twitter.run_iterations(options.iteration_step, true);
      random.run_iterations(options.iteration_step, true);
    }
  } else {
    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
      if (!options.verbose) { progress.print(iteration); }
      features.run_iterations(options.iteration_step, false);
      gsoc_2018.run_iterations(options.iteration_step, false);
      twitter.run_iterations(options.iteration_step, false);
      random.run_iterations(options.iteration_step, false);
    }
  }
  if (!options.verbose) { progress.erase(); }
  features.print();
 // Gauge effectiveness
  printf("gsoc-2018.json expected/actual: %g/%g\n", features.calc_expected(gsoc_2018), actual(gsoc_2018));
  printf("twitter.json expected/actual: %g/%g\n", features.calc_expected(twitter), actual(twitter));
  printf("random.json expected/actual: %g/%g\n", features.calc_expected(random), actual(random));
  return EXIT_SUCCESS;
 }
--- a/benchmark/benchmarker.h
+++ b/benchmark/benchmarker.h
@ -0,0 +1,424 @@
 #ifndef __BENCHMARKER_H
 #define __BENCHMARKER_H
 #include "json_parser.h"
 #include "event_counter.h"
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
 #include <dirent.h>
 #include <unistd.h>
 #endif
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <chrono>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "linux-perf-events.h"
 #ifdef __linux__
 #include <libgen.h>
 #endif
 //#define DEBUG
 #include "simdjson/common_defs.h"
 #include "simdjson/isadetection.h"
 #include "simdjson/jsonioutil.h"
 #include "simdjson/jsonparser.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
 #include <functional>
 using namespace simdjson;
 using std::cerr;
 using std::cout;
 using std::endl;
 using std::string;
 using std::to_string;
 using std::vector;
 using std::ostream;
 using std::ofstream;
 using std::exception;
 // Initialize "verbose" to go nowhere. We'll read options in main() and set to cout if verbose is true.
 std::ofstream dev_null;
 ostream *verbose_stream = &dev_null;
 const size_t BYTES_PER_BLOCK = 64;
 ostream& verbose() {
  return *verbose_stream;
 }
 void exit_error(string message) {
  cerr << message << endl;
  exit(EXIT_FAILURE);
  abort();
 }
 struct json_stats {
  size_t bytes = 0;
  size_t blocks = 0;
  size_t structurals = 0;
  size_t blocks_with_utf8 = 0;
  size_t blocks_with_utf8_flipped = 0;
  size_t blocks_with_0_structurals = 0;
  size_t blocks_with_0_structurals_flipped = 0;
  size_t blocks_with_1_structural = 0;
  size_t blocks_with_1_structural_flipped = 0;
  size_t blocks_with_8_structurals = 0;
  size_t blocks_with_8_structurals_flipped = 0;
  size_t blocks_with_16_structurals = 0;
  size_t blocks_with_16_structurals_flipped = 0;
  json_stats(const padded_string& json, const ParsedJson& pj) {
    bytes = json.size();
    blocks = bytes / BYTES_PER_BLOCK;
    if (bytes % BYTES_PER_BLOCK > 0) { blocks++; } // Account for remainder block
    structurals = pj.n_structural_indexes-1;
    // Calculate stats on blocks that will trigger utf-8 if statements / mispredictions
    bool last_block_has_utf8 = false;
    for (size_t block=0; block<blocks; block++) {
      // Find utf-8 in the block
      size_t block_start = block*BYTES_PER_BLOCK;
      size_t block_end = block_start+BYTES_PER_BLOCK;
      if (block_end > json.size()) { block_end = json.size(); }
      bool block_has_utf8 = false;
      for (size_t i=block_start; i<block_end; i++) {
        if (json.data()[i] & 0x80) {
          block_has_utf8 = true;
          break;
        }
      }
      if (block_has_utf8) {
        blocks_with_utf8++;
      }
      if (block > 0 && last_block_has_utf8 != block_has_utf8) {
        blocks_with_utf8_flipped++;
      }
      last_block_has_utf8 = block_has_utf8;
    }
    // Calculate stats on blocks that will trigger structural count if statements / mispredictions
    bool last_block_has_0_structurals = false;
    bool last_block_has_1_structural = false;
    bool last_block_has_8_structurals = false;
    bool last_block_has_16_structurals = false;
    size_t structural=0;
    for (size_t block=0; block<blocks; block++) {
      // Count structurals in the block
      int block_structurals=0;
      while (structural < pj.n_structural_indexes && pj.structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
        block_structurals++;
        structural++;
      }
      bool block_has_0_structurals = block_structurals == 0;
      if (block_has_0_structurals) {
        blocks_with_0_structurals++;
      }
      if (block > 0 && last_block_has_0_structurals != block_has_0_structurals) {
        blocks_with_0_structurals_flipped++;
      }
      last_block_has_0_structurals = block_has_0_structurals;
      bool block_has_1_structural = block_structurals >= 1;
      if (block_has_1_structural) {
        blocks_with_1_structural++;
      }
      if (block > 0 && last_block_has_1_structural != block_has_1_structural) {
        blocks_with_1_structural_flipped++;
      }
      last_block_has_1_structural = block_has_1_structural;
      bool block_has_8_structurals = block_structurals >= 8;
      if (block_has_8_structurals) {
        blocks_with_8_structurals++;
      }
      if (block > 0 && last_block_has_8_structurals != block_has_8_structurals) {
        blocks_with_8_structurals_flipped++;
      }
      last_block_has_8_structurals = block_has_8_structurals;
      bool block_has_16_structurals = block_structurals >= 16;
      if (block_has_16_structurals) {
        blocks_with_16_structurals++;
      }
      if (block > 0 && last_block_has_16_structurals != block_has_16_structurals) {
        blocks_with_16_structurals_flipped++;
      }
      last_block_has_16_structurals = block_has_16_structurals;
    }
  }
 };
 padded_string load_json(const char *filename) {
  try {
    verbose() << "[verbose] loading " << filename << endl;
    padded_string json = simdjson::get_corpus(filename);
    verbose() << "[verbose] loaded " << filename << " (" << json.size() << " bytes)" << endl;
    return json;
  } catch (const exception &) { // caught by reference to base
    exit_error(string("Could not load the file ") + filename);
    exit(EXIT_FAILURE); // This is not strictly necessary but removes the warning
  }
 }
 struct progress_bar {
  int max_value;
  int total_ticks;
  double ticks_per_value;
  int next_tick;
  progress_bar(int _max_value, int _total_ticks) : max_value(_max_value), total_ticks(_total_ticks), ticks_per_value(double(_total_ticks)/_max_value), next_tick(0) {
    fprintf(stderr, "[");
    for (int i=0;i<total_ticks;i++) {
      fprintf(stderr, " ");
    }
    fprintf(stderr, "]");
    for (int i=0;i<total_ticks+1;i++) {
      fprintf(stderr, "\b");
    }
  }
  void print(int value) {
    double ticks = value*ticks_per_value;
    if (ticks >= total_ticks) {
      ticks = total_ticks-1;
    }
    int tick;
    for (tick=next_tick; tick <= ticks && tick <= total_ticks; tick++) {
      fprintf(stderr, "=");
    }
    next_tick = tick;
  }
  void erase() {
    for (int i=0;i<next_tick+1;i++) {
      fprintf(stderr, "\b");
    }
    for (int tick=0; tick<=total_ticks+2; tick++) {
      fprintf(stderr, " ");
    }
    for (int tick=0; tick<=total_ticks+2; tick++) {
      fprintf(stderr, "\b");
    }
  }
 };
 struct benchmarker {
  // JSON text from loading the file. Owns the memory.
  const padded_string json;
  // JSON filename
  const char *filename;
  // Parser that will parse the JSON file
  const json_parser& parser;
  // Event collector that can be turned on to measure cycles, missed branches, etc.
  event_collector& collector;
  // Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
  // Loaded on first parse.
  json_stats* stats;
  // Speed and event summary for full parse (not including allocation)
  event_aggregate all_stages;
  // Speed and event summary for stage 1
  event_aggregate stage1;
  // Speed and event summary for stage 2
  event_aggregate stage2;
  // Speed and event summary for allocation
  event_aggregate allocate_stage;
  benchmarker(const char *_filename, const json_parser& _parser, event_collector& _collector)
    : json(load_json(_filename)), filename(_filename), parser(_parser), collector(_collector), stats(NULL) {}
  ~benchmarker() {
    if (stats) {
      delete stats;
    }
  }
  int iterations() const {
    return all_stages.iterations;
  }
  really_inline void run_iteration(bool stage1_only=false) {
    // Allocate ParsedJson
    collector.start();
    ParsedJson pj;
    bool allocok = pj.allocate_capacity(json.size());
    event_count allocate_count = collector.end();
    allocate_stage << allocate_count;
    if (!allocok) {
      exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result.");
    }
    verbose() << "[verbose] allocated memory for parsed JSON " << endl;
    // Stage 1 (find structurals)
    collector.start();
    int result = parser.stage1((const uint8_t *)json.data(), json.size(), pj);
    event_count stage1_count = collector.end();
    stage1 << stage1_count;
    if (result != simdjson::SUCCESS) {
      exit_error(string("Failed to parse ") + filename + " during stage 1: " + pj.get_error_message());
    }
    // Stage 2 (unified machine)
    event_count stage2_count;
    if (!stage1_only || stats == NULL) {
      if (!stage1_only) {
        collector.start();
      }
      result = parser.stage2((const uint8_t *)json.data(), json.size(), pj);
      if (!stage1_only) {
        stage2_count = collector.end();
        stage2 << stage2_count;
      }
      if (result != simdjson::SUCCESS) {
        exit_error(string("Failed to parse ") + filename + " during stage 2: " + pj.get_error_message());
      }
    }
    all_stages << (stage1_count + stage2_count);
    // Calculate stats the first time we parse
    if (stats == NULL) {
      stats = new json_stats(json, pj);
    }
  }
  really_inline void run_iterations(size_t iterations, bool stage1_only=false) {
    for (size_t i = 0; i<iterations; i++) {
      run_iteration(stage1_only);
    }
  }
  double stage1_ns_per_block() {
    return stage1.elapsed_ns() / stats->blocks;
  }
  template<typename T>
  void print_aggregate(const char* prefix, const T& stage) const {
    printf("%s%-13s: %8.4f ns per block (%5.1f %%) - %8.4f ns per byte - %8.4f ns per structural - %8.3f GB/s\n",
      prefix,
      "Speed",
      stage.elapsed_ns() / stats->blocks, // per block
      100.0 * stage.elapsed_sec() / all_stages.elapsed_sec(), // %
      stage.elapsed_ns() / stats->bytes, // per byte
      stage.elapsed_ns() / stats->structurals, // per structural
      (json.size() / 1000000000.0) / stage.elapsed_sec() // GB/s
    );
    if (collector.has_events()) {
      printf("%s%-13s: %2.3f per block (%5.2f %%) - %2.3f per byte - %2.3f per structural - %2.3f GHz est. frequency\n",
        prefix,
        "Cycles",
        stage.cycles() / stats->blocks,
        100.0 * stage.cycles() / all_stages.cycles(),
        stage.cycles() / stats->bytes,
        stage.cycles() / stats->structurals,
        (stage.cycles() / stage.elapsed_sec()) / 1000000000.0
      );
      printf("%s%-13s: %2.2f per block (%5.2f %%) - %2.2f per byte - %2.2f per structural - %2.2f per cycle\n",
        prefix,
        "Instructions",
        stage.instructions() / stats->blocks,
        100.0 * stage.instructions() / all_stages.instructions(),
        stage.instructions() / stats->bytes,
        stage.instructions() / stats->structurals,
        stage.instructions() / stage.cycles()
      );
      // NOTE: removed cycles/miss because it is a somewhat misleading stat
      printf("%s%-13s: %2.2f branch misses (%5.2f %%) - %2.2f cache misses (%5.2f %%) - %2.2f cache references\n",
        prefix,
        "Misses",
        stage.branch_misses(),
        100.0 * stage.branch_misses() / all_stages.branch_misses(),
        stage.cache_misses(),
        100.0 * stage.cache_misses() / all_stages.cache_misses(),
        stage.cache_references()
      );
    }
  }
  void print(bool tabbed_output) const {
    if (tabbed_output) {
      char* filename_copy = (char*)malloc(strlen(filename)+1);
      strcpy(filename_copy, filename);
      #if defined(__linux__)
      char* base = ::basename(filename_copy);
      #else
      char* base = filename_copy;
      #endif
      if (strlen(base) >= 5 && !strcmp(base+strlen(base)-5, ".json")) {
        base[strlen(base)-5] = '\0';
      }
      double gb = json.size() / 1000000000.0;
      if (collector.has_events()) {
        printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",
                base,
                allocate_stage.best.cycles() / json.size(),
                stage1.best.cycles() / json.size(),
                stage2.best.cycles() / json.size(),
                all_stages.best.cycles() / json.size(),
                gb / all_stages.best.elapsed_sec(),
                gb / stage1.best.elapsed_sec(),
                gb / stage2.best.elapsed_sec());
      } else {
        printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
                base,
                gb / all_stages.best.elapsed_sec(),
                gb / stage1.best.elapsed_sec(),
                gb / stage2.best.elapsed_sec());
      }
      free(filename_copy);
    } else {
      printf("\n");
      printf("%s\n", filename);
      printf("%s\n", string(strlen(filename), '=').c_str());
      printf("%9zu blocks - %10zu bytes - %5zu structurals (%5.1f %%)\n", stats->bytes / BYTES_PER_BLOCK, stats->bytes, stats->structurals, 100.0 * stats->structurals / stats->bytes);
      if (stats) {
        printf("special blocks with: utf8 %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
          stats->blocks_with_utf8, 100.0 * stats->blocks_with_utf8 / stats->blocks,
          stats->blocks_with_0_structurals, 100.0 * stats->blocks_with_0_structurals / stats->blocks,
          stats->blocks_with_1_structural, 100.0 * stats->blocks_with_1_structural / stats->blocks,
          stats->blocks_with_8_structurals, 100.0 * stats->blocks_with_8_structurals / stats->blocks,
          stats->blocks_with_16_structurals, 100.0 * stats->blocks_with_16_structurals / stats->blocks);
        printf("special block flips: utf8 %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
          stats->blocks_with_utf8_flipped, 100.0 * stats->blocks_with_utf8_flipped / stats->blocks,
          stats->blocks_with_1_structural_flipped, 100.0 * stats->blocks_with_1_structural_flipped / stats->blocks,
          stats->blocks_with_0_structurals_flipped, 100.0 * stats->blocks_with_0_structurals_flipped / stats->blocks,
          stats->blocks_with_8_structurals_flipped, 100.0 * stats->blocks_with_8_structurals_flipped / stats->blocks,
          stats->blocks_with_16_structurals_flipped, 100.0 * stats->blocks_with_16_structurals_flipped / stats->blocks);
      }
      printf("\n");
      printf("All Stages\n");
      print_aggregate("|    "   , all_stages.best);
      //          printf("|- Allocation\n");
      // print_aggregate("|    ", allocate_stage.best);
              printf("|- Stage 1\n");
      print_aggregate("|    ", stage1.best);
              printf("|- Stage 2\n");
      print_aggregate("|    ", stage2.best);
    }
  }
 };
 #endif
--- a/benchmark/event_counter.h
+++ b/benchmark/event_counter.h
@ -0,0 +1,152 @@
 #ifndef __EVENT_COUNTER_H
 #define __EVENT_COUNTER_H
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
 #include <dirent.h>
 #include <unistd.h>
 #endif
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <chrono>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "linux-perf-events.h"
 #ifdef __linux__
 #include <libgen.h>
 #endif
 //#define DEBUG
 #include "simdjson/common_defs.h"
 #include "simdjson/isadetection.h"
 using std::string;
 using std::vector;
 using std::chrono::steady_clock;
 using std::chrono::time_point;
 using std::chrono::duration;
 struct event_count {
  duration<double> elapsed;
  vector<unsigned long long> event_counts;
  event_count() : elapsed(0), event_counts{0,0,0,0,0} {}
  event_count(const duration<double> _elapsed, const vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {}
  event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { }
  // The types of counters (so we can read the getter more easily)
  enum event_counter_types {
    CPU_CYCLES,
    INSTRUCTIONS,
    BRANCH_MISSES,
    CACHE_REFERENCES,
    CACHE_MISSES
  };
  double elapsed_sec() const { return duration<double>(elapsed).count(); }
  double elapsed_ns() const { return duration<double, std::nano>(elapsed).count(); }
  double cycles() const { return event_counts[CPU_CYCLES]; }
  double instructions() const { return event_counts[INSTRUCTIONS]; }
  double branch_misses() const { return event_counts[BRANCH_MISSES]; }
  double cache_references() const { return event_counts[CACHE_REFERENCES]; }
  double cache_misses() const { return event_counts[CACHE_MISSES]; }
  event_count& operator=(const event_count other) {
    this->elapsed = other.elapsed;
    this->event_counts = other.event_counts;
    return *this;
  }
  event_count operator+(const event_count other) const {
    return event_count(elapsed+other.elapsed, {
      event_counts[0]+other.event_counts[0],
      event_counts[1]+other.event_counts[1],
      event_counts[2]+other.event_counts[2],
      event_counts[3]+other.event_counts[3],
      event_counts[4]+other.event_counts[4],
    });
  }
  void operator+=(const event_count other) {
    *this = *this + other;
  }
 };
 struct event_aggregate {
  int iterations = 0;
  event_count total;
  event_count best;
  event_count worst;
  event_aggregate() {}
  void operator<<(const event_count other) {
    if (iterations == 0 || other.elapsed < best.elapsed) {
      best = other;
    }
    if (iterations == 0 || other.elapsed > worst.elapsed) {
      worst = other;
    }
    iterations++;
    total += other;
  }
  double elapsed_sec() const { return total.elapsed_sec() / iterations; }
  double elapsed_ns() const { return total.elapsed_ns() / iterations; }
  double cycles() const { return total.cycles() / iterations; }
  double instructions() const { return total.instructions() / iterations; }
  double branch_misses() const { return total.branch_misses() / iterations; }
  double cache_references() const { return total.cache_references() / iterations; }
  double cache_misses() const { return total.cache_misses() / iterations; }
 };
 struct event_collector {
  event_count count;
  time_point<steady_clock> start_clock;
 #if defined(__linux__)
  LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
  event_collector() : linux_events(vector<int>{
    PERF_COUNT_HW_CPU_CYCLES,
    PERF_COUNT_HW_INSTRUCTIONS,
    PERF_COUNT_HW_BRANCH_MISSES,
    PERF_COUNT_HW_CACHE_REFERENCES,
    PERF_COUNT_HW_CACHE_MISSES
  }) {}
  bool has_events() {
    return linux_events.is_working();
  }
 #else
  bool has_events() {
    return false;
  }
 #endif
  really_inline void start() {
 #if defined(__linux)
    linux_events.start();
 #endif
    start_clock = steady_clock::now();
  }
  really_inline event_count& end() {
    time_point<steady_clock> end_clock = steady_clock::now();
 #if defined(__linux)
    linux_events.end(count.event_counts);
 #endif
    count.elapsed = end_clock - start_clock;
    return count;
  }
 };
 #endif
--- a/benchmark/genfeaturejson.rb
+++ b/benchmark/genfeaturejson.rb
@ -0,0 +1,114 @@
 class ChunkWriter
    def initialize(output_dir, miss_templates, file_size=640*1000, block_size=64)
        @@output_dir = output_dir
        @@miss_templates = miss_templates
        @@file_size = file_size
        @@block_size = block_size
    end
    def prepare_chunk(chunks, include_newline)
        Array(chunks).map do |chunk|
            "#{chunk}#{' '*(@@block_size-chunk.bytesize-1)}#{include_newline ? "\n" : " "}"
        end.join("")
    end
    def write_files(filename, start1, repeat1, end1, repeat2: '', include_newline: true)
        start1  = prepare_chunk(start1, include_newline)
        repeat1 = prepare_chunk(repeat1, include_newline)
        end1    = prepare_chunk(end1, include_newline)
        write_full(File.join(@@output_dir, "#{filename}-full.json"), start1, repeat1, end1)
        repeat2 = prepare_chunk(repeat2, include_newline)
        repeat2 = repeat2 * (repeat1.bytesize/repeat2.bytesize)
        write_half(File.join(@@output_dir, "#{filename}.json"), start1, repeat1, end1, repeat2)
        write_half_miss(File.join(@@output_dir, "#{filename}-miss.json"), start1, repeat1, end1, repeat2)
    end
    def write_full(filename, start1, repeat1, end1)
        puts "Writing #{filename} ..."
        File.open(filename, "w") do |file|
            write_chunks(file, start1, repeat1, end1, @@file_size)
        end
        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
    end
    def write_half(filename, start1, repeat1, end1, repeat2)
        # repeat1 is already represented in start1 and end1, so it doesn't need quite
        # half the iterations.
        repeat1_len = (@@file_size/2) - start1.bytesize - end1.bytesize
        halfway_point = start1.bytesize + repeat1_len + repeat2.bytesize
        puts "Writing #{filename} ..."
        File.open(filename, "w") do |file|
            write_chunks(file, start1,  repeat1, repeat2, halfway_point)
            write_chunks(file, repeat2, repeat2, end1,    @@file_size-halfway_point)
        end
        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
    end
    def write_half_miss(filename, start1, repeat1, end1, repeat2)
        miss_template = Array(File.read(File.join(@@miss_templates, "#{repeat1.bytesize}.txt")).chomp.split("\n"))
        # Take the start and end out of the template
        repeat_template = miss_template[(start1.bytesize/64)..(-end1.bytesize/64-1)]
        # If repeat is 128 bytes, each *pair* of elements is set. Use that.
        repeat_chunks = repeat1.bytesize/64
        repeat_template = (repeat_chunks - 1).step(repeat_template.size - 1, repeat_chunks).map { |i| repeat_template[i] }
        puts "Writing #{filename} ..."
        File.open(filename, "w") do |file|
            file.write(start1)
            repeat_template.each do |should_repeat|
                file.write(should_repeat == "1" ? repeat1 : repeat2)
            end
            file.write(end1)
        end
        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
    end
    def write_chunks(file, start1, repeat1, end1, size)
        pos = 0
        file.write(start1)
        pos += start1.bytesize
        repeat_end = size-end1.bytesize
        loop do
            file.write(repeat1)
            pos += repeat1.bytesize
            break if pos >= repeat_end
        end
        file.write(end1)
        pos += end1.bytesize
        return pos
    end
 end
 output_dir = File.expand_path("../jsonexamples/generated", File.dirname(__FILE__))
 miss_templates = File.expand_path("miss-templates", File.dirname(__FILE__))
 Dir.mkdir(output_dir) unless File.directory?(output_dir)
 w = ChunkWriter.new(output_dir, miss_templates)
 w.write_files "utf-8",          '["֏","֏",{}', ',"֏","֏",{}', ',"֏","֏","֏"]', repeat2: ',"ab","ab",{}'
 w.write_files "0-structurals",  '"ab"', '',  ''
 # w.write_files "1-structurals",  [ '[', '"ab"' ], [ ',', '"ab"' ], [ ',', '{', '}', ']' ]
 # w.write_files "2-structurals",  '["ab"', ',"ab"', [',{', '}]']
 # w.write_files "3-structurals",  '[{}', ',{}', ',"ab"]'
 # w.write_files "4-structurals",  '["ab","ab"', ',"ab","ab"', ',{}]'
 # w.write_files "5-structurals",  '["ab",{}', ',"ab",{}', ',"ab","ab"]'
 # w.write_files "6-structurals",  '["ab","ab","ab"', ',"ab","ab","ab"', ',"ab",{}]'
 w.write_files "7-structurals",  '["ab","ab",{}', ',"ab","ab",{}', ',"ab","ab","ab"]'
 # w.write_files "8-structurals",  '["ab","ab","ab","ab"', ',"ab","ab","ab","ab"', ',"ab","ab",{}]'
 # w.write_files "9-structurals",  '["ab","ab","ab",{}', ',"ab","ab","ab",{}', ',"ab","ab","ab","ab"]'
 # w.write_files "10-structurals", '["ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab"', ',"ab","ab","ab",{}]'
 # w.write_files "11-structurals", '["ab","ab","ab","ab",{}', ',"ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab"]'
 # w.write_files "12-structurals", '["ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab",{}]'
 # w.write_files "13-structurals", '["ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab"]'
 # w.write_files "14-structurals", '["ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab",{}]'
 w.write_files "15-structurals", '["ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab"]'
 # w.write_files "16-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab",{}]'
 # w.write_files "17-structurals", '["ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab"]'
 # w.write_files "18-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab",{}]'
 # w.write_files "19-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
 # w.write_files "20-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab",{}]'
 # w.write_files "21-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
 # w.write_files "22-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab",{}]'
 w.write_files "23-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
--- a/benchmark/genfeaturejsonseed.rb
+++ b/benchmark/genfeaturejsonseed.rb
@ -0,0 +1,49 @@
 def gen_seeds(start_blocks, repeat_blocks, end_blocks)
    total_size = 640*1000
    total_blocks = total_size/64
    seed_space = 1..1000000
    target_blocks = total_blocks*0.5
    target_flips = total_blocks*0.25
    percent_flips = 0.25*repeat_blocks
    puts "Seeds for #{start_blocks} start blocks, #{end_blocks} end blocks and #{repeat_blocks} repeat blocks: #{percent_flips*100}% flips"
    closest_flips = nil
    closest_seeds = []
    seed_space.each do |seed|
        r = Random.new(seed)
        # First block is always type 1
        flips = 0
        type1 = true
        type1_blocks = start_blocks
        finished_blocks = start_blocks
        last_repeat = total_blocks-end_blocks
        while finished_blocks < last_repeat
            if r.rand < percent_flips
                flips += 1
                type1 = !type1
            end
            type1_blocks += repeat_blocks if type1
            finished_blocks += repeat_blocks
        end
        # Last one is always type 1
        flips += 1 if !type1
        type1 = true
        type1_blocks += end_blocks
        finished_blocks += end_blocks
        raise "simulated the wrong number of blocks #{finished_blocks}" if finished_blocks != total_blocks
        if type1_blocks == target_blocks
            if flips == target_flips
                puts seed
                closest_seeds << seed
            end
        end
    end
    puts closest_seeds
 end
 gen_seeds(1,1,1)
 gen_seeds(1,1,2)
 gen_seeds(2,2,4)
--- a/benchmark/json_parser.h
+++ b/benchmark/json_parser.h
@ -0,0 +1,113 @@
 #ifndef __JSON_PARSER_H
 #define __JSON_PARSER_H
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
 #include <dirent.h>
 #include <unistd.h>
 #endif
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <chrono>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <set>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "linux-perf-events.h"
 #ifdef __linux__
 #include <libgen.h>
 #endif
 //#define DEBUG
 #include "simdjson/common_defs.h"
 #include "simdjson/isadetection.h"
 #include "simdjson/jsonioutil.h"
 #include "simdjson/jsonparser.h"
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
 using namespace simdjson;
 using std::string;
 using stage2_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
 using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
 stage1_functype* get_stage1_func(const Architecture architecture) {
  switch (architecture) {
  #ifdef IS_X86_64
    case Architecture::HASWELL:
      return &find_structural_bits<Architecture::HASWELL>;
    case Architecture::WESTMERE:
      return &find_structural_bits<Architecture::WESTMERE>;
  #endif
  #ifdef IS_ARM64
    case Architecture::ARM64:
      return &find_structural_bits<Architecture::ARM64>;
  #endif
  default:
    std::cerr << "The processor is not supported by simdjson." << std::endl;
    exit(EXIT_FAILURE);
  }
 }
 stage2_functype* get_stage2_func(const Architecture architecture) {
  switch (architecture) {
 #ifdef IS_X86_64
  case Architecture::HASWELL:
    return &unified_machine<Architecture::HASWELL>;
    break;
  case Architecture::WESTMERE:
    return &unified_machine<Architecture::WESTMERE>;
    break;
 #endif
 #ifdef IS_ARM64
  case Architecture::ARM64:
    return &unified_machine<Architecture::ARM64>;
    break;
 #endif
  default:
    std::cerr << "The processor is not supported by simdjson." << std::endl;
    exit(EXIT_FAILURE);
  }
 }
 struct json_parser {
  const Architecture architecture;
  const stage1_functype *stage1_func;
  const stage2_functype *stage2_func;
  json_parser(const Architecture _architecture) : architecture(_architecture) {
    this->stage1_func = get_stage1_func(architecture);
    this->stage2_func = get_stage2_func(architecture);
  }
  json_parser() : json_parser(find_best_supported_architecture()) {}
  int stage1(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
    return this->stage1_func(buf, len, pj);
  }
  int stage2(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
    return this->stage2_func(buf, len, pj);
  }
  int parse(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
    int result = this->stage1(buf, len, pj);
    if (result == SUCCESS) {
      result = this->stage2(buf, len, pj);
    }
    return result;
  }
 };
 #endif
--- a/benchmark/linux/linux-perf-events.h
+++ b/benchmark/linux/linux-perf-events.h
@ -83,6 +83,10 @@ public:
    }
  }
  bool is_working() {
    return working;
  }
 private:
  void report_error(const std::string &context) {
    if (working)
--- a/benchmark/miss-templates/128.txt
+++ b/benchmark/miss-templates/128.txt
--- a/benchmark/miss-templates/64.txt
+++ b/benchmark/miss-templates/64.txt
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -1,3 +1,6 @@
 #include "json_parser.h"
 #include "event_counter.h"
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
@ -35,405 +38,179 @@
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
-// Global arguments
+#include <functional>
 bool find_marks_only = false;
 bool verbose = false;
 bool dump = false;
 bool json_output = false;
 bool force_one_iteration = false;
 bool just_data = false;
 bool force_sse = false;
 int32_t iterations = -1;
 int32_t warmup_iterations = -1;
-namespace simdjson {
+#include "benchmarker.h"
 Architecture _find_best_supported_implementation() {
  constexpr uint32_t haswell_flags =
      instruction_set::AVX2 | instruction_set::PCLMULQDQ |
      instruction_set::BMI1 | instruction_set::BMI2;
  constexpr uint32_t westmere_flags =
      instruction_set::SSE42 | instruction_set::PCLMULQDQ;
  uint32_t supports = detect_supported_architectures();
  // Order from best to worst (within architecture)
  if ((haswell_flags & supports) == haswell_flags && !force_sse) {
    return Architecture::HASWELL;
  }
  if ((westmere_flags & supports) == westmere_flags) {
    return Architecture::WESTMERE;
  }
  if (instruction_set::NEON)
    return Architecture::ARM64;
-  return Architecture::NONE;
+using namespace simdjson;
 using std::cerr;
 using std::cout;
 using std::endl;
 using std::string;
 using std::to_string;
 using std::vector;
 using std::ostream;
 using std::ofstream;
 using std::exception;
 // Stash the exe_name in main() for functions to use
 char* exe_name;
 void print_usage(ostream& out) {
  out << "Usage: " << exe_name << " [-vt] [-n #] [-s STAGE] [-a ARCH] <jsonfile> ..." << endl;
  out << endl;
  out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << endl;
  out << endl;
  out << "Options:" << endl;
  out << endl;
  out << "-n #       - Number of iterations per file. Default: 200" << endl;
  out << "-i #       - Number of times to iterate a single file before moving to the next. Default: 20" << endl;
  out << "-t         - Tabbed data output" << endl;
  out << "-v         - Verbose output." << endl;
  out << "-s STAGE   - Stop after the given stage." << endl;
  out << "             -s stage1 - Stop after find_structural_bits." << endl;
  out << "             -s all    - Run all stages." << endl;
  out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
  out << "             or ARM64). By default, detects best supported architecture." << endl;
 }
-using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
+void exit_usage(string message) {
-using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
+  cerr << message << endl;
-
+  cerr << endl;
-extern unified_functype *unified_ptr;
+  print_usage(cerr);
-
+  exit(EXIT_FAILURE);
 extern stage1_functype *stage1_ptr;
 int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
  if (find_marks_only) {
    return simdjson::SUCCESS;
  }
  Architecture best_implementation = _find_best_supported_implementation();
  // Selecting the best implementation
  switch (best_implementation) {
 #ifdef IS_X86_64
  case Architecture::HASWELL:
    unified_ptr = &unified_machine<Architecture::HASWELL>;
    break;
  case Architecture::WESTMERE:
    unified_ptr = &unified_machine<Architecture::WESTMERE>;
    break;
 #endif
 #ifdef IS_ARM64
  case Architecture::ARM64:
    unified_ptr = &unified_machine<Architecture::ARM64>;
    break;
 #endif
  default:
    std::cerr << "The processor is not supported by simdjson." << std::endl;
    return simdjson::UNEXPECTED_ERROR;
  }
  return unified_ptr(buf, len, pj);
 }
-// Responsible to select the best json_parse implementation
+struct option_struct {
-int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
+  vector<char*> files;
-  Architecture best_implementation = _find_best_supported_implementation();
+  Architecture architecture = Architecture::UNSUPPORTED;
-  // Selecting the best implementation
+  bool stage1_only = false;
-  switch (best_implementation) {
+
-#ifdef IS_X86_64
+  int32_t iterations = 200;
-  case Architecture::HASWELL:
+  int32_t iteration_step = 50;
-    stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
+
-    break;
+  bool verbose = false;
-  case Architecture::WESTMERE:
+  bool tabbed_output = false;
-    stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
+
-    break;
+  option_struct(int argc, char **argv) {
-#endif
+    #ifndef _MSC_VER
-#ifdef IS_ARM64
+      int c;
-  case Architecture::ARM64:
+
-    stage1_ptr = &find_structural_bits<Architecture::ARM64>;
+      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
-    break;
+        switch (c) {
-#endif
+        case 'n':
-  default:
+          iterations = atoi(optarg);
-    std::cerr << "The processor is not supported by simdjson." << std::endl;
+          break;
-    return simdjson::UNEXPECTED_ERROR;
+        case 'i':
          iteration_step = atoi(optarg);
          break;
        case 't':
          tabbed_output = true;
          break;
        case 'v':
          verbose = true;
          break;
        case 'a':
          architecture = parse_architecture(optarg);
          if (architecture == Architecture::UNSUPPORTED) {
            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
          }
          break;
        case 's':
          if (!strcmp(optarg, "stage1")) {
            stage1_only = true;
          } else if (!strcmp(optarg, "all")) {
            stage1_only = false;
          } else {
            exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
          }
          break;
        default:
          exit_error("Unexpected argument " + c);
        }
      }
    #else
      int optind = 1;
    #endif
    // If architecture is not specified, pick the best supported architecture by default
    if (architecture == Architecture::UNSUPPORTED) {
      architecture = find_best_supported_architecture();
    }
    // All remaining arguments are considered to be files
    for (int i=optind; i<argc; i++) {
      files.push_back(argv[i]);
    }
    if (files.empty()) {
      exit_usage("No files specified");
    }
    // Keeps the numbers the same for CI (old ./parse didn't have a two-stage loop)
    if (files.size() == 1) {
      iteration_step = iterations;
    }
    #if !defined(__linux__)
      if (tabbed_output) {
        exit_error("tabbed_output (-t) flag only works under linux.\n");
      }
    #endif
  }
-
+};
  return stage1_ptr(buf, len, pj);
 }
 stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
 unified_functype *unified_ptr = &unified_machine_dispatch;
 } // namespace simdjson
 int main(int argc, char *argv[]) {
-
+  // Read options
-#ifndef _MSC_VER
+  exe_name = argv[0];
-  int c;
+  option_struct options(argc, argv);
-
+  if (options.verbose) {
-  while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
+    verbose_stream = &cout;
    switch (c) {
    case 'n':
      iterations = atoi(optarg);
      break;
    case 'w':
      warmup_iterations = atoi(optarg);
      break;
    case 's':
      force_sse = true;
      break;
    case 't':
      just_data = true;
      break;
    case 'v':
      verbose = true;
      break;
    case 'd':
      dump = true;
      break;
    case 'j':
      json_output = true;
      break;
    case '1':
      force_one_iteration = true;
      break;
    case 'f':
      find_marks_only = true;
      break;
    default:
      abort();
    }
  }
 #else
  int optind = 1;
 #endif
  if (optind >= argc) {
    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
    exit(1);
  }
  const char *filename = argv[optind];
  if (optind + 1 < argc) {
    std::cerr << "warning: ignoring everything after " << argv[optind + 1]
              << std::endl;
  }
  if (verbose) {
    std::cout << "[verbose] loading " << filename << std::endl;
  }
  simdjson::padded_string p;
  try {
    simdjson::get_corpus(filename).swap(p);
  } catch (const std::exception &) { // caught by reference to base
    std::cout << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
  }
  if (verbose) {
    std::cout << "[verbose] loaded " << filename << " (" << p.size()
              << " bytes)" << std::endl;
  }
  if (iterations == -1) {
    #if defined(DEBUG)
      iterations = 1;
    #else
      iterations = force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
    #endif
  }
  if (warmup_iterations == -1) {
    #if defined(DEBUG)
      warmup_iterations = 0;
    #else
      warmup_iterations = (p.size() < 1 * 1000 * 1000) ? 10 : 1;
    #endif
  }
-  std::vector<double> res;
+  // Start collecting events. We put this early so if it prints an error message, it's the
-  res.resize(iterations);
+  // first thing printed.
-  if (!just_data)
+  event_collector collector;
    printf("number of iterations %u \n", iterations);
 #if !defined(__linux__)
 #define SQUASH_COUNTERS
  if (just_data) {
    printf("just_data (-t) flag only works under linux.\n");
  }
 #endif
  { // practice run
    simdjson::ParsedJson pj;
    bool allocok = pj.allocate_capacity(p.size());
    if (allocok) {
      simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
      simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj);
    }
  }
 #ifndef SQUASH_COUNTERS
  std::vector<int> evts;
  evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
  evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
  evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
  evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
  evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
  LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
  std::vector<unsigned long long> results;
  results.resize(evts.size());
  unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
  unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
  unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
  unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
  unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
 #endif
-  // Do warmup iterations
+  // Print preamble
-  bool isok = true;
+  if (!options.tabbed_output) {
-  for (int32_t i = 0; i < warmup_iterations; i++) {
+    printf("number of iterations %u \n", options.iterations);
    if (verbose) {
      std::cout << "[verbose] warmup iteration # " << i << std::endl;
    }
    simdjson::ParsedJson pj;
    bool allocok = pj.allocate_capacity(p.size());
    if (!allocok) {
      std::cerr << "failed to allocate memory" << std::endl;
      return EXIT_FAILURE;
    }
    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
            simdjson::SUCCESS);
    isok = isok &&
           (simdjson::SUCCESS ==
            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
    if (!isok) {
      std::cerr << pj.get_error_message() << std::endl;
      std::cerr << "Could not parse. " << std::endl;
      return EXIT_FAILURE;
    }
  }
-#ifndef SQUASH_COUNTERS
+  // Set up benchmarkers by reading all files
-  for (int32_t i = 0; i < iterations; i++) {
+  json_parser parser(options.architecture);
-    if (verbose) {
+  vector<benchmarker*> benchmarkers;
-      std::cout << "[verbose] iteration # " << i << std::endl;
+  for (size_t i=0; i<options.files.size(); i++) {
-    }
+    benchmarkers.push_back(new benchmarker(options.files[i], parser, collector));
    unified.start();
    simdjson::ParsedJson pj;
    bool allocok = pj.allocate_capacity(p.size());
    if (!allocok) {
      std::cerr << "failed to allocate memory" << std::endl;
      return EXIT_FAILURE;
    }
    unified.end(results);
    cy0 += results[0];
    cl0 += results[1];
    mis0 += results[2];
    cref0 += results[3];
    cmis0 += results[4];
    if (verbose) {
      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
    }
    unified.start();
    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
            simdjson::SUCCESS);
    unified.end(results);
    cy1 += results[0];
    cl1 += results[1];
    mis1 += results[2];
    cref1 += results[3];
    cmis1 += results[4];
    if (!isok) {
      std::cout << "Failed during stage 1" << std::endl;
      break;
    }
    unified.start();
    isok = isok &&
           (simdjson::SUCCESS ==
            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
    unified.end(results);
    cy2 += results[0];
    cl2 += results[1];
    mis2 += results[2];
    cref2 += results[3];
    cmis2 += results[4];
    if (!isok) {
      std::cout << "Failed during stage 2" << std::endl;
      break;
    }
  }
 #endif
-  // we do it again, this time just measuring the elapsed time
+  // Run the benchmarks
-  for (int32_t i = 0; i < iterations; i++) {
+  progress_bar progress(options.iterations, 50);
-    if (verbose) {
+  // Put the if (options.stage1_only) *outside* the loop so that run_iterations will be optimized
-      std::cout << "[verbose] iteration # " << i << std::endl;
+  if (options.stage1_only) {
-    }
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
-    simdjson::ParsedJson pj;
+      if (!options.verbose) { progress.print(iteration); }
-    bool allocok = pj.allocate_capacity(p.size());
+      // Benchmark each file once per iteration
-    if (!allocok) {
+      for (size_t f=0; f<options.files.size(); f++) {
-      std::cerr << "failed to allocate memory" << std::endl;
+        verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
-      return EXIT_FAILURE;
+        benchmarkers[f]->run_iterations(options.iteration_step, true);
    }
    if (verbose) {
      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
    }
    auto start = std::chrono::steady_clock::now();
    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
            simdjson::SUCCESS);
    isok = isok &&
          (simdjson::SUCCESS ==
            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> secs = end - start;
    res[i] = secs.count();
    if (!isok) {
      std::cerr << pj.get_error_message() << std::endl;
      std::cerr << "Could not parse. " << std::endl;
      return EXIT_FAILURE;
    }
  }
  simdjson::ParsedJson pj =
      build_parsed_json(p); // do the parsing again to get the stats
  if (!pj.is_valid()) {
    std::cerr << pj.get_error_message() << std::endl;
    std::cerr << "Could not parse. " << std::endl;
    return EXIT_FAILURE;
  }
  double min_result = *min_element(res.begin(), res.end());
  double speedinGBs = (p.size()) / (min_result * 1000000000.0);
 #ifndef SQUASH_COUNTERS
  unsigned long total = cy0 + cy1 + cy2;
  if (just_data) {
    float cpb0 = (double)cy0 / (iterations * p.size());
    float cpb1 = (double)cy1 / (iterations * p.size());
    float cpb2 = (double)cy2 / (iterations * p.size());
    float cpbtotal = (double)total / (iterations * p.size());
    char *newfile = (char *)malloc(strlen(filename) + 1);
    if (newfile == NULL) {
      return EXIT_FAILURE;
    }
    ::strcpy(newfile, filename);
    char *snewfile = ::basename(newfile);
    size_t nl = strlen(snewfile);
    for (size_t j = nl - 1; j > 0; j--) {
      if (snewfile[j] == '.') {
        snewfile[j] = '\0';
        break;
      }
    }
    printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
           speedinGBs);
    free(newfile);
  } else {
-    printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
-           p.size(), pj.n_structural_indexes,
+      if (!options.verbose) { progress.print(iteration); }
-           (double)pj.n_structural_indexes / p.size());
+      // Benchmark each file once per iteration
-    printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
+      for (size_t f=0; f<options.files.size(); f++) {
-           "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
+        verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
-           "%10lu (failure %10lu)\n",
+        benchmarkers[f]->run_iterations(options.iteration_step, false);
-           cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
+      }
-           (double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
+    }
-           cref1 / iterations, cmis0 / iterations);
+  }
-    printf(" mem alloc runs at %.2f cycles per input byte.\n",
+  if (!options.verbose) { progress.erase(); }
           (double)cy0 / (iterations * p.size()));
    printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
           "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
           "%10lu (failure %10lu)\n",
           cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
           (double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
           cref1 / iterations, cmis1 / iterations);
    printf(" stage 1 runs at %.2f cycles per input byte.\n",
           (double)cy1 / (iterations * p.size()));
-    printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
+  for (size_t i=0; i<options.files.size(); i++) {
-           "%.2f mis. branches: %10lu  (cycles/mis.branch %.2f)  cache "
+    benchmarkers[i]->print(options.tabbed_output);
-           "accesses: %10lu (failure %10lu)\n",
+    delete benchmarkers[i];
-           cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
+  }
           (double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
           cref2 / iterations, cmis2 / iterations);
    printf(" stage 2 runs at %.2f cycles per input byte and ",
           (double)cy2 / (iterations * p.size()));
    printf("%.2f cycles per structural character.\n",
           (double)cy2 / (iterations * pj.n_structural_indexes));
    printf(" all stages: %.2f cycles per input byte.\n",
           (double)total / (iterations * p.size()));
    printf("Estimated average frequency: %.3f GHz.\n",
           (double)total / (iterations * min_result * 1000000000.0));
  }
 #endif
  if (!just_data) {
    std::cout << "Min:  " << min_result << " bytes read: " << p.size()
              << " Gigabytes/second: " << speedinGBs << std::endl;
  }
  if (json_output) {
    isok = isok && pj.print_json(std::cout);
  }
  if (dump) {
    isok = isok && pj.dump_raw_tape(std::cout);
  }
  if (!isok) {
    fprintf(stderr, " Parsing failed. \n ");
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
 }
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -6,10 +6,10 @@
 namespace simdjson {
 // Represents the minimal architecture that would support an implementation
 enum class Architecture {
  UNSUPPORTED,
  WESTMERE,
  HASWELL,
  ARM64,
  NONE,
 // TODO remove 'native' in favor of runtime dispatch?
 // the 'native' enum class value should point at a good default on the current
 // machine
@ -20,6 +20,9 @@ enum class Architecture {
 #endif
 };
 Architecture find_best_supported_architecture();
 Architecture parse_architecture(char *architecture);
 enum ErrorValues {
  SUCCESS = 0,
  SUCCESS_AND_HAS_MORE, //No errors and buffer still has more data
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -29,7 +29,7 @@ int json_parse(const char *buf, size_t len, ParsedJson &pj,
                                                        realloc);
 }
-Architecture find_best_supported_implementation() {
+Architecture find_best_supported_architecture() {
  constexpr uint32_t haswell_flags =
      instruction_set::AVX2 | instruction_set::PCLMULQDQ |
      instruction_set::BMI1 | instruction_set::BMI2;
@ -45,13 +45,20 @@ Architecture find_best_supported_implementation() {
  if (supports & instruction_set::NEON)
    return Architecture::ARM64;
-  return Architecture::NONE;
+  return Architecture::UNSUPPORTED;
 }
 Architecture parse_architecture(char *architecture) {
  if (!strcmp(architecture, "HASWELL")) { return Architecture::HASWELL; }
  if (!strcmp(architecture, "WESTMERE")) { return Architecture::WESTMERE; }
  if (!strcmp(architecture, "ARM64")) { return Architecture::ARM64; }
  return Architecture::UNSUPPORTED;
 }
 // Responsible to select the best json_parse implementation
 int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
                        bool realloc) {
-  Architecture best_implementation = find_best_supported_implementation();
+  Architecture best_implementation = find_best_supported_architecture();
  // Selecting the best implementation
  switch (best_implementation) {
 #ifdef IS_X86_64