Measure impact of utf-8 blocks and structurals per block directly

2019-09-11 08:38:13 -07:00 · 2019-09-11 08:38:13 -07:00 · e2f349e7bd
parent 102262c7ab
commit e2f349e7bd
14 changed files with 21366 additions and 384 deletions
--- a/.gitignore
+++ b/.gitignore
@ -53,6 +53,7 @@ objs
 # Build outputs (TODO build to a subdir so we can exclude that instead)
 /allparserscheckfile
 /basictests
+/benchfeatures
 /benchmark/parse
 /benchmark/perfdiff
 /benchmark/statisticalmodel
@ -86,6 +87,9 @@ objs
 /tools/jsonstats
 /tools/minify

+# Don't check in generated examples
+/jsonexamples/generated
+
 # C++ ignore from https://github.com/github/gitignore/blob/master/C%2B%2B.gitignore

 # Prerequisites
--- a/11
+++ b/11
@ -126,6 +126,12 @@ run_issue150_sh: allparserscheckfile
 run_testjson2json_sh: minify json2json
 	./scripts/testjson2json.sh

+generate_featurejson:
+	ruby ./benchmark/genfeaturejson.rb
+
+run_benchfeatures: benchfeatures generate_featurejson
+	./benchfeatures -n 1000
+
 test: run_basictests run_jsoncheck run_numberparsingcheck run_integer_tests run_stringparsingcheck  run_jsonstream_test run_pointercheck run_testjson2json_sh run_issue150_sh run_jsoncheck_noavx
 	@echo "It looks like the code is good!"

@ -145,9 +151,12 @@ submodules:

 $(JSON_INCLUDE) $(SAJSON_INCLUDE) $(RAPIDJSON_INCLUDE) $(JSON11_INCLUDE) $(FASTJSON_INCLUDE) $(GASON_INCLUDE) $(UJSON4C_INCLUDE) $(CJSON_INCLUDE) $(JSMN_INCLUDE) : submodules

-parse: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
+parse: benchmark/parse.cpp benchmark/json_parser.h benchmark/event_counter.h benchmark/benchmarker.h $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o parse $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)

+benchfeatures: benchmark/benchfeatures.cpp benchmark/json_parser.h benchmark/event_counter.h benchmark/benchmarker.h $(HEADERS) $(LIBFILES)
+	$(CXX) $(CXXFLAGS) -o benchfeatures $(LIBFILES) benchmark/benchfeatures.cpp $(LIBFLAGS)
+
 perfdiff: benchmark/perfdiff.cpp
 	$(CXX) $(CXXFLAGS) -o perfdiff benchmark/perfdiff.cpp $(LIBFLAGS)

--- a/benchmark/benchfeatures.cpp
+++ b/benchmark/benchfeatures.cpp
@ -0,0 +1,326 @@
+#include "json_parser.h"
+#include "event_counter.h"
+
+#include <cassert>
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <cinttypes>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+//#define DEBUG
+#include "simdjson/common_defs.h"
+#include "simdjson/isadetection.h"
+#include "simdjson/jsonioutil.h"
+#include "simdjson/jsonparser.h"
+#include "simdjson/parsedjson.h"
+#include "simdjson/stage1_find_marks.h"
+#include "simdjson/stage2_build_tape.h"
+
+#include <functional>
+
+#include "benchmarker.h"
+
+using namespace simdjson;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::vector;
+using std::ostream;
+using std::ofstream;
+using std::exception;
+
+// Stash the exe_name in main() for functions to use
+char* exe_name;
+
+void print_usage(ostream& out) {
+  out << "Usage: " << exe_name << " [-v] [-n #] [-s STAGE] [-a ARCH]" << endl;
+  out << endl;
+  out << "Runs the parser against jsonexamples/generated json files in a loop, measuring speed and other statistics." << endl;
+  out << endl;
+  out << "Options:" << endl;
+  out << endl;
+  out << "-n #       - Number of iterations per file. Default: 400" << endl;
+  out << "-i #       - Number of times to iterate a single file before moving to the next. Default: 20" << endl;
+  out << "-v         - Verbose output." << endl;
+  out << "-s STAGE   - Stop after the given stage." << endl;
+  out << "             -s stage1 - Stop after find_structural_bits." << endl;
+  out << "             -s all    - Run all stages." << endl;
+  out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
+  out << "             or ARM64). By default, detects best supported architecture." << endl;
+}
+
+void exit_usage(string message) {
+  cerr << message << endl;
+  cerr << endl;
+  print_usage(cerr);
+  exit(EXIT_FAILURE);
+}
+
+struct option_struct {
+  Architecture architecture = Architecture::UNSUPPORTED;
+  bool stage1_only = false;
+
+  int32_t iterations = 400;
+  int32_t iteration_step = 50;
+
+  bool verbose = false;
+
+  option_struct(int argc, char **argv) {
+    #ifndef _MSC_VER
+      int c;
+
+      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
+        switch (c) {
+        case 'n':
+          iterations = atoi(optarg);
+          break;
+        case 'i':
+          iteration_step = atoi(optarg);
+          break;
+        case 'v':
+          verbose = true;
+          break;
+        case 'a':
+          architecture = parse_architecture(optarg);
+          if (architecture == Architecture::UNSUPPORTED) {
+            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
+          }
+          break;
+        case 's':
+          if (!strcmp(optarg, "stage1")) {
+            stage1_only = true;
+          } else if (!strcmp(optarg, "all")) {
+            stage1_only = false;
+          } else {
+            exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
+          }
+          break;
+        default:
+          exit_error("Unexpected argument " + c);
+        }
+      }
+    #else
+      int optind = 1;
+    #endif
+
+    // If architecture is not specified, pick the best supported architecture by default
+    if (architecture == Architecture::UNSUPPORTED) {
+      architecture = find_best_supported_architecture();
+    }
+  }
+};
+
+double actual(const benchmarker& feature) {
+  return feature.stage1.best.elapsed_ns() / feature.stats->blocks;
+}
+double diff(const benchmarker& feature, const benchmarker& struct7) {
+  if (feature.stats->blocks == struct7.stats->blocks) {
+    return (feature.stage1.best.elapsed_ns() - struct7.stage1.best.elapsed_ns()) / struct7.stats->blocks;
+  } else {
+    return (feature.stage1.best.elapsed_ns() / feature.stats->blocks) - (struct7.stage1.best.elapsed_ns() / struct7.stats->blocks);
+  }
+}
+double diff_miss(const benchmarker& feature, const benchmarker& struct7) {
+  // There are roughly 2650 branch mispredicts, so we have to scale it so it represents a per block amount
+  return diff(feature, struct7) * 10000.0 / 2650.0;
+}
+
+struct feature_benchmarker {
+  benchmarker utf8;
+  benchmarker utf8_miss;
+  benchmarker empty;
+  benchmarker empty_miss;
+  benchmarker struct7;
+  benchmarker struct7_miss;
+  benchmarker struct7_full;
+  benchmarker struct15;
+  benchmarker struct15_miss;
+  benchmarker struct23;
+  benchmarker struct23_miss;
+
+  feature_benchmarker(json_parser& parser, event_collector& collector) :
+    utf8               ("jsonexamples/generated/utf-8.json", parser, collector),
+    utf8_miss          ("jsonexamples/generated/utf-8-miss.json", parser, collector),
+    empty              ("jsonexamples/generated/0-structurals.json", parser, collector),
+    empty_miss         ("jsonexamples/generated/0-structurals-miss.json", parser, collector),
+    struct7           ("jsonexamples/generated/7-structurals.json", parser, collector),
+    struct7_miss      ("jsonexamples/generated/7-structurals-miss.json", parser, collector),
+    struct7_full       ("jsonexamples/generated/7-structurals-full.json", parser, collector),
+    struct15     ("jsonexamples/generated/15-structurals.json", parser, collector),
+    struct15_miss("jsonexamples/generated/15-structurals-miss.json", parser, collector),
+    struct23     ("jsonexamples/generated/23-structurals.json", parser, collector),
+    struct23_miss("jsonexamples/generated/23-structurals-miss.json", parser, collector)
+  {
+
+  }
+
+  really_inline void run_iterations(size_t iterations, bool stage1_only=false) {
+    struct7.run_iterations(iterations, stage1_only);
+    struct7_miss.run_iterations(iterations, stage1_only);
+    struct7_full.run_iterations(iterations, stage1_only);
+    utf8.run_iterations(iterations, stage1_only);
+    utf8_miss.run_iterations(iterations, stage1_only);
+    empty.run_iterations(iterations, stage1_only);
+    empty_miss.run_iterations(iterations, stage1_only);
+    struct15.run_iterations(iterations, stage1_only);
+    struct15_miss.run_iterations(iterations, stage1_only);
+    struct23.run_iterations(iterations, stage1_only);
+    struct23_miss.run_iterations(iterations, stage1_only);
+  }
+
+  void print() {
+    printf("base (ns/block)");
+    printf(",struct 1-7");
+    printf(",struct 1-7 miss");
+    printf(",utf-8");
+    printf(",utf-8 miss");
+    printf(",struct 8-15");
+    printf(",struct 8-15 miss");
+    printf(",struct 16+");
+    printf(",struct 16+ miss");
+    printf("\n");
+
+    printf("%g",   actual(empty));
+    printf(",%+g", diff(struct7, empty));
+    printf(",%+g", diff(struct7_miss, struct7));
+    printf(",%+g", diff(utf8, struct7));
+    printf(",%+g", diff(utf8_miss, utf8));
+    printf(",%+g", diff(struct15, struct7));
+    printf(",%+g", diff(struct15_miss, struct15));
+    printf(",%+g", diff(struct23, struct15));
+    printf(",%+g", diff(struct23_miss, struct23));
+    printf("\n");
+  }
+
+  double cost_per_block(benchmarker& feature, size_t feature_blocks, benchmarker& base) {
+    return (feature.stage1.best.elapsed_ns() - base.stage1.best.elapsed_ns()) / feature_blocks;
+  }
+
+  // Base cost of any block (including empty ones)
+  double base_cost() {
+    return (empty.stage1.best.elapsed_ns() / empty.stats->blocks);
+  }
+  // Extra cost of a 1-7 structural block over an empty block
+  double struct1_7_cost() {
+    return cost_per_block(struct7, struct7.stats->blocks_with_1_structural, empty);
+  }
+  // Extra cost of an 1-7-structural miss
+  double struct1_7_miss_cost() {
+    return cost_per_block(struct7_miss, struct7_miss.stats->blocks_with_1_structural, struct7);
+  }
+  // Extra cost of an 8-15 structural block over a 1-7 structural block
+  double struct8_15_cost() {
+    return cost_per_block(struct15, struct15.stats->blocks_with_8_structurals, struct7);
+  }
+  // Extra cost of an 8-15-structural miss over a 1-7 miss
+  double struct8_15_miss_cost() {
+    return cost_per_block(struct15_miss, struct15_miss.stats->blocks_with_8_structurals_flipped, struct15);
+  }
+  // Extra cost of a 16+-structural block over an 8-15 structural block (actual varies based on # of structurals!)
+  double struct16_cost() {
+    return cost_per_block(struct23, struct23.stats->blocks_with_16_structurals, struct15);
+  }
+  // Extra cost of a 16-structural miss over an 8-15 miss
+  double struct16_miss_cost() {
+    return cost_per_block(struct23_miss, struct23_miss.stats->blocks_with_16_structurals_flipped, struct23);
+  }
+  // Extra cost of having UTF-8 in a block
+  double utf8_cost() {
+    return cost_per_block(utf8, utf8.stats->blocks_with_utf8, struct7_full);
+  }
+  // Extra cost of a UTF-8 miss
+  double utf8_miss_cost() {
+    return cost_per_block(utf8_miss, utf8_miss.stats->blocks_with_utf8_flipped, utf8);
+  }
+
+  double calc_expected(benchmarker& file) {
+    // Expected base ns/block (empty)
+    json_stats& stats = *file.stats;
+    double expected = base_cost()      * stats.blocks;
+    expected += struct1_7_cost()       * stats.blocks_with_1_structural;
+    expected += struct1_7_miss_cost()  * stats.blocks_with_1_structural_flipped;
+    expected += utf8_cost()            * stats.blocks_with_utf8;
+    expected += utf8_miss_cost()       * stats.blocks_with_utf8_flipped;
+    expected += struct8_15_cost()      * stats.blocks_with_8_structurals;
+    expected += struct8_15_miss_cost() * stats.blocks_with_8_structurals_flipped;
+    expected += struct16_cost()        * stats.blocks_with_16_structurals;
+    expected += struct16_miss_cost()   * stats.blocks_with_16_structurals_flipped;
+    return expected / stats.blocks;
+  }
+};
+
+int main(int argc, char *argv[]) {
+  // Read options
+  exe_name = argv[0];
+  option_struct options(argc, argv);
+  if (options.verbose) {
+    verbose_stream = &cout;
+  }
+
+  // Initialize the event collector. We put this early so if it prints an error message, it's the
+  // first thing printed.
+  event_collector collector;
+
+  // Set up benchmarkers by reading all files
+  json_parser parser(options.architecture);
+
+  feature_benchmarker features(parser, collector);
+  benchmarker gsoc_2018("jsonexamples/gsoc-2018.json", parser, collector);
+  benchmarker twitter("jsonexamples/twitter.json", parser, collector);
+  benchmarker random("jsonexamples/random.json", parser, collector);
+
+  // Run the benchmarks
+  progress_bar progress(options.iterations, 100);
+  // Put the if (options.stage1_only) *outside* the loop so that run_iterations will be optimized
+  if (options.stage1_only) {
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
+      if (!options.verbose) { progress.print(iteration); }
+      features.run_iterations(options.iteration_step, true);
+      gsoc_2018.run_iterations(options.iteration_step, true);
+      twitter.run_iterations(options.iteration_step, true);
+      random.run_iterations(options.iteration_step, true);
+    }
+  } else {
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
+      if (!options.verbose) { progress.print(iteration); }
+      features.run_iterations(options.iteration_step, false);
+      gsoc_2018.run_iterations(options.iteration_step, false);
+      twitter.run_iterations(options.iteration_step, false);
+      random.run_iterations(options.iteration_step, false);
+    }
+  }
+  if (!options.verbose) { progress.erase(); }
+
+  features.print();
+
+// Gauge effectiveness
+  printf("gsoc-2018.json expected/actual: %g/%g\n", features.calc_expected(gsoc_2018), actual(gsoc_2018));
+  printf("twitter.json expected/actual: %g/%g\n", features.calc_expected(twitter), actual(twitter));
+  printf("random.json expected/actual: %g/%g\n", features.calc_expected(random), actual(random));
+
+  return EXIT_SUCCESS;
+}
--- a/benchmark/benchmarker.h
+++ b/benchmark/benchmarker.h
@ -0,0 +1,424 @@
+#ifndef __BENCHMARKER_H
+#define __BENCHMARKER_H
+
+#include "json_parser.h"
+#include "event_counter.h"
+
+#include <cassert>
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <cinttypes>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+//#define DEBUG
+#include "simdjson/common_defs.h"
+#include "simdjson/isadetection.h"
+#include "simdjson/jsonioutil.h"
+#include "simdjson/jsonparser.h"
+#include "simdjson/parsedjson.h"
+#include "simdjson/stage1_find_marks.h"
+#include "simdjson/stage2_build_tape.h"
+
+#include <functional>
+
+using namespace simdjson;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::vector;
+using std::ostream;
+using std::ofstream;
+using std::exception;
+
+// Initialize "verbose" to go nowhere. We'll read options in main() and set to cout if verbose is true.
+std::ofstream dev_null;
+ostream *verbose_stream = &dev_null;
+const size_t BYTES_PER_BLOCK = 64;
+
+ostream& verbose() {
+  return *verbose_stream;
+}
+
+void exit_error(string message) {
+  cerr << message << endl;
+  exit(EXIT_FAILURE);
+  abort();
+}
+
+struct json_stats {
+  size_t bytes = 0;
+  size_t blocks = 0;
+  size_t structurals = 0;
+  size_t blocks_with_utf8 = 0;
+  size_t blocks_with_utf8_flipped = 0;
+  size_t blocks_with_0_structurals = 0;
+  size_t blocks_with_0_structurals_flipped = 0;
+  size_t blocks_with_1_structural = 0;
+  size_t blocks_with_1_structural_flipped = 0;
+  size_t blocks_with_8_structurals = 0;
+  size_t blocks_with_8_structurals_flipped = 0;
+  size_t blocks_with_16_structurals = 0;
+  size_t blocks_with_16_structurals_flipped = 0;
+
+  json_stats(const padded_string& json, const ParsedJson& pj) {
+    bytes = json.size();
+    blocks = bytes / BYTES_PER_BLOCK;
+    if (bytes % BYTES_PER_BLOCK > 0) { blocks++; } // Account for remainder block
+    structurals = pj.n_structural_indexes-1;
+
+    // Calculate stats on blocks that will trigger utf-8 if statements / mispredictions
+    bool last_block_has_utf8 = false;
+    for (size_t block=0; block<blocks; block++) {
+      // Find utf-8 in the block
+      size_t block_start = block*BYTES_PER_BLOCK;
+      size_t block_end = block_start+BYTES_PER_BLOCK;
+      if (block_end > json.size()) { block_end = json.size(); }
+      bool block_has_utf8 = false;
+      for (size_t i=block_start; i<block_end; i++) {
+        if (json.data()[i] & 0x80) {
+          block_has_utf8 = true;
+          break;
+        }
+      }
+      if (block_has_utf8) {
+        blocks_with_utf8++;
+      }
+      if (block > 0 && last_block_has_utf8 != block_has_utf8) {
+        blocks_with_utf8_flipped++;
+      }
+      last_block_has_utf8 = block_has_utf8;
+    }
+
+    // Calculate stats on blocks that will trigger structural count if statements / mispredictions
+    bool last_block_has_0_structurals = false;
+    bool last_block_has_1_structural = false;
+    bool last_block_has_8_structurals = false;
+    bool last_block_has_16_structurals = false;
+    size_t structural=0;
+    for (size_t block=0; block<blocks; block++) {
+      // Count structurals in the block
+      int block_structurals=0;
+      while (structural < pj.n_structural_indexes && pj.structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
+        block_structurals++;
+        structural++;
+      }
+
+      bool block_has_0_structurals = block_structurals == 0;
+      if (block_has_0_structurals) {
+        blocks_with_0_structurals++;
+      }
+      if (block > 0 && last_block_has_0_structurals != block_has_0_structurals) {
+        blocks_with_0_structurals_flipped++;
+      }
+      last_block_has_0_structurals = block_has_0_structurals;
+
+      bool block_has_1_structural = block_structurals >= 1;
+      if (block_has_1_structural) {
+        blocks_with_1_structural++;
+      }
+      if (block > 0 && last_block_has_1_structural != block_has_1_structural) {
+        blocks_with_1_structural_flipped++;
+      }
+      last_block_has_1_structural = block_has_1_structural;
+
+      bool block_has_8_structurals = block_structurals >= 8;
+      if (block_has_8_structurals) {
+        blocks_with_8_structurals++;
+      }
+      if (block > 0 && last_block_has_8_structurals != block_has_8_structurals) {
+        blocks_with_8_structurals_flipped++;
+      }
+      last_block_has_8_structurals = block_has_8_structurals;
+
+      bool block_has_16_structurals = block_structurals >= 16;
+      if (block_has_16_structurals) {
+        blocks_with_16_structurals++;
+      }
+      if (block > 0 && last_block_has_16_structurals != block_has_16_structurals) {
+        blocks_with_16_structurals_flipped++;
+      }
+      last_block_has_16_structurals = block_has_16_structurals;
+    }
+  }
+};
+
+padded_string load_json(const char *filename) {
+  try {
+    verbose() << "[verbose] loading " << filename << endl;
+    padded_string json = simdjson::get_corpus(filename);
+    verbose() << "[verbose] loaded " << filename << " (" << json.size() << " bytes)" << endl;
+    return json;
+  } catch (const exception &) { // caught by reference to base
+    exit_error(string("Could not load the file ") + filename);
+    exit(EXIT_FAILURE); // This is not strictly necessary but removes the warning
+  }
+}
+
+struct progress_bar {
+  int max_value;
+  int total_ticks;
+  double ticks_per_value;
+  int next_tick;
+  progress_bar(int _max_value, int _total_ticks) : max_value(_max_value), total_ticks(_total_ticks), ticks_per_value(double(_total_ticks)/_max_value), next_tick(0) {
+    fprintf(stderr, "[");
+    for (int i=0;i<total_ticks;i++) {
+      fprintf(stderr, " ");
+    }
+    fprintf(stderr, "]");
+    for (int i=0;i<total_ticks+1;i++) {
+      fprintf(stderr, "\b");
+    }
+  }
+
+  void print(int value) {
+    double ticks = value*ticks_per_value;
+    if (ticks >= total_ticks) {
+      ticks = total_ticks-1;
+    }
+    int tick;
+    for (tick=next_tick; tick <= ticks && tick <= total_ticks; tick++) {
+      fprintf(stderr, "=");
+    }
+    next_tick = tick;
+  }
+  void erase() {
+    for (int i=0;i<next_tick+1;i++) {
+      fprintf(stderr, "\b");
+    }
+    for (int tick=0; tick<=total_ticks+2; tick++) {
+      fprintf(stderr, " ");
+    }
+    for (int tick=0; tick<=total_ticks+2; tick++) {
+      fprintf(stderr, "\b");
+    }
+  }
+};
+
+struct benchmarker {
+  // JSON text from loading the file. Owns the memory.
+  const padded_string json;
+  // JSON filename
+  const char *filename;
+  // Parser that will parse the JSON file
+  const json_parser& parser;
+  // Event collector that can be turned on to measure cycles, missed branches, etc.
+  event_collector& collector;
+
+  // Statistics about the JSON file independent of its speed (amount of utf-8, structurals, etc.).
+  // Loaded on first parse.
+  json_stats* stats;
+  // Speed and event summary for full parse (not including allocation)
+  event_aggregate all_stages;
+  // Speed and event summary for stage 1
+  event_aggregate stage1;
+  // Speed and event summary for stage 2
+  event_aggregate stage2;
+  // Speed and event summary for allocation
+  event_aggregate allocate_stage;
+
+  benchmarker(const char *_filename, const json_parser& _parser, event_collector& _collector)
+    : json(load_json(_filename)), filename(_filename), parser(_parser), collector(_collector), stats(NULL) {}
+
+  ~benchmarker() {
+    if (stats) {
+      delete stats;
+    }
+  }
+
+  int iterations() const {
+    return all_stages.iterations;
+  }
+
+  really_inline void run_iteration(bool stage1_only=false) {
+    // Allocate ParsedJson
+    collector.start();
+    ParsedJson pj;
+    bool allocok = pj.allocate_capacity(json.size());
+    event_count allocate_count = collector.end();
+    allocate_stage << allocate_count;
+
+    if (!allocok) {
+      exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result.");
+    }
+    verbose() << "[verbose] allocated memory for parsed JSON " << endl;
+
+    // Stage 1 (find structurals)
+    collector.start();
+    int result = parser.stage1((const uint8_t *)json.data(), json.size(), pj);
+    event_count stage1_count = collector.end();
+    stage1 << stage1_count;
+
+    if (result != simdjson::SUCCESS) {
+      exit_error(string("Failed to parse ") + filename + " during stage 1: " + pj.get_error_message());
+    }
+
+    // Stage 2 (unified machine)
+    event_count stage2_count;
+    if (!stage1_only || stats == NULL) {
+      if (!stage1_only) {
+        collector.start();
+      }
+      result = parser.stage2((const uint8_t *)json.data(), json.size(), pj);
+      if (!stage1_only) {
+        stage2_count = collector.end();
+        stage2 << stage2_count;
+      }
+
+      if (result != simdjson::SUCCESS) {
+        exit_error(string("Failed to parse ") + filename + " during stage 2: " + pj.get_error_message());
+      }
+    }
+
+    all_stages << (stage1_count + stage2_count);
+
+    // Calculate stats the first time we parse
+    if (stats == NULL) {
+      stats = new json_stats(json, pj);
+    }
+  }
+
+  really_inline void run_iterations(size_t iterations, bool stage1_only=false) {
+    for (size_t i = 0; i<iterations; i++) {
+      run_iteration(stage1_only);
+    }
+  }
+
+  double stage1_ns_per_block() {
+    return stage1.elapsed_ns() / stats->blocks;
+  }
+
+  template<typename T>
+  void print_aggregate(const char* prefix, const T& stage) const {
+    printf("%s%-13s: %8.4f ns per block (%5.1f %%) - %8.4f ns per byte - %8.4f ns per structural - %8.3f GB/s\n",
+      prefix,
+      "Speed",
+      stage.elapsed_ns() / stats->blocks, // per block
+      100.0 * stage.elapsed_sec() / all_stages.elapsed_sec(), // %
+      stage.elapsed_ns() / stats->bytes, // per byte
+      stage.elapsed_ns() / stats->structurals, // per structural
+      (json.size() / 1000000000.0) / stage.elapsed_sec() // GB/s
+    );
+
+    if (collector.has_events()) {
+      printf("%s%-13s: %2.3f per block (%5.2f %%) - %2.3f per byte - %2.3f per structural - %2.3f GHz est. frequency\n",
+        prefix,
+        "Cycles",
+        stage.cycles() / stats->blocks,
+        100.0 * stage.cycles() / all_stages.cycles(),
+        stage.cycles() / stats->bytes,
+        stage.cycles() / stats->structurals,
+        (stage.cycles() / stage.elapsed_sec()) / 1000000000.0
+      );
+
+      printf("%s%-13s: %2.2f per block (%5.2f %%) - %2.2f per byte - %2.2f per structural - %2.2f per cycle\n",
+        prefix,
+        "Instructions",
+        stage.instructions() / stats->blocks,
+        100.0 * stage.instructions() / all_stages.instructions(),
+        stage.instructions() / stats->bytes,
+        stage.instructions() / stats->structurals,
+        stage.instructions() / stage.cycles()
+      );
+
+      // NOTE: removed cycles/miss because it is a somewhat misleading stat
+      printf("%s%-13s: %2.2f branch misses (%5.2f %%) - %2.2f cache misses (%5.2f %%) - %2.2f cache references\n",
+        prefix,
+        "Misses",
+        stage.branch_misses(),
+        100.0 * stage.branch_misses() / all_stages.branch_misses(),
+        stage.cache_misses(),
+        100.0 * stage.cache_misses() / all_stages.cache_misses(),
+        stage.cache_references()
+      );
+    }
+  }
+
+  void print(bool tabbed_output) const {
+    if (tabbed_output) {
+      char* filename_copy = (char*)malloc(strlen(filename)+1);
+      strcpy(filename_copy, filename);
+      #if defined(__linux__)
+      char* base = ::basename(filename_copy);
+      #else
+      char* base = filename_copy;
+      #endif
+      if (strlen(base) >= 5 && !strcmp(base+strlen(base)-5, ".json")) {
+        base[strlen(base)-5] = '\0';
+      }
+
+      double gb = json.size() / 1000000000.0;
+      if (collector.has_events()) {
+        printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",
+                base,
+                allocate_stage.best.cycles() / json.size(),
+                stage1.best.cycles() / json.size(),
+                stage2.best.cycles() / json.size(),
+                all_stages.best.cycles() / json.size(),
+                gb / all_stages.best.elapsed_sec(),
+                gb / stage1.best.elapsed_sec(),
+                gb / stage2.best.elapsed_sec());
+      } else {
+        printf("\"%s\"\t\t\t\t\t%f\t%f\t%f\n",
+                base,
+                gb / all_stages.best.elapsed_sec(),
+                gb / stage1.best.elapsed_sec(),
+                gb / stage2.best.elapsed_sec());
+      }
+      free(filename_copy);
+    } else {
+      printf("\n");
+      printf("%s\n", filename);
+      printf("%s\n", string(strlen(filename), '=').c_str());
+      printf("%9zu blocks - %10zu bytes - %5zu structurals (%5.1f %%)\n", stats->bytes / BYTES_PER_BLOCK, stats->bytes, stats->structurals, 100.0 * stats->structurals / stats->bytes);
+      if (stats) {
+        printf("special blocks with: utf8 %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
+          stats->blocks_with_utf8, 100.0 * stats->blocks_with_utf8 / stats->blocks,
+          stats->blocks_with_0_structurals, 100.0 * stats->blocks_with_0_structurals / stats->blocks,
+          stats->blocks_with_1_structural, 100.0 * stats->blocks_with_1_structural / stats->blocks,
+          stats->blocks_with_8_structurals, 100.0 * stats->blocks_with_8_structurals / stats->blocks,
+          stats->blocks_with_16_structurals, 100.0 * stats->blocks_with_16_structurals / stats->blocks);
+        printf("special block flips: utf8 %9zu (%5.1f %%) - 0 structurals %9zu (%5.1f %%) - 1+ structurals %9zu (%5.1f %%) - 8+ structurals %9zu (%5.1f %%) - 16+ structurals %9zu (%5.1f %%)\n",
+          stats->blocks_with_utf8_flipped, 100.0 * stats->blocks_with_utf8_flipped / stats->blocks,
+          stats->blocks_with_1_structural_flipped, 100.0 * stats->blocks_with_1_structural_flipped / stats->blocks,
+          stats->blocks_with_0_structurals_flipped, 100.0 * stats->blocks_with_0_structurals_flipped / stats->blocks,
+          stats->blocks_with_8_structurals_flipped, 100.0 * stats->blocks_with_8_structurals_flipped / stats->blocks,
+          stats->blocks_with_16_structurals_flipped, 100.0 * stats->blocks_with_16_structurals_flipped / stats->blocks);
+      }
+      printf("\n");
+      printf("All Stages\n");
+      print_aggregate("|    "   , all_stages.best);
+      //          printf("|- Allocation\n");
+      // print_aggregate("|    ", allocate_stage.best);
+              printf("|- Stage 1\n");
+      print_aggregate("|    ", stage1.best);
+              printf("|- Stage 2\n");
+      print_aggregate("|    ", stage2.best);
+    }
+  }
+};
+
+#endif
--- a/benchmark/event_counter.h
+++ b/benchmark/event_counter.h
@ -0,0 +1,152 @@
+#ifndef __EVENT_COUNTER_H
+#define __EVENT_COUNTER_H
+
+#include <cassert>
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <cinttypes>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+//#define DEBUG
+#include "simdjson/common_defs.h"
+#include "simdjson/isadetection.h"
+
+using std::string;
+using std::vector;
+using std::chrono::steady_clock;
+using std::chrono::time_point;
+using std::chrono::duration;
+
+struct event_count {
+  duration<double> elapsed;
+  vector<unsigned long long> event_counts;
+  event_count() : elapsed(0), event_counts{0,0,0,0,0} {}
+  event_count(const duration<double> _elapsed, const vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {}
+  event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { }
+
+  // The types of counters (so we can read the getter more easily)
+  enum event_counter_types {
+    CPU_CYCLES,
+    INSTRUCTIONS,
+    BRANCH_MISSES,
+    CACHE_REFERENCES,
+    CACHE_MISSES
+  };
+
+  double elapsed_sec() const { return duration<double>(elapsed).count(); }
+  double elapsed_ns() const { return duration<double, std::nano>(elapsed).count(); }
+  double cycles() const { return event_counts[CPU_CYCLES]; }
+  double instructions() const { return event_counts[INSTRUCTIONS]; }
+  double branch_misses() const { return event_counts[BRANCH_MISSES]; }
+  double cache_references() const { return event_counts[CACHE_REFERENCES]; }
+  double cache_misses() const { return event_counts[CACHE_MISSES]; }
+
+  event_count& operator=(const event_count other) {
+    this->elapsed = other.elapsed;
+    this->event_counts = other.event_counts;
+    return *this;
+  }
+  event_count operator+(const event_count other) const {
+    return event_count(elapsed+other.elapsed, {
+      event_counts[0]+other.event_counts[0],
+      event_counts[1]+other.event_counts[1],
+      event_counts[2]+other.event_counts[2],
+      event_counts[3]+other.event_counts[3],
+      event_counts[4]+other.event_counts[4],
+    });
+  }
+
+  void operator+=(const event_count other) {
+    *this = *this + other;
+  }
+};
+
+struct event_aggregate {
+  int iterations = 0;
+  event_count total;
+  event_count best;
+  event_count worst;
+
+  event_aggregate() {}
+
+  void operator<<(const event_count other) {
+    if (iterations == 0 || other.elapsed < best.elapsed) {
+      best = other;
+    }
+    if (iterations == 0 || other.elapsed > worst.elapsed) {
+      worst = other;
+    }
+    iterations++;
+    total += other;
+  }
+
+  double elapsed_sec() const { return total.elapsed_sec() / iterations; }
+  double elapsed_ns() const { return total.elapsed_ns() / iterations; }
+  double cycles() const { return total.cycles() / iterations; }
+  double instructions() const { return total.instructions() / iterations; }
+  double branch_misses() const { return total.branch_misses() / iterations; }
+  double cache_references() const { return total.cache_references() / iterations; }
+  double cache_misses() const { return total.cache_misses() / iterations; }
+};
+
+struct event_collector {
+  event_count count;
+  time_point<steady_clock> start_clock;
+
+#if defined(__linux__)
+  LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
+  event_collector() : linux_events(vector<int>{
+    PERF_COUNT_HW_CPU_CYCLES,
+    PERF_COUNT_HW_INSTRUCTIONS,
+    PERF_COUNT_HW_BRANCH_MISSES,
+    PERF_COUNT_HW_CACHE_REFERENCES,
+    PERF_COUNT_HW_CACHE_MISSES
+  }) {}
+  bool has_events() {
+    return linux_events.is_working();
+  }
+#else
+  bool has_events() {
+    return false;
+  }
+#endif
+
+  really_inline void start() {
+#if defined(__linux)
+    linux_events.start();
+#endif
+    start_clock = steady_clock::now();
+  }
+  really_inline event_count& end() {
+    time_point<steady_clock> end_clock = steady_clock::now();
+#if defined(__linux)
+    linux_events.end(count.event_counts);
+#endif
+    count.elapsed = end_clock - start_clock;
+    return count;
+  }
+};
+
+#endif
--- a/benchmark/genfeaturejson.rb
+++ b/benchmark/genfeaturejson.rb
@ -0,0 +1,114 @@
+class ChunkWriter
+    def initialize(output_dir, miss_templates, file_size=640*1000, block_size=64)
+        @@output_dir = output_dir
+        @@miss_templates = miss_templates
+        @@file_size = file_size
+        @@block_size = block_size
+    end
+
+    def prepare_chunk(chunks, include_newline)
+        Array(chunks).map do |chunk|
+            "#{chunk}#{' '*(@@block_size-chunk.bytesize-1)}#{include_newline ? "\n" : " "}"
+        end.join("")
+    end
+
+    def write_files(filename, start1, repeat1, end1, repeat2: '', include_newline: true)
+        start1  = prepare_chunk(start1, include_newline)
+        repeat1 = prepare_chunk(repeat1, include_newline)
+        end1    = prepare_chunk(end1, include_newline)
+        write_full(File.join(@@output_dir, "#{filename}-full.json"), start1, repeat1, end1)
+
+        repeat2 = prepare_chunk(repeat2, include_newline)
+        repeat2 = repeat2 * (repeat1.bytesize/repeat2.bytesize)
+        write_half(File.join(@@output_dir, "#{filename}.json"), start1, repeat1, end1, repeat2)
+        write_half_miss(File.join(@@output_dir, "#{filename}-miss.json"), start1, repeat1, end1, repeat2)
+    end
+
+    def write_full(filename, start1, repeat1, end1)
+        puts "Writing #{filename} ..."
+        File.open(filename, "w") do |file|
+            write_chunks(file, start1, repeat1, end1, @@file_size)
+        end
+        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
+    end
+
+    def write_half(filename, start1, repeat1, end1, repeat2)
+        # repeat1 is already represented in start1 and end1, so it doesn't need quite
+        # half the iterations.
+        repeat1_len = (@@file_size/2) - start1.bytesize - end1.bytesize
+        halfway_point = start1.bytesize + repeat1_len + repeat2.bytesize
+
+        puts "Writing #{filename} ..."
+        File.open(filename, "w") do |file|
+            write_chunks(file, start1,  repeat1, repeat2, halfway_point)
+            write_chunks(file, repeat2, repeat2, end1,    @@file_size-halfway_point)
+        end
+        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
+    end
+
+    def write_half_miss(filename, start1, repeat1, end1, repeat2)
+        miss_template = Array(File.read(File.join(@@miss_templates, "#{repeat1.bytesize}.txt")).chomp.split("\n"))
+        # Take the start and end out of the template
+        repeat_template = miss_template[(start1.bytesize/64)..(-end1.bytesize/64-1)]
+        # If repeat is 128 bytes, each *pair* of elements is set. Use that.
+        repeat_chunks = repeat1.bytesize/64
+        repeat_template = (repeat_chunks - 1).step(repeat_template.size - 1, repeat_chunks).map { |i| repeat_template[i] }
+
+        puts "Writing #{filename} ..."
+        File.open(filename, "w") do |file|
+            file.write(start1)
+            repeat_template.each do |should_repeat|
+                file.write(should_repeat == "1" ? repeat1 : repeat2)
+            end
+            file.write(end1)
+        end
+        raise "OMG wrong file size #{File.size(filename)} (should be #{@@file_size})" if File.size(filename) != @@file_size
+    end
+
+    def write_chunks(file, start1, repeat1, end1, size)
+        pos = 0
+        file.write(start1)
+        pos += start1.bytesize
+
+        repeat_end = size-end1.bytesize
+        loop do
+            file.write(repeat1)
+            pos += repeat1.bytesize
+            break if pos >= repeat_end
+        end
+
+        file.write(end1)
+        pos += end1.bytesize
+        return pos
+    end
+end
+
+output_dir = File.expand_path("../jsonexamples/generated", File.dirname(__FILE__))
+miss_templates = File.expand_path("miss-templates", File.dirname(__FILE__))
+Dir.mkdir(output_dir) unless File.directory?(output_dir)
+w = ChunkWriter.new(output_dir, miss_templates)
+w.write_files "utf-8",          '["֏","֏",{}', ',"֏","֏",{}', ',"֏","֏","֏"]', repeat2: ',"ab","ab",{}'
+w.write_files "0-structurals",  '"ab"', '',  ''
+# w.write_files "1-structurals",  [ '[', '"ab"' ], [ ',', '"ab"' ], [ ',', '{', '}', ']' ]
+# w.write_files "2-structurals",  '["ab"', ',"ab"', [',{', '}]']
+# w.write_files "3-structurals",  '[{}', ',{}', ',"ab"]'
+# w.write_files "4-structurals",  '["ab","ab"', ',"ab","ab"', ',{}]'
+# w.write_files "5-structurals",  '["ab",{}', ',"ab",{}', ',"ab","ab"]'
+# w.write_files "6-structurals",  '["ab","ab","ab"', ',"ab","ab","ab"', ',"ab",{}]'
+w.write_files "7-structurals",  '["ab","ab",{}', ',"ab","ab",{}', ',"ab","ab","ab"]'
+# w.write_files "8-structurals",  '["ab","ab","ab","ab"', ',"ab","ab","ab","ab"', ',"ab","ab",{}]'
+# w.write_files "9-structurals",  '["ab","ab","ab",{}', ',"ab","ab","ab",{}', ',"ab","ab","ab","ab"]'
+# w.write_files "10-structurals", '["ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab"', ',"ab","ab","ab",{}]'
+# w.write_files "11-structurals", '["ab","ab","ab","ab",{}', ',"ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab"]'
+# w.write_files "12-structurals", '["ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab",{}]'
+# w.write_files "13-structurals", '["ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab"]'
+# w.write_files "14-structurals", '["ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab",{}]'
+w.write_files "15-structurals", '["ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab"]'
+# w.write_files "16-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab",{}]'
+# w.write_files "17-structurals", '["ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab"]'
+# w.write_files "18-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab",{}]'
+# w.write_files "19-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
+# w.write_files "20-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab",{}]'
+# w.write_files "21-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
+# w.write_files "22-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab",{}]'
+w.write_files "23-structurals", '["ab","ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab",{}', ',"ab","ab","ab","ab","ab","ab","ab","ab","ab","ab","ab"]'
--- a/benchmark/genfeaturejsonseed.rb
+++ b/benchmark/genfeaturejsonseed.rb
@ -0,0 +1,49 @@
+def gen_seeds(start_blocks, repeat_blocks, end_blocks)
+    total_size = 640*1000
+    total_blocks = total_size/64
+    seed_space = 1..1000000
+    target_blocks = total_blocks*0.5
+    target_flips = total_blocks*0.25
+    percent_flips = 0.25*repeat_blocks
+
+    puts "Seeds for #{start_blocks} start blocks, #{end_blocks} end blocks and #{repeat_blocks} repeat blocks: #{percent_flips*100}% flips"
+    closest_flips = nil
+    closest_seeds = []
+    seed_space.each do |seed|
+        r = Random.new(seed)
+        # First block is always type 1
+        flips = 0
+        type1 = true
+        type1_blocks = start_blocks
+        finished_blocks = start_blocks
+        last_repeat = total_blocks-end_blocks
+        while finished_blocks < last_repeat
+            if r.rand < percent_flips
+                flips += 1
+                type1 = !type1
+            end
+            type1_blocks += repeat_blocks if type1
+            finished_blocks += repeat_blocks
+        end
+
+        # Last one is always type 1
+        flips += 1 if !type1
+        type1 = true
+        type1_blocks += end_blocks
+        finished_blocks += end_blocks
+
+        raise "simulated the wrong number of blocks #{finished_blocks}" if finished_blocks != total_blocks
+
+        if type1_blocks == target_blocks
+            if flips == target_flips
+                puts seed
+                closest_seeds << seed
+            end
+        end
+    end
+    puts closest_seeds
+end
+
+gen_seeds(1,1,1)
+gen_seeds(1,1,2)
+gen_seeds(2,2,4)
--- a/benchmark/json_parser.h
+++ b/benchmark/json_parser.h
@ -0,0 +1,113 @@
+#ifndef __JSON_PARSER_H
+#define __JSON_PARSER_H
+
+#include <cassert>
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <cinttypes>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+//#define DEBUG
+#include "simdjson/common_defs.h"
+#include "simdjson/isadetection.h"
+#include "simdjson/jsonioutil.h"
+#include "simdjson/jsonparser.h"
+#include "simdjson/parsedjson.h"
+#include "simdjson/stage1_find_marks.h"
+#include "simdjson/stage2_build_tape.h"
+
+using namespace simdjson;
+using std::string;
+
+using stage2_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
+using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
+
+stage1_functype* get_stage1_func(const Architecture architecture) {
+  switch (architecture) {
+  #ifdef IS_X86_64
+    case Architecture::HASWELL:
+      return &find_structural_bits<Architecture::HASWELL>;
+    case Architecture::WESTMERE:
+      return &find_structural_bits<Architecture::WESTMERE>;
+  #endif
+  #ifdef IS_ARM64
+    case Architecture::ARM64:
+      return &find_structural_bits<Architecture::ARM64>;
+  #endif
+  default:
+    std::cerr << "The processor is not supported by simdjson." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+stage2_functype* get_stage2_func(const Architecture architecture) {
+  switch (architecture) {
+#ifdef IS_X86_64
+  case Architecture::HASWELL:
+    return &unified_machine<Architecture::HASWELL>;
+    break;
+  case Architecture::WESTMERE:
+    return &unified_machine<Architecture::WESTMERE>;
+    break;
+#endif
+#ifdef IS_ARM64
+  case Architecture::ARM64:
+    return &unified_machine<Architecture::ARM64>;
+    break;
+#endif
+  default:
+    std::cerr << "The processor is not supported by simdjson." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+struct json_parser {
+  const Architecture architecture;
+  const stage1_functype *stage1_func;
+  const stage2_functype *stage2_func;
+
+  json_parser(const Architecture _architecture) : architecture(_architecture) {
+    this->stage1_func = get_stage1_func(architecture);
+    this->stage2_func = get_stage2_func(architecture);
+  }
+  json_parser() : json_parser(find_best_supported_architecture()) {}
+
+  int stage1(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
+    return this->stage1_func(buf, len, pj);
+  }
+  int stage2(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
+    return this->stage2_func(buf, len, pj);
+  }
+
+  int parse(const uint8_t *buf, const size_t len, ParsedJson &pj) const {
+    int result = this->stage1(buf, len, pj);
+    if (result == SUCCESS) {
+      result = this->stage2(buf, len, pj);
+    }
+    return result;
+  }
+};
+
+#endif
--- a/benchmark/linux/linux-perf-events.h
+++ b/benchmark/linux/linux-perf-events.h
@ -83,6 +83,10 @@ public:
    }
  }

+  bool is_working() {
+    return working;
+  }
+
 private:
  void report_error(const std::string &context) {
    if (working)
--- a/benchmark/miss-templates/128.txt
+++ b/benchmark/miss-templates/128.txt
--- a/benchmark/miss-templates/64.txt
+++ b/benchmark/miss-templates/64.txt
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -1,3 +1,6 @@
+#include "json_parser.h"
+#include "event_counter.h"
+
 #include <cassert>
 #include <cctype>
 #ifndef _MSC_VER
@ -35,405 +38,179 @@
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"

-// Global arguments
-bool find_marks_only = false;
-bool verbose = false;
-bool dump = false;
-bool json_output = false;
-bool force_one_iteration = false;
-bool just_data = false;
-bool force_sse = false;
-int32_t iterations = -1;
-int32_t warmup_iterations = -1;
+#include <functional>

-namespace simdjson {
-Architecture _find_best_supported_implementation() {
-  constexpr uint32_t haswell_flags =
-      instruction_set::AVX2 | instruction_set::PCLMULQDQ |
-      instruction_set::BMI1 | instruction_set::BMI2;
-  constexpr uint32_t westmere_flags =
-      instruction_set::SSE42 | instruction_set::PCLMULQDQ;
-  uint32_t supports = detect_supported_architectures();
-  // Order from best to worst (within architecture)
-  if ((haswell_flags & supports) == haswell_flags && !force_sse) {
-    return Architecture::HASWELL;
-  }
-  if ((westmere_flags & supports) == westmere_flags) {
-    return Architecture::WESTMERE;
-  }
-  if (instruction_set::NEON)
-    return Architecture::ARM64;
+#include "benchmarker.h"

-  return Architecture::NONE;
+using namespace simdjson;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::vector;
+using std::ostream;
+using std::ofstream;
+using std::exception;
+
+// Stash the exe_name in main() for functions to use
+char* exe_name;
+
+void print_usage(ostream& out) {
+  out << "Usage: " << exe_name << " [-vt] [-n #] [-s STAGE] [-a ARCH] <jsonfile> ..." << endl;
+  out << endl;
+  out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << endl;
+  out << endl;
+  out << "Options:" << endl;
+  out << endl;
+  out << "-n #       - Number of iterations per file. Default: 200" << endl;
+  out << "-i #       - Number of times to iterate a single file before moving to the next. Default: 20" << endl;
+  out << "-t         - Tabbed data output" << endl;
+  out << "-v         - Verbose output." << endl;
+  out << "-s STAGE   - Stop after the given stage." << endl;
+  out << "             -s stage1 - Stop after find_structural_bits." << endl;
+  out << "             -s all    - Run all stages." << endl;
+  out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
+  out << "             or ARM64). By default, detects best supported architecture." << endl;
 }

-using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
-using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
-
-extern unified_functype *unified_ptr;
-
-extern stage1_functype *stage1_ptr;
-
-int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
-  if (find_marks_only) {
-    return simdjson::SUCCESS;
-  }
-  Architecture best_implementation = _find_best_supported_implementation();
-  // Selecting the best implementation
-  switch (best_implementation) {
-#ifdef IS_X86_64
-  case Architecture::HASWELL:
-    unified_ptr = &unified_machine<Architecture::HASWELL>;
-    break;
-  case Architecture::WESTMERE:
-    unified_ptr = &unified_machine<Architecture::WESTMERE>;
-    break;
-#endif
-#ifdef IS_ARM64
-  case Architecture::ARM64:
-    unified_ptr = &unified_machine<Architecture::ARM64>;
-    break;
-#endif
-  default:
-    std::cerr << "The processor is not supported by simdjson." << std::endl;
-    return simdjson::UNEXPECTED_ERROR;
-  }
-
-  return unified_ptr(buf, len, pj);
+void exit_usage(string message) {
+  cerr << message << endl;
+  cerr << endl;
+  print_usage(cerr);
+  exit(EXIT_FAILURE);
 }

-// Responsible to select the best json_parse implementation
-int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
-  Architecture best_implementation = _find_best_supported_implementation();
-  // Selecting the best implementation
-  switch (best_implementation) {
-#ifdef IS_X86_64
-  case Architecture::HASWELL:
-    stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
-    break;
-  case Architecture::WESTMERE:
-    stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
-    break;
-#endif
-#ifdef IS_ARM64
-  case Architecture::ARM64:
-    stage1_ptr = &find_structural_bits<Architecture::ARM64>;
-    break;
-#endif
-  default:
-    std::cerr << "The processor is not supported by simdjson." << std::endl;
-    return simdjson::UNEXPECTED_ERROR;
+struct option_struct {
+  vector<char*> files;
+  Architecture architecture = Architecture::UNSUPPORTED;
+  bool stage1_only = false;
+
+  int32_t iterations = 200;
+  int32_t iteration_step = 50;
+
+  bool verbose = false;
+  bool tabbed_output = false;
+
+  option_struct(int argc, char **argv) {
+    #ifndef _MSC_VER
+      int c;
+
+      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
+        switch (c) {
+        case 'n':
+          iterations = atoi(optarg);
+          break;
+        case 'i':
+          iteration_step = atoi(optarg);
+          break;
+        case 't':
+          tabbed_output = true;
+          break;
+        case 'v':
+          verbose = true;
+          break;
+        case 'a':
+          architecture = parse_architecture(optarg);
+          if (architecture == Architecture::UNSUPPORTED) {
+            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
+          }
+          break;
+        case 's':
+          if (!strcmp(optarg, "stage1")) {
+            stage1_only = true;
+          } else if (!strcmp(optarg, "all")) {
+            stage1_only = false;
+          } else {
+            exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
+          }
+          break;
+        default:
+          exit_error("Unexpected argument " + c);
+        }
+      }
+    #else
+      int optind = 1;
+    #endif
+
+    // If architecture is not specified, pick the best supported architecture by default
+    if (architecture == Architecture::UNSUPPORTED) {
+      architecture = find_best_supported_architecture();
+    }
+
+    // All remaining arguments are considered to be files
+    for (int i=optind; i<argc; i++) {
+      files.push_back(argv[i]);
+    }
+    if (files.empty()) {
+      exit_usage("No files specified");
+    }
+
+    // Keeps the numbers the same for CI (old ./parse didn't have a two-stage loop)
+    if (files.size() == 1) {
+      iteration_step = iterations;
+    }
+
+    #if !defined(__linux__)
+      if (tabbed_output) {
+        exit_error("tabbed_output (-t) flag only works under linux.\n");
+      }
+    #endif
  }
-
-  return stage1_ptr(buf, len, pj);
-}
-
-stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
-unified_functype *unified_ptr = &unified_machine_dispatch;
-} // namespace simdjson
+};

 int main(int argc, char *argv[]) {
-
-#ifndef _MSC_VER
-  int c;
-
-  while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
-    switch (c) {
-    case 'n':
-      iterations = atoi(optarg);
-      break;
-    case 'w':
-      warmup_iterations = atoi(optarg);
-      break;
-    case 's':
-      force_sse = true;
-      break;
-    case 't':
-      just_data = true;
-      break;
-    case 'v':
-      verbose = true;
-      break;
-    case 'd':
-      dump = true;
-      break;
-    case 'j':
-      json_output = true;
-      break;
-    case '1':
-      force_one_iteration = true;
-      break;
-    case 'f':
-      find_marks_only = true;
-      break;
-    default:
-      abort();
-    }
-  }
-#else
-  int optind = 1;
-#endif
-  if (optind >= argc) {
-    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
-    exit(1);
-  }
-  const char *filename = argv[optind];
-  if (optind + 1 < argc) {
-    std::cerr << "warning: ignoring everything after " << argv[optind + 1]
-              << std::endl;
-  }
-  if (verbose) {
-    std::cout << "[verbose] loading " << filename << std::endl;
-  }
-  simdjson::padded_string p;
-  try {
-    simdjson::get_corpus(filename).swap(p);
-  } catch (const std::exception &) { // caught by reference to base
-    std::cout << "Could not load the file " << filename << std::endl;
-    return EXIT_FAILURE;
-  }
-  if (verbose) {
-    std::cout << "[verbose] loaded " << filename << " (" << p.size()
-              << " bytes)" << std::endl;
-  }
-  if (iterations == -1) {
-    #if defined(DEBUG)
-      iterations = 1;
-    #else
-      iterations = force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
-    #endif
-  }
-  if (warmup_iterations == -1) {
-    #if defined(DEBUG)
-      warmup_iterations = 0;
-    #else
-      warmup_iterations = (p.size() < 1 * 1000 * 1000) ? 10 : 1;
-    #endif
+  // Read options
+  exe_name = argv[0];
+  option_struct options(argc, argv);
+  if (options.verbose) {
+    verbose_stream = &cout;
  }

-  std::vector<double> res;
-  res.resize(iterations);
-  if (!just_data)
-    printf("number of iterations %u \n", iterations);
-#if !defined(__linux__)
-#define SQUASH_COUNTERS
-  if (just_data) {
-    printf("just_data (-t) flag only works under linux.\n");
-  }
-#endif
-  { // practice run
-    simdjson::ParsedJson pj;
-    bool allocok = pj.allocate_capacity(p.size());
-    if (allocok) {
-      simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
-      simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj);
-    }
-  }
-#ifndef SQUASH_COUNTERS
-  std::vector<int> evts;
-  evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
-  evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
-  evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
-  evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
-  evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
-  LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
-  std::vector<unsigned long long> results;
-  results.resize(evts.size());
-  unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
-  unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
-  unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
-  unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
-  unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
-#endif
+  // Start collecting events. We put this early so if it prints an error message, it's the
+  // first thing printed.
+  event_collector collector;

-  // Do warmup iterations
-  bool isok = true;
-  for (int32_t i = 0; i < warmup_iterations; i++) {
-    if (verbose) {
-      std::cout << "[verbose] warmup iteration # " << i << std::endl;
-    }
-    simdjson::ParsedJson pj;
-    bool allocok = pj.allocate_capacity(p.size());
-    if (!allocok) {
-      std::cerr << "failed to allocate memory" << std::endl;
-      return EXIT_FAILURE;
-    }
-    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
-            simdjson::SUCCESS);
-    isok = isok &&
-           (simdjson::SUCCESS ==
-            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
-    if (!isok) {
-      std::cerr << pj.get_error_message() << std::endl;
-      std::cerr << "Could not parse. " << std::endl;
-      return EXIT_FAILURE;
-    }
+  // Print preamble
+  if (!options.tabbed_output) {
+    printf("number of iterations %u \n", options.iterations);
  }

-#ifndef SQUASH_COUNTERS
-  for (int32_t i = 0; i < iterations; i++) {
-    if (verbose) {
-      std::cout << "[verbose] iteration # " << i << std::endl;
-    }
-    unified.start();
-    simdjson::ParsedJson pj;
-    bool allocok = pj.allocate_capacity(p.size());
-    if (!allocok) {
-      std::cerr << "failed to allocate memory" << std::endl;
-      return EXIT_FAILURE;
-    }
-    unified.end(results);
-    cy0 += results[0];
-    cl0 += results[1];
-    mis0 += results[2];
-    cref0 += results[3];
-    cmis0 += results[4];
-    if (verbose) {
-      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
-    }
-    unified.start();
-    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
-            simdjson::SUCCESS);
-    unified.end(results);
-    cy1 += results[0];
-    cl1 += results[1];
-    mis1 += results[2];
-    cref1 += results[3];
-    cmis1 += results[4];
-    if (!isok) {
-      std::cout << "Failed during stage 1" << std::endl;
-      break;
-    }
-    unified.start();
-    isok = isok &&
-           (simdjson::SUCCESS ==
-            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
-    unified.end(results);
-    cy2 += results[0];
-    cl2 += results[1];
-    mis2 += results[2];
-    cref2 += results[3];
-    cmis2 += results[4];
-    if (!isok) {
-      std::cout << "Failed during stage 2" << std::endl;
-      break;
-    }
+  // Set up benchmarkers by reading all files
+  json_parser parser(options.architecture);
+  vector<benchmarker*> benchmarkers;
+  for (size_t i=0; i<options.files.size(); i++) {
+    benchmarkers.push_back(new benchmarker(options.files[i], parser, collector));
  }
-#endif

-  // we do it again, this time just measuring the elapsed time
-  for (int32_t i = 0; i < iterations; i++) {
-    if (verbose) {
-      std::cout << "[verbose] iteration # " << i << std::endl;
-    }
-    simdjson::ParsedJson pj;
-    bool allocok = pj.allocate_capacity(p.size());
-    if (!allocok) {
-      std::cerr << "failed to allocate memory" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (verbose) {
-      std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
-    }
-
-    auto start = std::chrono::steady_clock::now();
-    isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
-            simdjson::SUCCESS);
-    isok = isok &&
-          (simdjson::SUCCESS ==
-            simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
-    auto end = std::chrono::steady_clock::now();
-    std::chrono::duration<double> secs = end - start;
-    res[i] = secs.count();
-    if (!isok) {
-      std::cerr << pj.get_error_message() << std::endl;
-      std::cerr << "Could not parse. " << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-  simdjson::ParsedJson pj =
-      build_parsed_json(p); // do the parsing again to get the stats
-  if (!pj.is_valid()) {
-    std::cerr << pj.get_error_message() << std::endl;
-    std::cerr << "Could not parse. " << std::endl;
-    return EXIT_FAILURE;
-  }
-  double min_result = *min_element(res.begin(), res.end());
-  double speedinGBs = (p.size()) / (min_result * 1000000000.0);
-#ifndef SQUASH_COUNTERS
-  unsigned long total = cy0 + cy1 + cy2;
-  if (just_data) {
-    float cpb0 = (double)cy0 / (iterations * p.size());
-    float cpb1 = (double)cy1 / (iterations * p.size());
-    float cpb2 = (double)cy2 / (iterations * p.size());
-    float cpbtotal = (double)total / (iterations * p.size());
-    char *newfile = (char *)malloc(strlen(filename) + 1);
-    if (newfile == NULL) {
-      return EXIT_FAILURE;
-    }
-    ::strcpy(newfile, filename);
-    char *snewfile = ::basename(newfile);
-    size_t nl = strlen(snewfile);
-    for (size_t j = nl - 1; j > 0; j--) {
-      if (snewfile[j] == '.') {
-        snewfile[j] = '\0';
-        break;
+  // Run the benchmarks
+  progress_bar progress(options.iterations, 50);
+  // Put the if (options.stage1_only) *outside* the loop so that run_iterations will be optimized
+  if (options.stage1_only) {
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
+      if (!options.verbose) { progress.print(iteration); }
+      // Benchmark each file once per iteration
+      for (size_t f=0; f<options.files.size(); f++) {
+        verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
+        benchmarkers[f]->run_iterations(options.iteration_step, true);
      }
    }
-    printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
-           speedinGBs);
-    free(newfile);
  } else {
-    printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
-           p.size(), pj.n_structural_indexes,
-           (double)pj.n_structural_indexes / p.size());
-    printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
-           "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
-           "%10lu (failure %10lu)\n",
-           cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
-           (double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
-           cref1 / iterations, cmis0 / iterations);
-    printf(" mem alloc runs at %.2f cycles per input byte.\n",
-           (double)cy0 / (iterations * p.size()));
-    printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
-           "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
-           "%10lu (failure %10lu)\n",
-           cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
-           (double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
-           cref1 / iterations, cmis1 / iterations);
-    printf(" stage 1 runs at %.2f cycles per input byte.\n",
-           (double)cy1 / (iterations * p.size()));
+    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
+      if (!options.verbose) { progress.print(iteration); }
+      // Benchmark each file once per iteration
+      for (size_t f=0; f<options.files.size(); f++) {
+        verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
+        benchmarkers[f]->run_iterations(options.iteration_step, false);
+      }
+    }
+  }
+  if (!options.verbose) { progress.erase(); }

-    printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
-           "%.2f mis. branches: %10lu  (cycles/mis.branch %.2f)  cache "
-           "accesses: %10lu (failure %10lu)\n",
-           cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
-           (double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
-           cref2 / iterations, cmis2 / iterations);
-    printf(" stage 2 runs at %.2f cycles per input byte and ",
-           (double)cy2 / (iterations * p.size()));
-    printf("%.2f cycles per structural character.\n",
-           (double)cy2 / (iterations * pj.n_structural_indexes));
+  for (size_t i=0; i<options.files.size(); i++) {
+    benchmarkers[i]->print(options.tabbed_output);
+    delete benchmarkers[i];
+  }

-    printf(" all stages: %.2f cycles per input byte.\n",
-           (double)total / (iterations * p.size()));
-    printf("Estimated average frequency: %.3f GHz.\n",
-           (double)total / (iterations * min_result * 1000000000.0));
-  }
-#endif
-  if (!just_data) {
-    std::cout << "Min:  " << min_result << " bytes read: " << p.size()
-              << " Gigabytes/second: " << speedinGBs << std::endl;
-  }
-  if (json_output) {
-    isok = isok && pj.print_json(std::cout);
-  }
-  if (dump) {
-    isok = isok && pj.dump_raw_tape(std::cout);
-  }
-  if (!isok) {
-    fprintf(stderr, " Parsing failed. \n ");
-    return EXIT_FAILURE;
-  }
  return EXIT_SUCCESS;
 }
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -6,10 +6,10 @@
 namespace simdjson {
 // Represents the minimal architecture that would support an implementation
 enum class Architecture {
+  UNSUPPORTED,
  WESTMERE,
  HASWELL,
  ARM64,
-  NONE,
 // TODO remove 'native' in favor of runtime dispatch?
 // the 'native' enum class value should point at a good default on the current
 // machine
@ -20,6 +20,9 @@ enum class Architecture {
 #endif
 };

+Architecture find_best_supported_architecture();
+Architecture parse_architecture(char *architecture);
+
 enum ErrorValues {
  SUCCESS = 0,
  SUCCESS_AND_HAS_MORE, //No errors and buffer still has more data
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -29,7 +29,7 @@ int json_parse(const char *buf, size_t len, ParsedJson &pj,
                                                        realloc);
 }

-Architecture find_best_supported_implementation() {
+Architecture find_best_supported_architecture() {
  constexpr uint32_t haswell_flags =
      instruction_set::AVX2 | instruction_set::PCLMULQDQ |
      instruction_set::BMI1 | instruction_set::BMI2;
@ -45,13 +45,20 @@ Architecture find_best_supported_implementation() {
  if (supports & instruction_set::NEON)
    return Architecture::ARM64;

-  return Architecture::NONE;
+  return Architecture::UNSUPPORTED;
+}
+
+Architecture parse_architecture(char *architecture) {
+  if (!strcmp(architecture, "HASWELL")) { return Architecture::HASWELL; }
+  if (!strcmp(architecture, "WESTMERE")) { return Architecture::WESTMERE; }
+  if (!strcmp(architecture, "ARM64")) { return Architecture::ARM64; }
+  return Architecture::UNSUPPORTED;
 }

 // Responsible to select the best json_parse implementation
 int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
                        bool realloc) {
-  Architecture best_implementation = find_best_supported_implementation();
+  Architecture best_implementation = find_best_supported_architecture();
  // Selecting the best implementation
  switch (best_implementation) {
 #ifdef IS_X86_64