simdjson/benchmark/benchfeatures.cpp

#include "event_counter.h"

#include <cassert>
#include <cctype>
#ifndef _MSC_VER
#include <dirent.h>
#include <unistd.h>
#endif
#include <cinttypes>
#include <initializer_list>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <algorithm>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <vector>

#include "linux-perf-events.h"
#ifdef __linux__
#include <libgen.h>
#endif

#include "simdjson.h"

#include <functional>

#include "benchmarker.h"

using namespace simdjson;
using std::cerr;
using std::cout;
using std::endl;
using std::string;
using std::to_string;
using std::vector;
using std::ostream;
using std::ofstream;
using std::exception;

// Stash the exe_name in main() for functions to use
char* exe_name;

void print_usage(ostream& out) {
  out << "Usage: " << exe_name << " [-v] [-n #] [-s STAGE] [-a ARCH]" << endl;
  out << endl;
  out << "Runs the parser against jsonexamples/generated json files in a loop, measuring speed and other statistics." << endl;
  out << endl;
  out << "Options:" << endl;
  out << endl;
  out << "-n #       - Number of iterations per file. Default: 400" << endl;
  out << "-i #       - Number of times to iterate a single file before moving to the next. Default: 20" << endl;
  out << "-v         - Verbose output." << endl;
  out << "-s STAGE   - Stop after the given stage." << endl;
  out << "             -s stage1 - Stop after find_structural_bits." << endl;
  out << "             -s all    - Run all stages." << endl;
  out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
  out << "             or ARM64). By default, detects best supported architecture." << endl;
}

void exit_usage(string message) {
  cerr << message << endl;
  cerr << endl;
  print_usage(cerr);
  exit(EXIT_FAILURE);
}

struct option_struct {
  architecture arch = architecture::UNSUPPORTED;
  bool stage1_only = false;

  int32_t iterations = 400;
  int32_t iteration_step = 50;

  bool verbose = false;

  option_struct(int argc, char **argv) {
    #ifndef _MSC_VER
      int c;

      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
        switch (c) {
        case 'n':
          iterations = atoi(optarg);
          break;
        case 'i':
          iteration_step = atoi(optarg);
          break;
        case 'v':
          verbose = true;
          break;
        case 'a':
          arch = parse_architecture(optarg);
          if (arch == architecture::UNSUPPORTED) {
            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
          }
          break;
        case 's':
          if (!strcmp(optarg, "stage1")) {
            stage1_only = true;
          } else if (!strcmp(optarg, "all")) {
            stage1_only = false;
          } else {
            exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
          }
          break;
        default:
          exit_error("Unexpected argument " + c);
        }
      }
    #else
      int optind = 1;
    #endif

    // If architecture is not specified, pick the best supported architecture by default
    if (arch == architecture::UNSUPPORTED) {
      arch = find_best_supported_architecture();
    }
    document::parser::use_implementation(arch);
  }

  template<typename F>
  void each_stage(const F& f) const {
    f(BenchmarkStage::STAGE1);
    if (!this->stage1_only) {
      f(BenchmarkStage::STAGE2);
      f(BenchmarkStage::ALL);
    }
  }

};

struct feature_benchmarker {
  benchmarker utf8;
  benchmarker utf8_miss;
  benchmarker escape;
  benchmarker escape_miss;
  benchmarker empty;
  benchmarker empty_miss;
  benchmarker struct7;
  benchmarker struct7_miss;
  benchmarker struct7_full;
  benchmarker struct15;
  benchmarker struct15_miss;
  benchmarker struct23;
  benchmarker struct23_miss;

  feature_benchmarker(const simdjson::implementation &parser, event_collector& collector) :
    utf8               ("jsonexamples/generated/utf-8.json", parser, collector),
    utf8_miss          ("jsonexamples/generated/utf-8-miss.json", parser, collector),
    escape               ("jsonexamples/generated/escape.json", parser, collector),
    escape_miss          ("jsonexamples/generated/escape-miss.json", parser, collector),
    empty              ("jsonexamples/generated/0-structurals.json", parser, collector),
    empty_miss         ("jsonexamples/generated/0-structurals-miss.json", parser, collector),
    struct7           ("jsonexamples/generated/7-structurals.json", parser, collector),
    struct7_miss      ("jsonexamples/generated/7-structurals-miss.json", parser, collector),
    struct7_full       ("jsonexamples/generated/7-structurals-full.json", parser, collector),
    struct15     ("jsonexamples/generated/15-structurals.json", parser, collector),
    struct15_miss("jsonexamples/generated/15-structurals-miss.json", parser, collector),
    struct23     ("jsonexamples/generated/23-structurals.json", parser, collector),
    struct23_miss("jsonexamples/generated/23-structurals-miss.json", parser, collector)
  {

  }

  really_inline void run_iterations(size_t iterations, bool stage1_only=false) {
    struct7.run_iterations(iterations, stage1_only);
    struct7_miss.run_iterations(iterations, stage1_only);
    struct7_full.run_iterations(iterations, stage1_only);
    utf8.run_iterations(iterations, stage1_only);
    utf8_miss.run_iterations(iterations, stage1_only);
    escape.run_iterations(iterations, stage1_only);
    escape_miss.run_iterations(iterations, stage1_only);
    empty.run_iterations(iterations, stage1_only);
    empty_miss.run_iterations(iterations, stage1_only);
    struct15.run_iterations(iterations, stage1_only);
    struct15_miss.run_iterations(iterations, stage1_only);
    struct23.run_iterations(iterations, stage1_only);
    struct23_miss.run_iterations(iterations, stage1_only);
  }

  double cost_per_block(BenchmarkStage stage, const benchmarker& feature, size_t feature_blocks, const benchmarker& base) const {
    return (feature[stage].best.elapsed_ns() - base[stage].best.elapsed_ns()) / feature_blocks;
  }

  // Whether we're recording cache miss and branch miss events
  bool has_events() const {
    return empty.collector.has_events();
  }

  // Base cost of any block (including empty ones)
  double base_cost(BenchmarkStage stage) const {
    return (empty[stage].best.elapsed_ns() / empty.stats->blocks);
  }

  // Extra cost of a 1-7 structural block over an empty block
  double struct1_7_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct7, struct7.stats->blocks_with_1_structural, empty);
  }
  // Extra cost of an 1-7-structural miss
  double struct1_7_miss_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct7_miss, struct7_miss.stats->blocks_with_1_structural, struct7);
  }
  // Rate of 1-7-structural misses per 8-structural flip
  double struct1_7_miss_rate(BenchmarkStage stage) const {
    if (!has_events()) { return 1; }
    return double(struct7_miss[stage].best.branch_misses() - struct7[stage].best.branch_misses()) / struct7_miss.stats->blocks_with_1_structural_flipped;
  }

  // Extra cost of an 8-15 structural block over a 1-7 structural block
  double struct8_15_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct15, struct15.stats->blocks_with_8_structurals, struct7);
  }
  // Extra cost of an 8-15-structural miss over a 1-7 miss
  double struct8_15_miss_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct15_miss, struct15_miss.stats->blocks_with_8_structurals_flipped, struct15);
  }
  // Rate of 8-15-structural misses per 8-structural flip
  double struct8_15_miss_rate(BenchmarkStage stage) const {
    if (!has_events()) { return 1; }
    return double(struct15_miss[stage].best.branch_misses() - struct15[stage].best.branch_misses()) / struct15_miss.stats->blocks_with_8_structurals_flipped;
  }

  // Extra cost of a 16+-structural block over an 8-15 structural block (actual varies based on # of structurals!)
  double struct16_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct23, struct23.stats->blocks_with_16_structurals, struct15);
  }
  // Extra cost of a 16-structural miss over an 8-15 miss
  double struct16_miss_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, struct23_miss, struct23_miss.stats->blocks_with_16_structurals_flipped, struct23);
  }
  // Rate of 16-structural misses per 16-structural flip
  double struct16_miss_rate(BenchmarkStage stage) const {
    if (!has_events()) { return 1; }
    return double(struct23_miss[stage].best.branch_misses() - struct23[stage].best.branch_misses()) / struct23_miss.stats->blocks_with_16_structurals_flipped;
  }

  // Extra cost of having UTF-8 in a block
  double utf8_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, utf8, utf8.stats->blocks_with_utf8, struct7_full);
  }
  // Extra cost of a UTF-8 miss
  double utf8_miss_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, utf8_miss, utf8_miss.stats->blocks_with_utf8_flipped, utf8);
  }
  // Rate of UTF-8 misses per UTF-8 flip
  double utf8_miss_rate(BenchmarkStage stage) const {
    if (!has_events()) { return 1; }
    return double(utf8_miss[stage].best.branch_misses() - utf8[stage].best.branch_misses()) / utf8_miss.stats->blocks_with_utf8_flipped;
  }

  // Extra cost of having escapes in a block
  double escape_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, escape, escape.stats->blocks_with_escapes, struct7_full);
  }
  // Extra cost of an escape miss
  double escape_miss_cost(BenchmarkStage stage) const {
    return cost_per_block(stage, escape_miss, escape_miss.stats->blocks_with_escapes_flipped, escape);
  }
  // Rate of escape misses per escape flip
  double escape_miss_rate(BenchmarkStage stage) const {
    if (!has_events()) { return 1; }
    return double(escape_miss[stage].best.branch_misses() - escape[stage].best.branch_misses()) / escape_miss.stats->blocks_with_escapes_flipped;
  }

  double calc_expected_feature_cost(BenchmarkStage stage, const benchmarker& file) const {
    // Expected base ns/block (empty)
    json_stats& stats = *file.stats;
    double expected = base_cost(stage)       * stats.blocks;
    expected +=       struct1_7_cost(stage)  * stats.blocks_with_1_structural;
    expected +=       utf8_cost(stage)       * stats.blocks_with_utf8;
    expected +=       escape_cost(stage)     * stats.blocks_with_escapes;
    expected +=       struct8_15_cost(stage) * stats.blocks_with_8_structurals;
    expected +=       struct16_cost(stage)   * stats.blocks_with_16_structurals;
    return expected / stats.blocks;
  }

  double calc_expected_miss_cost(BenchmarkStage stage, const benchmarker& file) const {
    // Expected base ns/block (empty)
    json_stats& stats = *file.stats;
    double expected = struct1_7_miss_cost(stage)  * stats.blocks_with_1_structural_flipped * struct1_7_miss_rate(stage);
    expected +=       utf8_miss_cost(stage)       * stats.blocks_with_utf8_flipped * utf8_miss_rate(stage);
    expected +=       escape_miss_cost(stage)     * stats.blocks_with_escapes_flipped * escape_miss_rate(stage);
    expected +=       struct8_15_miss_cost(stage) * stats.blocks_with_8_structurals_flipped * struct8_15_miss_rate(stage);
    expected +=       struct16_miss_cost(stage)   * stats.blocks_with_16_structurals_flipped * struct16_miss_rate(stage);
    return expected / stats.blocks;
  }

  double calc_expected_misses(BenchmarkStage stage, const benchmarker& file) const {
    json_stats& stats = *file.stats;
    double expected = stats.blocks_with_1_structural_flipped   * struct1_7_miss_rate(stage);
    expected +=       stats.blocks_with_utf8_flipped           * utf8_miss_rate(stage);
    expected +=       stats.blocks_with_escapes_flipped        * escape_miss_rate(stage);
    expected +=       stats.blocks_with_8_structurals_flipped  * struct8_15_miss_rate(stage);
    expected +=       stats.blocks_with_16_structurals_flipped * struct16_miss_rate(stage);
    return expected;
  }

  double calc_expected(BenchmarkStage stage, const benchmarker& file) const {
    return calc_expected_feature_cost(stage, file) + calc_expected_miss_cost(stage, file);
  }

  void print(const option_struct& options) const {
    printf("\n");
    printf("Features in ns/block (64 bytes):\n");
    printf("\n");
    printf("| %-8s ",   "Stage");
    printf("| %8s ",  "Base");
    printf("| %8s ",  "7 Struct");
    printf("| %8s ",  "UTF-8");
    printf("| %8s ",  "Escape");
    printf("| %8s ",  "15 Str.");
    printf("| %8s ",  "16+ Str.");
    printf("| %15s ", "7 Struct Miss");
    printf("| %15s ", "UTF-8 Miss");
    printf("| %15s ", "Escape Miss");
    printf("| %15s ", "15 Str. Miss");
    printf("| %15s ", "16+ Str. Miss");
    printf("|\n");

    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.17s", "---------------------------------------");
    printf("|%.17s", "---------------------------------------");
    printf("|%.17s", "---------------------------------------");
    printf("|%.17s", "---------------------------------------");
    printf("|%.17s", "---------------------------------------");
    printf("|\n");

    options.each_stage([&](auto stage) {
      printf("| %-8s ",         benchmark_stage_name(stage));
      printf("| %8.3g ",        base_cost(stage));
      printf("| %8.3g ",        struct1_7_cost(stage));
      printf("| %8.3g ",        utf8_cost(stage));
      printf("| %8.3g ",        escape_cost(stage));
      printf("| %8.3g ",        struct8_15_cost(stage));
      printf("| %8.3g ",        struct16_cost(stage));
      if (has_events()) {
        printf("| %8.3g (%3d%%) ", struct1_7_miss_cost(stage), int(struct1_7_miss_rate(stage)*100));
        printf("| %8.3g (%3d%%) ", utf8_miss_cost(stage), int(utf8_miss_rate(stage)*100));
        printf("| %8.3g (%3d%%) ", escape_miss_cost(stage), int(escape_miss_rate(stage)*100));
        printf("| %8.3g (%3d%%) ", struct8_15_miss_cost(stage), int(struct8_15_miss_rate(stage)*100));
        printf("| %8.3g (%3d%%) ", struct16_miss_cost(stage), int(struct16_miss_rate(stage)*100));
      } else {
        printf("|        %8.3g ", struct1_7_miss_cost(stage));
        printf("|        %8.3g ", utf8_miss_cost(stage));
        printf("|        %8.3g ", escape_miss_cost(stage));
        printf("|        %8.3g ", struct8_15_miss_cost(stage));
        printf("|        %8.3g ", struct16_miss_cost(stage));
      }
      printf("|\n");
    });
  }
};

void print_file_effectiveness(BenchmarkStage stage, const char* filename, const benchmarker& results, const feature_benchmarker& features) {
  double actual = results[stage].best.elapsed_ns() / results.stats->blocks;
  double calc = features.calc_expected(stage, results);
  uint64_t actual_misses = results[stage].best.branch_misses();
  uint64_t calc_misses = uint64_t(features.calc_expected_misses(stage, results));
  double calc_miss_cost = features.calc_expected_miss_cost(stage, results);
  printf("        | %-8s ", benchmark_stage_name(stage));
  printf("| %-15s ",   filename);
  printf("|    %8.3g ", features.calc_expected_feature_cost(stage, results));
  printf("|    %8.3g ", calc_miss_cost);
  printf("| %8.3g ",  calc);
  printf("| %8.3g ",  actual);
  printf("| %+8.3g ", actual - calc);
  printf("| %13lu ",  calc_misses);
  if (features.has_events()) {
    printf("| %13lu ",  actual_misses);
    printf("| %+13ld ", int64_t(actual_misses - calc_misses));
    double miss_adjustment = calc_miss_cost * (double(int64_t(actual_misses - calc_misses)) / calc_misses);
    printf("|      %8.3g ", calc_miss_cost + miss_adjustment);
    printf("|      %+8.3g ", actual - (calc + miss_adjustment));
  }
  printf("|\n");
}

int main(int argc, char *argv[]) {
  // Read options
  exe_name = argv[0];
  option_struct options(argc, argv);
  if (options.verbose) {
    verbose_stream = &cout;
  }

  // Initialize the event collector. We put this early so if it prints an error message, it's the
  // first thing printed.
  event_collector collector;

  // Set up benchmarkers by reading all files
  feature_benchmarker features(collector);
  benchmarker gsoc_2018("jsonexamples/gsoc-2018.json", collector);
  benchmarker twitter("jsonexamples/twitter.json", collector);
  benchmarker random("jsonexamples/random.json", collector);

  // Run the benchmarks
  progress_bar progress(options.iterations, 100);
  // Put the if (options.stage1_only) *outside* the loop so that run_iterations will be optimized
  if (options.stage1_only) {
    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
      if (!options.verbose) { progress.print(iteration); }
      features.run_iterations(options.iteration_step, true);
      gsoc_2018.run_iterations(options.iteration_step, true);
      twitter.run_iterations(options.iteration_step, true);
      random.run_iterations(options.iteration_step, true);
    }
  } else {
    for (int iteration = 0; iteration < options.iterations; iteration += options.iteration_step) {
      if (!options.verbose) { progress.print(iteration); }
      features.run_iterations(options.iteration_step, false);
      gsoc_2018.run_iterations(options.iteration_step, false);
      twitter.run_iterations(options.iteration_step, false);
      random.run_iterations(options.iteration_step, false);
    }
  }
  if (!options.verbose) { progress.erase(); }

  features.print(options);

  // Gauge effectiveness
  if (options.verbose) {
    printf("\n");
    printf("        Effectiveness Check: Estimated vs. Actual ns/block for real files:\n");
    printf("\n");
    printf("        | %8s ", "Stage");
    printf("| %-15s ", "File");
    printf("| %11s ", "Est. (Base)");
    printf("| %11s ", "Est. (Miss)");
    printf("| %8s ",  "Est.");
    printf("| %8s ",  "Actual");
    printf("| %8s ",  "Diff");
    printf("| %13s ", "Est. Misses");
    if (features.has_events()) {
      printf("| %13s ", "Actual Misses");
      printf("| %13s ", "Diff (Misses)");
      printf("| %13s ", "Adjusted Miss");
      printf("| %13s ", "Adjusted Diff");
    }
    printf("|\n");
    printf("        |%.10s",  "---------------------------------------");
    printf("|%.17s",  "---------------------------------------");
    printf("|%.13s",  "---------------------------------------");
    printf("|%.13s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.10s",  "---------------------------------------");
    printf("|%.15s",  "---------------------------------------");
    if (features.has_events()) {
      printf("|%.15s",  "---------------------------------------");
      printf("|%.15s",  "---------------------------------------");
      printf("|%.15s",  "---------------------------------------");
      printf("|%.15s",  "---------------------------------------");
    }
    printf("|\n");

    options.each_stage([&](auto stage) {
      print_file_effectiveness(stage, "gsoc-2018.json", gsoc_2018, features);
      print_file_effectiveness(stage, "twitter.json", twitter, features);
      print_file_effectiveness(stage, "random.json", random, features);
    });
  }

  return EXIT_SUCCESS;
}