simdjson/benchmark/parse.cpp

270 lines
7.8 KiB
C++
Raw Normal View History

2019-02-24 00:28:20 +08:00
#include <cassert>
#include <cctype>
#ifndef _MSC_VER
2019-02-24 00:28:20 +08:00
#include <dirent.h>
#include <unistd.h>
#endif
2019-02-24 00:28:20 +08:00
#include <cinttypes>
#include <cstdio>
#include <cstdlib>
#include <cstring>
2018-11-30 22:37:57 +08:00
#include <algorithm>
2018-08-21 05:27:25 +08:00
#include <chrono>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
2018-03-23 12:05:32 +08:00
#include <string>
#include <vector>
2018-11-30 22:37:57 +08:00
#include "linux-perf-events.h"
2018-12-19 11:18:23 +08:00
#ifdef __linux__
#include <libgen.h>
#endif
2018-05-31 10:46:28 +08:00
//#define DEBUG
2018-11-30 22:37:57 +08:00
#include "simdjson/common_defs.h"
#include "simdjson/jsonioutil.h"
2018-12-19 11:48:24 +08:00
#include "simdjson/jsonparser.h"
2018-11-30 22:37:57 +08:00
#include "simdjson/parsedjson.h"
#include "simdjson/stage1_find_marks.h"
2019-01-01 06:13:32 +08:00
#include "simdjson/stage2_build_tape.h"
2018-03-23 12:05:32 +08:00
2018-08-21 05:27:25 +08:00
int main(int argc, char *argv[]) {
2018-11-10 10:31:14 +08:00
bool verbose = false;
2018-11-24 11:20:57 +08:00
bool dump = false;
2018-12-07 06:40:32 +08:00
bool jsonoutput = false;
2018-11-28 03:37:59 +08:00
bool forceoneiteration = false;
2018-12-19 11:18:23 +08:00
bool justdata = false;
#ifndef _MSC_VER
2018-11-10 10:31:14 +08:00
int c;
2019-02-24 00:28:20 +08:00
while ((c = getopt(argc, argv, "1vdt")) != -1) {
2018-12-19 11:48:24 +08:00
switch (c) {
case 't':
justdata = true;
break;
case 'v':
verbose = true;
break;
case 'd':
dump = true;
break;
case 'j':
jsonoutput = true;
break;
case '1':
forceoneiteration = true;
break;
default:
abort();
}
2019-02-24 00:28:20 +08:00
}
#else
int optind = 1;
#endif
2018-11-10 10:31:14 +08:00
if (optind >= argc) {
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
2018-08-21 05:27:25 +08:00
exit(1);
}
2018-12-19 11:48:24 +08:00
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
2018-08-21 05:27:25 +08:00
}
2019-02-24 00:28:20 +08:00
if (verbose) {
std::cout << "[verbose] loading " << filename << std::endl;
}
padded_string p;
2018-11-28 03:37:59 +08:00
try {
get_corpus(filename).swap(p);
2018-12-19 11:48:24 +08:00
} catch (const std::exception &e) { // caught by reference to base
2018-11-28 03:37:59 +08:00
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
2019-02-24 00:28:20 +08:00
if (verbose) {
std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
<< std::endl;
2019-02-24 00:28:20 +08:00
}
2018-07-14 10:22:30 +08:00
#if defined(DEBUG)
const uint32_t iterations = 1;
2018-03-23 12:05:32 +08:00
#else
const uint32_t iterations =
2018-12-19 11:48:24 +08:00
forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
2018-03-23 12:05:32 +08:00
#endif
std::vector<double> res;
2018-08-21 05:27:25 +08:00
res.resize(iterations);
#if !defined(__linux__)
#define SQUASH_COUNTERS
2018-12-19 11:48:24 +08:00
if (justdata) {
2018-12-19 11:18:23 +08:00
printf("justdata (-t) flag only works under linux.\n");
}
#endif
#ifndef SQUASH_COUNTERS
std::vector<int> evts;
2018-08-21 05:27:25 +08:00
evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
2018-10-04 21:47:34 +08:00
evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
2018-11-28 23:53:57 +08:00
evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
2018-08-21 05:27:25 +08:00
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
std::vector<unsigned long long> results;
2018-08-21 05:27:25 +08:00
results.resize(evts.size());
2019-01-01 06:39:06 +08:00
unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
#endif
2018-08-21 05:27:25 +08:00
bool isok = true;
2018-11-10 10:31:14 +08:00
for (uint32_t i = 0; i < iterations; i++) {
2019-02-24 00:28:20 +08:00
if (verbose) {
std::cout << "[verbose] iteration # " << i << std::endl;
}
#ifndef SQUASH_COUNTERS
unified.start();
#endif
ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size());
2018-12-19 11:48:24 +08:00
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
}
#ifndef SQUASH_COUNTERS
unified.end(results);
cy0 += results[0];
cl0 += results[1];
mis0 += results[2];
cref0 += results[3];
cmis0 += results[4];
#endif
2019-02-24 00:28:20 +08:00
if (verbose) {
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
2019-02-24 00:28:20 +08:00
}
2018-08-21 05:27:25 +08:00
auto start = std::chrono::steady_clock::now();
#ifndef SQUASH_COUNTERS
2018-08-21 05:27:25 +08:00
unified.start();
#endif
2018-12-01 10:31:05 +08:00
isok = find_structural_bits(p.data(), p.size(), pj);
#ifndef SQUASH_COUNTERS
2018-08-21 05:27:25 +08:00
unified.end(results);
cy1 += results[0];
cl1 += results[1];
2018-10-04 21:47:34 +08:00
mis1 += results[2];
2018-11-28 23:53:57 +08:00
cref1 += results[3];
cmis1 += results[4];
if (!isok) {
std::cout << "Failed during stage 1" << std::endl;
2018-08-21 05:27:25 +08:00
break;
}
2018-08-21 05:27:25 +08:00
unified.start();
#endif
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
#ifndef SQUASH_COUNTERS
unified.end(results);
2019-01-01 06:39:06 +08:00
cy2 += results[0];
cl2 += results[1];
mis2 += results[2];
cref2 += results[3];
cmis2 += results[4];
if (!isok) {
std::cout << "Failed during stage 2" << std::endl;
2018-08-21 05:27:25 +08:00
break;
}
#endif
2018-08-21 05:27:25 +08:00
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res[i] = secs.count();
}
ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
2018-12-19 11:48:24 +08:00
if (!pj.isValid()) {
2018-12-11 03:25:49 +08:00
std::cerr << "Could not parse. " << std::endl;
return EXIT_FAILURE;
}
#ifndef SQUASH_COUNTERS
2019-01-01 06:39:06 +08:00
unsigned long total = cy0 + cy1 + cy2;
2018-12-19 11:48:24 +08:00
if (justdata) {
float cpb0 = (double)cy0 / (iterations * p.size());
float cpb1 = (double)cy1 / (iterations * p.size());
float cpb2 = (double)cy2 / (iterations * p.size());
2018-12-19 11:18:23 +08:00
float cpbtotal = (double)total / (iterations * p.size());
2018-12-19 11:48:24 +08:00
char *newfile = (char *)malloc(strlen(filename) + 1);
if (newfile == NULL) {
2018-12-19 11:48:24 +08:00
return EXIT_FAILURE;
}
2018-12-19 11:48:24 +08:00
::strcpy(newfile, filename);
char *snewfile = ::basename(newfile);
size_t nl = strlen(snewfile);
for (size_t j = nl - 1; j > 0; j--) {
if (snewfile[j] == '.') {
snewfile[j] = '\0';
break;
}
}
2019-01-01 06:39:06 +08:00
printf("\"%s\"\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2,
2018-12-19 11:48:24 +08:00
cpbtotal);
free(newfile);
2018-12-19 11:18:23 +08:00
} else {
2018-12-19 11:48:24 +08:00
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
p.size(), pj.n_structural_indexes,
(double)pj.n_structural_indexes / p.size());
printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
"%10lu (failure %10lu)\n",
cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
(double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
cref1 / iterations, cmis0 / iterations);
printf(" mem alloc runs at %.2f cycles per input byte.\n",
(double)cy0 / (iterations * p.size()));
printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
"%10lu (failure %10lu)\n",
cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
(double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
cref1 / iterations, cmis1 / iterations);
printf(" stage 1 runs at %.2f cycles per input byte.\n",
(double)cy1 / (iterations * p.size()));
2018-08-21 05:27:25 +08:00
2018-12-19 11:48:24 +08:00
printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache "
"accesses: %10lu (failure %10lu)\n",
2019-01-01 06:39:06 +08:00
cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
(double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
cref2 / iterations, cmis2 / iterations);
2019-01-01 06:13:32 +08:00
printf(" stage 2 runs at %.2f cycles per input byte and ",
2019-01-01 06:39:06 +08:00
(double)cy2 / (iterations * p.size()));
2018-12-19 11:48:24 +08:00
printf("%.2f cycles per structural character.\n",
2019-01-01 06:39:06 +08:00
(double)cy2 / (iterations * pj.n_structural_indexes));
2018-08-21 05:27:25 +08:00
2018-12-19 11:48:24 +08:00
printf(" all stages: %.2f cycles per input byte.\n",
(double)total / (iterations * p.size()));
2018-12-19 11:18:23 +08:00
}
#endif
2018-08-21 05:27:25 +08:00
double min_result = *min_element(res.begin(), res.end());
2019-02-24 00:28:20 +08:00
if (!justdata) {
std::cout << "Min: " << min_result << " bytes read: " << p.size()
2018-12-19 11:48:24 +08:00
<< " Gigabytes/second: " << (p.size()) / (min_result * 1000000000.0)
<< std::endl;
2019-02-24 00:28:20 +08:00
}
2018-12-19 11:48:24 +08:00
if (jsonoutput) {
2018-12-11 04:16:31 +08:00
isok = isok && pj.printjson(std::cout);
2018-12-07 06:40:32 +08:00
}
2018-12-19 11:48:24 +08:00
if (dump) {
2018-12-11 04:16:31 +08:00
isok = isok && pj.dump_raw_tape(std::cout);
2018-12-07 06:40:32 +08:00
}
2018-08-21 05:27:25 +08:00
if (!isok) {
fprintf(stderr, " Parsing failed. \n ");
2018-08-21 05:27:25 +08:00
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
2018-03-23 12:05:32 +08:00
}