New benchmark

This commit is contained in:
Daniel Lemire 2018-04-06 17:14:04 -04:00
parent b55e8c01a3
commit 980f69dc67
8 changed files with 395 additions and 28 deletions

6
scalarvssimd/Makefile Normal file
View File

@ -0,0 +1,6 @@
HEADERS:=avxprocessing.h benchmark.h common_defs.h jsonstruct.h scalarprocessing.h util.h
bench: bench.cpp $(HEADERS)
$(CXX) -O3 -o $@ bench.cpp -march=native -lm -Wall -Wextra
clean:
rm -f bench

View File

@ -18,7 +18,7 @@ using namespace std;
// a straightforward comparison of a mask against input. 5 uops; would be cheaper in AVX512.
really_inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
static inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
m256 cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
u64 res_0 = (u32)_mm256_movemask_epi8(cmp_res_0);
m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
@ -26,7 +26,7 @@ really_inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask
return res_0 | (res_1 << 32);
}
never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
static bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
// Useful constant masks
const u64 even_bits = 0x5555555555555555ULL;
const u64 odd_bits = ~even_bits;
@ -81,12 +81,10 @@ never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson &
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
dumpbits(quote_bits, "quote_bits");
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(_mm_set_epi64x(0ULL, quote_bits),
_mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (u64)((s64)quote_mask>>63);
dumpbits(quote_mask, "quote_mask");
// How do we build up a user traversable data structure
// first, do a 'shufti' to detect structural JSON characters
@ -184,17 +182,31 @@ const u32 ROOT_NODE = 1;
// just transform the bitmask to a big list of 32-bit integers for now
// that's all; the type of character the offset points to will
// tell us exactly what we need to know. Naive but straightforward implementation
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
static bool flatten_indexes(size_t len, ParsedJson & pj) {
u32 base = NUM_RESERVED_NODES;
u32 * base_ptr = pj.structural_indexes;
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
for (size_t idx = 0; idx < len; idx+=64) {
u64 s = *(u64 *)(pj.structurals + idx/8);
u32 cnt = __builtin_popcountll(s);
u32 next_base = base + cnt;
while (s) {
u32 si = (u32)idx + __builtin_ctzll(s);
base_ptr[base++] = si;
s &= s - 1ULL;
// spoil the suspense
u64 s3 = _pdep_u64(~0x7ULL, s); // s3 will have bottom 3 1-bits unset
u64 s5 = _pdep_u64(~0x1fULL, s); // s5 will have bottom 5 1-bits unset
base_ptr[base+0] = (u32)idx + __builtin_ctzll(s); u64 s1 = s & (s - 1ULL);
base_ptr[base+1] = (u32)idx + __builtin_ctzll(s1); u64 s2 = s1 & (s1 - 1ULL);
base_ptr[base+2] = (u32)idx + __builtin_ctzll(s2); //u64 s3 = s2 & (s2 - 1ULL);
base_ptr[base+3] = (u32)idx + __builtin_ctzll(s3); u64 s4 = s3 & (s3 - 1ULL);
base_ptr[base+4] = (u32)idx + __builtin_ctzll(s4); //u64 s5 = s4 & (s4 - 1ULL);
base_ptr[base+5] = (u32)idx + __builtin_ctzll(s5); u64 s6 = s5 & (s5 - 1ULL);
base_ptr[base+6] = (u32)idx + __builtin_ctzll(s6); u64 s7 = s6 & (s6 - 1ULL);
s = s7;
base += 7;
}
base = next_base;
}
pj.n_structural_indexes = base;
return true;
@ -202,7 +214,7 @@ never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
// Parse our json given a big array of 32-bit integers telling us where
// the interesting stuff is
bool avx_json_parse(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
static bool json_parse(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
u32 last; // index of previous structure at this level or 0 if none
u32 up; // index of structure that contains this one
@ -240,16 +252,13 @@ bool avx_json_parse(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
nodes[n.prev].next = i;
}
dummy.next = DUMMY_NODE; // dummy.next is a sump for meaningless 'nexts', clear it
#ifdef DEBUG
for (u32 i = 0; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
JsonNode & n = nodes[i];
cout << "i: " << i;
cout << " n.up: " << n.up;
cout << " n.next: " << n.next;
cout << " n.prev: " << n.prev;
cout << " idx: " << idx << " buf[idx] " << buf[idx] << "\n";
}
#endif
return true;
}
static bool avx_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
find_structural_bits(buf, len, pj);
flatten_indexes(len, pj);
json_parse(buf, len, pj);
return true;
}

54
scalarvssimd/bench.cpp Normal file
View File

@ -0,0 +1,54 @@
#include "jsonstruct.h"
#include "scalarprocessing.h"
#include "avxprocessing.h"
#include "benchmark.h"
#include "util.h"
//colorfuldisplay(ParsedJson & pj, const u8 * buf)
//BEST_TIME_NOCHECK(dividearray32(array, N), , repeat, N, timings,true);
int main(int argc, char * argv[]) {
if (argc < 2) {
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
cerr << "Or " << argv[0] << " -v <jsonfile>\n";
exit(1);
}
bool verbose = false;
if (argc > 2) {
if(strcmp(argv[1],"-v")) verbose = true;
}
pair<u8 *, size_t> p = get_corpus(argv[argc - 1]);
ParsedJson pj;
std::cout << "Input has "<< p.second << " bytes."<<std::endl;
if (posix_memalign( (void **)&pj.structurals, 8, ROUNDUP_N(p.second, 64)/8)) {
throw "Allocation failed";
};
pj.n_structural_indexes = 0;
// we have potentially 1 structure per byte of input
// as well as a dummy structure and a root structure
// we also potentially write up to 7 iterations beyond
// in our 'cheesy flatten', so make some worst-case
// sapce for that too
u32 max_structures = ROUNDUP_N(p.second, 64) + 2 + 7;
pj.structural_indexes = new u32[max_structures];
pj.nodes = new JsonNode[max_structures];
if(verbose) {
std::cout << "Parsing SIMD (once) " << std::endl;
avx_json_parse(p.first, p.second, pj);
colorfuldisplay(pj, p.first);
debugdisplay(pj,p.first);
std::cout << "Parsing scalar (once) " << std::endl;
scalar_json_parse(p.first, p.second, pj);
colorfuldisplay(pj, p.first);
debugdisplay(pj,p.first);
}
int repeat = 5;
int volume = p.second;
BEST_TIME_NOCHECK(avx_json_parse(p.first, p.second, pj), , repeat, volume, true);
BEST_TIME_NOCHECK(scalar_json_parse(p.first, p.second, pj), , repeat, volume, true);
}

196
scalarvssimd/benchmark.h Normal file
View File

@ -0,0 +1,196 @@
#ifndef _BENCHMARK_H_
#define _BENCHMARK_H_
#include <stdint.h>
#include <time.h>
#ifdef __x86_64__
const char *unitname = "cycles";
#define RDTSC_START(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("cpuid\n" \
"rdtsc\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1" \
: "=r"(cyc_high), "=r"(cyc_low) \
: \
: /* no read only */ \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("rdtscp\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n" \
"cpuid" \
: "=r"(cyc_high), "=r"(cyc_low) \
: /* no read only registers */ \
: "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#else
const char *unitname = " (clock units) ";
#define RDTSC_START(cycles) \
do { \
cycles = clock(); \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
cycles = clock(); \
} while (0)
#endif
static __attribute__((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) {
return dummy;
}
uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
#define RDTSC_SET_OVERHEAD(test, repeat) \
do { \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = UINT64_MAX; \
for (int i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
} \
global_rdtsc_overhead = min_diff; \
} while (0)
/*
* Prints the best number of operations per cycle where
* test is the function call, answer is the expected answer generated by
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(test, expected, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-60s\t: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
if (test != expected) { \
printf("not expected (%d , %d )", (int)test, (int)expected); \
break; \
} \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
if (verbose) \
printf(" %.3f %s per operation (best) ", cycle_per_op, unitname); \
if (verbose) \
printf("\t%.3f %s per operation (avg) ", avg_cycle_per_op, unitname); \
if (verbose) \
printf("\n"); \
if (!verbose) \
printf(" %.3f ", cycle_per_op); \
fflush(NULL); \
} while (0)
// like BEST_TIME, but no check
#define BEST_TIME_NOCHECK(test, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-40s\t: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
if (verbose) \
printf(" %.3f %s per operation (best) ", cycle_per_op, unitname); \
if (verbose) \
printf(" %.3f %s per operation (avg) ", avg_cycle_per_op, unitname); \
if (verbose) \
printf("\n"); \
if (!verbose) \
printf(" %.3f ", cycle_per_op); \
fflush(NULL); \
} while (0)
// like BEST_TIME except that we run a function to check the result
#define BEST_TIME_CHECK(test, check, pre, repeat, size, verbose) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
if (verbose) \
printf("%-60s\t: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (int i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
if (!check) { \
printf("error"); \
break; \
} \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) \
min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
if (verbose) \
printf(" %.3f cycles per operation (best) ", cycle_per_op); \
if (verbose) \
printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \
if (verbose) \
printf("\n"); \
if (!verbose) \
printf(" %.3f ", cycle_per_op); \
fflush(NULL); \
} while (0)
#endif

View File

@ -1,5 +1,5 @@
#pragma once
#include <cassert>
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;

View File

View File

@ -1,5 +1,6 @@
#pragma once
#include "common_defs.h"
struct JsonNode {
u32 up;
@ -13,3 +14,69 @@ struct ParsedJson {
u32 * structural_indexes;
JsonNode * nodes;
};
#include <algorithm>
#include <iostream>
#include <iterator>
// https://stackoverflow.com/questions/2616906/how-do-i-output-coloured-text-to-a-linux-terminal
namespace Color {
enum Code {
FG_DEFAULT = 39, FG_BLACK = 30, FG_RED = 31, FG_GREEN = 32,
FG_YELLOW = 33, FG_BLUE = 34, FG_MAGENTA = 35, FG_CYAN = 36,
FG_LIGHT_GRAY = 37, FG_DARK_GRAY = 90, FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92, FG_LIGHT_YELLOW = 93, FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95, FG_LIGHT_CYAN = 96, FG_WHITE = 97,
BG_RED = 41, BG_GREEN = 42, BG_BLUE = 44, BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream&
operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
}
void colorfuldisplay(ParsedJson & pj, const u8 * buf) {
Color::Modifier greenfg(Color::FG_GREEN);
Color::Modifier yellowfg(Color::FG_YELLOW);
Color::Modifier deffg(Color::FG_DEFAULT);
size_t i = 0;
// skip initial fluff
while((i+1< pj.n_structural_indexes) && (pj.structural_indexes[i]==pj.structural_indexes[i+1])){
i++;
}
for (; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
u8 c = buf[idx];
if (((c & 0xdf) == 0x5b)) { // meaning 7b or 5b, { or [
std::cout << greenfg << buf[idx] << deffg;
} else if (((c & 0xdf) == 0x5d)) { // meaning 7d or 5d, } or ]
std::cout << greenfg << buf[idx] << deffg;
} else {
std::cout << yellowfg << buf[idx] << deffg;
}
if(i + 1 < pj.n_structural_indexes) {
u32 nextidx = pj.structural_indexes[i + 1];
for(u32 pos = idx + 1 ; pos < nextidx; pos++) {
std::cout << buf[pos];
}
}
}
std::cout << std::endl;
}
void debugdisplay(ParsedJson & pj, const u8 * buf) {
for (u32 i = 0; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
JsonNode & n = pj.nodes[i];
std::cout << "i: " << i;
std::cout << " n.up: " << n.up;
std::cout << " n.next: " << n.next;
std::cout << " n.prev: " << n.prev;
std::cout << " idx: " << idx << " buf[idx] " << buf[idx] << std::endl;
}
}

View File

@ -1,44 +1,79 @@
#include "common_defs.h"
#include "jsonstruct.h"
bool is_valid_escape(char c) {
return (c == '"') || (c == '\\') || (c == '/') || (c == 'b') || (c == 'f') || (c == 'n') || (c == 'r') || (c == 't') || (c == 'u');
}
bool scalar_json_parse(const u8 * buf, size_t len, ParsedJson & pj) {
// this is a naive attempt at this point
// it will probably be subject to failures given adversarial inputs
size_t pos = 0;
size_t last = 0;
size_t up = 0;
const u32 DUMMY_NODE = 0;
const u32 ROOT_NODE = 1;
pj.structural_indexes[DUMMY_NODE] = 0;
pj.structural_indexes[ROOT_NODE] = 0;
JsonNode & dummy = pj.nodes[DUMMY_NODE];
JsonNode & root = pj.nodes[ROOT_NODE];
dummy.prev = dummy.up = DUMMY_NODE;
dummy.next = 0;
root.prev = DUMMY_NODE;
root.up = ROOT_NODE;
root.next = 0;
last = up = ROOT_NODE;
pos = 2;
for(size_t i = 0; i < len; i++) {
JsonNode & n = pj.nodes[pos];
switch buf[i] {
switch (buf[i]) {
case '[':
case '{':
pj.structural_indexes[pos] = i;
n.prev = last;
pj.nodes[last].next = pos;// two-way linked list
n.up = up;
up = pos;
up = pos;// new possible up
last = 0;
pos += 1;
break;
case ']':
case '}':
pj.structural_indexes[pos] = i;
n.prev = up;
n.next = 0;// necessary?
pj.nodes[up].next = pos;// two-way linked list
n.up = pj.nodes[up].up;
up = pj.nodes[up].up;
last = pos;
last = pos;// potential previous
pos += 1;
break;
case '"':
case ':':
case ',':
pj.structural_indexes[pos] = i;
n.prev = last;
n.next = 0;// necessary
pj.nodes[last].next = pos;// two-way linked list
n.up = up;
last = pos;
last = pos;// potential previous
pos += 1;
break;
case '\\':
if(i == len - 1) return false;
if(!is_valid_escape(buf[i+1])) return false;
i = i + 1; // skip valid escape
default:
// nothing
break;
}
n.next = 0;
nodes[n.prev].next = pos;
}
pj.n_structural_indexes = pos;
dummy.next = DUMMY_NODE; // dummy.next is a sump for meaningless 'nexts', clear it
return true;
}