228 lines
15 KiB
C
228 lines
15 KiB
C
#ifndef _BENCHMARK_H_
|
|
#define _BENCHMARK_H_
|
|
#include <float.h>
|
|
#include <stdint.h>
|
|
#include <time.h>
|
|
#ifdef __x86_64__
|
|
|
|
const char *unitname = "cycles";
|
|
|
|
#define RDTSC_START(cycles) \
|
|
do { \
|
|
uint32_t cyc_high, cyc_low; \
|
|
__asm volatile("cpuid\n" \
|
|
"rdtsc\n" \
|
|
"mov %%edx, %0\n" \
|
|
"mov %%eax, %1" \
|
|
: "=r"(cyc_high), "=r"(cyc_low) \
|
|
: \
|
|
: /* no read only */ \
|
|
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
|
); \
|
|
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
|
} while (0)
|
|
|
|
#define RDTSC_STOP(cycles) \
|
|
do { \
|
|
uint32_t cyc_high, cyc_low; \
|
|
__asm volatile("rdtscp\n" \
|
|
"mov %%edx, %0\n" \
|
|
"mov %%eax, %1\n" \
|
|
"cpuid" \
|
|
: "=r"(cyc_high), "=r"(cyc_low) \
|
|
: /* no read only registers */ \
|
|
: "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
|
); \
|
|
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
|
} while (0)
|
|
|
|
#else
|
|
const char *unitname = " (clock units) ";
|
|
|
|
#define RDTSC_START(cycles) \
|
|
do { \
|
|
cycles = clock(); \
|
|
} while (0)
|
|
|
|
#define RDTSC_STOP(cycles) \
|
|
do { \
|
|
cycles = clock(); \
|
|
} while (0)
|
|
#endif
|
|
|
|
static __attribute__((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) {
|
|
return dummy;
|
|
}
|
|
|
|
uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX;
|
|
|
|
#define RDTSC_SET_OVERHEAD(test, repeat) \
|
|
do { \
|
|
uint64_t cycles_start, cycles_final, cycles_diff; \
|
|
uint64_t min_diff = UINT64_MAX; \
|
|
for (decltype(repeat) i = 0; i < repeat; i++) { \
|
|
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
|
|
RDTSC_START(cycles_start); \
|
|
test; \
|
|
RDTSC_STOP(cycles_final); \
|
|
cycles_diff = (cycles_final - cycles_start); \
|
|
if (cycles_diff < min_diff) \
|
|
min_diff = cycles_diff; \
|
|
} \
|
|
global_rdtsc_overhead = min_diff; \
|
|
} while (0)
|
|
|
|
double diff(timespec start, timespec end) {
|
|
return static_cast<double>((end.tv_nsec + 1000000000 * end.tv_sec) -
|
|
(start.tv_nsec + 1000000000 * start.tv_sec)) /
|
|
1000000000.0;
|
|
}
|
|
|
|
/*
|
|
* Prints the best number of operations per cycle where
|
|
* test is the function call, answer is the expected answer generated by
|
|
* test, repeat is the number of times we should repeat and size is the
|
|
* number of operations represented by test.
|
|
*/
|
|
#define BEST_TIME(name, test, expected, pre, repeat, size, verbose) \
|
|
do { \
|
|
if (global_rdtsc_overhead == UINT64_MAX) { \
|
|
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
|
|
} \
|
|
if (verbose) \
|
|
printf("%-40s\t: ", name); \
|
|
else \
|
|
printf("\"%-40s\"", name); \
|
|
fflush(NULL); \
|
|
uint64_t cycles_start, cycles_final, cycles_diff; \
|
|
uint64_t min_diff = (uint64_t)-1; \
|
|
double min_sumclockdiff = DBL_MAX; \
|
|
uint64_t sum_diff = 0; \
|
|
double sumclockdiff = 0; \
|
|
struct timespec time1, time2; \
|
|
for (decltype(repeat) i = 0; i < repeat; i++) { \
|
|
pre; \
|
|
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
|
|
clock_gettime(CLOCK_REALTIME, &time1); \
|
|
RDTSC_START(cycles_start); \
|
|
if (test != expected) { \
|
|
fprintf(stderr, "not expected (%d , %d )", (int)test, (int)expected); \
|
|
break; \
|
|
} \
|
|
RDTSC_STOP(cycles_final); \
|
|
clock_gettime(CLOCK_REALTIME, &time2); \
|
|
double thistiming = diff(time1, time2); \
|
|
sumclockdiff += thistiming; \
|
|
if (thistiming < min_sumclockdiff) \
|
|
min_sumclockdiff = thistiming; \
|
|
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
|
|
if (cycles_diff < min_diff) \
|
|
min_diff = cycles_diff; \
|
|
sum_diff += cycles_diff; \
|
|
} \
|
|
uint64_t S = size; \
|
|
double cycle_per_op = static_cast<double>(min_diff) / static_cast<double>(S); \
|
|
double avg_cycle_per_op = static_cast<double>(sum_diff) / (static_cast<double>(S) * static_cast<double>(repeat)); \
|
|
double avg_gb_per_s = \
|
|
(static_cast<double>(S) * static_cast<double>(repeat)) / ((sumclockdiff)*1000.0 * 1000.0 * 1000.0); \
|
|
double max_gb_per_s = \
|
|
static_cast<double>(S) / (min_sumclockdiff * 1000.0 * 1000.0 * 1000.0); \
|
|
if (verbose) \
|
|
printf(" %7.3f %s per input byte (best) ", cycle_per_op, unitname); \
|
|
if (verbose) \
|
|
printf(" %7.3f %s (avg) ", avg_cycle_per_op, unitname); \
|
|
if (verbose) \
|
|
printf(" %7.3f GB/s (error margin: %5.3f GB/s)", max_gb_per_s, \
|
|
-avg_gb_per_s + max_gb_per_s); \
|
|
if (verbose) \
|
|
printf(" %13.0f documents/s (best)", 1.0/min_sumclockdiff); \
|
|
if (verbose) \
|
|
printf(" %13.0f documents/s (avg)", 1.0/(sumclockdiff/static_cast<double>(repeat))); \
|
|
if (!verbose) \
|
|
printf(" %20.3f %20.3f %20.3f %20.3f", cycle_per_op, \
|
|
avg_cycle_per_op - cycle_per_op, max_gb_per_s, \
|
|
-avg_gb_per_s + max_gb_per_s); \
|
|
printf("\n"); \
|
|
fflush(NULL); \
|
|
} while (0)
|
|
|
|
// like BEST_TIME, but no check
|
|
#define BEST_TIME_NOCHECK(name, test, pre, repeat, size, verbose) \
|
|
do { \
|
|
if (global_rdtsc_overhead == UINT64_MAX) { \
|
|
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
|
|
} \
|
|
if (verbose) \
|
|
printf("%-40s\t: ", name); \
|
|
fflush(NULL); \
|
|
uint64_t cycles_start, cycles_final, cycles_diff; \
|
|
uint64_t min_diff = (uint64_t)-1; \
|
|
uint64_t sum_diff = 0; \
|
|
for (int i = 0; i < repeat; i++) { \
|
|
pre; \
|
|
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
|
|
RDTSC_START(cycles_start); \
|
|
test; \
|
|
RDTSC_STOP(cycles_final); \
|
|
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
|
|
if (cycles_diff < min_diff) \
|
|
min_diff = cycles_diff; \
|
|
sum_diff += cycles_diff; \
|
|
} \
|
|
uint64_t S = size; \
|
|
double cycle_per_op = static_cast<double>(min_diff) / static_cast<double>(S); \
|
|
double avg_cycle_per_op = static_cast<double>(sum_diff) / (static_cast<double>(S) * static_cast<double>(repeat)); \
|
|
if (verbose) \
|
|
printf(" %.3f %s per input byte (best) ", cycle_per_op, unitname); \
|
|
if (verbose) \
|
|
printf(" %.3f %s per input byte (avg) ", avg_cycle_per_op, unitname); \
|
|
if (verbose) \
|
|
printf("\n"); \
|
|
if (!verbose) \
|
|
printf(" %.3f ", cycle_per_op); \
|
|
fflush(NULL); \
|
|
} while (0)
|
|
|
|
// like BEST_TIME except that we run a function to check the result
|
|
#define BEST_TIME_CHECK(test, check, pre, repeat, size, verbose) \
|
|
do { \
|
|
if (global_rdtsc_overhead == UINT64_MAX) { \
|
|
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
|
|
} \
|
|
if (verbose) \
|
|
printf("%-60s\t:\n", #test); \
|
|
fflush(NULL); \
|
|
uint64_t cycles_start, cycles_final, cycles_diff; \
|
|
uint64_t min_diff = (uint64_t)-1; \
|
|
uint64_t sum_diff = 0; \
|
|
for (int i = 0; i < repeat; i++) { \
|
|
pre; \
|
|
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
|
|
RDTSC_START(cycles_start); \
|
|
test; \
|
|
RDTSC_STOP(cycles_final); \
|
|
if (!check) { \
|
|
printf("error"); \
|
|
break; \
|
|
} \
|
|
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
|
|
if (cycles_diff < min_diff) \
|
|
min_diff = cycles_diff; \
|
|
sum_diff += cycles_diff; \
|
|
} \
|
|
uint64_t S = size; \
|
|
float cycle_per_op = (min_diff) / (double)S; \
|
|
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
|
|
if (verbose) \
|
|
printf(" %.3f cycles per operation (best) ", cycle_per_op); \
|
|
if (verbose) \
|
|
printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \
|
|
if (verbose) \
|
|
printf("\n"); \
|
|
if (!verbose) \
|
|
printf(" %.3f ", cycle_per_op); \
|
|
fflush(NULL); \
|
|
} while (0)
|
|
|
|
#endif
|