use unique_ptr in class parsedjson (#417)

* refactor parsedjson to use unique_ptr instead of owning raw pointer
* fix a potential undefined behavior
* output only first cpu in /proc/cpuinfo
This commit is contained in:
Paul Dreik 2019-12-31 20:31:45 +01:00 committed by Daniel Lemire
parent 6f799435b6
commit 399d08c86c
7 changed files with 49 additions and 115 deletions

View File

@ -22,7 +22,7 @@ steps:
image: gcc:8
environment:
CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson
commands: [ cat /proc/cpuinfo, make checkperf ]
commands: [ sed '/^$/Q' /proc/cpuinfo, make checkperf ]
---
kind: pipeline
name: x64-build

View File

@ -5,6 +5,7 @@
#include "simdjson/simdjson.h"
#include <cstring>
#include <iostream>
#include <memory>
#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
@ -21,10 +22,14 @@ class ParsedJson {
public:
// create a ParsedJson container with zero capacity, call allocate_capacity to
// allocate memory
ParsedJson();
~ParsedJson();
ParsedJson(ParsedJson &&p);
ParsedJson &operator=(ParsedJson &&o);
ParsedJson()=default;
~ParsedJson()=default;
// this is a move only class
ParsedJson(ParsedJson &&p) = default;
ParsedJson(const ParsedJson &p) = delete;
ParsedJson &operator=(ParsedJson &&o) = default;
ParsedJson &operator=(const ParsedJson &o) = delete;
// if needed, allocate memory so that the object is able to process JSON
// documents having up to len bytes and max_depth "depth"
@ -77,7 +82,8 @@ public:
really_inline void write_tape_s64(int64_t i) {
write_tape(0, 'l');
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
std::memcpy(&tape[current_loc], &i, sizeof(i));
++current_loc;
}
really_inline void write_tape_u64(uint64_t i) {
@ -113,27 +119,22 @@ public:
uint32_t current_loc{0};
uint32_t n_structural_indexes{0};
uint32_t *structural_indexes;
std::unique_ptr<uint32_t[]> structural_indexes;
std::unique_ptr<uint64_t[]> tape;
std::unique_ptr<uint32_t[]> containing_scope_offset;
uint64_t *tape;
uint32_t *containing_scope_offset;
#ifdef SIMDJSON_USE_COMPUTED_GOTO
void **ret_address;
std::unique_ptr<void*[]> ret_address;
#else
char *ret_address;
std::unique_ptr<char[]> ret_address;
#endif
uint8_t *string_buf; // should be at least byte_capacity
std::unique_ptr<uint8_t[]> string_buf;// should be at least byte_capacity
uint8_t *current_string_buf_loc;
bool valid{false};
int error_code{simdjson::UNITIALIZED};
private:
// we don't want the default constructor to be called
ParsedJson(const ParsedJson &p) =
delete; // we don't want the default constructor to be called
// we don't want the assignment to be called
ParsedJson &operator=(const ParsedJson &o) = delete;
};
// dump bits low to high

View File

@ -64,14 +64,14 @@ public:
// within the string: get_string_length determines the true string length.
inline const char *get_string() const {
return reinterpret_cast<const char *>(
pj->string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
pj->string_buf.get() + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
}
// return the length of the string in bytes
inline uint32_t get_string_length() const {
uint32_t answer;
memcpy(&answer,
reinterpret_cast<const char *>(pj->string_buf +
reinterpret_cast<const char *>(pj->string_buf.get() +
(current_val & JSON_VALUE_MASK)),
sizeof(uint32_t));
return answer;

View File

@ -371,7 +371,7 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
return simdjson::CAPACITY;
}
utf8_checker utf8_checker{};
json_structural_scanner scanner{pj.structural_indexes};
json_structural_scanner scanner{pj.structural_indexes.get()};
scanner.scan<STEP_SIZE>(buf, len, utf8_checker);
simdjson::ErrorValues error = scanner.detect_errors_on_eof();
@ -379,7 +379,7 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
return error;
}
pj.n_structural_indexes = scanner.structural_indexes.tail - pj.structural_indexes;
pj.n_structural_indexes = scanner.structural_indexes.tail - pj.structural_indexes.get();
/* a valid JSON file cannot have zero structural indexes - we should have
* found something */
if (unlikely(pj.n_structural_indexes == 0u)) {

View File

@ -73,7 +73,7 @@ WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth,
UNUSED uint32_t offset) {
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
pj.write_tape(pj.current_string_buf_loc - pj.string_buf.get(), '"');
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
const uint8_t *const start_of_string = dst;

View File

@ -2,58 +2,6 @@
#include "simdjson/jsonformatutils.h"
namespace simdjson {
ParsedJson::ParsedJson()
: structural_indexes(nullptr), tape(nullptr),
containing_scope_offset(nullptr), ret_address(nullptr),
string_buf(nullptr), current_string_buf_loc(nullptr) {}
ParsedJson::~ParsedJson() { deallocate(); }
ParsedJson::ParsedJson(ParsedJson &&p)
: byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity),
tape_capacity(p.tape_capacity), string_capacity(p.string_capacity),
current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes),
structural_indexes(p.structural_indexes), tape(p.tape),
containing_scope_offset(p.containing_scope_offset),
ret_address(p.ret_address), string_buf(p.string_buf),
current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) {
p.structural_indexes = nullptr;
p.tape = nullptr;
p.containing_scope_offset = nullptr;
p.ret_address = nullptr;
p.string_buf = nullptr;
p.current_string_buf_loc = nullptr;
}
ParsedJson &ParsedJson::operator=(ParsedJson &&p) {
byte_capacity = p.byte_capacity;
p.byte_capacity = 0;
depth_capacity = p.depth_capacity;
p.depth_capacity = 0;
tape_capacity = p.tape_capacity;
p.tape_capacity = 0;
string_capacity = p.string_capacity;
p.string_capacity = 0;
current_loc = p.current_loc;
p.current_loc = 0;
n_structural_indexes = p.n_structural_indexes;
p.n_structural_indexes = 0;
structural_indexes = p.structural_indexes;
p.structural_indexes = nullptr;
tape = p.tape;
p.tape = nullptr;
containing_scope_offset = p.containing_scope_offset;
p.containing_scope_offset = nullptr;
ret_address = p.ret_address;
p.ret_address = nullptr;
string_buf = p.string_buf;
p.string_buf = nullptr;
current_string_buf_loc = p.current_string_buf_loc;
p.current_string_buf_loc = nullptr;
valid = p.valid;
p.valid = false;
return *this;
}
WARN_UNUSED
bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) {
@ -74,7 +22,8 @@ bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) {
byte_capacity = 0; // will only set it to len after allocations are a success
n_structural_indexes = 0;
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
structural_indexes = new (std::nothrow) uint32_t[max_structures];
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures]);
// a pathological input like "[[[[..." would generate len tape elements, so
// need a capacity of at least len + 1, but it is also possible to do
// worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
@ -84,24 +33,19 @@ bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) {
// a document with only zero-length strings... could have len/3 string
// and we would need len/3 * 5 bytes on the string buffer
size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64);
string_buf = new (std::nothrow) uint8_t[local_string_capacity];
tape = new (std::nothrow) uint64_t[local_tape_capacity];
containing_scope_offset = new (std::nothrow) uint32_t[max_depth];
string_buf.reset( new (std::nothrow) uint8_t[local_string_capacity]);
tape.reset(new (std::nothrow) uint64_t[local_tape_capacity]);
containing_scope_offset.reset(new (std::nothrow) uint32_t[max_depth]);
#ifdef SIMDJSON_USE_COMPUTED_GOTO
ret_address = new (std::nothrow) void *[max_depth];
//ret_address = new (std::nothrow) void *[max_depth];
ret_address.reset(new (std::nothrow) void *[max_depth]);
#else
ret_address = new (std::nothrow) char[max_depth];
ret_address.reset(new (std::nothrow) char[max_depth]);
#endif
if ((string_buf == nullptr) || (tape == nullptr) ||
(containing_scope_offset == nullptr) || (ret_address == nullptr) ||
(structural_indexes == nullptr)) {
if (!string_buf || !tape ||
!containing_scope_offset || !ret_address ||
!structural_indexes) {
std::cerr << "Could not allocate memory" << std::endl;
delete[] ret_address;
delete[] containing_scope_offset;
delete[] tape;
delete[] string_buf;
delete[] structural_indexes;
return false;
}
/*
@ -131,16 +75,16 @@ void ParsedJson::deallocate() {
depth_capacity = 0;
tape_capacity = 0;
string_capacity = 0;
delete[] ret_address;
delete[] containing_scope_offset;
delete[] tape;
delete[] string_buf;
delete[] structural_indexes;
ret_address.reset();
containing_scope_offset.reset();
tape.reset();
string_buf.reset();
structural_indexes.reset();
valid = false;
}
void ParsedJson::init() {
current_string_buf_loc = string_buf;
current_string_buf_loc = string_buf.get();
current_loc = 0;
valid = false;
}
@ -168,8 +112,8 @@ bool ParsedJson::print_json(std::ostream &os) const {
return false;
}
tape_idx++;
bool *in_object = new bool[depth_capacity];
auto *in_object_idx = new size_t[depth_capacity];
std::unique_ptr<bool[]> in_object(new bool[depth_capacity]);
std::unique_ptr<size_t[]> in_object_idx(new size_t[depth_capacity]);
int depth = 1; // only root at level 0
in_object_idx[depth] = 0;
in_object[depth] = false;
@ -195,32 +139,26 @@ bool ParsedJson::print_json(std::ostream &os) const {
switch (type) {
case '"': // we have a string
os << '"';
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t));
print_with_escapes(
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
(const unsigned char *)(string_buf.get() + payload + sizeof(uint32_t)),
os, string_length);
os << '"';
break;
case 'l': // we have a long int
if (tape_idx + 1 >= how_many) {
delete[] in_object;
delete[] in_object_idx;
return false;
}
os << static_cast<int64_t>(tape[++tape_idx]);
break;
case 'u':
if (tape_idx + 1 >= how_many) {
delete[] in_object;
delete[] in_object_idx;
return false;
}
os << tape[++tape_idx];
break;
case 'd': // we have a double
if (tape_idx + 1 >= how_many) {
delete[] in_object;
delete[] in_object_idx;
return false;
}
double answer;
@ -258,18 +196,12 @@ bool ParsedJson::print_json(std::ostream &os) const {
break;
case 'r': // we start and end with the root node
fprintf(stderr, "should we be hitting the root node?\n");
delete[] in_object;
delete[] in_object_idx;
return false;
default:
fprintf(stderr, "bug %c\n", type);
delete[] in_object;
delete[] in_object_idx;
return false;
}
}
delete[] in_object;
delete[] in_object_idx;
return true;
}
@ -301,9 +233,10 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) const {
switch (type) {
case '"': // we have a string
os << "string \"";
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t));
print_with_escapes(
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
(const unsigned char *)(string_buf.get() + payload + sizeof(uint32_t)),
os,
string_length);
os << '"';
os << '\n';