Merge pull request #1101 from simdjson/jkeiser/yakety-sax

Basic SAX interface with benchmarks
This commit is contained in:
John Keiser 2020-08-19 09:05:44 -07:00 committed by GitHub
commit 0a2bca3f73
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1901 additions and 580 deletions

View File

@ -1,5 +1,13 @@
include_directories( . linux )
link_libraries(simdjson simdjson-flags simdjson-windows-headers test-data)
link_libraries(simdjson-windows-headers test-data)
if (TARGET benchmark::benchmark)
add_executable(bench_sax bench_sax.cpp)
target_link_libraries(bench_sax simdjson-internal-flags simdjson-include-source benchmark::benchmark)
endif (TARGET benchmark::benchmark)
link_libraries(simdjson simdjson-flags)
add_executable(benchfeatures benchfeatures.cpp)
add_executable(get_corpus_benchmark get_corpus_benchmark.cpp)
add_executable(perfdiff perfdiff.cpp)
@ -14,12 +22,6 @@ target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARS
add_executable(parse_nostringparsing parse.cpp)
target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING)
if (TARGET benchmark::benchmark)
link_libraries(benchmark::benchmark)
add_executable(bench_parse_call bench_parse_call.cpp)
add_executable(bench_dom_api bench_dom_api.cpp)
endif()
if (TARGET competition-all)
add_executable(distinctuseridcompetition distinctuseridcompetition.cpp)
target_link_libraries(distinctuseridcompetition competition-core)
@ -34,4 +36,10 @@ if (TARGET competition-all)
target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER)
endif()
if (TARGET benchmark::benchmark)
link_libraries(benchmark::benchmark)
add_executable(bench_parse_call bench_parse_call.cpp)
add_executable(bench_dom_api bench_dom_api.cpp)
endif()
include(checkperf.cmake)

View File

@ -22,7 +22,7 @@ static void recover_one_string(State& state) {
return;
}
dom::element doc;
if (error = parser.parse(docdata).get(doc)) {
if ((error = parser.parse(docdata).get(doc))) {
cerr << "could not parse string" << error << endl;
return;
}
@ -48,8 +48,7 @@ static void serialize_twitter(State& state) {
return;
}
// we do not want mem. alloc. in the loop.
error = parser.allocate(docdata.size());
if(error) {
if((error = parser.allocate(docdata.size()))) {
cout << error << endl;
return;
}

359
benchmark/bench_sax.cpp Normal file
View File

@ -0,0 +1,359 @@
#define SIMDJSON_IMPLEMENTATION_FALLBACK 0
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
#define SIMDJSON_IMPLEMENTATION_AMD64 0
#include <iostream>
#include <sstream>
#include <random>
#include "simdjson.h"
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
#include <benchmark/benchmark.h>
SIMDJSON_POP_DISABLE_WARNINGS
#include "simdjson.cpp"
#if SIMDJSON_EXCEPTIONS
using namespace benchmark;
using namespace simdjson;
using std::cerr;
using std::endl;
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
const int REPETITIONS = 10;
#if SIMDJSON_IMPLEMENTATION_HASWELL
#include "twitter/sax_tweet_reader.h"
static void sax_tweets(State &state) {
// Load twitter.json to a buffer
padded_string json;
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
// Allocate
twitter::sax_tweet_reader reader;
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
// Warm the vector
if (auto error = reader.read_tweets(json)) { throw error; }
// Read tweets
size_t bytes = 0;
size_t tweets = 0;
for (SIMDJSON_UNUSED auto _ : state) {
if (auto error = reader.read_tweets(json)) { throw error; }
bytes += json.size();
tweets += reader.tweets.size();
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
}
BENCHMARK(sax_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
#endif // SIMDJSON_IMPLEMENTATION_HASWELL
#include "twitter/tweet.h"
simdjson_really_inline uint64_t nullable_int(dom::element element) {
if (element.is_null()) { return 0; }
return element;
}
simdjson_really_inline void read_dom_tweets(dom::parser &parser, padded_string &json, std::vector<twitter::tweet> &tweets) {
for (dom::element tweet : parser.parse(json)["statuses"]) {
auto user = tweet["user"];
tweets.push_back(
{
tweet["id"],
tweet["text"],
tweet["created_at"],
nullable_int(tweet["in_reply_to_status_id"]),
tweet["retweet_count"],
tweet["favorite_count"],
{ user["id"], user["screen_name"] }
}
);
}
}
static void dom_tweets(State &state) {
// Load twitter.json to a buffer
padded_string json;
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
// Allocate
dom::parser parser;
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
// Warm the vector
std::vector<twitter::tweet> tweets;
read_dom_tweets(parser, json, tweets);
// Read tweets
size_t bytes = 0;
size_t num_tweets = 0;
for (SIMDJSON_UNUSED auto _ : state) {
tweets.clear();
read_dom_tweets(parser, json, tweets);
bytes += json.size();
num_tweets += tweets.size();
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
state.counters["tweets"] = Counter(double(num_tweets), benchmark::Counter::kIsRate);
}
BENCHMARK(dom_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
static void dom_parse(State &state) {
// Load twitter.json to a buffer
padded_string json;
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
// Allocate
dom::parser parser;
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
// Read tweets
size_t bytes = 0;
for (SIMDJSON_UNUSED auto _ : state) {
if (parser.parse(json).error()) { throw "Parsing failed"; };
bytes += json.size();
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(dom_parse)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
/********************
* Large file parsing benchmarks:
********************/
static std::string build_json_array(size_t N) {
std::default_random_engine e;
std::uniform_real_distribution<> dis(0, 1);
std::stringstream myss;
myss << "[" << std::endl;
if(N > 0) {
myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}" << std::endl;
}
for(size_t i = 1; i < N; i++) {
myss << "," << std::endl;
myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}";
}
myss << std::endl;
myss << "]" << std::endl;
std::string answer = myss.str();
std::cout << "Creating a source file spanning " << (answer.size() + 512) / 1024 << " KB " << std::endl;
return answer;
}
static const simdjson::padded_string& get_my_json_str() {
static simdjson::padded_string s = build_json_array(1000000);
return s;
}
struct my_point {
double x;
double y;
double z;
};
// ./benchmark/bench_sax --benchmark_filter=largerandom
/***
* We start with the naive DOM-based approach.
**/
static void dom_parse_largerandom(State &state) {
// Load twitter.json to a buffer
const padded_string& json = get_my_json_str();
// Allocate
dom::parser parser;
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
// Read
size_t bytes = 0;
simdjson::error_code error;
for (SIMDJSON_UNUSED auto _ : state) {
std::vector<my_point> container;
dom::element doc;
if ((error = parser.parse(json).get(doc))) {
std::cerr << "failure: " << error << std::endl;
throw "Parsing failed";
};
for (auto p : doc) {
container.emplace_back(my_point{p["x"], p["y"], p["z"]});
}
bytes += json.size();
benchmark::DoNotOptimize(container.data());
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(dom_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
#if SIMDJSON_IMPLEMENTATION_HASWELL
/***
* Next we are going to code the SAX approach.
**/
SIMDJSON_TARGET_HASWELL
namespace largerandom {
namespace {
using namespace simdjson;
using namespace haswell;
using namespace haswell::stage2;
struct sax_point_reader_visitor {
public:
sax_point_reader_visitor(std::vector<my_point> &_points) : points(_points) {
}
simdjson_really_inline error_code visit_document_start(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_object_start(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_key(json_iterator &, const uint8_t *key) {
switch(key[0]) {
case 'x':
idx = 0;
break;
case 'y':
idx = 2;
break;
case 'z':
idx = 3;
break;
}
return SUCCESS;
}
simdjson_really_inline error_code visit_primitive(json_iterator &, const uint8_t *value) {
return numberparsing::parse_double(value).get(buffer[idx]);
}
simdjson_really_inline error_code visit_array_start(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_array_end(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_object_end(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_document_end(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_empty_array(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_empty_object(json_iterator &) { return SUCCESS; }
simdjson_really_inline error_code visit_root_primitive(json_iterator &, const uint8_t *) { return SUCCESS; }
simdjson_really_inline error_code increment_count(json_iterator &) { return SUCCESS; }
std::vector<my_point> &points;
size_t idx{0};
double buffer[3];
};
struct sax_point_reader {
std::vector<my_point> points;
std::unique_ptr<uint8_t[]> string_buf;
size_t capacity;
dom_parser_implementation dom_parser;
sax_point_reader();
error_code set_capacity(size_t new_capacity);
error_code read_points(const padded_string &json);
}; // struct sax_point_reader
sax_point_reader::sax_point_reader() : points{}, string_buf{}, capacity{0}, dom_parser() {
}
error_code sax_point_reader::set_capacity(size_t new_capacity) {
// string_capacity copied from document::allocate
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
if (capacity == 0) { // set max depth the first time only
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
}
capacity = new_capacity;
return SUCCESS;
}
error_code sax_point_reader::read_points(const padded_string &json) {
// Allocate capacity if needed
points.clear();
if (capacity < json.size()) {
if (auto error = set_capacity(capacity)) { return error; }
}
// Run stage 1 first.
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
// Then walk the document, parsing the tweets as we go
json_iterator iter(dom_parser, 0);
sax_point_reader_visitor visitor(points);
if (auto error = iter.walk_document<false>(visitor)) { return error; }
return SUCCESS;
}
} // unnamed namespace
} // namespace largerandom
SIMDJSON_UNTARGET_REGION
// ./benchmark/bench_sax --benchmark_filter=largerandom
static void sax_parse_largerandom(State &state) {
// Load twitter.json to a buffer
const padded_string& json = get_my_json_str();
// Allocate
largerandom::sax_point_reader reader;
if (auto error = reader.set_capacity(json.size())) { throw error; }
// warming
for(size_t i = 0; i < 10; i++) {
if (auto error = reader.read_points(json)) { throw error; }
}
// Read
size_t bytes = 0;
for (SIMDJSON_UNUSED auto _ : state) {
if (auto error = reader.read_points(json)) { throw error; }
bytes += json.size();
benchmark::DoNotOptimize(reader.points.data());
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(sax_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
#endif // SIMDJSON_IMPLEMENTATION_HASWELL
#endif // SIMDJSON_EXCEPTIONS
BENCHMARK_MAIN();

View File

@ -0,0 +1,67 @@
#ifndef TWITTER_SAX_TWEET_READER_H
#define TWITTER_SAX_TWEET_READER_H
#include "simdjson.h"
#include "sax_tweet_reader_visitor.h"
#include "tweet.h"
#include <vector>
SIMDJSON_TARGET_HASWELL
namespace twitter {
namespace {
using namespace simdjson;
using namespace haswell;
using namespace haswell::stage2;
struct sax_tweet_reader {
std::vector<tweet> tweets;
std::unique_ptr<uint8_t[]> string_buf;
size_t capacity;
dom_parser_implementation dom_parser;
sax_tweet_reader();
error_code set_capacity(size_t new_capacity);
error_code read_tweets(padded_string &json);
}; // struct tweet_reader
sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {
}
error_code sax_tweet_reader::set_capacity(size_t new_capacity) {
// string_capacity copied from document::allocate
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
if (capacity == 0) { // set max depth the first time only
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
}
capacity = new_capacity;
return SUCCESS;
}
// NOTE: this assumes the dom_parser is already allocated
error_code sax_tweet_reader::read_tweets(padded_string &json) {
// Allocate capacity if needed
tweets.clear();
if (capacity < json.size()) {
if (auto error = set_capacity(capacity)) { return error; }
}
// Run stage 1 first.
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
// Then walk the document, parsing the tweets as we go
json_iterator iter(dom_parser, 0);
sax_tweet_reader_visitor visitor(tweets, string_buf.get());
if (auto error = iter.walk_document<false>(visitor)) { return error; }
return SUCCESS;
}
} // unnamed namespace
} // namespace twitter
SIMDJSON_UNTARGET_REGION
#endif // TWITTER_SAX_TWEET_READER_H

View File

@ -0,0 +1,519 @@
#ifndef TWITTER_SAX_TWEET_READER_VISITOR_H
#define TWITTER_SAX_TWEET_READER_VISITOR_H
#include "simdjson.h"
#include "tweet.h"
#include <vector>
SIMDJSON_TARGET_HASWELL
namespace twitter {
using namespace simdjson;
using namespace haswell;
using namespace haswell::stage2;
struct sax_tweet_reader_visitor {
public:
sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf);
simdjson_really_inline error_code visit_document_start(json_iterator &iter);
simdjson_really_inline error_code visit_object_start(json_iterator &iter);
simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key);
simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value);
simdjson_really_inline error_code visit_array_start(json_iterator &iter);
simdjson_really_inline error_code visit_array_end(json_iterator &iter);
simdjson_really_inline error_code visit_object_end(json_iterator &iter);
simdjson_really_inline error_code visit_document_end(json_iterator &iter);
simdjson_really_inline error_code visit_empty_array(json_iterator &iter);
simdjson_really_inline error_code visit_empty_object(json_iterator &iter);
simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value);
simdjson_really_inline error_code increment_count(json_iterator &iter);
private:
// Since we only care about one thing at each level, we just use depth as the marker for what
// object/array we're nested inside.
enum class containers {
document = 0, //
top_object = 1, // {
statuses = 2, // { "statuses": [
tweet = 3, // { "statuses": [ {
user = 4 // { "statuses": [ { "user": {
};
/**
* The largest depth we care about.
* There can be things at lower depths.
*/
static constexpr uint32_t MAX_SUPPORTED_DEPTH = uint32_t(containers::user);
static constexpr const char *STATE_NAMES[] = {
"document",
"top object",
"statuses",
"tweet",
"user"
};
enum class field_type {
any,
unsigned_integer,
string,
nullable_unsigned_integer,
object,
array
};
struct field {
const char * key{};
size_t len{0};
size_t offset;
containers container{containers::document};
field_type type{field_type::any};
};
containers container{containers::document};
std::vector<tweet> &tweets;
uint8_t *current_string_buf_loc;
const uint8_t *current_key{};
simdjson_really_inline bool in_container(json_iterator &iter);
simdjson_really_inline bool in_container_child(json_iterator &iter);
simdjson_really_inline void start_container(json_iterator &iter);
simdjson_really_inline void end_container(json_iterator &iter);
simdjson_really_inline error_code parse_nullable_unsigned(json_iterator &iter, const uint8_t *value, const field &f);
simdjson_really_inline error_code parse_unsigned(json_iterator &iter, const uint8_t *value, const field &f);
simdjson_really_inline error_code parse_string(json_iterator &iter, const uint8_t *value, const field &f);
struct field_lookup {
field entries[256]{};
field_lookup();
simdjson_really_inline field get(const uint8_t * key, containers container);
private:
simdjson_really_inline uint8_t hash(const char * key, uint32_t depth);
simdjson_really_inline void add(const char * key, size_t len, containers container, field_type type, size_t offset);
simdjson_really_inline void neg(const char * const key, uint32_t depth);
};
static field_lookup fields;
}; // sax_tweet_reader_visitor
sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf)
: tweets{_tweets},
current_string_buf_loc{string_buf} {
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) {
start_container(iter);
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) {
// If we're not in a container we care about, don't bother with the rest
if (!in_container_child(iter)) { return SUCCESS; }
// Handle fields first
if (current_key) {
switch (fields.get(current_key, container).type) {
case field_type::array: // { "statuses": [
start_container(iter);
return SUCCESS;
case field_type::any:
return SUCCESS;
case field_type::object:
case field_type::unsigned_integer:
case field_type::nullable_unsigned_integer:
case field_type::string:
iter.log_error("unexpected array field");
return INCORRECT_TYPE;
}
}
// We're not in a field, so it must be a child of an array. We support any of those.
iter.log_error("unexpected array");
return INCORRECT_TYPE;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) {
// If we're not in a container we care about, don't bother with the rest
if (!in_container_child(iter)) { return SUCCESS; }
// Handle known fields
if (current_key) {
auto f = fields.get(current_key, container);
switch (f.type) {
case field_type::object: // { "statuses": [ { "user": {
start_container(iter);
return SUCCESS;
case field_type::any:
return SUCCESS;
case field_type::array:
case field_type::unsigned_integer:
case field_type::nullable_unsigned_integer:
case field_type::string:
iter.log_error("unexpected object field");
return INCORRECT_TYPE;
}
}
// It's not a field, so it's a child of an array or document
switch (container) {
case containers::document: // top_object: {
case containers::statuses: // tweet: { "statuses": [ {
start_container(iter);
return SUCCESS;
case containers::top_object:
case containers::tweet:
case containers::user:
iter.log_error("unexpected object");
return INCORRECT_TYPE;
}
SIMDJSON_UNREACHABLE();
return UNINITIALIZED;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &, const uint8_t *key) {
current_key = key;
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) {
// Don't bother unless we're in a container we care about
if (!in_container(iter)) { return SUCCESS; }
// Handle fields first
if (current_key) {
auto f = fields.get(current_key, container);
switch (f.type) {
case field_type::unsigned_integer:
return parse_unsigned(iter, value, f);
case field_type::nullable_unsigned_integer:
return parse_nullable_unsigned(iter, value, f);
case field_type::string:
return parse_string(iter, value, f);
case field_type::any:
return SUCCESS;
case field_type::array:
case field_type::object:
iter.log_error("unexpected primitive");
return INCORRECT_TYPE;
}
}
// If it's not a field, it's a child of an array.
// The only array we support is statuses, which must contain objects.
iter.log_error("unexpected primitive");
return INCORRECT_TYPE;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) {
if (in_container(iter)) { end_container(iter); }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) {
if (in_container(iter)) { end_container(iter); }
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) {
iter.log_end_value("document");
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &) {
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &) {
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) {
iter.log_error("unexpected root primitive");
return INCORRECT_TYPE;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; }
simdjson_really_inline bool sax_tweet_reader_visitor::in_container(json_iterator &iter) {
return iter.depth == uint32_t(container);
}
simdjson_really_inline bool sax_tweet_reader_visitor::in_container_child(json_iterator &iter) {
return iter.depth == uint32_t(container) + 1;
}
simdjson_really_inline void sax_tweet_reader_visitor::start_container(json_iterator &iter) {
SIMDJSON_ASSUME(iter.depth <= MAX_SUPPORTED_DEPTH); // Asserts in debug mode
container = containers(iter.depth);
if (logger::LOG_ENABLED) { iter.log_start_value(STATE_NAMES[iter.depth]); }
}
simdjson_really_inline void sax_tweet_reader_visitor::end_container(json_iterator &iter) {
if (logger::LOG_ENABLED) { iter.log_end_value(STATE_NAMES[int(container)]); }
container = containers(int(container) - 1);
}
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_nullable_unsigned(json_iterator &iter, const uint8_t *value, const field &f) {
iter.log_value(f.key);
auto i = reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
if (auto error = numberparsing::parse_unsigned(value).get(*i)) {
// If number parsing failed, check if it's null before returning the error
if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; }
i = 0;
}
return SUCCESS;
}
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_unsigned(json_iterator &iter, const uint8_t *value, const field &f) {
iter.log_value(f.key);
auto i = reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
return numberparsing::parse_unsigned(value).get(*i);
}
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_string(json_iterator &iter, const uint8_t *value, const field &f) {
iter.log_value(f.key);
auto s = reinterpret_cast<std::string_view *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
return stringparsing::parse_string_to_buffer(value, current_string_buf_loc, *s);
}
sax_tweet_reader_visitor::field_lookup sax_tweet_reader_visitor::fields{};
simdjson_really_inline uint8_t sax_tweet_reader_visitor::field_lookup::hash(const char * key, uint32_t depth) {
// These shift numbers were chosen specifically because this yields only 2 collisions between
// keys in twitter.json, leaves 0 as a distinct value, and has 0 collisions between keys we
// actually care about.
return uint8_t((key[0] << 0) ^ (key[1] << 3) ^ (key[2] << 3) ^ (key[3] << 1) ^ depth);
}
simdjson_really_inline sax_tweet_reader_visitor::field sax_tweet_reader_visitor::field_lookup::get(const uint8_t * key, containers c) {
auto index = hash((const char *)key, uint32_t(c));
auto entry = entries[index];
// TODO if any key is > SIMDJSON_PADDING, this will access inaccessible memory!
if (c != entry.container || memcmp(key, entry.key, entry.len)) { return entries[0]; }
return entry;
}
simdjson_really_inline void sax_tweet_reader_visitor::field_lookup::add(const char * key, size_t len, containers c, field_type type, size_t offset) {
auto index = hash(key, uint32_t(c));
if (index == 0) {
fprintf(stderr, "%s (depth %d) hashes to zero, which is used as 'missing value'\n", key, int(c));
assert(false);
}
if (entries[index].key) {
fprintf(stderr, "%s (depth %d) collides with %s (depth %d) !\n", key, int(c), entries[index].key, int(entries[index].container));
assert(false);
}
entries[index] = { key, len, offset, c, type };
}
simdjson_really_inline void sax_tweet_reader_visitor::field_lookup::neg(const char * const key, uint32_t depth) {
auto index = hash(key, depth);
if (entries[index].key) {
fprintf(stderr, "%s (depth %d) conflicts with %s (depth %d) !\n", key, depth, entries[index].key, int(entries[index].container));
assert(false);
}
}
sax_tweet_reader_visitor::field_lookup::field_lookup() {
add("\"statuses\"", strlen("\"statuses\""), containers::top_object, field_type::array, 0); // { "statuses": [...]
#define TWEET_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::tweet, TYPE, offsetof(tweet, KEY));
TWEET_FIELD(id, field_type::unsigned_integer);
TWEET_FIELD(in_reply_to_status_id, field_type::nullable_unsigned_integer);
TWEET_FIELD(retweet_count, field_type::unsigned_integer);
TWEET_FIELD(favorite_count, field_type::unsigned_integer);
TWEET_FIELD(text, field_type::string);
TWEET_FIELD(created_at, field_type::string);
TWEET_FIELD(user, field_type::object)
#undef TWEET_FIELD
#define USER_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::user, TYPE, offsetof(tweet, user)+offsetof(twitter_user, KEY));
USER_FIELD(id, field_type::unsigned_integer);
USER_FIELD(screen_name, field_type::string);
#undef USER_FIELD
// Check for collisions with other (unused) hash keys in typical twitter JSON
#define NEG(key, depth) neg("\"" #key "\"", depth);
NEG(display_url, 9);
NEG(expanded_url, 9);
neg("\"h\":", 9);
NEG(indices, 9);
NEG(resize, 9);
NEG(url, 9);
neg("\"w\":", 9);
NEG(display_url, 8);
NEG(expanded_url, 8);
neg("\"h\":", 8);
NEG(indices, 8);
NEG(large, 8);
NEG(medium, 8);
NEG(resize, 8);
NEG(small, 8);
NEG(thumb, 8);
NEG(url, 8);
neg("\"w\":", 8);
NEG(display_url, 7);
NEG(expanded_url, 7);
NEG(id_str, 7);
NEG(id, 7);
NEG(indices, 7);
NEG(large, 7);
NEG(media_url_https, 7);
NEG(media_url, 7);
NEG(medium, 7);
NEG(name, 7);
NEG(sizes, 7);
NEG(small, 7);
NEG(source_status_id_str, 7);
NEG(source_status_id, 7);
NEG(thumb, 7);
NEG(type, 7);
NEG(url, 7);
NEG(urls, 7);
NEG(description, 6);
NEG(display_url, 6);
NEG(expanded_url, 6);
NEG(id_str, 6);
NEG(id, 6);
NEG(indices, 6);
NEG(media_url_https, 6);
NEG(media_url, 6);
NEG(name, 6);
NEG(sizes, 6);
NEG(source_status_id_str, 6);
NEG(source_status_id, 6);
NEG(type, 6);
NEG(url, 6);
NEG(urls, 6);
NEG(contributors_enabled, 5);
NEG(default_profile_image, 5);
NEG(default_profile, 5);
NEG(description, 5);
NEG(entities, 5);
NEG(favourites_count, 5);
NEG(follow_request_sent, 5);
NEG(followers_count, 5);
NEG(following, 5);
NEG(friends_count, 5);
NEG(geo_enabled, 5);
NEG(hashtags, 5);
NEG(id_str, 5);
NEG(id, 5);
NEG(is_translation_enabled, 5);
NEG(is_translator, 5);
NEG(iso_language_code, 5);
NEG(lang, 5);
NEG(listed_count, 5);
NEG(location, 5);
NEG(media, 5);
NEG(name, 5);
NEG(notifications, 5);
NEG(profile_background_color, 5);
NEG(profile_background_image_url_https, 5);
NEG(profile_background_image_url, 5);
NEG(profile_background_tile, 5);
NEG(profile_banner_url, 5);
NEG(profile_image_url_https, 5);
NEG(profile_image_url, 5);
NEG(profile_link_color, 5);
NEG(profile_sidebar_border_color, 5);
NEG(profile_sidebar_fill_color, 5);
NEG(profile_text_color, 5);
NEG(profile_use_background_image, 5);
NEG(protected, 5);
NEG(result_type, 5);
NEG(statuses_count, 5);
NEG(symbols, 5);
NEG(time_zone, 5);
NEG(url, 5);
NEG(urls, 5);
NEG(user_mentions, 5);
NEG(utc_offset, 5);
NEG(verified, 5);
NEG(contributors_enabled, 4);
NEG(contributors, 4);
NEG(coordinates, 4);
NEG(default_profile_image, 4);
NEG(default_profile, 4);
NEG(description, 4);
NEG(entities, 4);
NEG(favorited, 4);
NEG(favourites_count, 4);
NEG(follow_request_sent, 4);
NEG(followers_count, 4);
NEG(following, 4);
NEG(friends_count, 4);
NEG(geo_enabled, 4);
NEG(geo, 4);
NEG(hashtags, 4);
NEG(id_str, 4);
NEG(in_reply_to_screen_name, 4);
NEG(in_reply_to_status_id_str, 4);
NEG(in_reply_to_user_id_str, 4);
NEG(in_reply_to_user_id, 4);
NEG(is_translation_enabled, 4);
NEG(is_translator, 4);
NEG(iso_language_code, 4);
NEG(lang, 4);
NEG(listed_count, 4);
NEG(location, 4);
NEG(media, 4);
NEG(metadata, 4);
NEG(name, 4);
NEG(notifications, 4);
NEG(place, 4);
NEG(possibly_sensitive, 4);
NEG(profile_background_color, 4);
NEG(profile_background_image_url_https, 4);
NEG(profile_background_image_url, 4);
NEG(profile_background_tile, 4);
NEG(profile_banner_url, 4);
NEG(profile_image_url_https, 4);
NEG(profile_image_url, 4);
NEG(profile_link_color, 4);
NEG(profile_sidebar_border_color, 4);
NEG(profile_sidebar_fill_color, 4);
NEG(profile_text_color, 4);
NEG(profile_use_background_image, 4);
NEG(protected, 4);
NEG(result_type, 4);
NEG(retweeted, 4);
NEG(source, 4);
NEG(statuses_count, 4);
NEG(symbols, 4);
NEG(time_zone, 4);
NEG(truncated, 4);
NEG(url, 4);
NEG(urls, 4);
NEG(user_mentions, 4);
NEG(utc_offset, 4);
NEG(verified, 4);
NEG(contributors, 3);
NEG(coordinates, 3);
NEG(entities, 3);
NEG(favorited, 3);
NEG(geo, 3);
NEG(id_str, 3);
NEG(in_reply_to_screen_name, 3);
NEG(in_reply_to_status_id_str, 3);
NEG(in_reply_to_user_id_str, 3);
NEG(in_reply_to_user_id, 3);
NEG(lang, 3);
NEG(metadata, 3);
NEG(place, 3);
NEG(possibly_sensitive, 3);
NEG(retweeted_status, 3);
NEG(retweeted, 3);
NEG(source, 3);
NEG(truncated, 3);
NEG(completed_in, 2);
NEG(count, 2);
NEG(max_id_str, 2);
NEG(max_id, 2);
NEG(next_results, 2);
NEG(query, 2);
NEG(refresh_url, 2);
NEG(since_id_str, 2);
NEG(since_id, 2);
NEG(search_metadata, 1);
#undef NEG
}
// sax_tweet_reader_visitor::field_lookup::find_min() {
// int min_count = 100000;
// for (int a=0;a<4;a++) {
// for (int b=0;b<4;b++) {
// for (int c=0;c<4;c++) {
// twitter::sax_tweet_reader_visitor::field_lookup fields(a,b,c);
// if (fields.collision_count) { continue; }
// if (fields.zero_emission) { continue; }
// if (fields.conflict_count < min_count) { printf("min=%d,%d,%d (%d)", a, b, c, fields.conflict_count); }
// }
// }
// }
// }
} // namespace twitter
SIMDJSON_UNTARGET_REGION
#endif // TWITTER_SAX_TWEET_READER_VISITOR_H

21
benchmark/twitter/tweet.h Normal file
View File

@ -0,0 +1,21 @@
#ifndef TWEET_H
#define TWEET_H
#include "simdjson.h"
#include "twitter_user.h"
namespace twitter {
struct tweet {
uint64_t id{};
std::string_view text{};
std::string_view created_at{};
uint64_t in_reply_to_status_id{};
uint64_t retweet_count{};
uint64_t favorite_count{};
twitter_user user{};
};
} // namespace twitter
#endif // TWEET_H

View File

@ -0,0 +1,15 @@
#ifndef TWITTER_USER_H
#define TWITTER_USER_H
#include "simdjson.h"
namespace twitter {
struct twitter_user {
uint64_t id{};
std::string_view screen_name{};
};
} // namespace twitter
#endif // TWITTER_USER_H

View File

@ -111,7 +111,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
#include "arm64/stringparsing.h"
#include "arm64/numberparsing.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/tape_builder.h"
//
@ -145,15 +144,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(*doc);
return stage2::structural_parser::parse<false>(*this, builder);
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(_doc);
return stage2::structural_parser::parse<true>(*this, builder);
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {

View File

@ -315,22 +315,17 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
//
#include "fallback/stringparsing.h"
#include "fallback/numberparsing.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/tape_builder.h"
namespace {
namespace SIMDJSON_IMPLEMENTATION {
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(*doc);
return stage2::structural_parser::parse<false>(*this, builder);
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(_doc);
return stage2::structural_parser::parse<true>(*this, builder);
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {

View File

@ -4,16 +4,18 @@
namespace {
namespace SIMDJSON_IMPLEMENTATION {
// expectation: sizeof(scope_descriptor) = 64/8.
struct scope_descriptor {
// expectation: sizeof(open_container) = 64/8.
struct open_container {
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
uint32_t count; // how many elements in the scope
}; // struct scope_descriptor
}; // struct open_container
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
class dom_parser_implementation final : public internal::dom_parser_implementation {
public:
/** Tape location of each open { or [ */
std::unique_ptr<scope_descriptor[]> containing_scope{};
std::unique_ptr<open_container[]> open_containers{};
/** Whether each open container is a [ or { */
std::unique_ptr<bool[]> is_array{};
/** Buffer passed to stage 1 */

View File

@ -7,10 +7,10 @@ namespace allocate {
// Allocates stage 2 internal state and outputs in the parser
//
simdjson_really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
parser.open_containers.reset(new (std::nothrow) open_container[max_depth]);
parser.is_array.reset(new (std::nothrow) bool[max_depth]);
if (!parser.is_array || !parser.containing_scope) {
if (!parser.is_array || !parser.open_containers) {
return MEMALLOC;
}
return SUCCESS;

View File

@ -0,0 +1,315 @@
#include "generic/stage2/logger.h"
namespace {
namespace SIMDJSON_IMPLEMENTATION {
namespace stage2 {
class json_iterator {
public:
const uint8_t* const buf;
uint32_t *next_structural;
dom_parser_implementation &dom_parser;
uint32_t depth{0};
/**
* Walk the JSON document.
*
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
* the first parameter; some callbacks have other parameters as well:
*
* - visit_document_start() - at the beginning.
* - visit_document_end() - at the end (if things were successful).
*
* - visit_array_start() - at the start `[` of a non-empty array.
* - visit_array_end() - at the end `]` of a non-empty array.
* - visit_empty_array() - when an empty array is encountered.
*
* - visit_object_end() - at the start `]` of a non-empty object.
* - visit_object_start() - at the end `]` of a non-empty object.
* - visit_empty_object() - when an empty object is encountered.
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
* guaranteed to point at the first quote of the string (`"key"`).
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
*
* - increment_count(iter) - each time a value is found in an array or object.
*/
template<bool STREAMING, typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code walk_document(V &visitor) noexcept;
/**
* Create an iterator capable of walking a JSON document.
*
* The document must have already passed through stage 1.
*/
simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
/**
* Look at the next token.
*
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
*
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
*/
simdjson_really_inline const uint8_t *peek() const noexcept;
/**
* Advance to the next token.
*
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
*
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
*/
simdjson_really_inline const uint8_t *advance() noexcept;
/**
* Get the remaining length of the document, from the start of the current token.
*/
simdjson_really_inline size_t remaining_len() const noexcept;
/**
* Check if we are at the end of the document.
*
* If this is true, there are no more tokens.
*/
simdjson_really_inline bool at_eof() const noexcept;
/**
* Check if we are at the beginning of the document.
*/
simdjson_really_inline bool at_beginning() const noexcept;
simdjson_really_inline uint8_t last_structural() const noexcept;
/**
* Log that a value has been found.
*
* Set ENABLE_LOGGING=true in logger.h to see logging.
*/
simdjson_really_inline void log_value(const char *type) const noexcept;
/**
* Log the start of a multipart value.
*
* Set ENABLE_LOGGING=true in logger.h to see logging.
*/
simdjson_really_inline void log_start_value(const char *type) const noexcept;
/**
* Log the end of a multipart value.
*
* Set ENABLE_LOGGING=true in logger.h to see logging.
*/
simdjson_really_inline void log_end_value(const char *type) const noexcept;
/**
* Log an error.
*
* Set ENABLE_LOGGING=true in logger.h to see logging.
*/
simdjson_really_inline void log_error(const char *error) const noexcept;
template<typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
template<typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
};
template<bool STREAMING, typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
logger::log_start();
//
// Start the document
//
if (at_eof()) { return EMPTY; }
log_start_value("document");
SIMDJSON_TRY( visitor.visit_document_start(*this) );
//
// Read first value
//
{
auto value = advance();
// Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
if (!STREAMING) {
switch (*value) {
case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
}
}
switch (*value) {
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
}
}
goto document_end;
//
// Object parser states
//
object_begin:
log_start_value("object");
depth++;
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
dom_parser.is_array[depth] = false;
SIMDJSON_TRY( visitor.visit_object_start(*this) );
{
auto key = advance();
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
SIMDJSON_TRY( visitor.increment_count(*this) );
SIMDJSON_TRY( visitor.visit_key(*this, key) );
}
object_field:
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
{
auto value = advance();
switch (*value) {
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
}
}
object_continue:
switch (*advance()) {
case ',':
SIMDJSON_TRY( visitor.increment_count(*this) );
{
auto key = advance();
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
SIMDJSON_TRY( visitor.visit_key(*this, key) );
}
goto object_field;
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
default: log_error("No comma between object fields"); return TAPE_ERROR;
}
scope_end:
depth--;
if (depth == 0) { goto document_end; }
if (dom_parser.is_array[depth]) { goto array_continue; }
goto object_continue;
//
// Array parser states
//
array_begin:
log_start_value("array");
depth++;
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
dom_parser.is_array[depth] = true;
SIMDJSON_TRY( visitor.visit_array_start(*this) );
SIMDJSON_TRY( visitor.increment_count(*this) );
array_value:
{
auto value = advance();
switch (*value) {
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
}
}
array_continue:
switch (*advance()) {
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
default: log_error("Missing comma between array values"); return TAPE_ERROR;
}
document_end:
log_end_value("document");
SIMDJSON_TRY( visitor.visit_document_end(*this) );
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
// If we didn't make it to the end, it's an error
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
return TAPE_ERROR;
}
return SUCCESS;
} // walk_document()
simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
: buf{_dom_parser.buf},
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
dom_parser{_dom_parser} {
}
simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
return &buf[*(next_structural)];
}
simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
return &buf[*(next_structural++)];
}
simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
return dom_parser.len - *(next_structural-1);
}
simdjson_really_inline bool json_iterator::at_eof() const noexcept {
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
}
simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
return next_structural == dom_parser.structural_indexes.get();
}
simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
}
simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
logger::log_line(*this, "", type, "");
}
simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
logger::log_line(*this, "+", type, "");
if (logger::LOG_ENABLED) { logger::log_depth++; }
}
simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
if (logger::LOG_ENABLED) { logger::log_depth--; }
logger::log_line(*this, "-", type, "");
}
simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
logger::log_line(*this, "", "ERROR", error);
}
template<typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
switch (*value) {
case '"': return visitor.visit_root_string(*this, value);
case 't': return visitor.visit_root_true_atom(*this, value);
case 'f': return visitor.visit_root_false_atom(*this, value);
case 'n': return visitor.visit_root_null_atom(*this, value);
case '-':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return visitor.visit_root_number(*this, value);
default:
log_error("Document starts with a non-value character");
return TAPE_ERROR;
}
}
template<typename V>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
switch (*value) {
case '"': return visitor.visit_string(*this, value);
case 't': return visitor.visit_true_atom(*this, value);
case 'f': return visitor.visit_false_atom(*this, value);
case 'n': return visitor.visit_null_atom(*this, value);
case '-':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return visitor.visit_number(*this, value);
default:
log_error("Non-value found when value was expected!");
return TAPE_ERROR;
}
}
} // namespace stage2
} // namespace SIMDJSON_IMPLEMENTATION
} // unnamed namespace

View File

@ -8,7 +8,7 @@ namespace logger {
static constexpr const bool LOG_ENABLED = false;
static constexpr const int LOG_EVENT_LEN = 20;
static constexpr const int LOG_BUFFER_LEN = 10;
static constexpr const int LOG_BUFFER_LEN = 30;
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
static constexpr const int LOG_INDEX_LEN = 5;
@ -33,12 +33,6 @@ namespace logger {
}
}
static simdjson_really_inline void log_string(const char *message) {
if (LOG_ENABLED) {
printf("%s\n", message);
}
}
// Logs a single line of
template<typename S>
static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {

View File

@ -7,15 +7,15 @@ namespace stage2 {
namespace numberparsing {
#ifdef JSON_TEST_NUMBERS
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
#else
#define INVALID_NUMBER(SRC) (false)
#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
#endif
// Attempts to compute i * 10^(power) exactly; and if "negative" is
@ -24,7 +24,7 @@ namespace numberparsing {
// set to false. This should work *most of the time* (like 99% of the time).
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success) {
simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
// we start with a fast path
// It was described in
// Clinger WD. How to read floating point numbers accurately.
@ -40,7 +40,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
#endif
// convert the integer into a double. This is lossless since
// 0 <= i <= 2^53 - 1.
double d = double(i);
d = double(i);
//
// The general idea is as follows.
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
@ -59,8 +59,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
if (negative) {
d = -d;
}
*success = true;
return d;
return true;
}
// When 22 < power && power < 22 + 16, we could
// hope for another, secondary fast path. It wa
@ -85,7 +84,8 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
// possible, except if i == 0, so we handle i == 0 separately.
if(i == 0) {
return 0.0;
d = 0.0;
return true;
}
// We are going to need to do some 64-bit arithmetic to get a more precise product.
@ -135,8 +135,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
// This does happen, e.g. with 7.3177701707893310e+15.
if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
(product_low + i < product_low))) { // let us be prudent and bail out.
*success = false;
return 0;
return false;
}
upper = product_high;
lower = product_middle;
@ -157,25 +156,24 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
// floating-point values.
if (simdjson_unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
((mantissa & 3) == 1))) {
// if mantissa & 1 == 1 we might need to round up.
//
// Scenarios:
// 1. We are not in the middle. Then we should round up.
//
// 2. We are right in the middle. Whether we round up depends
// on the last significant bit: if it is "one" then we round
// up (round to even) otherwise, we do not.
//
// So if the last significant bit is 1, we can safely round up.
// Hence we only need to bail out if (mantissa & 3) == 1.
// Otherwise we may need more accuracy or analysis to determine whether
// we are exactly between two floating-point numbers.
// It can be triggered with 1e23.
// Note: because the factor_mantissa and factor_mantissa_low are
// almost always rounded down (except for small positive powers),
// almost always should round up.
*success = false;
return 0;
// if mantissa & 1 == 1 we might need to round up.
//
// Scenarios:
// 1. We are not in the middle. Then we should round up.
//
// 2. We are right in the middle. Whether we round up depends
// on the last significant bit: if it is "one" then we round
// up (round to even) otherwise, we do not.
//
// So if the last significant bit is 1, we can safely round up.
// Hence we only need to bail out if (mantissa & 3) == 1.
// Otherwise we may need more accuracy or analysis to determine whether
// we are exactly between two floating-point numbers.
// It can be triggered with 1e23.
// Note: because the factor_mantissa and factor_mantissa_low are
// almost always rounded down (except for small positive powers),
// almost always should round up.
return false;
}
mantissa += mantissa & 1;
@ -193,15 +191,12 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
uint64_t real_exponent = c.exp - lz;
// we have to check that real_exponent is in range, otherwise we bail out
if (simdjson_unlikely((real_exponent < 1) || (real_exponent > 2046))) {
*success = false;
return 0;
return false;
}
mantissa |= real_exponent << 52;
mantissa |= (((uint64_t)negative) << 63);
double d;
memcpy(&d, &mantissa, sizeof(d));
*success = true;
return d;
return true;
}
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
@ -252,11 +247,11 @@ simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
}
template<typename W>
bool slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
error_code slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
double d;
if (parse_float_strtod(src, &d)) {
WRITE_DOUBLE(d, src, writer);
return true;
writer.append_double(d);
return SUCCESS;
}
return INVALID_NUMBER(src);
}
@ -273,7 +268,7 @@ simdjson_really_inline bool parse_digit(const uint8_t c, I &i) {
return true;
}
simdjson_really_inline bool parse_decimal(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
simdjson_really_inline error_code parse_decimal(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
// we continue with the fiction that we have an integer. If the
// floating point number is representable as x * 10^z for some integer
// z that fits in 53 bits, then we will be able to convert back the
@ -296,10 +291,10 @@ simdjson_really_inline bool parse_decimal(SIMDJSON_UNUSED const uint8_t *const s
if (exponent == 0) {
return INVALID_NUMBER(src);
}
return true;
return SUCCESS;
}
simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
// Exp Sign: -123.456e[-]78
bool neg_exp = ('-' == *p);
if (neg_exp || '+' == *p) { p++; } // Skip + as well
@ -312,11 +307,11 @@ simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
// Thus we *must* check for possible overflow before we negate exp_number.
// Performance notes: it may seem like combining the two "unlikely checks" below into
// a single unlikely path would be faster. The reasoning is sound, but the compiler may
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
// not oblige and may, in fact, generate two distinct paths in any case. It might be
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
// instructions for a likely branch, an unconclusive gain.
// instructions for a simdjson_likely branch, an unconclusive gain.
// If there were no digits, it's an error.
if (simdjson_unlikely(p == start_exp)) {
@ -347,68 +342,74 @@ simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
// To sum it up: the next line should never overflow.
exponent += (neg_exp ? -exp_number : exp_number);
return true;
return SUCCESS;
}
simdjson_really_inline int significant_digits(const uint8_t * start_digits, int digit_count) {
// It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
const uint8_t *start = start_digits;
while ((*start == '0') || (*start == '.')) {
start++;
}
// we over-decrement by one when there is a '.'
return digit_count - int(start - start_digits);
}
template<typename W>
simdjson_really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
simdjson_really_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
// If we frequently had to deal with long strings of digits,
// we could extend our code by using a 128-bit integer instead
// of a 64-bit integer. However, this is uncommon in practice.
// digit count is off by 1 because of the decimal (assuming there was one).
if (simdjson_unlikely((digit_count-1 >= 19))) { // this is uncommon
// It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
const uint8_t *start = start_digits;
while ((*start == '0') || (*start == '.')) {
start++;
}
// we over-decrement by one when there is a '.'
digit_count -= int(start - start_digits);
if (digit_count >= 19) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
// This will happen in the following examples:
// 10000000000000000000000000000000000000000000e+308
// 3.1415926535897932384626433832795028841971693993751
//
bool success = slow_float_parsing(src, writer);
// The number was already written, but we made a copy of the writer
// when we passed it to the parse_large_integer() function, so
writer.skip_double();
return success;
}
if (simdjson_unlikely(digit_count-1 >= 19 && significant_digits(start_digits, digit_count) >= 19)) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
// This will happen in the following examples:
// 10000000000000000000000000000000000000000000e+308
// 3.1415926535897932384626433832795028841971693993751
//
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
// and putting into registers. i.e. if we pass it as reference, it gets slow.
// This is what forces the skip_double, as well.
error_code error = slow_float_parsing(src, writer);
writer.skip_double();
return error;
}
// NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
// To future reader: we'd love if someone found a better way, or at least could explain this result!
if (simdjson_unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
// this is almost never going to get called!!!
// we start anew, going slowly!!!
bool success = slow_float_parsing(src, writer);
// The number was already written, but we made a copy of the writer when we passed it to the
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
// and putting into registers. i.e. if we pass it as reference, it gets slow.
// This is what forces the skip_double, as well.
error_code error = slow_float_parsing(src, writer);
writer.skip_double();
return success;
return error;
}
bool success = true;
double d = compute_float_64(exponent, i, negative, &success);
if (!success) {
double d;
if (!compute_float_64(exponent, i, negative, d)) {
// we are almost never going to get here.
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
}
WRITE_DOUBLE(d, src, writer);
return true;
return SUCCESS;
}
// for performance analysis, it is sometimes useful to skip parsing
#ifdef SIMDJSON_SKIPNUMBERPARSING
template<typename W>
simdjson_really_inline bool parse_number(const uint8_t *const, W &writer) {
simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) {
writer.append_s64(0); // always write zero
return true; // always succeeds
return SUCCESS; // always succeeds
}
#else
@ -423,7 +424,7 @@ simdjson_really_inline bool parse_number(const uint8_t *const, W &writer) {
//
// Our objective is accurate parsing (ULP of 0) at high speed.
template<typename W>
simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
simdjson_really_inline error_code parse_number(const uint8_t *const src, W &writer) {
//
// Check for minus sign
@ -451,17 +452,19 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
if ('.' == *p) {
is_float = true;
++p;
if (!parse_decimal(src, p, i, exponent)) { return false; }
SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
digit_count = int(p - start_digits); // used later to guard against overflows
}
if (('e' == *p) || ('E' == *p)) {
is_float = true;
++p;
if (!parse_exponent(src, p, exponent)) { return false; }
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
}
if (is_float) {
const bool clean_end = is_structural_or_whitespace(*p);
return write_float(src, negative, i, start_digits, digit_count, exponent, writer) && clean_end;
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
if (!clean_end) { return INVALID_NUMBER(src); }
return SUCCESS;
}
// The longest negative 64-bit number is 19 digits.
@ -470,13 +473,12 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
int longest_digit_count = negative ? 19 : 20;
if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
if (digit_count == longest_digit_count) {
if(negative) {
if (negative) {
// Anything negative above INT64_MAX+1 is invalid
if (i > uint64_t(INT64_MAX)+1) {
return INVALID_NUMBER(src);
}
if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); }
WRITE_INTEGER(~i+1, src, writer);
return is_structural_or_whitespace(*p);
if (!is_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
return SUCCESS;
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
@ -498,9 +500,230 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
} else {
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
}
return is_structural_or_whitespace(*p);
if (!is_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
return SUCCESS;
}
// SAX functions
namespace {
// Parse any number from 0 to 18,446,744,073,709,551,615
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
const uint8_t *p = src;
//
// Parse the integer part.
//
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
const uint8_t *const start_digits = p;
uint64_t i = 0;
while (parse_digit(*p, i)) { p++; }
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
int digit_count = int(p - start_digits);
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
// The longest positive 64-bit number is 20 digits.
// We do it this way so we don't trigger this branch unless we must.
if (digit_count > 20) { return NUMBER_ERROR; }
if (digit_count == 20) {
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// If we got here, it's a 20 digit number starting with the digit "1".
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// than 1,553,255,926,290,448,384.
// - That is smaller than the smallest possible 20-digit number the user could write:
// 10,000,000,000,000,000,000.
// - Therefore, if the number is positive and lower than that, it's overflow.
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
//
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
}
return i;
}
// Parse any number from 0 to 18,446,744,073,709,551,615
// Call this version of the method if you regularly expect 8- or 16-digit numbers.
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<uint64_t> parse_large_unsigned(const uint8_t * const src) noexcept {
const uint8_t *p = src;
//
// Parse the integer part.
//
uint64_t i = 0;
if (is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p);
p += 8;
if (is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p);
p += 8;
if (parse_digit(*p, i)) { // digit 17
p++;
if (parse_digit(*p, i)) { // digit 18
p++;
if (parse_digit(*p, i)) { // digit 19
p++;
if (parse_digit(*p, i)) { // digit 20
p++;
if (parse_digit(*p, i)) { return NUMBER_ERROR; } // 21 digits is an error
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// If we got here, it's a 20 digit number starting with the digit "1".
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// than 1,553,255,926,290,448,384.
// - That is smaller than the smallest possible 20-digit number the user could write:
// 10,000,000,000,000,000,000.
// - Therefore, if the number is positive and lower than that, it's overflow.
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
//
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
}
}
}
}
} // 16 digits
} else { // 8 digits
// Less than 8 digits can't overflow, simpler logic here.
if (parse_digit(*p, i)) { p++; } else { return NUMBER_ERROR; }
while (parse_digit(*p, i)) { p++; }
}
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
int digit_count = int(p - src);
if (digit_count == 0 || ('0' == *src && digit_count > 1)) { return NUMBER_ERROR; }
return i;
}
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
//
// Check for minus sign
//
bool negative = (*src == '-');
const uint8_t *p = src + negative;
//
// Parse the integer part.
//
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
const uint8_t *const start_digits = p;
uint64_t i = 0;
while (parse_digit(*p, i)) { p++; }
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
int digit_count = int(p - start_digits);
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
// The longest negative 64-bit number is 19 digits.
// The longest positive 64-bit number is 20 digits.
// We do it this way so we don't trigger this branch unless we must.
int longest_digit_count = negative ? 19 : 20;
if (digit_count > longest_digit_count) { return NUMBER_ERROR; }
if (digit_count == longest_digit_count) {
if(negative) {
// Anything negative above INT64_MAX+1 is invalid
if (i > uint64_t(INT64_MAX)+1) { return NUMBER_ERROR; }
return ~i+1;
// Positive overflow check:
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// biggest uint64_t.
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
// If we got here, it's a 20 digit number starting with the digit "1".
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// than 1,553,255,926,290,448,384.
// - That is smaller than the smallest possible 20-digit number the user could write:
// 10,000,000,000,000,000,000.
// - Therefore, if the number is positive and lower than that, it's overflow.
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
//
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
}
return negative ? (~i+1) : i;
}
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
//
// Check for minus sign
//
bool negative = (*src == '-');
src += negative;
//
// Parse the integer part.
//
uint64_t i = 0;
const uint8_t *p = src;
p += parse_digit(*p, i);
bool leading_zero = (i == 0);
while (parse_digit(*p, i)) { p++; }
// no integer digits, or 0123 (zero must be solo)
if ( p == src || (leading_zero && p != src+1)) { return NUMBER_ERROR; }
//
// Parse the decimal part.
//
int64_t exponent = 0;
bool overflow;
if (simdjson_likely(*p == '.')) {
p++;
const uint8_t *start_decimal_digits = p;
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
p++;
while (parse_digit(*p, i)) { p++; }
exponent = -(p - start_decimal_digits);
// Overflow check. 19 digits (minus the decimal) may be overflow.
overflow = p-src-1 >= 19;
if (simdjson_unlikely(overflow && leading_zero)) {
// Skip leading 0.00000 and see if it still overflows
const uint8_t *start_digits = src + 2;
while (*start_digits == '0') { start_digits++; }
overflow = start_digits-src >= 19;
}
} else {
overflow = p-src >= 19;
}
//
// Parse the exponent
//
if (*p == 'e' || *p == 'E') {
p++;
bool exp_neg = *p == '-';
p += exp_neg || *p == '+';
uint64_t exp = 0;
const uint8_t *start_exp_digits = p;
while (parse_digit(*p, exp)) { p++; }
// no exp digits, or 20+ exp digits
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
exponent += exp_neg ? 0-exp : exp;
overflow = overflow || exponent < FASTFLOAT_SMALLEST_POWER || exponent > FASTFLOAT_LARGEST_POWER;
}
//
// Assemble (or slow-parse) the float
//
double d;
if (simdjson_likely(!overflow)) {
if (compute_float_64(exponent, i, negative, d)) { return d; }
}
if (!parse_float_strtod(src-negative, &d)) {
return NUMBER_ERROR;
}
return d;
}
} //namespace {}
#endif // SIMDJSON_SKIPNUMBERPARSING
} // namespace numberparsing

View File

@ -119,6 +119,15 @@ SIMDJSON_WARN_UNUSED simdjson_really_inline uint8_t *parse_string(const uint8_t
return nullptr;
}
SIMDJSON_UNUSED SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string_to_buffer(const uint8_t *src, uint8_t *&current_string_buf_loc, std::string_view &s) {
if (src[0] != '"') { return STRING_ERROR; }
auto end = stringparsing::parse_string(src, current_string_buf_loc);
if (!end) { return STRING_ERROR; }
s = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
current_string_buf_loc = end;
return SUCCESS;
}
} // namespace stringparsing
} // namespace stage2
} // namespace SIMDJSON_IMPLEMENTATION

View File

@ -1,245 +0,0 @@
// This file contains the common code every implementation uses for stage2
// It is intended to be included multiple times and compiled multiple times
// We assume the file in which it is include already includes
// "simdjson/stage2.h" (this simplifies amalgation)
#include "generic/stage2/logger.h"
#include "generic/stage2/structural_iterator.h"
namespace { // Make everything here private
namespace SIMDJSON_IMPLEMENTATION {
namespace stage2 {
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
struct structural_parser : structural_iterator {
/** Current depth (nested objects and arrays) */
uint32_t depth{0};
template<bool STREAMING, typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse(T &builder) noexcept;
template<bool STREAMING, typename T>
SIMDJSON_WARN_UNUSED static simdjson_really_inline error_code parse(dom_parser_implementation &dom_parser, T &builder) noexcept {
structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
return parser.parse<STREAMING>(builder);
}
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
simdjson_really_inline structural_parser(dom_parser_implementation &_dom_parser, uint32_t start_structural_index)
: structural_iterator(_dom_parser, start_structural_index) {
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code start_document() {
dom_parser.is_array[depth] = false;
return SUCCESS;
}
template<typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code start_array(T &builder) {
depth++;
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
builder.start_array(*this);
dom_parser.is_array[depth] = true;
return SUCCESS;
}
template<typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline bool empty_object(T &builder) {
if (peek_next_char() == '}') {
advance_char();
builder.empty_object(*this);
return true;
}
return false;
}
template<typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline bool empty_array(T &builder) {
if (peek_next_char() == ']') {
advance_char();
builder.empty_array(*this);
return true;
}
return false;
}
template<bool STREAMING>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code finish() {
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
if (depth != 0) {
log_error("Unclosed objects or arrays!");
return TAPE_ERROR;
}
// If we didn't make it to the end, it's an error
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
return TAPE_ERROR;
}
return SUCCESS;
}
simdjson_really_inline uint8_t last_structural() {
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
}
simdjson_really_inline void log_value(const char *type) {
logger::log_line(*this, "", type, "");
}
simdjson_really_inline void log_start_value(const char *type) {
logger::log_line(*this, "+", type, "");
if (logger::LOG_ENABLED) { logger::log_depth++; }
}
simdjson_really_inline void log_end_value(const char *type) {
if (logger::LOG_ENABLED) { logger::log_depth--; }
logger::log_line(*this, "-", type, "");
}
simdjson_really_inline void log_error(const char *error) {
logger::log_line(*this, "", "ERROR", error);
}
}; // struct structural_parser
template<bool STREAMING, typename T>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code structural_parser::parse(T &builder) noexcept {
logger::log_start();
//
// Start the document
//
if (at_end()) { return EMPTY; }
SIMDJSON_TRY( start_document() );
builder.start_document(*this);
//
// Read first value
//
{
const uint8_t *value = advance();
// Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
if (!STREAMING) {
switch (*value) {
case '{':
if (last_structural() != '}') {
return TAPE_ERROR;
}
break;
case '[':
if (last_structural() != ']') {
return TAPE_ERROR;
}
break;
}
}
switch (*value) {
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
default: SIMDJSON_TRY( builder.parse_root_primitive(*this, value) );
}
goto document_end;
}
//
// Object parser states
//
object_begin: {
depth++;
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
builder.start_object(*this);
dom_parser.is_array[depth] = false;
const uint8_t *key = advance();
if (*key != '"') {
log_error("Object does not start with a key");
return TAPE_ERROR;
}
builder.increment_count(*this);
SIMDJSON_TRY( builder.parse_key(*this, key) );
goto object_field;
} // object_begin:
object_field: {
if (simdjson_unlikely( advance_char() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
const uint8_t *value = advance();
switch (*value) {
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
default: SIMDJSON_TRY( builder.parse_primitive(*this, value) );
}
} // object_field:
object_continue: {
switch (advance_char()) {
case ',': {
builder.increment_count(*this);
const uint8_t *key = advance();
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
SIMDJSON_TRY( builder.parse_key(*this, key) );
goto object_field;
}
case '}':
builder.end_object(*this);
goto scope_end;
default:
log_error("No comma between object fields");
return TAPE_ERROR;
}
} // object_continue:
scope_end: {
depth--;
if (depth == 0) { goto document_end; }
if (dom_parser.is_array[depth]) { goto array_continue; }
goto object_continue;
} // scope_end:
//
// Array parser states
//
array_begin: {
depth++;
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
builder.start_array(*this);
dom_parser.is_array[depth] = true;
builder.increment_count(*this);
} // array_begin:
array_value: {
const uint8_t *value = advance();
switch (*value) {
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
default: SIMDJSON_TRY( builder.parse_primitive(*this, value) );
}
} // array_value:
array_continue: {
switch (advance_char()) {
case ',':
builder.increment_count(*this);
goto array_value;
case ']':
builder.end_array(*this);
goto scope_end;
default:
log_error("Missing comma between array values");
return TAPE_ERROR;
}
} // array_continue:
document_end: {
builder.end_document(*this);
return finish<STREAMING>();
} // document_end:
} // parse_structurals()
} // namespace stage2
} // namespace SIMDJSON_IMPLEMENTATION
} // unnamed namespace

View File

@ -1,3 +1,4 @@
#include "generic/stage2/json_iterator.h"
#include "generic/stage2/tape_writer.h"
#include "generic/stage2/atomparsing.h"
@ -6,224 +7,276 @@ namespace SIMDJSON_IMPLEMENTATION {
namespace stage2 {
struct tape_builder {
template<bool STREAMING>
SIMDJSON_WARN_UNUSED static simdjson_really_inline error_code parse_document(
dom_parser_implementation &dom_parser,
dom::document &doc) noexcept;
/** Called when a non-empty document starts. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
/** Called when a non-empty document ends without error. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
/** Called when a non-empty array starts. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
/** Called when a non-empty array ends. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
/** Called when an empty array is found. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
/** Called when a non-empty object starts. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
/**
* Called when a key in a field is encountered.
*
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
* will be called after this with the field value.
*/
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
/** Called when a non-empty object ends. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
/** Called when an empty object is found. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
/**
* Called when a string, number, boolean or null is found.
*/
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
/**
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
* when there is no array or object and the entire document is a single string, number, boolean or
* null.
*
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
* there is at least one more token after the value, which is only true in an array or object.
*/
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
/** Called each time a new field or element in an array or object is found. */
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
/** Next location to write to tape */
tape_writer tape;
private:
/** Next write location in the string buf for stage 2 parsing */
uint8_t *current_string_buf_loc;
simdjson_really_inline tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
simdjson_really_inline tape_builder(dom::document &doc) noexcept;
private:
friend struct structural_parser;
simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
simdjson_really_inline void start_container(json_iterator &iter) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
}; // class tape_builder
simdjson_really_inline error_code parse_root_primitive(structural_parser &parser, const uint8_t *value) {
switch (*value) {
case '"': return parse_string(parser, value);
case 't': return parse_root_true_atom(parser, value);
case 'f': return parse_root_false_atom(parser, value);
case 'n': return parse_root_null_atom(parser, value);
case '-':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return parse_root_number(parser, value);
default:
parser.log_error("Document starts with a non-value character");
return TAPE_ERROR;
}
}
simdjson_really_inline error_code parse_primitive(structural_parser &parser, const uint8_t *value) {
switch (*value) {
case '"': return parse_string(parser, value);
case 't': return parse_true_atom(parser, value);
case 'f': return parse_false_atom(parser, value);
case 'n': return parse_null_atom(parser, value);
case '-':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return parse_number(parser, value);
default:
parser.log_error("Non-value found when value was expected!");
return TAPE_ERROR;
}
}
simdjson_really_inline void empty_object(structural_parser &parser) {
parser.log_value("empty object");
empty_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
}
simdjson_really_inline void empty_array(structural_parser &parser) {
parser.log_value("empty array");
empty_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
}
template<bool STREAMING>
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::parse_document(
dom_parser_implementation &dom_parser,
dom::document &doc) noexcept {
dom_parser.doc = &doc;
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
tape_builder builder(doc);
return iter.walk_document<STREAMING>(builder);
}
simdjson_really_inline void start_document(structural_parser &parser) {
parser.log_start_value("document");
start_container(parser);
}
simdjson_really_inline void start_object(structural_parser &parser) {
parser.log_start_value("object");
start_container(parser);
}
simdjson_really_inline void start_array(structural_parser &parser) {
parser.log_start_value("array");
start_container(parser);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
return iter.visit_root_primitive(*this, value);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
return iter.visit_primitive(*this, value);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
}
simdjson_really_inline void end_object(structural_parser &parser) {
parser.log_end_value("object");
end_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
}
simdjson_really_inline void end_array(structural_parser &parser) {
parser.log_end_value("array");
end_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
}
simdjson_really_inline void end_document(structural_parser &parser) {
parser.log_end_value("document");
constexpr uint32_t start_tape_index = 0;
tape.append(start_tape_index, internal::tape_type::ROOT);
tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser), internal::tape_type::ROOT);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
start_container(iter);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
start_container(iter);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
start_container(iter);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_key(structural_parser &parser, const uint8_t *value) {
return parse_string(parser, value, true);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string(structural_parser &parser, const uint8_t *value, bool key = false) {
parser.log_value(key ? "key" : "string");
uint8_t *dst = on_start_string(parser);
dst = stringparsing::parse_string(value, dst);
if (dst == nullptr) {
parser.log_error("Invalid escape in string");
return STRING_ERROR;
}
on_end_string(dst);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
constexpr uint32_t start_tape_index = 0;
tape.append(start_tape_index, internal::tape_type::ROOT);
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
return visit_string(iter, key, true);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_number(structural_parser &parser, const uint8_t *value) {
parser.log_value("number");
if (!numberparsing::parse_number(value, tape)) { parser.log_error("Invalid number"); return NUMBER_ERROR; }
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
return SUCCESS;
}
simdjson_really_inline error_code parse_root_number(structural_parser &parser, const uint8_t *value) {
//
// We need to make a copy to make sure that the string is space terminated.
// This is not about padding the input, which should already padded up
// to len + SIMDJSON_PADDING. However, we have no control at this stage
// on how the padding was done. What if the input string was padded with nulls?
// It is quite common for an input string to have an extra null character (C string).
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
// document, but the string "9\0" by itself is fine. So we make a copy and
// pad the input with spaces when we know that there is just one input element.
// This copy is relatively expensive, but it will almost never be called in
// practice unless you are in the strange scenario where you have many JSON
// documents made of single atoms.
//
uint8_t *copy = static_cast<uint8_t *>(malloc(parser.remaining_len() + SIMDJSON_PADDING));
if (copy == nullptr) {
return MEMALLOC;
}
memcpy(copy, value, parser.remaining_len());
memset(copy + parser.remaining_len(), ' ', SIMDJSON_PADDING);
error_code error = parse_number(parser, copy);
free(copy);
return error;
}
simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_true_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("true");
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
iter.log_value(key ? "key" : "string");
uint8_t *dst = on_start_string(iter);
dst = stringparsing::parse_string(value, dst);
if (dst == nullptr) {
iter.log_error("Invalid escape in string");
return STRING_ERROR;
}
on_end_string(dst);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_true_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("true");
if (!atomparsing::is_valid_true_atom(value, parser.remaining_len())) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
return visit_string(iter, value);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_false_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("false");
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("number");
return numberparsing::parse_number(value, tape);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_false_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("false");
if (!atomparsing::is_valid_false_atom(value, parser.remaining_len())) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
//
// We need to make a copy to make sure that the string is space terminated.
// This is not about padding the input, which should already padded up
// to len + SIMDJSON_PADDING. However, we have no control at this stage
// on how the padding was done. What if the input string was padded with nulls?
// It is quite common for an input string to have an extra null character (C string).
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
// document, but the string "9\0" by itself is fine. So we make a copy and
// pad the input with spaces when we know that there is just one input element.
// This copy is relatively expensive, but it will almost never be called in
// practice unless you are in the strange scenario where you have many JSON
// documents made of single atoms.
//
uint8_t *copy = static_cast<uint8_t *>(malloc(iter.remaining_len() + SIMDJSON_PADDING));
if (copy == nullptr) { return MEMALLOC; }
memcpy(copy, value, iter.remaining_len());
memset(copy + iter.remaining_len(), ' ', SIMDJSON_PADDING);
error_code error = visit_number(iter, copy);
free(copy);
return error;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_null_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("null");
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("true");
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_null_atom(structural_parser &parser, const uint8_t *value) {
parser.log_value("null");
if (!atomparsing::is_valid_null_atom(value, parser.remaining_len())) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("true");
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
tape.append(0, internal::tape_type::TRUE_VALUE);
return SUCCESS;
}
// increment_count increments the count of keys in an object or values in an array.
simdjson_really_inline void increment_count(structural_parser &parser) {
parser.dom_parser.containing_scope[parser.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("false");
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("false");
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
tape.append(0, internal::tape_type::FALSE_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("null");
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
iter.log_value("null");
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
tape.append(0, internal::tape_type::NULL_VALUE);
return SUCCESS;
}
// private:
simdjson_really_inline uint32_t next_tape_index(structural_parser &parser) {
return uint32_t(tape.next_tape_loc - parser.dom_parser.doc->tape.get());
}
simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
}
simdjson_really_inline void empty_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) {
auto start_index = next_tape_index(parser);
tape.append(start_index+2, start);
tape.append(start_index, end);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
auto start_index = next_tape_index(iter);
tape.append(start_index+2, start);
tape.append(start_index, end);
return SUCCESS;
}
simdjson_really_inline void start_container(structural_parser &parser) {
parser.dom_parser.containing_scope[parser.depth].tape_index = next_tape_index(parser);
parser.dom_parser.containing_scope[parser.depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end.
}
simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
iter.dom_parser.open_containers[iter.depth].count = 0;
tape.skip(); // We don't actually *write* the start element until the end.
}
simdjson_really_inline void end_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) noexcept {
// Write the ending tape element, pointing at the start location
const uint32_t start_tape_index = parser.dom_parser.containing_scope[parser.depth].tape_index;
tape.append(start_tape_index, end);
// Write the start tape element, pointing at the end location (and including count)
// count can overflow if it exceeds 24 bits... so we saturate
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
const uint32_t count = parser.dom_parser.containing_scope[parser.depth].count;
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser) | (uint64_t(cntsat) << 32), start);
}
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
// Write the ending tape element, pointing at the start location
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
tape.append(start_tape_index, end);
// Write the start tape element, pointing at the end location (and including count)
// count can overflow if it exceeds 24 bits... so we saturate
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
return SUCCESS;
}
simdjson_really_inline uint8_t *on_start_string(structural_parser &parser) noexcept {
// we advance the point, accounting for the fact that we have a NULL termination
tape.append(current_string_buf_loc - parser.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t);
}
simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
// we advance the point, accounting for the fact that we have a NULL termination
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
return current_string_buf_loc + sizeof(uint32_t);
}
simdjson_really_inline void on_end_string(uint8_t *dst) noexcept {
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
}
}; // class tape_builder
simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
// TODO check for overflow in case someone has a crazy string (>=4GB?)
// But only add the overflow check when the document itself exceeds 4GB
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
// NULL termination is still handy if you expect all your strings to
// be NULL terminated? It comes at a small cost
*dst = 0;
current_string_buf_loc = dst + 1;
}
} // namespace stage2
} // namespace SIMDJSON_IMPLEMENTATION

View File

@ -80,7 +80,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
//
#include "haswell/stringparsing.h"
#include "haswell/numberparsing.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/tape_builder.h"
//
@ -112,15 +111,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(_doc);
return stage2::structural_parser::parse<false>(*this, builder);
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(_doc);
return stage2::structural_parser::parse<true>(*this, builder);
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {

View File

@ -4,6 +4,8 @@
#include <initializer_list>
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
// Static array of known implementations. We're hoping these get baked into the executable
// without requiring a static initializer.

View File

@ -81,7 +81,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
//
#include "westmere/stringparsing.h"
#include "westmere/numberparsing.h"
#include "generic/stage2/structural_parser.h"
#include "generic/stage2/tape_builder.h"
//
@ -114,15 +113,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(*doc);
return stage2::structural_parser::parse<false>(*this, builder);
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
doc = &_doc;
stage2::tape_builder builder(_doc);
return stage2::structural_parser::parse<true>(*this, builder);
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {