Merge pull request #1101 from simdjson/jkeiser/yakety-sax
Basic SAX interface with benchmarks
This commit is contained in:
commit
0a2bca3f73
|
@ -1,5 +1,13 @@
|
||||||
include_directories( . linux )
|
include_directories( . linux )
|
||||||
link_libraries(simdjson simdjson-flags simdjson-windows-headers test-data)
|
link_libraries(simdjson-windows-headers test-data)
|
||||||
|
|
||||||
|
|
||||||
|
if (TARGET benchmark::benchmark)
|
||||||
|
add_executable(bench_sax bench_sax.cpp)
|
||||||
|
target_link_libraries(bench_sax simdjson-internal-flags simdjson-include-source benchmark::benchmark)
|
||||||
|
endif (TARGET benchmark::benchmark)
|
||||||
|
|
||||||
|
link_libraries(simdjson simdjson-flags)
|
||||||
add_executable(benchfeatures benchfeatures.cpp)
|
add_executable(benchfeatures benchfeatures.cpp)
|
||||||
add_executable(get_corpus_benchmark get_corpus_benchmark.cpp)
|
add_executable(get_corpus_benchmark get_corpus_benchmark.cpp)
|
||||||
add_executable(perfdiff perfdiff.cpp)
|
add_executable(perfdiff perfdiff.cpp)
|
||||||
|
@ -14,12 +22,6 @@ target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARS
|
||||||
add_executable(parse_nostringparsing parse.cpp)
|
add_executable(parse_nostringparsing parse.cpp)
|
||||||
target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING)
|
target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING)
|
||||||
|
|
||||||
if (TARGET benchmark::benchmark)
|
|
||||||
link_libraries(benchmark::benchmark)
|
|
||||||
add_executable(bench_parse_call bench_parse_call.cpp)
|
|
||||||
add_executable(bench_dom_api bench_dom_api.cpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (TARGET competition-all)
|
if (TARGET competition-all)
|
||||||
add_executable(distinctuseridcompetition distinctuseridcompetition.cpp)
|
add_executable(distinctuseridcompetition distinctuseridcompetition.cpp)
|
||||||
target_link_libraries(distinctuseridcompetition competition-core)
|
target_link_libraries(distinctuseridcompetition competition-core)
|
||||||
|
@ -34,4 +36,10 @@ if (TARGET competition-all)
|
||||||
target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER)
|
target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (TARGET benchmark::benchmark)
|
||||||
|
link_libraries(benchmark::benchmark)
|
||||||
|
add_executable(bench_parse_call bench_parse_call.cpp)
|
||||||
|
add_executable(bench_dom_api bench_dom_api.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
include(checkperf.cmake)
|
include(checkperf.cmake)
|
||||||
|
|
|
@ -22,7 +22,7 @@ static void recover_one_string(State& state) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
dom::element doc;
|
dom::element doc;
|
||||||
if (error = parser.parse(docdata).get(doc)) {
|
if ((error = parser.parse(docdata).get(doc))) {
|
||||||
cerr << "could not parse string" << error << endl;
|
cerr << "could not parse string" << error << endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -48,8 +48,7 @@ static void serialize_twitter(State& state) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// we do not want mem. alloc. in the loop.
|
// we do not want mem. alloc. in the loop.
|
||||||
error = parser.allocate(docdata.size());
|
if((error = parser.allocate(docdata.size()))) {
|
||||||
if(error) {
|
|
||||||
cout << error << endl;
|
cout << error << endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,359 @@
|
||||||
|
#define SIMDJSON_IMPLEMENTATION_FALLBACK 0
|
||||||
|
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
|
||||||
|
#define SIMDJSON_IMPLEMENTATION_AMD64 0
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#include "simdjson.h"
|
||||||
|
|
||||||
|
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
SIMDJSON_POP_DISABLE_WARNINGS
|
||||||
|
|
||||||
|
#include "simdjson.cpp"
|
||||||
|
|
||||||
|
#if SIMDJSON_EXCEPTIONS
|
||||||
|
|
||||||
|
using namespace benchmark;
|
||||||
|
using namespace simdjson;
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
|
||||||
|
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
|
||||||
|
const int REPETITIONS = 10;
|
||||||
|
|
||||||
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
||||||
|
|
||||||
|
#include "twitter/sax_tweet_reader.h"
|
||||||
|
|
||||||
|
static void sax_tweets(State &state) {
|
||||||
|
// Load twitter.json to a buffer
|
||||||
|
padded_string json;
|
||||||
|
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
twitter::sax_tweet_reader reader;
|
||||||
|
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
|
||||||
|
|
||||||
|
// Warm the vector
|
||||||
|
if (auto error = reader.read_tweets(json)) { throw error; }
|
||||||
|
|
||||||
|
// Read tweets
|
||||||
|
size_t bytes = 0;
|
||||||
|
size_t tweets = 0;
|
||||||
|
for (SIMDJSON_UNUSED auto _ : state) {
|
||||||
|
if (auto error = reader.read_tweets(json)) { throw error; }
|
||||||
|
bytes += json.size();
|
||||||
|
tweets += reader.tweets.size();
|
||||||
|
}
|
||||||
|
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
||||||
|
state.counters["Gigabytes"] = benchmark::Counter(
|
||||||
|
double(bytes), benchmark::Counter::kIsRate,
|
||||||
|
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
||||||
|
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
||||||
|
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
|
||||||
|
}
|
||||||
|
BENCHMARK(sax_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
|
||||||
|
return *(std::max_element(std::begin(v), std::end(v)));
|
||||||
|
})->DisplayAggregatesOnly(true);
|
||||||
|
|
||||||
|
#endif // SIMDJSON_IMPLEMENTATION_HASWELL
|
||||||
|
|
||||||
|
#include "twitter/tweet.h"
|
||||||
|
|
||||||
|
simdjson_really_inline uint64_t nullable_int(dom::element element) {
|
||||||
|
if (element.is_null()) { return 0; }
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
simdjson_really_inline void read_dom_tweets(dom::parser &parser, padded_string &json, std::vector<twitter::tweet> &tweets) {
|
||||||
|
for (dom::element tweet : parser.parse(json)["statuses"]) {
|
||||||
|
auto user = tweet["user"];
|
||||||
|
tweets.push_back(
|
||||||
|
{
|
||||||
|
tweet["id"],
|
||||||
|
tweet["text"],
|
||||||
|
tweet["created_at"],
|
||||||
|
nullable_int(tweet["in_reply_to_status_id"]),
|
||||||
|
tweet["retweet_count"],
|
||||||
|
tweet["favorite_count"],
|
||||||
|
{ user["id"], user["screen_name"] }
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void dom_tweets(State &state) {
|
||||||
|
// Load twitter.json to a buffer
|
||||||
|
padded_string json;
|
||||||
|
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
dom::parser parser;
|
||||||
|
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
|
||||||
|
|
||||||
|
// Warm the vector
|
||||||
|
std::vector<twitter::tweet> tweets;
|
||||||
|
read_dom_tweets(parser, json, tweets);
|
||||||
|
|
||||||
|
// Read tweets
|
||||||
|
size_t bytes = 0;
|
||||||
|
size_t num_tweets = 0;
|
||||||
|
for (SIMDJSON_UNUSED auto _ : state) {
|
||||||
|
tweets.clear();
|
||||||
|
read_dom_tweets(parser, json, tweets);
|
||||||
|
bytes += json.size();
|
||||||
|
num_tweets += tweets.size();
|
||||||
|
}
|
||||||
|
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
||||||
|
state.counters["Gigabytes"] = benchmark::Counter(
|
||||||
|
double(bytes), benchmark::Counter::kIsRate,
|
||||||
|
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
||||||
|
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
||||||
|
state.counters["tweets"] = Counter(double(num_tweets), benchmark::Counter::kIsRate);
|
||||||
|
}
|
||||||
|
BENCHMARK(dom_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
|
||||||
|
return *(std::max_element(std::begin(v), std::end(v)));
|
||||||
|
})->DisplayAggregatesOnly(true);
|
||||||
|
|
||||||
|
static void dom_parse(State &state) {
|
||||||
|
// Load twitter.json to a buffer
|
||||||
|
padded_string json;
|
||||||
|
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
dom::parser parser;
|
||||||
|
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
|
||||||
|
|
||||||
|
// Read tweets
|
||||||
|
size_t bytes = 0;
|
||||||
|
for (SIMDJSON_UNUSED auto _ : state) {
|
||||||
|
if (parser.parse(json).error()) { throw "Parsing failed"; };
|
||||||
|
bytes += json.size();
|
||||||
|
}
|
||||||
|
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
||||||
|
state.counters["Gigabytes"] = benchmark::Counter(
|
||||||
|
double(bytes), benchmark::Counter::kIsRate,
|
||||||
|
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
||||||
|
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
||||||
|
}
|
||||||
|
BENCHMARK(dom_parse)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
|
||||||
|
return *(std::max_element(std::begin(v), std::end(v)));
|
||||||
|
})->DisplayAggregatesOnly(true);
|
||||||
|
|
||||||
|
|
||||||
|
/********************
|
||||||
|
* Large file parsing benchmarks:
|
||||||
|
********************/
|
||||||
|
|
||||||
|
static std::string build_json_array(size_t N) {
|
||||||
|
std::default_random_engine e;
|
||||||
|
std::uniform_real_distribution<> dis(0, 1);
|
||||||
|
std::stringstream myss;
|
||||||
|
myss << "[" << std::endl;
|
||||||
|
if(N > 0) {
|
||||||
|
myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}" << std::endl;
|
||||||
|
}
|
||||||
|
for(size_t i = 1; i < N; i++) {
|
||||||
|
myss << "," << std::endl;
|
||||||
|
myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}";
|
||||||
|
}
|
||||||
|
myss << std::endl;
|
||||||
|
myss << "]" << std::endl;
|
||||||
|
std::string answer = myss.str();
|
||||||
|
std::cout << "Creating a source file spanning " << (answer.size() + 512) / 1024 << " KB " << std::endl;
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const simdjson::padded_string& get_my_json_str() {
|
||||||
|
static simdjson::padded_string s = build_json_array(1000000);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct my_point {
|
||||||
|
double x;
|
||||||
|
double y;
|
||||||
|
double z;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ./benchmark/bench_sax --benchmark_filter=largerandom
|
||||||
|
|
||||||
|
|
||||||
|
/***
|
||||||
|
* We start with the naive DOM-based approach.
|
||||||
|
**/
|
||||||
|
static void dom_parse_largerandom(State &state) {
|
||||||
|
// Load twitter.json to a buffer
|
||||||
|
const padded_string& json = get_my_json_str();
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
dom::parser parser;
|
||||||
|
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
|
||||||
|
|
||||||
|
// Read
|
||||||
|
size_t bytes = 0;
|
||||||
|
simdjson::error_code error;
|
||||||
|
for (SIMDJSON_UNUSED auto _ : state) {
|
||||||
|
std::vector<my_point> container;
|
||||||
|
dom::element doc;
|
||||||
|
if ((error = parser.parse(json).get(doc))) {
|
||||||
|
std::cerr << "failure: " << error << std::endl;
|
||||||
|
throw "Parsing failed";
|
||||||
|
};
|
||||||
|
for (auto p : doc) {
|
||||||
|
container.emplace_back(my_point{p["x"], p["y"], p["z"]});
|
||||||
|
}
|
||||||
|
bytes += json.size();
|
||||||
|
benchmark::DoNotOptimize(container.data());
|
||||||
|
|
||||||
|
}
|
||||||
|
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
||||||
|
state.counters["Gigabytes"] = benchmark::Counter(
|
||||||
|
double(bytes), benchmark::Counter::kIsRate,
|
||||||
|
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
||||||
|
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(dom_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
|
||||||
|
return *(std::max_element(std::begin(v), std::end(v)));
|
||||||
|
})->DisplayAggregatesOnly(true);
|
||||||
|
|
||||||
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
||||||
|
|
||||||
|
/***
|
||||||
|
* Next we are going to code the SAX approach.
|
||||||
|
**/
|
||||||
|
|
||||||
|
SIMDJSON_TARGET_HASWELL
|
||||||
|
|
||||||
|
namespace largerandom {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using namespace simdjson;
|
||||||
|
using namespace haswell;
|
||||||
|
using namespace haswell::stage2;
|
||||||
|
struct sax_point_reader_visitor {
|
||||||
|
public:
|
||||||
|
sax_point_reader_visitor(std::vector<my_point> &_points) : points(_points) {
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline error_code visit_document_start(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_object_start(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_key(json_iterator &, const uint8_t *key) {
|
||||||
|
switch(key[0]) {
|
||||||
|
case 'x':
|
||||||
|
idx = 0;
|
||||||
|
break;
|
||||||
|
case 'y':
|
||||||
|
idx = 2;
|
||||||
|
break;
|
||||||
|
case 'z':
|
||||||
|
idx = 3;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code visit_primitive(json_iterator &, const uint8_t *value) {
|
||||||
|
return numberparsing::parse_double(value).get(buffer[idx]);
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code visit_array_start(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_array_end(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_object_end(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_document_end(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_empty_array(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_empty_object(json_iterator &) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code visit_root_primitive(json_iterator &, const uint8_t *) { return SUCCESS; }
|
||||||
|
simdjson_really_inline error_code increment_count(json_iterator &) { return SUCCESS; }
|
||||||
|
std::vector<my_point> &points;
|
||||||
|
size_t idx{0};
|
||||||
|
double buffer[3];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct sax_point_reader {
|
||||||
|
std::vector<my_point> points;
|
||||||
|
std::unique_ptr<uint8_t[]> string_buf;
|
||||||
|
size_t capacity;
|
||||||
|
dom_parser_implementation dom_parser;
|
||||||
|
|
||||||
|
sax_point_reader();
|
||||||
|
error_code set_capacity(size_t new_capacity);
|
||||||
|
error_code read_points(const padded_string &json);
|
||||||
|
}; // struct sax_point_reader
|
||||||
|
|
||||||
|
sax_point_reader::sax_point_reader() : points{}, string_buf{}, capacity{0}, dom_parser() {
|
||||||
|
}
|
||||||
|
|
||||||
|
error_code sax_point_reader::set_capacity(size_t new_capacity) {
|
||||||
|
// string_capacity copied from document::allocate
|
||||||
|
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
|
||||||
|
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
|
||||||
|
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
|
||||||
|
if (capacity == 0) { // set max depth the first time only
|
||||||
|
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
|
||||||
|
}
|
||||||
|
capacity = new_capacity;
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
error_code sax_point_reader::read_points(const padded_string &json) {
|
||||||
|
// Allocate capacity if needed
|
||||||
|
points.clear();
|
||||||
|
if (capacity < json.size()) {
|
||||||
|
if (auto error = set_capacity(capacity)) { return error; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run stage 1 first.
|
||||||
|
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
|
||||||
|
|
||||||
|
// Then walk the document, parsing the tweets as we go
|
||||||
|
json_iterator iter(dom_parser, 0);
|
||||||
|
sax_point_reader_visitor visitor(points);
|
||||||
|
if (auto error = iter.walk_document<false>(visitor)) { return error; }
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // unnamed namespace
|
||||||
|
} // namespace largerandom
|
||||||
|
|
||||||
|
SIMDJSON_UNTARGET_REGION
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ./benchmark/bench_sax --benchmark_filter=largerandom
|
||||||
|
static void sax_parse_largerandom(State &state) {
|
||||||
|
// Load twitter.json to a buffer
|
||||||
|
const padded_string& json = get_my_json_str();
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
largerandom::sax_point_reader reader;
|
||||||
|
if (auto error = reader.set_capacity(json.size())) { throw error; }
|
||||||
|
// warming
|
||||||
|
for(size_t i = 0; i < 10; i++) {
|
||||||
|
if (auto error = reader.read_points(json)) { throw error; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read
|
||||||
|
size_t bytes = 0;
|
||||||
|
for (SIMDJSON_UNUSED auto _ : state) {
|
||||||
|
if (auto error = reader.read_points(json)) { throw error; }
|
||||||
|
bytes += json.size();
|
||||||
|
benchmark::DoNotOptimize(reader.points.data());
|
||||||
|
}
|
||||||
|
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
||||||
|
state.counters["Gigabytes"] = benchmark::Counter(
|
||||||
|
double(bytes), benchmark::Counter::kIsRate,
|
||||||
|
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
||||||
|
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
||||||
|
}
|
||||||
|
BENCHMARK(sax_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
|
||||||
|
return *(std::max_element(std::begin(v), std::end(v)));
|
||||||
|
})->DisplayAggregatesOnly(true);
|
||||||
|
|
||||||
|
#endif // SIMDJSON_IMPLEMENTATION_HASWELL
|
||||||
|
|
||||||
|
#endif // SIMDJSON_EXCEPTIONS
|
||||||
|
|
||||||
|
BENCHMARK_MAIN();
|
|
@ -0,0 +1,67 @@
|
||||||
|
#ifndef TWITTER_SAX_TWEET_READER_H
|
||||||
|
#define TWITTER_SAX_TWEET_READER_H
|
||||||
|
|
||||||
|
#include "simdjson.h"
|
||||||
|
#include "sax_tweet_reader_visitor.h"
|
||||||
|
#include "tweet.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
SIMDJSON_TARGET_HASWELL
|
||||||
|
|
||||||
|
namespace twitter {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using namespace simdjson;
|
||||||
|
using namespace haswell;
|
||||||
|
using namespace haswell::stage2;
|
||||||
|
|
||||||
|
struct sax_tweet_reader {
|
||||||
|
std::vector<tweet> tweets;
|
||||||
|
std::unique_ptr<uint8_t[]> string_buf;
|
||||||
|
size_t capacity;
|
||||||
|
dom_parser_implementation dom_parser;
|
||||||
|
|
||||||
|
sax_tweet_reader();
|
||||||
|
error_code set_capacity(size_t new_capacity);
|
||||||
|
error_code read_tweets(padded_string &json);
|
||||||
|
}; // struct tweet_reader
|
||||||
|
|
||||||
|
sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {
|
||||||
|
}
|
||||||
|
|
||||||
|
error_code sax_tweet_reader::set_capacity(size_t new_capacity) {
|
||||||
|
// string_capacity copied from document::allocate
|
||||||
|
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
|
||||||
|
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
|
||||||
|
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
|
||||||
|
if (capacity == 0) { // set max depth the first time only
|
||||||
|
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
|
||||||
|
}
|
||||||
|
capacity = new_capacity;
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: this assumes the dom_parser is already allocated
|
||||||
|
error_code sax_tweet_reader::read_tweets(padded_string &json) {
|
||||||
|
// Allocate capacity if needed
|
||||||
|
tweets.clear();
|
||||||
|
if (capacity < json.size()) {
|
||||||
|
if (auto error = set_capacity(capacity)) { return error; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run stage 1 first.
|
||||||
|
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
|
||||||
|
|
||||||
|
// Then walk the document, parsing the tweets as we go
|
||||||
|
json_iterator iter(dom_parser, 0);
|
||||||
|
sax_tweet_reader_visitor visitor(tweets, string_buf.get());
|
||||||
|
if (auto error = iter.walk_document<false>(visitor)) { return error; }
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // unnamed namespace
|
||||||
|
} // namespace twitter
|
||||||
|
|
||||||
|
SIMDJSON_UNTARGET_REGION
|
||||||
|
|
||||||
|
#endif // TWITTER_SAX_TWEET_READER_H
|
|
@ -0,0 +1,519 @@
|
||||||
|
#ifndef TWITTER_SAX_TWEET_READER_VISITOR_H
|
||||||
|
#define TWITTER_SAX_TWEET_READER_VISITOR_H
|
||||||
|
|
||||||
|
#include "simdjson.h"
|
||||||
|
#include "tweet.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
SIMDJSON_TARGET_HASWELL
|
||||||
|
|
||||||
|
namespace twitter {
|
||||||
|
|
||||||
|
using namespace simdjson;
|
||||||
|
using namespace haswell;
|
||||||
|
using namespace haswell::stage2;
|
||||||
|
|
||||||
|
struct sax_tweet_reader_visitor {
|
||||||
|
public:
|
||||||
|
sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf);
|
||||||
|
|
||||||
|
simdjson_really_inline error_code visit_document_start(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_object_start(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key);
|
||||||
|
simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value);
|
||||||
|
simdjson_really_inline error_code visit_array_start(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_array_end(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_object_end(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_document_end(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_empty_array(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_empty_object(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value);
|
||||||
|
simdjson_really_inline error_code increment_count(json_iterator &iter);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Since we only care about one thing at each level, we just use depth as the marker for what
|
||||||
|
// object/array we're nested inside.
|
||||||
|
enum class containers {
|
||||||
|
document = 0, //
|
||||||
|
top_object = 1, // {
|
||||||
|
statuses = 2, // { "statuses": [
|
||||||
|
tweet = 3, // { "statuses": [ {
|
||||||
|
user = 4 // { "statuses": [ { "user": {
|
||||||
|
};
|
||||||
|
/**
|
||||||
|
* The largest depth we care about.
|
||||||
|
* There can be things at lower depths.
|
||||||
|
*/
|
||||||
|
static constexpr uint32_t MAX_SUPPORTED_DEPTH = uint32_t(containers::user);
|
||||||
|
static constexpr const char *STATE_NAMES[] = {
|
||||||
|
"document",
|
||||||
|
"top object",
|
||||||
|
"statuses",
|
||||||
|
"tweet",
|
||||||
|
"user"
|
||||||
|
};
|
||||||
|
enum class field_type {
|
||||||
|
any,
|
||||||
|
unsigned_integer,
|
||||||
|
string,
|
||||||
|
nullable_unsigned_integer,
|
||||||
|
object,
|
||||||
|
array
|
||||||
|
};
|
||||||
|
struct field {
|
||||||
|
const char * key{};
|
||||||
|
size_t len{0};
|
||||||
|
size_t offset;
|
||||||
|
containers container{containers::document};
|
||||||
|
field_type type{field_type::any};
|
||||||
|
};
|
||||||
|
|
||||||
|
containers container{containers::document};
|
||||||
|
std::vector<tweet> &tweets;
|
||||||
|
uint8_t *current_string_buf_loc;
|
||||||
|
const uint8_t *current_key{};
|
||||||
|
|
||||||
|
simdjson_really_inline bool in_container(json_iterator &iter);
|
||||||
|
simdjson_really_inline bool in_container_child(json_iterator &iter);
|
||||||
|
simdjson_really_inline void start_container(json_iterator &iter);
|
||||||
|
simdjson_really_inline void end_container(json_iterator &iter);
|
||||||
|
simdjson_really_inline error_code parse_nullable_unsigned(json_iterator &iter, const uint8_t *value, const field &f);
|
||||||
|
simdjson_really_inline error_code parse_unsigned(json_iterator &iter, const uint8_t *value, const field &f);
|
||||||
|
simdjson_really_inline error_code parse_string(json_iterator &iter, const uint8_t *value, const field &f);
|
||||||
|
|
||||||
|
struct field_lookup {
|
||||||
|
field entries[256]{};
|
||||||
|
|
||||||
|
field_lookup();
|
||||||
|
simdjson_really_inline field get(const uint8_t * key, containers container);
|
||||||
|
private:
|
||||||
|
simdjson_really_inline uint8_t hash(const char * key, uint32_t depth);
|
||||||
|
simdjson_really_inline void add(const char * key, size_t len, containers container, field_type type, size_t offset);
|
||||||
|
simdjson_really_inline void neg(const char * const key, uint32_t depth);
|
||||||
|
};
|
||||||
|
static field_lookup fields;
|
||||||
|
}; // sax_tweet_reader_visitor
|
||||||
|
|
||||||
|
sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf)
|
||||||
|
: tweets{_tweets},
|
||||||
|
current_string_buf_loc{string_buf} {
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) {
|
||||||
|
start_container(iter);
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) {
|
||||||
|
// If we're not in a container we care about, don't bother with the rest
|
||||||
|
if (!in_container_child(iter)) { return SUCCESS; }
|
||||||
|
|
||||||
|
// Handle fields first
|
||||||
|
if (current_key) {
|
||||||
|
switch (fields.get(current_key, container).type) {
|
||||||
|
case field_type::array: // { "statuses": [
|
||||||
|
start_container(iter);
|
||||||
|
return SUCCESS;
|
||||||
|
case field_type::any:
|
||||||
|
return SUCCESS;
|
||||||
|
case field_type::object:
|
||||||
|
case field_type::unsigned_integer:
|
||||||
|
case field_type::nullable_unsigned_integer:
|
||||||
|
case field_type::string:
|
||||||
|
iter.log_error("unexpected array field");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We're not in a field, so it must be a child of an array. We support any of those.
|
||||||
|
iter.log_error("unexpected array");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) {
|
||||||
|
// If we're not in a container we care about, don't bother with the rest
|
||||||
|
if (!in_container_child(iter)) { return SUCCESS; }
|
||||||
|
|
||||||
|
// Handle known fields
|
||||||
|
if (current_key) {
|
||||||
|
auto f = fields.get(current_key, container);
|
||||||
|
switch (f.type) {
|
||||||
|
case field_type::object: // { "statuses": [ { "user": {
|
||||||
|
start_container(iter);
|
||||||
|
return SUCCESS;
|
||||||
|
case field_type::any:
|
||||||
|
return SUCCESS;
|
||||||
|
case field_type::array:
|
||||||
|
case field_type::unsigned_integer:
|
||||||
|
case field_type::nullable_unsigned_integer:
|
||||||
|
case field_type::string:
|
||||||
|
iter.log_error("unexpected object field");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// It's not a field, so it's a child of an array or document
|
||||||
|
switch (container) {
|
||||||
|
case containers::document: // top_object: {
|
||||||
|
case containers::statuses: // tweet: { "statuses": [ {
|
||||||
|
start_container(iter);
|
||||||
|
return SUCCESS;
|
||||||
|
case containers::top_object:
|
||||||
|
case containers::tweet:
|
||||||
|
case containers::user:
|
||||||
|
iter.log_error("unexpected object");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
SIMDJSON_UNREACHABLE();
|
||||||
|
return UNINITIALIZED;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &, const uint8_t *key) {
|
||||||
|
current_key = key;
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) {
|
||||||
|
// Don't bother unless we're in a container we care about
|
||||||
|
if (!in_container(iter)) { return SUCCESS; }
|
||||||
|
|
||||||
|
// Handle fields first
|
||||||
|
if (current_key) {
|
||||||
|
auto f = fields.get(current_key, container);
|
||||||
|
switch (f.type) {
|
||||||
|
case field_type::unsigned_integer:
|
||||||
|
return parse_unsigned(iter, value, f);
|
||||||
|
case field_type::nullable_unsigned_integer:
|
||||||
|
return parse_nullable_unsigned(iter, value, f);
|
||||||
|
case field_type::string:
|
||||||
|
return parse_string(iter, value, f);
|
||||||
|
case field_type::any:
|
||||||
|
return SUCCESS;
|
||||||
|
case field_type::array:
|
||||||
|
case field_type::object:
|
||||||
|
iter.log_error("unexpected primitive");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it's not a field, it's a child of an array.
|
||||||
|
// The only array we support is statuses, which must contain objects.
|
||||||
|
iter.log_error("unexpected primitive");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) {
|
||||||
|
if (in_container(iter)) { end_container(iter); }
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) {
|
||||||
|
if (in_container(iter)) { end_container(iter); }
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) {
|
||||||
|
iter.log_end_value("document");
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &) {
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &) {
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) {
|
||||||
|
iter.log_error("unexpected root primitive");
|
||||||
|
return INCORRECT_TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; }
|
||||||
|
|
||||||
|
simdjson_really_inline bool sax_tweet_reader_visitor::in_container(json_iterator &iter) {
|
||||||
|
return iter.depth == uint32_t(container);
|
||||||
|
}
|
||||||
|
simdjson_really_inline bool sax_tweet_reader_visitor::in_container_child(json_iterator &iter) {
|
||||||
|
return iter.depth == uint32_t(container) + 1;
|
||||||
|
}
|
||||||
|
simdjson_really_inline void sax_tweet_reader_visitor::start_container(json_iterator &iter) {
|
||||||
|
SIMDJSON_ASSUME(iter.depth <= MAX_SUPPORTED_DEPTH); // Asserts in debug mode
|
||||||
|
container = containers(iter.depth);
|
||||||
|
if (logger::LOG_ENABLED) { iter.log_start_value(STATE_NAMES[iter.depth]); }
|
||||||
|
}
|
||||||
|
simdjson_really_inline void sax_tweet_reader_visitor::end_container(json_iterator &iter) {
|
||||||
|
if (logger::LOG_ENABLED) { iter.log_end_value(STATE_NAMES[int(container)]); }
|
||||||
|
container = containers(int(container) - 1);
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_nullable_unsigned(json_iterator &iter, const uint8_t *value, const field &f) {
|
||||||
|
iter.log_value(f.key);
|
||||||
|
auto i = reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
|
||||||
|
if (auto error = numberparsing::parse_unsigned(value).get(*i)) {
|
||||||
|
// If number parsing failed, check if it's null before returning the error
|
||||||
|
if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; }
|
||||||
|
i = 0;
|
||||||
|
}
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_unsigned(json_iterator &iter, const uint8_t *value, const field &f) {
|
||||||
|
iter.log_value(f.key);
|
||||||
|
auto i = reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
|
||||||
|
return numberparsing::parse_unsigned(value).get(*i);
|
||||||
|
}
|
||||||
|
simdjson_really_inline error_code sax_tweet_reader_visitor::parse_string(json_iterator &iter, const uint8_t *value, const field &f) {
|
||||||
|
iter.log_value(f.key);
|
||||||
|
auto s = reinterpret_cast<std::string_view *>(reinterpret_cast<char *>(&tweets.back() + f.offset));
|
||||||
|
return stringparsing::parse_string_to_buffer(value, current_string_buf_loc, *s);
|
||||||
|
}
|
||||||
|
|
||||||
|
sax_tweet_reader_visitor::field_lookup sax_tweet_reader_visitor::fields{};
|
||||||
|
|
||||||
|
simdjson_really_inline uint8_t sax_tweet_reader_visitor::field_lookup::hash(const char * key, uint32_t depth) {
|
||||||
|
// These shift numbers were chosen specifically because this yields only 2 collisions between
|
||||||
|
// keys in twitter.json, leaves 0 as a distinct value, and has 0 collisions between keys we
|
||||||
|
// actually care about.
|
||||||
|
return uint8_t((key[0] << 0) ^ (key[1] << 3) ^ (key[2] << 3) ^ (key[3] << 1) ^ depth);
|
||||||
|
}
|
||||||
|
simdjson_really_inline sax_tweet_reader_visitor::field sax_tweet_reader_visitor::field_lookup::get(const uint8_t * key, containers c) {
|
||||||
|
auto index = hash((const char *)key, uint32_t(c));
|
||||||
|
auto entry = entries[index];
|
||||||
|
// TODO if any key is > SIMDJSON_PADDING, this will access inaccessible memory!
|
||||||
|
if (c != entry.container || memcmp(key, entry.key, entry.len)) { return entries[0]; }
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
simdjson_really_inline void sax_tweet_reader_visitor::field_lookup::add(const char * key, size_t len, containers c, field_type type, size_t offset) {
|
||||||
|
auto index = hash(key, uint32_t(c));
|
||||||
|
if (index == 0) {
|
||||||
|
fprintf(stderr, "%s (depth %d) hashes to zero, which is used as 'missing value'\n", key, int(c));
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
if (entries[index].key) {
|
||||||
|
fprintf(stderr, "%s (depth %d) collides with %s (depth %d) !\n", key, int(c), entries[index].key, int(entries[index].container));
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
entries[index] = { key, len, offset, c, type };
|
||||||
|
}
|
||||||
|
simdjson_really_inline void sax_tweet_reader_visitor::field_lookup::neg(const char * const key, uint32_t depth) {
|
||||||
|
auto index = hash(key, depth);
|
||||||
|
if (entries[index].key) {
|
||||||
|
fprintf(stderr, "%s (depth %d) conflicts with %s (depth %d) !\n", key, depth, entries[index].key, int(entries[index].container));
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sax_tweet_reader_visitor::field_lookup::field_lookup() {
|
||||||
|
add("\"statuses\"", strlen("\"statuses\""), containers::top_object, field_type::array, 0); // { "statuses": [...]
|
||||||
|
#define TWEET_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::tweet, TYPE, offsetof(tweet, KEY));
|
||||||
|
TWEET_FIELD(id, field_type::unsigned_integer);
|
||||||
|
TWEET_FIELD(in_reply_to_status_id, field_type::nullable_unsigned_integer);
|
||||||
|
TWEET_FIELD(retweet_count, field_type::unsigned_integer);
|
||||||
|
TWEET_FIELD(favorite_count, field_type::unsigned_integer);
|
||||||
|
TWEET_FIELD(text, field_type::string);
|
||||||
|
TWEET_FIELD(created_at, field_type::string);
|
||||||
|
TWEET_FIELD(user, field_type::object)
|
||||||
|
#undef TWEET_FIELD
|
||||||
|
#define USER_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::user, TYPE, offsetof(tweet, user)+offsetof(twitter_user, KEY));
|
||||||
|
USER_FIELD(id, field_type::unsigned_integer);
|
||||||
|
USER_FIELD(screen_name, field_type::string);
|
||||||
|
#undef USER_FIELD
|
||||||
|
|
||||||
|
// Check for collisions with other (unused) hash keys in typical twitter JSON
|
||||||
|
#define NEG(key, depth) neg("\"" #key "\"", depth);
|
||||||
|
NEG(display_url, 9);
|
||||||
|
NEG(expanded_url, 9);
|
||||||
|
neg("\"h\":", 9);
|
||||||
|
NEG(indices, 9);
|
||||||
|
NEG(resize, 9);
|
||||||
|
NEG(url, 9);
|
||||||
|
neg("\"w\":", 9);
|
||||||
|
NEG(display_url, 8);
|
||||||
|
NEG(expanded_url, 8);
|
||||||
|
neg("\"h\":", 8);
|
||||||
|
NEG(indices, 8);
|
||||||
|
NEG(large, 8);
|
||||||
|
NEG(medium, 8);
|
||||||
|
NEG(resize, 8);
|
||||||
|
NEG(small, 8);
|
||||||
|
NEG(thumb, 8);
|
||||||
|
NEG(url, 8);
|
||||||
|
neg("\"w\":", 8);
|
||||||
|
NEG(display_url, 7);
|
||||||
|
NEG(expanded_url, 7);
|
||||||
|
NEG(id_str, 7);
|
||||||
|
NEG(id, 7);
|
||||||
|
NEG(indices, 7);
|
||||||
|
NEG(large, 7);
|
||||||
|
NEG(media_url_https, 7);
|
||||||
|
NEG(media_url, 7);
|
||||||
|
NEG(medium, 7);
|
||||||
|
NEG(name, 7);
|
||||||
|
NEG(sizes, 7);
|
||||||
|
NEG(small, 7);
|
||||||
|
NEG(source_status_id_str, 7);
|
||||||
|
NEG(source_status_id, 7);
|
||||||
|
NEG(thumb, 7);
|
||||||
|
NEG(type, 7);
|
||||||
|
NEG(url, 7);
|
||||||
|
NEG(urls, 7);
|
||||||
|
NEG(description, 6);
|
||||||
|
NEG(display_url, 6);
|
||||||
|
NEG(expanded_url, 6);
|
||||||
|
NEG(id_str, 6);
|
||||||
|
NEG(id, 6);
|
||||||
|
NEG(indices, 6);
|
||||||
|
NEG(media_url_https, 6);
|
||||||
|
NEG(media_url, 6);
|
||||||
|
NEG(name, 6);
|
||||||
|
NEG(sizes, 6);
|
||||||
|
NEG(source_status_id_str, 6);
|
||||||
|
NEG(source_status_id, 6);
|
||||||
|
NEG(type, 6);
|
||||||
|
NEG(url, 6);
|
||||||
|
NEG(urls, 6);
|
||||||
|
NEG(contributors_enabled, 5);
|
||||||
|
NEG(default_profile_image, 5);
|
||||||
|
NEG(default_profile, 5);
|
||||||
|
NEG(description, 5);
|
||||||
|
NEG(entities, 5);
|
||||||
|
NEG(favourites_count, 5);
|
||||||
|
NEG(follow_request_sent, 5);
|
||||||
|
NEG(followers_count, 5);
|
||||||
|
NEG(following, 5);
|
||||||
|
NEG(friends_count, 5);
|
||||||
|
NEG(geo_enabled, 5);
|
||||||
|
NEG(hashtags, 5);
|
||||||
|
NEG(id_str, 5);
|
||||||
|
NEG(id, 5);
|
||||||
|
NEG(is_translation_enabled, 5);
|
||||||
|
NEG(is_translator, 5);
|
||||||
|
NEG(iso_language_code, 5);
|
||||||
|
NEG(lang, 5);
|
||||||
|
NEG(listed_count, 5);
|
||||||
|
NEG(location, 5);
|
||||||
|
NEG(media, 5);
|
||||||
|
NEG(name, 5);
|
||||||
|
NEG(notifications, 5);
|
||||||
|
NEG(profile_background_color, 5);
|
||||||
|
NEG(profile_background_image_url_https, 5);
|
||||||
|
NEG(profile_background_image_url, 5);
|
||||||
|
NEG(profile_background_tile, 5);
|
||||||
|
NEG(profile_banner_url, 5);
|
||||||
|
NEG(profile_image_url_https, 5);
|
||||||
|
NEG(profile_image_url, 5);
|
||||||
|
NEG(profile_link_color, 5);
|
||||||
|
NEG(profile_sidebar_border_color, 5);
|
||||||
|
NEG(profile_sidebar_fill_color, 5);
|
||||||
|
NEG(profile_text_color, 5);
|
||||||
|
NEG(profile_use_background_image, 5);
|
||||||
|
NEG(protected, 5);
|
||||||
|
NEG(result_type, 5);
|
||||||
|
NEG(statuses_count, 5);
|
||||||
|
NEG(symbols, 5);
|
||||||
|
NEG(time_zone, 5);
|
||||||
|
NEG(url, 5);
|
||||||
|
NEG(urls, 5);
|
||||||
|
NEG(user_mentions, 5);
|
||||||
|
NEG(utc_offset, 5);
|
||||||
|
NEG(verified, 5);
|
||||||
|
NEG(contributors_enabled, 4);
|
||||||
|
NEG(contributors, 4);
|
||||||
|
NEG(coordinates, 4);
|
||||||
|
NEG(default_profile_image, 4);
|
||||||
|
NEG(default_profile, 4);
|
||||||
|
NEG(description, 4);
|
||||||
|
NEG(entities, 4);
|
||||||
|
NEG(favorited, 4);
|
||||||
|
NEG(favourites_count, 4);
|
||||||
|
NEG(follow_request_sent, 4);
|
||||||
|
NEG(followers_count, 4);
|
||||||
|
NEG(following, 4);
|
||||||
|
NEG(friends_count, 4);
|
||||||
|
NEG(geo_enabled, 4);
|
||||||
|
NEG(geo, 4);
|
||||||
|
NEG(hashtags, 4);
|
||||||
|
NEG(id_str, 4);
|
||||||
|
NEG(in_reply_to_screen_name, 4);
|
||||||
|
NEG(in_reply_to_status_id_str, 4);
|
||||||
|
NEG(in_reply_to_user_id_str, 4);
|
||||||
|
NEG(in_reply_to_user_id, 4);
|
||||||
|
NEG(is_translation_enabled, 4);
|
||||||
|
NEG(is_translator, 4);
|
||||||
|
NEG(iso_language_code, 4);
|
||||||
|
NEG(lang, 4);
|
||||||
|
NEG(listed_count, 4);
|
||||||
|
NEG(location, 4);
|
||||||
|
NEG(media, 4);
|
||||||
|
NEG(metadata, 4);
|
||||||
|
NEG(name, 4);
|
||||||
|
NEG(notifications, 4);
|
||||||
|
NEG(place, 4);
|
||||||
|
NEG(possibly_sensitive, 4);
|
||||||
|
NEG(profile_background_color, 4);
|
||||||
|
NEG(profile_background_image_url_https, 4);
|
||||||
|
NEG(profile_background_image_url, 4);
|
||||||
|
NEG(profile_background_tile, 4);
|
||||||
|
NEG(profile_banner_url, 4);
|
||||||
|
NEG(profile_image_url_https, 4);
|
||||||
|
NEG(profile_image_url, 4);
|
||||||
|
NEG(profile_link_color, 4);
|
||||||
|
NEG(profile_sidebar_border_color, 4);
|
||||||
|
NEG(profile_sidebar_fill_color, 4);
|
||||||
|
NEG(profile_text_color, 4);
|
||||||
|
NEG(profile_use_background_image, 4);
|
||||||
|
NEG(protected, 4);
|
||||||
|
NEG(result_type, 4);
|
||||||
|
NEG(retweeted, 4);
|
||||||
|
NEG(source, 4);
|
||||||
|
NEG(statuses_count, 4);
|
||||||
|
NEG(symbols, 4);
|
||||||
|
NEG(time_zone, 4);
|
||||||
|
NEG(truncated, 4);
|
||||||
|
NEG(url, 4);
|
||||||
|
NEG(urls, 4);
|
||||||
|
NEG(user_mentions, 4);
|
||||||
|
NEG(utc_offset, 4);
|
||||||
|
NEG(verified, 4);
|
||||||
|
NEG(contributors, 3);
|
||||||
|
NEG(coordinates, 3);
|
||||||
|
NEG(entities, 3);
|
||||||
|
NEG(favorited, 3);
|
||||||
|
NEG(geo, 3);
|
||||||
|
NEG(id_str, 3);
|
||||||
|
NEG(in_reply_to_screen_name, 3);
|
||||||
|
NEG(in_reply_to_status_id_str, 3);
|
||||||
|
NEG(in_reply_to_user_id_str, 3);
|
||||||
|
NEG(in_reply_to_user_id, 3);
|
||||||
|
NEG(lang, 3);
|
||||||
|
NEG(metadata, 3);
|
||||||
|
NEG(place, 3);
|
||||||
|
NEG(possibly_sensitive, 3);
|
||||||
|
NEG(retweeted_status, 3);
|
||||||
|
NEG(retweeted, 3);
|
||||||
|
NEG(source, 3);
|
||||||
|
NEG(truncated, 3);
|
||||||
|
NEG(completed_in, 2);
|
||||||
|
NEG(count, 2);
|
||||||
|
NEG(max_id_str, 2);
|
||||||
|
NEG(max_id, 2);
|
||||||
|
NEG(next_results, 2);
|
||||||
|
NEG(query, 2);
|
||||||
|
NEG(refresh_url, 2);
|
||||||
|
NEG(since_id_str, 2);
|
||||||
|
NEG(since_id, 2);
|
||||||
|
NEG(search_metadata, 1);
|
||||||
|
#undef NEG
|
||||||
|
}
|
||||||
|
|
||||||
|
// sax_tweet_reader_visitor::field_lookup::find_min() {
|
||||||
|
// int min_count = 100000;
|
||||||
|
// for (int a=0;a<4;a++) {
|
||||||
|
// for (int b=0;b<4;b++) {
|
||||||
|
// for (int c=0;c<4;c++) {
|
||||||
|
// twitter::sax_tweet_reader_visitor::field_lookup fields(a,b,c);
|
||||||
|
// if (fields.collision_count) { continue; }
|
||||||
|
// if (fields.zero_emission) { continue; }
|
||||||
|
// if (fields.conflict_count < min_count) { printf("min=%d,%d,%d (%d)", a, b, c, fields.conflict_count); }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
} // namespace twitter
|
||||||
|
|
||||||
|
SIMDJSON_UNTARGET_REGION
|
||||||
|
|
||||||
|
#endif // TWITTER_SAX_TWEET_READER_VISITOR_H
|
|
@ -0,0 +1,21 @@
|
||||||
|
#ifndef TWEET_H
|
||||||
|
#define TWEET_H
|
||||||
|
|
||||||
|
#include "simdjson.h"
|
||||||
|
#include "twitter_user.h"
|
||||||
|
|
||||||
|
namespace twitter {
|
||||||
|
|
||||||
|
struct tweet {
|
||||||
|
uint64_t id{};
|
||||||
|
std::string_view text{};
|
||||||
|
std::string_view created_at{};
|
||||||
|
uint64_t in_reply_to_status_id{};
|
||||||
|
uint64_t retweet_count{};
|
||||||
|
uint64_t favorite_count{};
|
||||||
|
twitter_user user{};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace twitter
|
||||||
|
|
||||||
|
#endif // TWEET_H
|
|
@ -0,0 +1,15 @@
|
||||||
|
#ifndef TWITTER_USER_H
|
||||||
|
#define TWITTER_USER_H
|
||||||
|
|
||||||
|
#include "simdjson.h"
|
||||||
|
|
||||||
|
namespace twitter {
|
||||||
|
|
||||||
|
struct twitter_user {
|
||||||
|
uint64_t id{};
|
||||||
|
std::string_view screen_name{};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace twitter
|
||||||
|
|
||||||
|
#endif // TWITTER_USER_H
|
|
@ -111,7 +111,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
|
||||||
|
|
||||||
#include "arm64/stringparsing.h"
|
#include "arm64/stringparsing.h"
|
||||||
#include "arm64/numberparsing.h"
|
#include "arm64/numberparsing.h"
|
||||||
#include "generic/stage2/structural_parser.h"
|
|
||||||
#include "generic/stage2/tape_builder.h"
|
#include "generic/stage2/tape_builder.h"
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -145,15 +144,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
||||||
stage2::tape_builder builder(*doc);
|
|
||||||
return stage2::structural_parser::parse<false>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
||||||
stage2::tape_builder builder(_doc);
|
|
||||||
return stage2::structural_parser::parse<true>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||||
|
|
|
@ -315,22 +315,17 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
|
||||||
//
|
//
|
||||||
#include "fallback/stringparsing.h"
|
#include "fallback/stringparsing.h"
|
||||||
#include "fallback/numberparsing.h"
|
#include "fallback/numberparsing.h"
|
||||||
#include "generic/stage2/structural_parser.h"
|
|
||||||
#include "generic/stage2/tape_builder.h"
|
#include "generic/stage2/tape_builder.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
namespace SIMDJSON_IMPLEMENTATION {
|
namespace SIMDJSON_IMPLEMENTATION {
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
||||||
stage2::tape_builder builder(*doc);
|
|
||||||
return stage2::structural_parser::parse<false>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
||||||
stage2::tape_builder builder(_doc);
|
|
||||||
return stage2::structural_parser::parse<true>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||||
|
|
|
@ -4,16 +4,18 @@
|
||||||
namespace {
|
namespace {
|
||||||
namespace SIMDJSON_IMPLEMENTATION {
|
namespace SIMDJSON_IMPLEMENTATION {
|
||||||
|
|
||||||
// expectation: sizeof(scope_descriptor) = 64/8.
|
// expectation: sizeof(open_container) = 64/8.
|
||||||
struct scope_descriptor {
|
struct open_container {
|
||||||
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
||||||
uint32_t count; // how many elements in the scope
|
uint32_t count; // how many elements in the scope
|
||||||
}; // struct scope_descriptor
|
}; // struct open_container
|
||||||
|
|
||||||
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
||||||
|
|
||||||
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
||||||
public:
|
public:
|
||||||
/** Tape location of each open { or [ */
|
/** Tape location of each open { or [ */
|
||||||
std::unique_ptr<scope_descriptor[]> containing_scope{};
|
std::unique_ptr<open_container[]> open_containers{};
|
||||||
/** Whether each open container is a [ or { */
|
/** Whether each open container is a [ or { */
|
||||||
std::unique_ptr<bool[]> is_array{};
|
std::unique_ptr<bool[]> is_array{};
|
||||||
/** Buffer passed to stage 1 */
|
/** Buffer passed to stage 1 */
|
||||||
|
|
|
@ -7,10 +7,10 @@ namespace allocate {
|
||||||
// Allocates stage 2 internal state and outputs in the parser
|
// Allocates stage 2 internal state and outputs in the parser
|
||||||
//
|
//
|
||||||
simdjson_really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
|
simdjson_really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
|
||||||
parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
|
parser.open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
||||||
parser.is_array.reset(new (std::nothrow) bool[max_depth]);
|
parser.is_array.reset(new (std::nothrow) bool[max_depth]);
|
||||||
|
|
||||||
if (!parser.is_array || !parser.containing_scope) {
|
if (!parser.is_array || !parser.open_containers) {
|
||||||
return MEMALLOC;
|
return MEMALLOC;
|
||||||
}
|
}
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
|
|
|
@ -0,0 +1,315 @@
|
||||||
|
#include "generic/stage2/logger.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
namespace SIMDJSON_IMPLEMENTATION {
|
||||||
|
namespace stage2 {
|
||||||
|
|
||||||
|
class json_iterator {
|
||||||
|
public:
|
||||||
|
const uint8_t* const buf;
|
||||||
|
uint32_t *next_structural;
|
||||||
|
dom_parser_implementation &dom_parser;
|
||||||
|
uint32_t depth{0};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Walk the JSON document.
|
||||||
|
*
|
||||||
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
||||||
|
* the first parameter; some callbacks have other parameters as well:
|
||||||
|
*
|
||||||
|
* - visit_document_start() - at the beginning.
|
||||||
|
* - visit_document_end() - at the end (if things were successful).
|
||||||
|
*
|
||||||
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
||||||
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
||||||
|
* - visit_empty_array() - when an empty array is encountered.
|
||||||
|
*
|
||||||
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
||||||
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
||||||
|
* - visit_empty_object() - when an empty object is encountered.
|
||||||
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
||||||
|
* guaranteed to point at the first quote of the string (`"key"`).
|
||||||
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
||||||
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
||||||
|
*
|
||||||
|
* - increment_count(iter) - each time a value is found in an array or object.
|
||||||
|
*/
|
||||||
|
template<bool STREAMING, typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code walk_document(V &visitor) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an iterator capable of walking a JSON document.
|
||||||
|
*
|
||||||
|
* The document must have already passed through stage 1.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look at the next token.
|
||||||
|
*
|
||||||
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
||||||
|
*
|
||||||
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
||||||
|
*/
|
||||||
|
simdjson_really_inline const uint8_t *peek() const noexcept;
|
||||||
|
/**
|
||||||
|
* Advance to the next token.
|
||||||
|
*
|
||||||
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
||||||
|
*
|
||||||
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
||||||
|
*/
|
||||||
|
simdjson_really_inline const uint8_t *advance() noexcept;
|
||||||
|
/**
|
||||||
|
* Get the remaining length of the document, from the start of the current token.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline size_t remaining_len() const noexcept;
|
||||||
|
/**
|
||||||
|
* Check if we are at the end of the document.
|
||||||
|
*
|
||||||
|
* If this is true, there are no more tokens.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline bool at_eof() const noexcept;
|
||||||
|
/**
|
||||||
|
* Check if we are at the beginning of the document.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline bool at_beginning() const noexcept;
|
||||||
|
simdjson_really_inline uint8_t last_structural() const noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log that a value has been found.
|
||||||
|
*
|
||||||
|
* Set ENABLE_LOGGING=true in logger.h to see logging.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline void log_value(const char *type) const noexcept;
|
||||||
|
/**
|
||||||
|
* Log the start of a multipart value.
|
||||||
|
*
|
||||||
|
* Set ENABLE_LOGGING=true in logger.h to see logging.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline void log_start_value(const char *type) const noexcept;
|
||||||
|
/**
|
||||||
|
* Log the end of a multipart value.
|
||||||
|
*
|
||||||
|
* Set ENABLE_LOGGING=true in logger.h to see logging.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline void log_end_value(const char *type) const noexcept;
|
||||||
|
/**
|
||||||
|
* Log an error.
|
||||||
|
*
|
||||||
|
* Set ENABLE_LOGGING=true in logger.h to see logging.
|
||||||
|
*/
|
||||||
|
simdjson_really_inline void log_error(const char *error) const noexcept;
|
||||||
|
|
||||||
|
template<typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
||||||
|
template<typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<bool STREAMING, typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
||||||
|
logger::log_start();
|
||||||
|
|
||||||
|
//
|
||||||
|
// Start the document
|
||||||
|
//
|
||||||
|
if (at_eof()) { return EMPTY; }
|
||||||
|
log_start_value("document");
|
||||||
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
||||||
|
|
||||||
|
//
|
||||||
|
// Read first value
|
||||||
|
//
|
||||||
|
{
|
||||||
|
auto value = advance();
|
||||||
|
|
||||||
|
// Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
|
||||||
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
||||||
|
if (!STREAMING) {
|
||||||
|
switch (*value) {
|
||||||
|
case '{': if (last_structural() != '}') { return TAPE_ERROR; }; break;
|
||||||
|
case '[': if (last_structural() != ']') { return TAPE_ERROR; }; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (*value) {
|
||||||
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
||||||
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
||||||
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto document_end;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Object parser states
|
||||||
|
//
|
||||||
|
object_begin:
|
||||||
|
log_start_value("object");
|
||||||
|
depth++;
|
||||||
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
||||||
|
dom_parser.is_array[depth] = false;
|
||||||
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
||||||
|
|
||||||
|
{
|
||||||
|
auto key = advance();
|
||||||
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
||||||
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
||||||
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
||||||
|
}
|
||||||
|
|
||||||
|
object_field:
|
||||||
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
||||||
|
{
|
||||||
|
auto value = advance();
|
||||||
|
switch (*value) {
|
||||||
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
||||||
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
||||||
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object_continue:
|
||||||
|
switch (*advance()) {
|
||||||
|
case ',':
|
||||||
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
||||||
|
{
|
||||||
|
auto key = advance();
|
||||||
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
||||||
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
||||||
|
}
|
||||||
|
goto object_field;
|
||||||
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
||||||
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
scope_end:
|
||||||
|
depth--;
|
||||||
|
if (depth == 0) { goto document_end; }
|
||||||
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
||||||
|
goto object_continue;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Array parser states
|
||||||
|
//
|
||||||
|
array_begin:
|
||||||
|
log_start_value("array");
|
||||||
|
depth++;
|
||||||
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
||||||
|
dom_parser.is_array[depth] = true;
|
||||||
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
||||||
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
||||||
|
|
||||||
|
array_value:
|
||||||
|
{
|
||||||
|
auto value = advance();
|
||||||
|
switch (*value) {
|
||||||
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
||||||
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
||||||
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
array_continue:
|
||||||
|
switch (*advance()) {
|
||||||
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
||||||
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
||||||
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
document_end:
|
||||||
|
log_end_value("document");
|
||||||
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
||||||
|
|
||||||
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
||||||
|
|
||||||
|
// If we didn't make it to the end, it's an error
|
||||||
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
||||||
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
||||||
|
return TAPE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SUCCESS;
|
||||||
|
|
||||||
|
} // walk_document()
|
||||||
|
|
||||||
|
simdjson_really_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
||||||
|
: buf{_dom_parser.buf},
|
||||||
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
||||||
|
dom_parser{_dom_parser} {
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline const uint8_t *json_iterator::peek() const noexcept {
|
||||||
|
return &buf[*(next_structural)];
|
||||||
|
}
|
||||||
|
simdjson_really_inline const uint8_t *json_iterator::advance() noexcept {
|
||||||
|
return &buf[*(next_structural++)];
|
||||||
|
}
|
||||||
|
simdjson_really_inline size_t json_iterator::remaining_len() const noexcept {
|
||||||
|
return dom_parser.len - *(next_structural-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline bool json_iterator::at_eof() const noexcept {
|
||||||
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
||||||
|
}
|
||||||
|
simdjson_really_inline bool json_iterator::at_beginning() const noexcept {
|
||||||
|
return next_structural == dom_parser.structural_indexes.get();
|
||||||
|
}
|
||||||
|
simdjson_really_inline uint8_t json_iterator::last_structural() const noexcept {
|
||||||
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline void json_iterator::log_value(const char *type) const noexcept {
|
||||||
|
logger::log_line(*this, "", type, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
||||||
|
logger::log_line(*this, "+", type, "");
|
||||||
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
||||||
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
||||||
|
logger::log_line(*this, "-", type, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
simdjson_really_inline void json_iterator::log_error(const char *error) const noexcept {
|
||||||
|
logger::log_line(*this, "", "ERROR", error);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
||||||
|
switch (*value) {
|
||||||
|
case '"': return visitor.visit_root_string(*this, value);
|
||||||
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
||||||
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
||||||
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
||||||
|
case '-':
|
||||||
|
case '0': case '1': case '2': case '3': case '4':
|
||||||
|
case '5': case '6': case '7': case '8': case '9':
|
||||||
|
return visitor.visit_root_number(*this, value);
|
||||||
|
default:
|
||||||
|
log_error("Document starts with a non-value character");
|
||||||
|
return TAPE_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<typename V>
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
||||||
|
switch (*value) {
|
||||||
|
case '"': return visitor.visit_string(*this, value);
|
||||||
|
case 't': return visitor.visit_true_atom(*this, value);
|
||||||
|
case 'f': return visitor.visit_false_atom(*this, value);
|
||||||
|
case 'n': return visitor.visit_null_atom(*this, value);
|
||||||
|
case '-':
|
||||||
|
case '0': case '1': case '2': case '3': case '4':
|
||||||
|
case '5': case '6': case '7': case '8': case '9':
|
||||||
|
return visitor.visit_number(*this, value);
|
||||||
|
default:
|
||||||
|
log_error("Non-value found when value was expected!");
|
||||||
|
return TAPE_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace stage2
|
||||||
|
} // namespace SIMDJSON_IMPLEMENTATION
|
||||||
|
} // unnamed namespace
|
|
@ -8,7 +8,7 @@ namespace logger {
|
||||||
|
|
||||||
static constexpr const bool LOG_ENABLED = false;
|
static constexpr const bool LOG_ENABLED = false;
|
||||||
static constexpr const int LOG_EVENT_LEN = 20;
|
static constexpr const int LOG_EVENT_LEN = 20;
|
||||||
static constexpr const int LOG_BUFFER_LEN = 10;
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
||||||
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
||||||
static constexpr const int LOG_INDEX_LEN = 5;
|
static constexpr const int LOG_INDEX_LEN = 5;
|
||||||
|
|
||||||
|
@ -33,12 +33,6 @@ namespace logger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static simdjson_really_inline void log_string(const char *message) {
|
|
||||||
if (LOG_ENABLED) {
|
|
||||||
printf("%s\n", message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Logs a single line of
|
// Logs a single line of
|
||||||
template<typename S>
|
template<typename S>
|
||||||
static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
static simdjson_really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
||||||
|
|
|
@ -7,15 +7,15 @@ namespace stage2 {
|
||||||
namespace numberparsing {
|
namespace numberparsing {
|
||||||
|
|
||||||
#ifdef JSON_TEST_NUMBERS
|
#ifdef JSON_TEST_NUMBERS
|
||||||
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
||||||
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
||||||
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
||||||
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
||||||
#else
|
#else
|
||||||
#define INVALID_NUMBER(SRC) (false)
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
||||||
#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
||||||
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
||||||
#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
||||||
|
@ -24,7 +24,7 @@ namespace numberparsing {
|
||||||
// set to false. This should work *most of the time* (like 99% of the time).
|
// set to false. This should work *most of the time* (like 99% of the time).
|
||||||
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
|
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
|
||||||
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
|
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
|
||||||
simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success) {
|
simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
||||||
// we start with a fast path
|
// we start with a fast path
|
||||||
// It was described in
|
// It was described in
|
||||||
// Clinger WD. How to read floating point numbers accurately.
|
// Clinger WD. How to read floating point numbers accurately.
|
||||||
|
@ -40,7 +40,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
#endif
|
#endif
|
||||||
// convert the integer into a double. This is lossless since
|
// convert the integer into a double. This is lossless since
|
||||||
// 0 <= i <= 2^53 - 1.
|
// 0 <= i <= 2^53 - 1.
|
||||||
double d = double(i);
|
d = double(i);
|
||||||
//
|
//
|
||||||
// The general idea is as follows.
|
// The general idea is as follows.
|
||||||
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
||||||
|
@ -59,8 +59,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
if (negative) {
|
if (negative) {
|
||||||
d = -d;
|
d = -d;
|
||||||
}
|
}
|
||||||
*success = true;
|
return true;
|
||||||
return d;
|
|
||||||
}
|
}
|
||||||
// When 22 < power && power < 22 + 16, we could
|
// When 22 < power && power < 22 + 16, we could
|
||||||
// hope for another, secondary fast path. It wa
|
// hope for another, secondary fast path. It wa
|
||||||
|
@ -85,7 +84,8 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
||||||
// possible, except if i == 0, so we handle i == 0 separately.
|
// possible, except if i == 0, so we handle i == 0 separately.
|
||||||
if(i == 0) {
|
if(i == 0) {
|
||||||
return 0.0;
|
d = 0.0;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We are going to need to do some 64-bit arithmetic to get a more precise product.
|
// We are going to need to do some 64-bit arithmetic to get a more precise product.
|
||||||
|
@ -135,8 +135,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
// This does happen, e.g. with 7.3177701707893310e+15.
|
// This does happen, e.g. with 7.3177701707893310e+15.
|
||||||
if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
|
if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
|
||||||
(product_low + i < product_low))) { // let us be prudent and bail out.
|
(product_low + i < product_low))) { // let us be prudent and bail out.
|
||||||
*success = false;
|
return false;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
upper = product_high;
|
upper = product_high;
|
||||||
lower = product_middle;
|
lower = product_middle;
|
||||||
|
@ -174,8 +173,7 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
// Note: because the factor_mantissa and factor_mantissa_low are
|
// Note: because the factor_mantissa and factor_mantissa_low are
|
||||||
// almost always rounded down (except for small positive powers),
|
// almost always rounded down (except for small positive powers),
|
||||||
// almost always should round up.
|
// almost always should round up.
|
||||||
*success = false;
|
return false;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mantissa += mantissa & 1;
|
mantissa += mantissa & 1;
|
||||||
|
@ -193,15 +191,12 @@ simdjson_really_inline double compute_float_64(int64_t power, uint64_t i, bool n
|
||||||
uint64_t real_exponent = c.exp - lz;
|
uint64_t real_exponent = c.exp - lz;
|
||||||
// we have to check that real_exponent is in range, otherwise we bail out
|
// we have to check that real_exponent is in range, otherwise we bail out
|
||||||
if (simdjson_unlikely((real_exponent < 1) || (real_exponent > 2046))) {
|
if (simdjson_unlikely((real_exponent < 1) || (real_exponent > 2046))) {
|
||||||
*success = false;
|
return false;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
mantissa |= real_exponent << 52;
|
mantissa |= real_exponent << 52;
|
||||||
mantissa |= (((uint64_t)negative) << 63);
|
mantissa |= (((uint64_t)negative) << 63);
|
||||||
double d;
|
|
||||||
memcpy(&d, &mantissa, sizeof(d));
|
memcpy(&d, &mantissa, sizeof(d));
|
||||||
*success = true;
|
return true;
|
||||||
return d;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
|
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
|
||||||
|
@ -252,11 +247,11 @@ simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename W>
|
template<typename W>
|
||||||
bool slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
|
error_code slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
|
||||||
double d;
|
double d;
|
||||||
if (parse_float_strtod(src, &d)) {
|
if (parse_float_strtod(src, &d)) {
|
||||||
WRITE_DOUBLE(d, src, writer);
|
writer.append_double(d);
|
||||||
return true;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
return INVALID_NUMBER(src);
|
return INVALID_NUMBER(src);
|
||||||
}
|
}
|
||||||
|
@ -273,7 +268,7 @@ simdjson_really_inline bool parse_digit(const uint8_t c, I &i) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline bool parse_decimal(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
simdjson_really_inline error_code parse_decimal(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
||||||
// we continue with the fiction that we have an integer. If the
|
// we continue with the fiction that we have an integer. If the
|
||||||
// floating point number is representable as x * 10^z for some integer
|
// floating point number is representable as x * 10^z for some integer
|
||||||
// z that fits in 53 bits, then we will be able to convert back the
|
// z that fits in 53 bits, then we will be able to convert back the
|
||||||
|
@ -296,10 +291,10 @@ simdjson_really_inline bool parse_decimal(SIMDJSON_UNUSED const uint8_t *const s
|
||||||
if (exponent == 0) {
|
if (exponent == 0) {
|
||||||
return INVALID_NUMBER(src);
|
return INVALID_NUMBER(src);
|
||||||
}
|
}
|
||||||
return true;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
||||||
// Exp Sign: -123.456e[-]78
|
// Exp Sign: -123.456e[-]78
|
||||||
bool neg_exp = ('-' == *p);
|
bool neg_exp = ('-' == *p);
|
||||||
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
||||||
|
@ -312,11 +307,11 @@ simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const
|
||||||
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
||||||
// Thus we *must* check for possible overflow before we negate exp_number.
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
||||||
|
|
||||||
// Performance notes: it may seem like combining the two "unlikely checks" below into
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
||||||
// a single unlikely path would be faster. The reasoning is sound, but the compiler may
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
||||||
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
||||||
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
||||||
// instructions for a likely branch, an unconclusive gain.
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
||||||
|
|
||||||
// If there were no digits, it's an error.
|
// If there were no digits, it's an error.
|
||||||
if (simdjson_unlikely(p == start_exp)) {
|
if (simdjson_unlikely(p == start_exp)) {
|
||||||
|
@ -347,16 +342,10 @@ simdjson_really_inline bool parse_exponent(SIMDJSON_UNUSED const uint8_t *const
|
||||||
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
||||||
// To sum it up: the next line should never overflow.
|
// To sum it up: the next line should never overflow.
|
||||||
exponent += (neg_exp ? -exp_number : exp_number);
|
exponent += (neg_exp ? -exp_number : exp_number);
|
||||||
return true;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename W>
|
simdjson_really_inline int significant_digits(const uint8_t * start_digits, int digit_count) {
|
||||||
simdjson_really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
|
|
||||||
// If we frequently had to deal with long strings of digits,
|
|
||||||
// we could extend our code by using a 128-bit integer instead
|
|
||||||
// of a 64-bit integer. However, this is uncommon in practice.
|
|
||||||
// digit count is off by 1 because of the decimal (assuming there was one).
|
|
||||||
if (simdjson_unlikely((digit_count-1 >= 19))) { // this is uncommon
|
|
||||||
// It is possible that the integer had an overflow.
|
// It is possible that the integer had an overflow.
|
||||||
// We have to handle the case where we have 0.0000somenumber.
|
// We have to handle the case where we have 0.0000somenumber.
|
||||||
const uint8_t *start = start_digits;
|
const uint8_t *start = start_digits;
|
||||||
|
@ -364,8 +353,16 @@ simdjson_really_inline bool write_float(const uint8_t *const src, bool negative,
|
||||||
start++;
|
start++;
|
||||||
}
|
}
|
||||||
// we over-decrement by one when there is a '.'
|
// we over-decrement by one when there is a '.'
|
||||||
digit_count -= int(start - start_digits);
|
return digit_count - int(start - start_digits);
|
||||||
if (digit_count >= 19) {
|
}
|
||||||
|
|
||||||
|
template<typename W>
|
||||||
|
simdjson_really_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
|
||||||
|
// If we frequently had to deal with long strings of digits,
|
||||||
|
// we could extend our code by using a 128-bit integer instead
|
||||||
|
// of a 64-bit integer. However, this is uncommon in practice.
|
||||||
|
// digit count is off by 1 because of the decimal (assuming there was one).
|
||||||
|
if (simdjson_unlikely(digit_count-1 >= 19 && significant_digits(start_digits, digit_count) >= 19)) {
|
||||||
// Ok, chances are good that we had an overflow!
|
// Ok, chances are good that we had an overflow!
|
||||||
// this is almost never going to get called!!!
|
// this is almost never going to get called!!!
|
||||||
// we start anew, going slowly!!!
|
// we start anew, going slowly!!!
|
||||||
|
@ -373,42 +370,46 @@ simdjson_really_inline bool write_float(const uint8_t *const src, bool negative,
|
||||||
// 10000000000000000000000000000000000000000000e+308
|
// 10000000000000000000000000000000000000000000e+308
|
||||||
// 3.1415926535897932384626433832795028841971693993751
|
// 3.1415926535897932384626433832795028841971693993751
|
||||||
//
|
//
|
||||||
bool success = slow_float_parsing(src, writer);
|
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
|
||||||
// The number was already written, but we made a copy of the writer
|
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
|
||||||
// when we passed it to the parse_large_integer() function, so
|
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
|
||||||
|
// and putting into registers. i.e. if we pass it as reference, it gets slow.
|
||||||
|
// This is what forces the skip_double, as well.
|
||||||
|
error_code error = slow_float_parsing(src, writer);
|
||||||
writer.skip_double();
|
writer.skip_double();
|
||||||
return success;
|
return error;
|
||||||
}
|
}
|
||||||
}
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
||||||
// NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
|
|
||||||
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
||||||
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
||||||
if (simdjson_unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
|
if (simdjson_unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
|
||||||
// this is almost never going to get called!!!
|
// this is almost never going to get called!!!
|
||||||
// we start anew, going slowly!!!
|
// we start anew, going slowly!!!
|
||||||
bool success = slow_float_parsing(src, writer);
|
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
|
||||||
// The number was already written, but we made a copy of the writer when we passed it to the
|
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
|
||||||
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
|
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
|
||||||
|
// and putting into registers. i.e. if we pass it as reference, it gets slow.
|
||||||
|
// This is what forces the skip_double, as well.
|
||||||
|
error_code error = slow_float_parsing(src, writer);
|
||||||
writer.skip_double();
|
writer.skip_double();
|
||||||
return success;
|
return error;
|
||||||
}
|
}
|
||||||
bool success = true;
|
double d;
|
||||||
double d = compute_float_64(exponent, i, negative, &success);
|
if (!compute_float_64(exponent, i, negative, d)) {
|
||||||
if (!success) {
|
|
||||||
// we are almost never going to get here.
|
// we are almost never going to get here.
|
||||||
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
|
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
|
||||||
}
|
}
|
||||||
WRITE_DOUBLE(d, src, writer);
|
WRITE_DOUBLE(d, src, writer);
|
||||||
return true;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
// for performance analysis, it is sometimes useful to skip parsing
|
// for performance analysis, it is sometimes useful to skip parsing
|
||||||
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
||||||
|
|
||||||
template<typename W>
|
template<typename W>
|
||||||
simdjson_really_inline bool parse_number(const uint8_t *const, W &writer) {
|
simdjson_really_inline error_code parse_number(const uint8_t *const, W &writer) {
|
||||||
writer.append_s64(0); // always write zero
|
writer.append_s64(0); // always write zero
|
||||||
return true; // always succeeds
|
return SUCCESS; // always succeeds
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -423,7 +424,7 @@ simdjson_really_inline bool parse_number(const uint8_t *const, W &writer) {
|
||||||
//
|
//
|
||||||
// Our objective is accurate parsing (ULP of 0) at high speed.
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
||||||
template<typename W>
|
template<typename W>
|
||||||
simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
|
simdjson_really_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
||||||
|
|
||||||
//
|
//
|
||||||
// Check for minus sign
|
// Check for minus sign
|
||||||
|
@ -451,17 +452,19 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
|
||||||
if ('.' == *p) {
|
if ('.' == *p) {
|
||||||
is_float = true;
|
is_float = true;
|
||||||
++p;
|
++p;
|
||||||
if (!parse_decimal(src, p, i, exponent)) { return false; }
|
SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
|
||||||
digit_count = int(p - start_digits); // used later to guard against overflows
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
||||||
}
|
}
|
||||||
if (('e' == *p) || ('E' == *p)) {
|
if (('e' == *p) || ('E' == *p)) {
|
||||||
is_float = true;
|
is_float = true;
|
||||||
++p;
|
++p;
|
||||||
if (!parse_exponent(src, p, exponent)) { return false; }
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
||||||
}
|
}
|
||||||
if (is_float) {
|
if (is_float) {
|
||||||
const bool clean_end = is_structural_or_whitespace(*p);
|
const bool clean_end = is_structural_or_whitespace(*p);
|
||||||
return write_float(src, negative, i, start_digits, digit_count, exponent, writer) && clean_end;
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
||||||
|
if (!clean_end) { return INVALID_NUMBER(src); }
|
||||||
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The longest negative 64-bit number is 19 digits.
|
// The longest negative 64-bit number is 19 digits.
|
||||||
|
@ -470,13 +473,12 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
|
||||||
int longest_digit_count = negative ? 19 : 20;
|
int longest_digit_count = negative ? 19 : 20;
|
||||||
if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
|
if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
|
||||||
if (digit_count == longest_digit_count) {
|
if (digit_count == longest_digit_count) {
|
||||||
if(negative) {
|
if (negative) {
|
||||||
// Anything negative above INT64_MAX+1 is invalid
|
// Anything negative above INT64_MAX+1 is invalid
|
||||||
if (i > uint64_t(INT64_MAX)+1) {
|
if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); }
|
||||||
return INVALID_NUMBER(src);
|
|
||||||
}
|
|
||||||
WRITE_INTEGER(~i+1, src, writer);
|
WRITE_INTEGER(~i+1, src, writer);
|
||||||
return is_structural_or_whitespace(*p);
|
if (!is_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
||||||
|
return SUCCESS;
|
||||||
// Positive overflow check:
|
// Positive overflow check:
|
||||||
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
||||||
// biggest uint64_t.
|
// biggest uint64_t.
|
||||||
|
@ -498,9 +500,230 @@ simdjson_really_inline bool parse_number(const uint8_t *const src, W &writer) {
|
||||||
} else {
|
} else {
|
||||||
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
||||||
}
|
}
|
||||||
return is_structural_or_whitespace(*p);
|
if (!is_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
||||||
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SAX functions
|
||||||
|
namespace {
|
||||||
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
||||||
|
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
||||||
|
const uint8_t *p = src;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the integer part.
|
||||||
|
//
|
||||||
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
||||||
|
const uint8_t *const start_digits = p;
|
||||||
|
uint64_t i = 0;
|
||||||
|
while (parse_digit(*p, i)) { p++; }
|
||||||
|
|
||||||
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
||||||
|
int digit_count = int(p - start_digits);
|
||||||
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
|
||||||
|
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
||||||
|
|
||||||
|
// The longest positive 64-bit number is 20 digits.
|
||||||
|
// We do it this way so we don't trigger this branch unless we must.
|
||||||
|
if (digit_count > 20) { return NUMBER_ERROR; }
|
||||||
|
if (digit_count == 20) {
|
||||||
|
// Positive overflow check:
|
||||||
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
||||||
|
// biggest uint64_t.
|
||||||
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
||||||
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
||||||
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
||||||
|
// than 1,553,255,926,290,448,384.
|
||||||
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
||||||
|
// 10,000,000,000,000,000,000.
|
||||||
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
||||||
|
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
|
||||||
|
//
|
||||||
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
|
||||||
|
}
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
||||||
|
// Call this version of the method if you regularly expect 8- or 16-digit numbers.
|
||||||
|
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<uint64_t> parse_large_unsigned(const uint8_t * const src) noexcept {
|
||||||
|
const uint8_t *p = src;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the integer part.
|
||||||
|
//
|
||||||
|
uint64_t i = 0;
|
||||||
|
if (is_made_of_eight_digits_fast(p)) {
|
||||||
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
||||||
|
p += 8;
|
||||||
|
if (is_made_of_eight_digits_fast(p)) {
|
||||||
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
||||||
|
p += 8;
|
||||||
|
if (parse_digit(*p, i)) { // digit 17
|
||||||
|
p++;
|
||||||
|
if (parse_digit(*p, i)) { // digit 18
|
||||||
|
p++;
|
||||||
|
if (parse_digit(*p, i)) { // digit 19
|
||||||
|
p++;
|
||||||
|
if (parse_digit(*p, i)) { // digit 20
|
||||||
|
p++;
|
||||||
|
if (parse_digit(*p, i)) { return NUMBER_ERROR; } // 21 digits is an error
|
||||||
|
// Positive overflow check:
|
||||||
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
||||||
|
// biggest uint64_t.
|
||||||
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
||||||
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
||||||
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
||||||
|
// than 1,553,255,926,290,448,384.
|
||||||
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
||||||
|
// 10,000,000,000,000,000,000.
|
||||||
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
||||||
|
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
|
||||||
|
//
|
||||||
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // 16 digits
|
||||||
|
} else { // 8 digits
|
||||||
|
// Less than 8 digits can't overflow, simpler logic here.
|
||||||
|
if (parse_digit(*p, i)) { p++; } else { return NUMBER_ERROR; }
|
||||||
|
while (parse_digit(*p, i)) { p++; }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
||||||
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
||||||
|
int digit_count = int(p - src);
|
||||||
|
if (digit_count == 0 || ('0' == *src && digit_count > 1)) { return NUMBER_ERROR; }
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
||||||
|
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
||||||
|
//
|
||||||
|
// Check for minus sign
|
||||||
|
//
|
||||||
|
bool negative = (*src == '-');
|
||||||
|
const uint8_t *p = src + negative;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the integer part.
|
||||||
|
//
|
||||||
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
||||||
|
const uint8_t *const start_digits = p;
|
||||||
|
uint64_t i = 0;
|
||||||
|
while (parse_digit(*p, i)) { p++; }
|
||||||
|
|
||||||
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
||||||
|
int digit_count = int(p - start_digits);
|
||||||
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return NUMBER_ERROR; }
|
||||||
|
if (!is_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
||||||
|
|
||||||
|
// The longest negative 64-bit number is 19 digits.
|
||||||
|
// The longest positive 64-bit number is 20 digits.
|
||||||
|
// We do it this way so we don't trigger this branch unless we must.
|
||||||
|
int longest_digit_count = negative ? 19 : 20;
|
||||||
|
if (digit_count > longest_digit_count) { return NUMBER_ERROR; }
|
||||||
|
if (digit_count == longest_digit_count) {
|
||||||
|
if(negative) {
|
||||||
|
// Anything negative above INT64_MAX+1 is invalid
|
||||||
|
if (i > uint64_t(INT64_MAX)+1) { return NUMBER_ERROR; }
|
||||||
|
return ~i+1;
|
||||||
|
|
||||||
|
// Positive overflow check:
|
||||||
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
||||||
|
// biggest uint64_t.
|
||||||
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
||||||
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
||||||
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
||||||
|
// than 1,553,255,926,290,448,384.
|
||||||
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
||||||
|
// 10,000,000,000,000,000,000.
|
||||||
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
||||||
|
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
|
||||||
|
//
|
||||||
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_ERROR; }
|
||||||
|
}
|
||||||
|
|
||||||
|
return negative ? (~i+1) : i;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
||||||
|
//
|
||||||
|
// Check for minus sign
|
||||||
|
//
|
||||||
|
bool negative = (*src == '-');
|
||||||
|
src += negative;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the integer part.
|
||||||
|
//
|
||||||
|
uint64_t i = 0;
|
||||||
|
const uint8_t *p = src;
|
||||||
|
p += parse_digit(*p, i);
|
||||||
|
bool leading_zero = (i == 0);
|
||||||
|
while (parse_digit(*p, i)) { p++; }
|
||||||
|
// no integer digits, or 0123 (zero must be solo)
|
||||||
|
if ( p == src || (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the decimal part.
|
||||||
|
//
|
||||||
|
int64_t exponent = 0;
|
||||||
|
bool overflow;
|
||||||
|
if (simdjson_likely(*p == '.')) {
|
||||||
|
p++;
|
||||||
|
const uint8_t *start_decimal_digits = p;
|
||||||
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
||||||
|
p++;
|
||||||
|
while (parse_digit(*p, i)) { p++; }
|
||||||
|
exponent = -(p - start_decimal_digits);
|
||||||
|
|
||||||
|
// Overflow check. 19 digits (minus the decimal) may be overflow.
|
||||||
|
overflow = p-src-1 >= 19;
|
||||||
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
||||||
|
// Skip leading 0.00000 and see if it still overflows
|
||||||
|
const uint8_t *start_digits = src + 2;
|
||||||
|
while (*start_digits == '0') { start_digits++; }
|
||||||
|
overflow = start_digits-src >= 19;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
overflow = p-src >= 19;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Parse the exponent
|
||||||
|
//
|
||||||
|
if (*p == 'e' || *p == 'E') {
|
||||||
|
p++;
|
||||||
|
bool exp_neg = *p == '-';
|
||||||
|
p += exp_neg || *p == '+';
|
||||||
|
|
||||||
|
uint64_t exp = 0;
|
||||||
|
const uint8_t *start_exp_digits = p;
|
||||||
|
while (parse_digit(*p, exp)) { p++; }
|
||||||
|
// no exp digits, or 20+ exp digits
|
||||||
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
||||||
|
|
||||||
|
exponent += exp_neg ? 0-exp : exp;
|
||||||
|
overflow = overflow || exponent < FASTFLOAT_SMALLEST_POWER || exponent > FASTFLOAT_LARGEST_POWER;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Assemble (or slow-parse) the float
|
||||||
|
//
|
||||||
|
double d;
|
||||||
|
if (simdjson_likely(!overflow)) {
|
||||||
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
||||||
|
}
|
||||||
|
if (!parse_float_strtod(src-negative, &d)) {
|
||||||
|
return NUMBER_ERROR;
|
||||||
|
}
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
} //namespace {}
|
||||||
#endif // SIMDJSON_SKIPNUMBERPARSING
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
||||||
|
|
||||||
} // namespace numberparsing
|
} // namespace numberparsing
|
||||||
|
|
|
@ -119,6 +119,15 @@ SIMDJSON_WARN_UNUSED simdjson_really_inline uint8_t *parse_string(const uint8_t
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SIMDJSON_UNUSED SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string_to_buffer(const uint8_t *src, uint8_t *¤t_string_buf_loc, std::string_view &s) {
|
||||||
|
if (src[0] != '"') { return STRING_ERROR; }
|
||||||
|
auto end = stringparsing::parse_string(src, current_string_buf_loc);
|
||||||
|
if (!end) { return STRING_ERROR; }
|
||||||
|
s = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
|
||||||
|
current_string_buf_loc = end;
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace stringparsing
|
} // namespace stringparsing
|
||||||
} // namespace stage2
|
} // namespace stage2
|
||||||
} // namespace SIMDJSON_IMPLEMENTATION
|
} // namespace SIMDJSON_IMPLEMENTATION
|
||||||
|
|
|
@ -1,245 +0,0 @@
|
||||||
// This file contains the common code every implementation uses for stage2
|
|
||||||
// It is intended to be included multiple times and compiled multiple times
|
|
||||||
// We assume the file in which it is include already includes
|
|
||||||
// "simdjson/stage2.h" (this simplifies amalgation)
|
|
||||||
|
|
||||||
#include "generic/stage2/logger.h"
|
|
||||||
#include "generic/stage2/structural_iterator.h"
|
|
||||||
|
|
||||||
namespace { // Make everything here private
|
|
||||||
namespace SIMDJSON_IMPLEMENTATION {
|
|
||||||
namespace stage2 {
|
|
||||||
|
|
||||||
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
|
|
||||||
|
|
||||||
struct structural_parser : structural_iterator {
|
|
||||||
/** Current depth (nested objects and arrays) */
|
|
||||||
uint32_t depth{0};
|
|
||||||
|
|
||||||
template<bool STREAMING, typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse(T &builder) noexcept;
|
|
||||||
template<bool STREAMING, typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED static simdjson_really_inline error_code parse(dom_parser_implementation &dom_parser, T &builder) noexcept {
|
|
||||||
structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
||||||
return parser.parse<STREAMING>(builder);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
|
|
||||||
simdjson_really_inline structural_parser(dom_parser_implementation &_dom_parser, uint32_t start_structural_index)
|
|
||||||
: structural_iterator(_dom_parser, start_structural_index) {
|
|
||||||
}
|
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code start_document() {
|
|
||||||
dom_parser.is_array[depth] = false;
|
|
||||||
return SUCCESS;
|
|
||||||
}
|
|
||||||
template<typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code start_array(T &builder) {
|
|
||||||
depth++;
|
|
||||||
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
||||||
builder.start_array(*this);
|
|
||||||
dom_parser.is_array[depth] = true;
|
|
||||||
return SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline bool empty_object(T &builder) {
|
|
||||||
if (peek_next_char() == '}') {
|
|
||||||
advance_char();
|
|
||||||
builder.empty_object(*this);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
template<typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline bool empty_array(T &builder) {
|
|
||||||
if (peek_next_char() == ']') {
|
|
||||||
advance_char();
|
|
||||||
builder.empty_array(*this);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool STREAMING>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code finish() {
|
|
||||||
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
||||||
|
|
||||||
if (depth != 0) {
|
|
||||||
log_error("Unclosed objects or arrays!");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we didn't make it to the end, it's an error
|
|
||||||
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
||||||
logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline uint8_t last_structural() {
|
|
||||||
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline void log_value(const char *type) {
|
|
||||||
logger::log_line(*this, "", type, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline void log_start_value(const char *type) {
|
|
||||||
logger::log_line(*this, "+", type, "");
|
|
||||||
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline void log_end_value(const char *type) {
|
|
||||||
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
||||||
logger::log_line(*this, "-", type, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline void log_error(const char *error) {
|
|
||||||
logger::log_line(*this, "", "ERROR", error);
|
|
||||||
}
|
|
||||||
}; // struct structural_parser
|
|
||||||
|
|
||||||
template<bool STREAMING, typename T>
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code structural_parser::parse(T &builder) noexcept {
|
|
||||||
logger::log_start();
|
|
||||||
|
|
||||||
//
|
|
||||||
// Start the document
|
|
||||||
//
|
|
||||||
if (at_end()) { return EMPTY; }
|
|
||||||
SIMDJSON_TRY( start_document() );
|
|
||||||
builder.start_document(*this);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Read first value
|
|
||||||
//
|
|
||||||
{
|
|
||||||
const uint8_t *value = advance();
|
|
||||||
|
|
||||||
// Make sure the outer hash or array is closed before continuing; otherwise, there are ways we
|
|
||||||
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
||||||
if (!STREAMING) {
|
|
||||||
switch (*value) {
|
|
||||||
case '{':
|
|
||||||
if (last_structural() != '}') {
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case '[':
|
|
||||||
if (last_structural() != ']') {
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (*value) {
|
|
||||||
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
|
|
||||||
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
|
|
||||||
default: SIMDJSON_TRY( builder.parse_root_primitive(*this, value) );
|
|
||||||
}
|
|
||||||
goto document_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Object parser states
|
|
||||||
//
|
|
||||||
object_begin: {
|
|
||||||
depth++;
|
|
||||||
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
||||||
builder.start_object(*this);
|
|
||||||
dom_parser.is_array[depth] = false;
|
|
||||||
|
|
||||||
const uint8_t *key = advance();
|
|
||||||
if (*key != '"') {
|
|
||||||
log_error("Object does not start with a key");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
builder.increment_count(*this);
|
|
||||||
SIMDJSON_TRY( builder.parse_key(*this, key) );
|
|
||||||
goto object_field;
|
|
||||||
} // object_begin:
|
|
||||||
|
|
||||||
object_field: {
|
|
||||||
if (simdjson_unlikely( advance_char() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
||||||
const uint8_t *value = advance();
|
|
||||||
switch (*value) {
|
|
||||||
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
|
|
||||||
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
|
|
||||||
default: SIMDJSON_TRY( builder.parse_primitive(*this, value) );
|
|
||||||
}
|
|
||||||
} // object_field:
|
|
||||||
|
|
||||||
object_continue: {
|
|
||||||
switch (advance_char()) {
|
|
||||||
case ',': {
|
|
||||||
builder.increment_count(*this);
|
|
||||||
const uint8_t *key = advance();
|
|
||||||
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
||||||
SIMDJSON_TRY( builder.parse_key(*this, key) );
|
|
||||||
goto object_field;
|
|
||||||
}
|
|
||||||
case '}':
|
|
||||||
builder.end_object(*this);
|
|
||||||
goto scope_end;
|
|
||||||
default:
|
|
||||||
log_error("No comma between object fields");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
} // object_continue:
|
|
||||||
|
|
||||||
scope_end: {
|
|
||||||
depth--;
|
|
||||||
if (depth == 0) { goto document_end; }
|
|
||||||
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
||||||
goto object_continue;
|
|
||||||
} // scope_end:
|
|
||||||
|
|
||||||
//
|
|
||||||
// Array parser states
|
|
||||||
//
|
|
||||||
array_begin: {
|
|
||||||
depth++;
|
|
||||||
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
||||||
builder.start_array(*this);
|
|
||||||
dom_parser.is_array[depth] = true;
|
|
||||||
|
|
||||||
builder.increment_count(*this);
|
|
||||||
} // array_begin:
|
|
||||||
|
|
||||||
array_value: {
|
|
||||||
const uint8_t *value = advance();
|
|
||||||
switch (*value) {
|
|
||||||
case '{': if (!empty_object(builder)) { goto object_begin; }; break;
|
|
||||||
case '[': if (!empty_array(builder)) { goto array_begin; }; break;
|
|
||||||
default: SIMDJSON_TRY( builder.parse_primitive(*this, value) );
|
|
||||||
}
|
|
||||||
} // array_value:
|
|
||||||
|
|
||||||
array_continue: {
|
|
||||||
switch (advance_char()) {
|
|
||||||
case ',':
|
|
||||||
builder.increment_count(*this);
|
|
||||||
goto array_value;
|
|
||||||
case ']':
|
|
||||||
builder.end_array(*this);
|
|
||||||
goto scope_end;
|
|
||||||
default:
|
|
||||||
log_error("Missing comma between array values");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
} // array_continue:
|
|
||||||
|
|
||||||
document_end: {
|
|
||||||
builder.end_document(*this);
|
|
||||||
return finish<STREAMING>();
|
|
||||||
} // document_end:
|
|
||||||
|
|
||||||
} // parse_structurals()
|
|
||||||
|
|
||||||
} // namespace stage2
|
|
||||||
} // namespace SIMDJSON_IMPLEMENTATION
|
|
||||||
} // unnamed namespace
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "generic/stage2/json_iterator.h"
|
||||||
#include "generic/stage2/tape_writer.h"
|
#include "generic/stage2/tape_writer.h"
|
||||||
#include "generic/stage2/atomparsing.h"
|
#include "generic/stage2/atomparsing.h"
|
||||||
|
|
||||||
|
@ -6,105 +7,163 @@ namespace SIMDJSON_IMPLEMENTATION {
|
||||||
namespace stage2 {
|
namespace stage2 {
|
||||||
|
|
||||||
struct tape_builder {
|
struct tape_builder {
|
||||||
|
template<bool STREAMING>
|
||||||
|
SIMDJSON_WARN_UNUSED static simdjson_really_inline error_code parse_document(
|
||||||
|
dom_parser_implementation &dom_parser,
|
||||||
|
dom::document &doc) noexcept;
|
||||||
|
|
||||||
|
/** Called when a non-empty document starts. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
||||||
|
/** Called when a non-empty document ends without error. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
||||||
|
|
||||||
|
/** Called when a non-empty array starts. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
||||||
|
/** Called when a non-empty array ends. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
||||||
|
/** Called when an empty array is found. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
||||||
|
|
||||||
|
/** Called when a non-empty object starts. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
||||||
|
/**
|
||||||
|
* Called when a key in a field is encountered.
|
||||||
|
*
|
||||||
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
||||||
|
* will be called after this with the field value.
|
||||||
|
*/
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
||||||
|
/** Called when a non-empty object ends. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
||||||
|
/** Called when an empty object is found. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when a string, number, boolean or null is found.
|
||||||
|
*/
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
/**
|
||||||
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
||||||
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
||||||
|
* null.
|
||||||
|
*
|
||||||
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
||||||
|
* there is at least one more token after the value, which is only true in an array or object.
|
||||||
|
*/
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
||||||
|
|
||||||
|
/** Called each time a new field or element in an array or object is found. */
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code increment_count(json_iterator &iter) noexcept;
|
||||||
|
|
||||||
/** Next location to write to tape */
|
/** Next location to write to tape */
|
||||||
tape_writer tape;
|
tape_writer tape;
|
||||||
|
private:
|
||||||
/** Next write location in the string buf for stage 2 parsing */
|
/** Next write location in the string buf for stage 2 parsing */
|
||||||
uint8_t *current_string_buf_loc;
|
uint8_t *current_string_buf_loc;
|
||||||
|
|
||||||
simdjson_really_inline tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
simdjson_really_inline tape_builder(dom::document &doc) noexcept;
|
||||||
|
|
||||||
private:
|
simdjson_really_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
||||||
friend struct structural_parser;
|
simdjson_really_inline void start_container(json_iterator &iter) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
||||||
|
simdjson_really_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
||||||
|
simdjson_really_inline void on_end_string(uint8_t *dst) noexcept;
|
||||||
|
}; // class tape_builder
|
||||||
|
|
||||||
simdjson_really_inline error_code parse_root_primitive(structural_parser &parser, const uint8_t *value) {
|
template<bool STREAMING>
|
||||||
switch (*value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::parse_document(
|
||||||
case '"': return parse_string(parser, value);
|
dom_parser_implementation &dom_parser,
|
||||||
case 't': return parse_root_true_atom(parser, value);
|
dom::document &doc) noexcept {
|
||||||
case 'f': return parse_root_false_atom(parser, value);
|
dom_parser.doc = &doc;
|
||||||
case 'n': return parse_root_null_atom(parser, value);
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
||||||
case '-':
|
tape_builder builder(doc);
|
||||||
case '0': case '1': case '2': case '3': case '4':
|
return iter.walk_document<STREAMING>(builder);
|
||||||
case '5': case '6': case '7': case '8': case '9':
|
}
|
||||||
return parse_root_number(parser, value);
|
|
||||||
default:
|
|
||||||
parser.log_error("Document starts with a non-value character");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
simdjson_really_inline error_code parse_primitive(structural_parser &parser, const uint8_t *value) {
|
|
||||||
switch (*value) {
|
|
||||||
case '"': return parse_string(parser, value);
|
|
||||||
case 't': return parse_true_atom(parser, value);
|
|
||||||
case 'f': return parse_false_atom(parser, value);
|
|
||||||
case 'n': return parse_null_atom(parser, value);
|
|
||||||
case '-':
|
|
||||||
case '0': case '1': case '2': case '3': case '4':
|
|
||||||
case '5': case '6': case '7': case '8': case '9':
|
|
||||||
return parse_number(parser, value);
|
|
||||||
default:
|
|
||||||
parser.log_error("Non-value found when value was expected!");
|
|
||||||
return TAPE_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
simdjson_really_inline void empty_object(structural_parser &parser) {
|
|
||||||
parser.log_value("empty object");
|
|
||||||
empty_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
||||||
}
|
|
||||||
simdjson_really_inline void empty_array(structural_parser &parser) {
|
|
||||||
parser.log_value("empty array");
|
|
||||||
empty_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline void start_document(structural_parser &parser) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_start_value("document");
|
return iter.visit_root_primitive(*this, value);
|
||||||
start_container(parser);
|
}
|
||||||
}
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
simdjson_really_inline void start_object(structural_parser &parser) {
|
return iter.visit_primitive(*this, value);
|
||||||
parser.log_start_value("object");
|
}
|
||||||
start_container(parser);
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
||||||
}
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
||||||
simdjson_really_inline void start_array(structural_parser &parser) {
|
}
|
||||||
parser.log_start_value("array");
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
||||||
start_container(parser);
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline void end_object(structural_parser &parser) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
||||||
parser.log_end_value("object");
|
start_container(iter);
|
||||||
end_container(parser, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
simdjson_really_inline void end_array(structural_parser &parser) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
||||||
parser.log_end_value("array");
|
start_container(iter);
|
||||||
end_container(parser, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
simdjson_really_inline void end_document(structural_parser &parser) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
||||||
parser.log_end_value("document");
|
start_container(iter);
|
||||||
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
||||||
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
||||||
|
}
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
||||||
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
||||||
|
}
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
||||||
constexpr uint32_t start_tape_index = 0;
|
constexpr uint32_t start_tape_index = 0;
|
||||||
tape.append(start_tape_index, internal::tape_type::ROOT);
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
||||||
tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser), internal::tape_type::ROOT);
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
||||||
}
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
||||||
|
return visit_string(iter, key, true);
|
||||||
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_key(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
||||||
return parse_string(parser, value, true);
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
||||||
}
|
return SUCCESS;
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string(structural_parser &parser, const uint8_t *value, bool key = false) {
|
}
|
||||||
parser.log_value(key ? "key" : "string");
|
|
||||||
uint8_t *dst = on_start_string(parser);
|
simdjson_really_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
||||||
|
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
||||||
|
iter.log_value(key ? "key" : "string");
|
||||||
|
uint8_t *dst = on_start_string(iter);
|
||||||
dst = stringparsing::parse_string(value, dst);
|
dst = stringparsing::parse_string(value, dst);
|
||||||
if (dst == nullptr) {
|
if (dst == nullptr) {
|
||||||
parser.log_error("Invalid escape in string");
|
iter.log_error("Invalid escape in string");
|
||||||
return STRING_ERROR;
|
return STRING_ERROR;
|
||||||
}
|
}
|
||||||
on_end_string(dst);
|
on_end_string(dst);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_number(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("number");
|
return visit_string(iter, value);
|
||||||
if (!numberparsing::parse_number(value, tape)) { parser.log_error("Invalid number"); return NUMBER_ERROR; }
|
}
|
||||||
return SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
simdjson_really_inline error_code parse_root_number(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
|
iter.log_value("number");
|
||||||
|
return numberparsing::parse_number(value, tape);
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
//
|
//
|
||||||
// We need to make a copy to make sure that the string is space terminated.
|
// We need to make a copy to make sure that the string is space terminated.
|
||||||
// This is not about padding the input, which should already padded up
|
// This is not about padding the input, which should already padded up
|
||||||
|
@ -118,101 +177,96 @@ private:
|
||||||
// practice unless you are in the strange scenario where you have many JSON
|
// practice unless you are in the strange scenario where you have many JSON
|
||||||
// documents made of single atoms.
|
// documents made of single atoms.
|
||||||
//
|
//
|
||||||
uint8_t *copy = static_cast<uint8_t *>(malloc(parser.remaining_len() + SIMDJSON_PADDING));
|
uint8_t *copy = static_cast<uint8_t *>(malloc(iter.remaining_len() + SIMDJSON_PADDING));
|
||||||
if (copy == nullptr) {
|
if (copy == nullptr) { return MEMALLOC; }
|
||||||
return MEMALLOC;
|
memcpy(copy, value, iter.remaining_len());
|
||||||
}
|
memset(copy + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
||||||
memcpy(copy, value, parser.remaining_len());
|
error_code error = visit_number(iter, copy);
|
||||||
memset(copy + parser.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
||||||
error_code error = parse_number(parser, copy);
|
|
||||||
free(copy);
|
free(copy);
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_true_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("true");
|
iter.log_value("true");
|
||||||
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::TRUE_VALUE);
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_true_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("true");
|
iter.log_value("true");
|
||||||
if (!atomparsing::is_valid_true_atom(value, parser.remaining_len())) { return T_ATOM_ERROR; }
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::TRUE_VALUE);
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_false_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("false");
|
iter.log_value("false");
|
||||||
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::FALSE_VALUE);
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_false_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("false");
|
iter.log_value("false");
|
||||||
if (!atomparsing::is_valid_false_atom(value, parser.remaining_len())) { return F_ATOM_ERROR; }
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::FALSE_VALUE);
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_null_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("null");
|
iter.log_value("null");
|
||||||
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::NULL_VALUE);
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_root_null_atom(structural_parser &parser, const uint8_t *value) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
||||||
parser.log_value("null");
|
iter.log_value("null");
|
||||||
if (!atomparsing::is_valid_null_atom(value, parser.remaining_len())) { return N_ATOM_ERROR; }
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
||||||
tape.append(0, internal::tape_type::NULL_VALUE);
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
// increment_count increments the count of keys in an object or values in an array.
|
|
||||||
simdjson_really_inline void increment_count(structural_parser &parser) {
|
|
||||||
parser.dom_parser.containing_scope[parser.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// private:
|
// private:
|
||||||
|
|
||||||
simdjson_really_inline uint32_t next_tape_index(structural_parser &parser) {
|
simdjson_really_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
||||||
return uint32_t(tape.next_tape_loc - parser.dom_parser.doc->tape.get());
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline void empty_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
||||||
auto start_index = next_tape_index(parser);
|
auto start_index = next_tape_index(iter);
|
||||||
tape.append(start_index+2, start);
|
tape.append(start_index+2, start);
|
||||||
tape.append(start_index, end);
|
tape.append(start_index, end);
|
||||||
}
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
simdjson_really_inline void start_container(structural_parser &parser) {
|
simdjson_really_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
||||||
parser.dom_parser.containing_scope[parser.depth].tape_index = next_tape_index(parser);
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
||||||
parser.dom_parser.containing_scope[parser.depth].count = 0;
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
||||||
tape.skip(); // We don't actually *write* the start element until the end.
|
tape.skip(); // We don't actually *write* the start element until the end.
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline void end_container(structural_parser &parser, internal::tape_type start, internal::tape_type end) noexcept {
|
SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
||||||
// Write the ending tape element, pointing at the start location
|
// Write the ending tape element, pointing at the start location
|
||||||
const uint32_t start_tape_index = parser.dom_parser.containing_scope[parser.depth].tape_index;
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
||||||
tape.append(start_tape_index, end);
|
tape.append(start_tape_index, end);
|
||||||
// Write the start tape element, pointing at the end location (and including count)
|
// Write the start tape element, pointing at the end location (and including count)
|
||||||
// count can overflow if it exceeds 24 bits... so we saturate
|
// count can overflow if it exceeds 24 bits... so we saturate
|
||||||
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
||||||
const uint32_t count = parser.dom_parser.containing_scope[parser.depth].count;
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
||||||
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
||||||
tape_writer::write(parser.dom_parser.doc->tape[start_tape_index], next_tape_index(parser) | (uint64_t(cntsat) << 32), start);
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
||||||
}
|
return SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
simdjson_really_inline uint8_t *on_start_string(structural_parser &parser) noexcept {
|
simdjson_really_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
||||||
// we advance the point, accounting for the fact that we have a NULL termination
|
// we advance the point, accounting for the fact that we have a NULL termination
|
||||||
tape.append(current_string_buf_loc - parser.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
||||||
return current_string_buf_loc + sizeof(uint32_t);
|
return current_string_buf_loc + sizeof(uint32_t);
|
||||||
}
|
}
|
||||||
|
|
||||||
simdjson_really_inline void on_end_string(uint8_t *dst) noexcept {
|
simdjson_really_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
||||||
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
||||||
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
||||||
// But only add the overflow check when the document itself exceeds 4GB
|
// But only add the overflow check when the document itself exceeds 4GB
|
||||||
|
@ -222,8 +276,7 @@ private:
|
||||||
// be NULL terminated? It comes at a small cost
|
// be NULL terminated? It comes at a small cost
|
||||||
*dst = 0;
|
*dst = 0;
|
||||||
current_string_buf_loc = dst + 1;
|
current_string_buf_loc = dst + 1;
|
||||||
}
|
}
|
||||||
}; // class tape_builder
|
|
||||||
|
|
||||||
} // namespace stage2
|
} // namespace stage2
|
||||||
} // namespace SIMDJSON_IMPLEMENTATION
|
} // namespace SIMDJSON_IMPLEMENTATION
|
||||||
|
|
|
@ -80,7 +80,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
|
||||||
//
|
//
|
||||||
#include "haswell/stringparsing.h"
|
#include "haswell/stringparsing.h"
|
||||||
#include "haswell/numberparsing.h"
|
#include "haswell/numberparsing.h"
|
||||||
#include "generic/stage2/structural_parser.h"
|
|
||||||
#include "generic/stage2/tape_builder.h"
|
#include "generic/stage2/tape_builder.h"
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -112,15 +111,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
||||||
stage2::tape_builder builder(_doc);
|
|
||||||
return stage2::structural_parser::parse<false>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
||||||
stage2::tape_builder builder(_doc);
|
|
||||||
return stage2::structural_parser::parse<true>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
|
|
||||||
|
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
|
||||||
|
|
||||||
// Static array of known implementations. We're hoping these get baked into the executable
|
// Static array of known implementations. We're hoping these get baked into the executable
|
||||||
// without requiring a static initializer.
|
// without requiring a static initializer.
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,6 @@ simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
|
||||||
//
|
//
|
||||||
#include "westmere/stringparsing.h"
|
#include "westmere/stringparsing.h"
|
||||||
#include "westmere/numberparsing.h"
|
#include "westmere/numberparsing.h"
|
||||||
#include "generic/stage2/structural_parser.h"
|
|
||||||
#include "generic/stage2/tape_builder.h"
|
#include "generic/stage2/tape_builder.h"
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -114,15 +113,11 @@ SIMDJSON_WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
||||||
stage2::tape_builder builder(*doc);
|
|
||||||
return stage2::structural_parser::parse<false>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
||||||
doc = &_doc;
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
||||||
stage2::tape_builder builder(_doc);
|
|
||||||
return stage2::structural_parser::parse<true>(*this, builder);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
SIMDJSON_WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
||||||
|
|
Loading…
Reference in New Issue