Style uniformization (#238)
* massive clang-format -style=LLVM * naming harmonization * adding commentary about sysinfoapi.h
This commit is contained in:
parent
065805d6e1
commit
c2eea8abba
34
README.md
34
README.md
|
@ -67,7 +67,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi
|
|||
|
||||
## Code usage and example
|
||||
|
||||
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').
|
||||
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::Iterator pjh(pj)`, see 'Navigating the parsed document').
|
||||
|
||||
```C
|
||||
#include "simdjson/jsonparser.h"
|
||||
|
@ -80,12 +80,12 @@ const char * filename = ... //
|
|||
// use whatever means you want to get a string (UTF-8) of your JSON document
|
||||
padded_string p = get_corpus(filename);
|
||||
ParsedJson pj;
|
||||
pj.allocateCapacity(p.size()); // allocate memory for parsing up to p.size() bytes
|
||||
pj.allocate_capacity(p.size()); // allocate memory for parsing up to p.size() bytes
|
||||
const int res = json_parse(p, pj); // do the parsing, return 0 on success
|
||||
// parsing is done!
|
||||
if (res != 0) {
|
||||
// You can use the "simdjson/simdjson.h" header to access the error message
|
||||
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
|
||||
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
|
||||
}
|
||||
// the ParsedJson document can be used here
|
||||
// pj can be reused with other json_parse calls.
|
||||
|
@ -103,9 +103,9 @@ using namespace simdjson;
|
|||
const char * filename = ... //
|
||||
padded_string p = get_corpus(filename);
|
||||
ParsedJson pj = build_parsed_json(p); // do the parsing
|
||||
if( ! pj.isValid() ) {
|
||||
if( ! pj.is_valid() ) {
|
||||
// something went wrong
|
||||
std::cout << pj.getErrorMsg() << std::endl;
|
||||
std::cout << pj.get_error_message() << std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -119,13 +119,13 @@ using namespace simdjson;
|
|||
/...
|
||||
std::string mystring = ... //
|
||||
ParsedJson pj;
|
||||
pj.allocateCapacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
|
||||
pj.allocate_capacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
|
||||
// std::string may not overallocate so a copy will be needed
|
||||
const int res = json_parse(mystring, pj); // do the parsing, return 0 on success
|
||||
// parsing is done!
|
||||
if (res != 0) {
|
||||
// You can use the "simdjson/simdjson.h" header to access the error message
|
||||
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
|
||||
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
|
||||
}
|
||||
// pj can be reused with other json_parse calls.
|
||||
```
|
||||
|
@ -141,9 +141,9 @@ using namespace simdjson;
|
|||
std::string mystring = ... //
|
||||
// std::string may not overallocate so a copy will be needed
|
||||
ParsedJson pj = build_parsed_json(mystring); // do the parsing
|
||||
if( ! pj.isValid() ) {
|
||||
if( ! pj.is_valid() ) {
|
||||
// something went wrong
|
||||
std::cout << pj.getErrorMsg() << std::endl;
|
||||
std::cout << pj.get_error_message() << std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -164,9 +164,9 @@ int main(int argc, char *argv[]) {
|
|||
const char * filename = argv[1];
|
||||
padded_string p = get_corpus(filename);
|
||||
ParsedJson pj = build_parsed_json(p); // do the parsing
|
||||
if( ! pj.isValid() ) {
|
||||
if( ! pj.is_valid() ) {
|
||||
std::cout << "not valid" << std::endl;
|
||||
std::cout << pj.getErrorMsg() << std::endl;
|
||||
std::cout << pj.get_error_message() << std::endl;
|
||||
} else {
|
||||
std::cout << "valid" << std::endl;
|
||||
}
|
||||
|
@ -370,8 +370,8 @@ In C++, given a `ParsedJson`, we can move to a node with the `move_to` method, p
|
|||
Here is a code sample to dump back the parsed JSON to a string:
|
||||
|
||||
```c
|
||||
ParsedJson::iterator pjh(pj);
|
||||
if (!pjh.isOk()) {
|
||||
ParsedJson::Iterator pjh(pj);
|
||||
if (!pjh.is_ok()) {
|
||||
std::cerr << " Could not iterate parsed result. " << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
@ -379,7 +379,7 @@ Here is a code sample to dump back the parsed JSON to a string:
|
|||
//
|
||||
// where compute_dump is :
|
||||
|
||||
void compute_dump(ParsedJson::iterator &pjh) {
|
||||
void compute_dump(ParsedJson::Iterator &pjh) {
|
||||
if (pjh.is_object()) {
|
||||
std::cout << "{";
|
||||
if (pjh.down()) {
|
||||
|
@ -417,12 +417,12 @@ void compute_dump(ParsedJson::iterator &pjh) {
|
|||
The following function will find all user.id integers:
|
||||
|
||||
```C
|
||||
void simdjson_scan(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
|
||||
void simdjson_scan(std::vector<int64_t> &answer, ParsedJson::Iterator &i) {
|
||||
while(i.move_forward()) {
|
||||
if(i.get_scope_type() == '{') {
|
||||
bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
|
||||
bool found_user = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
|
||||
i.move_to_value();
|
||||
if(founduser) {
|
||||
if(found_user) {
|
||||
if(i.is_object() && i.move_to_key("id",2)) {
|
||||
if (i.is_integer()) {
|
||||
answer.push_back(i.get_integer());
|
||||
|
|
|
@ -117,7 +117,7 @@ int main(int argc, char *argv[]) {
|
|||
const char * filename = argv[1];
|
||||
simdjson::padded_string p = simdjson::get_corpus(filename);
|
||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p); // do the parsing
|
||||
if( ! pj.isValid() ) {
|
||||
if( ! pj.is_valid() ) {
|
||||
std::cout << "not valid" << std::endl;
|
||||
} else {
|
||||
std::cout << "valid" << std::endl;
|
||||
|
|
|
@ -18,7 +18,7 @@ const char *unitname = "cycles";
|
|||
: \
|
||||
: /* no read only */ \
|
||||
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
||||
); \
|
||||
); \
|
||||
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
||||
} while (0)
|
||||
|
||||
|
@ -32,7 +32,7 @@ const char *unitname = "cycles";
|
|||
: "=r"(cyc_high), "=r"(cyc_low) \
|
||||
: /* no read only registers */ \
|
||||
: "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
||||
); \
|
||||
); \
|
||||
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
||||
} while (0)
|
||||
|
||||
|
|
|
@ -30,49 +30,51 @@ void print_vec(const std::vector<int64_t> &v) {
|
|||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
void simdjson_scan(std::vector<int64_t> &answer, simdjson::ParsedJson::iterator &i) {
|
||||
while(i.move_forward()) {
|
||||
if(i.get_scope_type() == '{') {
|
||||
bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
|
||||
i.move_to_value();
|
||||
if(founduser) {
|
||||
if(i.is_object() && i.move_to_key("id",2)) {
|
||||
if (i.is_integer()) {
|
||||
answer.push_back(i.get_integer());
|
||||
}
|
||||
i.up();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void simdjson_scan(std::vector<int64_t> &answer,
|
||||
simdjson::ParsedJson::Iterator &i) {
|
||||
while (i.move_forward()) {
|
||||
if (i.get_scope_type() == '{') {
|
||||
bool found_user = (i.get_string_length() == 4) &&
|
||||
(memcmp(i.get_string(), "user", 4) == 0);
|
||||
i.move_to_value();
|
||||
if (found_user) {
|
||||
if (i.is_object() && i.move_to_key("id", 2)) {
|
||||
if (i.is_integer()) {
|
||||
answer.push_back(i.get_integer());
|
||||
}
|
||||
i.up();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> simdjson_justdom(simdjson::ParsedJson &pj) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
simdjson_just_dom(simdjson::ParsedJson &pj) {
|
||||
std::vector<int64_t> answer;
|
||||
simdjson::ParsedJson::iterator i(pj);
|
||||
simdjson_scan(answer,i);
|
||||
simdjson::ParsedJson::Iterator i(pj);
|
||||
simdjson_scan(answer, i);
|
||||
remove_duplicates(answer);
|
||||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> simdjson_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
simdjson_compute_stats(const simdjson::padded_string &p) {
|
||||
std::vector<int64_t> answer;
|
||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
|
||||
if (!pj.isValid()) {
|
||||
if (!pj.is_valid()) {
|
||||
return answer;
|
||||
}
|
||||
simdjson::ParsedJson::iterator i(pj);
|
||||
simdjson_scan(answer,i);
|
||||
simdjson::ParsedJson::Iterator i(pj);
|
||||
simdjson_scan(answer, i);
|
||||
remove_duplicates(answer);
|
||||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
bool simdjson_justparse(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) bool
|
||||
simdjson_just_parse(const simdjson::padded_string &p) {
|
||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
|
||||
bool answer = !pj.isValid();
|
||||
bool answer = !pj.is_valid();
|
||||
return answer;
|
||||
}
|
||||
|
||||
|
@ -88,25 +90,27 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
|
|||
}
|
||||
case TYPE_OBJECT: {
|
||||
auto length = node.get_length();
|
||||
// sajson has O(log n) find_object_key, but we still visit each node anyhow because we
|
||||
// need to visit all values.
|
||||
// sajson has O(log n) find_object_key, but we still visit each node anyhow
|
||||
// because we need to visit all values.
|
||||
for (auto i = 0u; i < length; ++i) {
|
||||
auto key = node.get_object_key(i); // expected: sajson::string
|
||||
bool founduser = (key.length() == 4) && (memcmp(key.data(), "user", 4) == 0);
|
||||
if (founduser) { // found a user!!!
|
||||
auto uservalue = node.get_object_value(i); // get the value
|
||||
if (uservalue.get_type() ==
|
||||
bool found_user =
|
||||
(key.length() == 4) && (memcmp(key.data(), "user", 4) == 0);
|
||||
if (found_user) { // found a user!!!
|
||||
auto user_value = node.get_object_value(i); // get the value
|
||||
if (user_value.get_type() ==
|
||||
TYPE_OBJECT) { // the value should be an object
|
||||
// now we know that we only need one value
|
||||
auto uservaluelength = uservalue.get_length();
|
||||
auto rightindex = uservalue.find_object_key(sajson::string("id",2));
|
||||
if(rightindex < uservaluelength) {
|
||||
auto v = uservalue.get_object_value(rightindex);
|
||||
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
|
||||
answer.push_back(v.get_integer_value()); // record it!
|
||||
} else if (v.get_type() == TYPE_DOUBLE) {
|
||||
answer.push_back((int64_t)v.get_double_value()); // record it!
|
||||
}
|
||||
auto user_value_length = user_value.get_length();
|
||||
auto right_index =
|
||||
user_value.find_object_key(sajson::string("id", 2));
|
||||
if (right_index < user_value_length) {
|
||||
auto v = user_value.get_object_value(right_index);
|
||||
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
|
||||
answer.push_back(v.get_integer_value()); // record it!
|
||||
} else if (v.get_type() == TYPE_DOUBLE) {
|
||||
answer.push_back((int64_t)v.get_double_value()); // record it!
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -126,16 +130,16 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
|
|||
}
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> sasjon_justdom(sajson::document & d) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
sasjon_just_dom(sajson::document &d) {
|
||||
std::vector<int64_t> answer;
|
||||
sajson_traverse(answer, d.get_root());
|
||||
remove_duplicates(answer);
|
||||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> sasjon_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
sasjon_compute_stats(const simdjson::padded_string &p) {
|
||||
std::vector<int64_t> answer;
|
||||
char *buffer = (char *)malloc(p.size());
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
|
@ -151,8 +155,8 @@ std::vector<int64_t> sasjon_computestats(const simdjson::padded_string &p) {
|
|||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
bool sasjon_justparse(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) bool
|
||||
sasjon_just_parse(const simdjson::padded_string &p) {
|
||||
char *buffer = (char *)malloc(p.size());
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
auto d = sajson::parse(sajson::dynamic_allocation(),
|
||||
|
@ -167,8 +171,9 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
|
|||
case kObjectType:
|
||||
for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
|
||||
++m) {
|
||||
bool founduser = (m->name.GetStringLength() == 4) && (memcmp(m->name.GetString(), "user", 4) == 0);
|
||||
if (founduser) {
|
||||
bool found_user = (m->name.GetStringLength() == 4) &&
|
||||
(memcmp(m->name.GetString(), "user", 4) == 0);
|
||||
if (found_user) {
|
||||
const rapidjson::Value &child = m->value;
|
||||
if (child.GetType() == kObjectType) {
|
||||
for (Value::ConstMemberIterator k = child.MemberBegin();
|
||||
|
@ -201,16 +206,16 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
|
|||
}
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> rapid_justdom(rapidjson::Document &d) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
rapid_just_dom(rapidjson::Document &d) {
|
||||
std::vector<int64_t> answer;
|
||||
rapid_traverse(answer, d);
|
||||
remove_duplicates(answer);
|
||||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) std::vector<int64_t>
|
||||
rapid_compute_stats(const simdjson::padded_string &p) {
|
||||
std::vector<int64_t> answer;
|
||||
char *buffer = (char *)malloc(p.size() + 1);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
|
@ -218,8 +223,8 @@ std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
|
|||
rapidjson::Document d;
|
||||
d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
|
||||
if (d.HasParseError()) {
|
||||
free(buffer);
|
||||
return answer;
|
||||
free(buffer);
|
||||
return answer;
|
||||
}
|
||||
rapid_traverse(answer, d);
|
||||
free(buffer);
|
||||
|
@ -227,8 +232,8 @@ std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
|
|||
return answer;
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
bool rapid_justparse(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) bool
|
||||
rapid_just_parse(const simdjson::padded_string &p) {
|
||||
char *buffer = (char *)malloc(p.size() + 1);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
buffer[p.size()] = '\0';
|
||||
|
@ -239,16 +244,15 @@ bool rapid_justparse(const simdjson::padded_string &p) {
|
|||
return answer;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool justdata = false;
|
||||
bool just_data = false;
|
||||
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "vt")) != -1)
|
||||
switch (c) {
|
||||
case 't':
|
||||
justdata = true;
|
||||
just_data = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
|
@ -257,15 +261,18 @@ int main(int argc, char *argv[]) {
|
|||
abort();
|
||||
}
|
||||
if (optind >= argc) {
|
||||
std::cerr << "Using different parsers, we compute the content statistics of "
|
||||
"JSON documents." << std::endl;
|
||||
std::cerr
|
||||
<< "Using different parsers, we compute the content statistics of "
|
||||
"JSON documents."
|
||||
<< std::endl;
|
||||
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
||||
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
const char *filename = argv[optind];
|
||||
if (optind + 1 < argc) {
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
||||
<< std::endl;
|
||||
}
|
||||
simdjson::padded_string p;
|
||||
try {
|
||||
|
@ -285,17 +292,17 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << p.size() << " B ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::vector<int64_t> s1 = simdjson_computestats(p);
|
||||
std::vector<int64_t> s1 = simdjson_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("simdjson: ");
|
||||
print_vec(s1);
|
||||
}
|
||||
std::vector<int64_t> s2 = rapid_computestats(p);
|
||||
std::vector<int64_t> s2 = rapid_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("rapid: ");
|
||||
print_vec(s2);
|
||||
}
|
||||
std::vector<int64_t> s3 = sasjon_computestats(p);
|
||||
std::vector<int64_t> s3 = sasjon_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("sasjon: ");
|
||||
print_vec(s3);
|
||||
|
@ -306,34 +313,35 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
int repeat = 500;
|
||||
int volume = p.size();
|
||||
if(justdata) {
|
||||
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
||||
if (just_data) {
|
||||
printf(
|
||||
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
||||
}
|
||||
BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat,
|
||||
volume, !justdata);
|
||||
BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("simdjson (just parse) ", simdjson_justparse(p), false, , repeat,
|
||||
volume, !justdata);
|
||||
BEST_TIME("rapid (just parse) ", rapid_justparse(p), false, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("sasjon (just parse) ", sasjon_justparse(p), false, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat,
|
||||
volume, !just_data);
|
||||
BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume,
|
||||
!just_data);
|
||||
BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume,
|
||||
!just_data);
|
||||
BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), false, , repeat,
|
||||
volume, !just_data);
|
||||
BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat,
|
||||
volume, !just_data);
|
||||
BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat,
|
||||
volume, !just_data);
|
||||
simdjson::ParsedJson dsimdjson = simdjson::build_parsed_json(p);
|
||||
BEST_TIME("simdjson (just dom) ", simdjson_justdom(dsimdjson).size(), size, , repeat,
|
||||
volume, !justdata);
|
||||
BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size,
|
||||
, repeat, volume, !just_data);
|
||||
char *buffer = (char *)malloc(p.size());
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
rapidjson::Document drapid;
|
||||
drapid.ParseInsitu<kParseValidateEncodingFlag>(buffer);
|
||||
BEST_TIME("rapid (just dom) ", rapid_justdom(drapid).size(), size, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("rapid (just dom) ", rapid_just_dom(drapid).size(), size, , repeat,
|
||||
volume, !just_data);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
auto dsasjon = sajson::parse(sajson::dynamic_allocation(),
|
||||
sajson::mutable_string_view(p.size(), buffer));
|
||||
BEST_TIME("sasjon (just dom) ", sasjon_justdom(dsasjon).size(), size, , repeat, volume,
|
||||
!justdata);
|
||||
sajson::mutable_string_view(p.size(), buffer));
|
||||
BEST_TIME("sasjon (just dom) ", sasjon_just_dom(dsasjon).size(), size, ,
|
||||
repeat, volume, !just_data);
|
||||
free(buffer);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "benchmark.h"
|
||||
#include "simdjson/jsonioutil.h"
|
||||
|
@ -17,7 +17,7 @@
|
|||
using namespace simdjson;
|
||||
using namespace rapidjson;
|
||||
|
||||
std::string rapidstringmeInsitu(char *json) {
|
||||
std::string rapid_stringme_insitu(char *json) {
|
||||
Document d;
|
||||
d.ParseInsitu(json);
|
||||
if (d.HasParseError()) {
|
||||
|
@ -30,7 +30,7 @@ std::string rapidstringmeInsitu(char *json) {
|
|||
return buffer.GetString();
|
||||
}
|
||||
|
||||
std::string rapidstringme(char *json) {
|
||||
std::string rapid_stringme(char *json) {
|
||||
Document d;
|
||||
d.Parse(json);
|
||||
if (d.HasParseError()) {
|
||||
|
@ -46,29 +46,28 @@ std::string rapidstringme(char *json) {
|
|||
int main(int argc, char *argv[]) {
|
||||
int c;
|
||||
bool verbose = false;
|
||||
bool justdata = false;
|
||||
bool just_data = false;
|
||||
|
||||
while ((c = getopt (argc, argv, "vt")) != -1)
|
||||
switch (c)
|
||||
{
|
||||
case 't':
|
||||
justdata = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
break;
|
||||
default:
|
||||
abort ();
|
||||
}
|
||||
while ((c = getopt(argc, argv, "vt")) != -1)
|
||||
switch (c) {
|
||||
case 't':
|
||||
just_data = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
if (optind >= argc) {
|
||||
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
const char * filename = argv[optind];
|
||||
const char *filename = argv[optind];
|
||||
simdjson::padded_string p;
|
||||
try {
|
||||
simdjson::get_corpus(filename).swap(p);
|
||||
} catch (const std::exception& e) { // caught by reference to base
|
||||
} catch (const std::exception &e) { // caught by reference to base
|
||||
std::cout << "Could not load the file " << filename << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
@ -88,71 +87,95 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
int repeat = 50;
|
||||
int volume = p.size();
|
||||
if(justdata) {
|
||||
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
||||
if (just_data) {
|
||||
printf(
|
||||
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
||||
}
|
||||
size_t strlength = rapidstringme((char *)p.data()).size();
|
||||
size_t strlength = rapid_stringme((char *)p.data()).size();
|
||||
if (verbose)
|
||||
std::cout << "input length is " << p.size() << " stringified length is "
|
||||
<< strlength << std::endl;
|
||||
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, !justdata);
|
||||
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
BEST_TIME_NOCHECK("despacing with RapidJSON",
|
||||
rapid_stringme((char *)p.data()), , repeat, volume,
|
||||
!just_data);
|
||||
BEST_TIME_NOCHECK(
|
||||
"despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
|
||||
size_t outlength =
|
||||
simdjson::jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
|
||||
size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
|
||||
(uint8_t *)buffer);
|
||||
if (verbose)
|
||||
std::cout << "jsonminify length is " << outlength << std::endl;
|
||||
std::cout << "json_minify length is " << outlength << std::endl;
|
||||
|
||||
uint8_t *cbuffer = (uint8_t *)buffer;
|
||||
BEST_TIME("jsonminify", simdjson::jsonminify(cbuffer, p.size(), cbuffer), outlength,
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
|
||||
BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
|
||||
outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!just_data);
|
||||
printf("minisize = %zu, original size = %zu (minified down to %.2f percent "
|
||||
"of original) \n",
|
||||
outlength, p.size(), outlength * 100.0 / p.size());
|
||||
|
||||
/***
|
||||
* Is it worth it to minify before parsing?
|
||||
***/
|
||||
rapidjson::Document d;
|
||||
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(),
|
||||
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!just_data);
|
||||
|
||||
char *minibuffer = simdjson::allocate_padded_buffer(p.size() + 1);
|
||||
size_t minisize = simdjson::jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer);
|
||||
minibuffer[minisize] = '\0';
|
||||
char *mini_buffer = simdjson::allocate_padded_buffer(p.size() + 1);
|
||||
size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
|
||||
(uint8_t *)mini_buffer);
|
||||
mini_buffer[minisize] = '\0';
|
||||
|
||||
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
|
||||
memcpy(buffer, minibuffer, p.size()),
|
||||
repeat, volume, !justdata);
|
||||
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
|
||||
false, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
|
||||
!just_data);
|
||||
|
||||
size_t astbuffersize = p.size() * 2;
|
||||
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
|
||||
size_t ast_buffer_size = p.size() * 2;
|
||||
size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t));
|
||||
|
||||
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
BEST_TIME(
|
||||
"sajson orig",
|
||||
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid(),
|
||||
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
|
||||
|
||||
|
||||
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
|
||||
BEST_TIME(
|
||||
"sajson despaced",
|
||||
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
|
||||
sajson::mutable_string_view(minisize, buffer))
|
||||
.is_valid(),
|
||||
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data);
|
||||
|
||||
simdjson::ParsedJson pj;
|
||||
bool isallocok = pj.allocateCapacity(p.size(), 1024);
|
||||
if(!isallocok) {
|
||||
bool is_alloc_ok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!is_alloc_ok) {
|
||||
fprintf(stderr, "failed to allocate memory\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
bool automated_reallocation = false;
|
||||
BEST_TIME("simdjson orig", simdjson::json_parse((const uint8_t*)buffer, p.size(), pj, automated_reallocation), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
|
||||
}
|
||||
bool automated_reallocation = false;
|
||||
BEST_TIME("simdjson orig",
|
||||
simdjson::json_parse((const uint8_t *)buffer, p.size(), pj,
|
||||
automated_reallocation),
|
||||
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!just_data);
|
||||
|
||||
simdjson::ParsedJson pj2;
|
||||
bool isallocok2 = pj2.allocateCapacity(p.size(), 1024);
|
||||
if(!isallocok2) {
|
||||
bool is_alloc_ok2 = pj2.allocate_capacity(p.size(), 1024);
|
||||
if (!is_alloc_ok2) {
|
||||
fprintf(stderr, "failed to allocate memory\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
automated_reallocation = false;
|
||||
BEST_TIME("simdjson despaced", simdjson::json_parse((const uint8_t*)buffer, minisize, pj2, automated_reallocation), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
|
||||
}
|
||||
automated_reallocation = false;
|
||||
BEST_TIME("simdjson despaced",
|
||||
simdjson::json_parse((const uint8_t *)buffer, minisize, pj2,
|
||||
automated_reallocation),
|
||||
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
|
||||
!just_data);
|
||||
free(buffer);
|
||||
free(ast_buffer);
|
||||
free(minibuffer);
|
||||
|
||||
|
||||
free(mini_buffer);
|
||||
}
|
||||
|
|
|
@ -28,57 +28,58 @@
|
|||
#endif
|
||||
//#define DEBUG
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/isadetection.h"
|
||||
#include "simdjson/jsonioutil.h"
|
||||
#include "simdjson/jsonparser.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage2_build_tape.h"
|
||||
#include "simdjson/isadetection.h"
|
||||
namespace simdjson {
|
||||
architecture _find_best_supported_implementation() {
|
||||
constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ
|
||||
| SIMDExtensions::BMI1 | SIMDExtensions::BMI2;
|
||||
constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ;
|
||||
Architecture _find_best_supported_implementation() {
|
||||
constexpr uint32_t haswell_flags =
|
||||
instruction_set::AVX2 | instruction_set::PCLMULQDQ |
|
||||
instruction_set::BMI1 | instruction_set::BMI2;
|
||||
constexpr uint32_t westmere_flags =
|
||||
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
|
||||
uint32_t supports = detect_supported_architectures();
|
||||
// Order from best to worst (within architecture)
|
||||
if ((haswell_flags & supports) == haswell_flags) {
|
||||
return architecture::haswell;
|
||||
return Architecture::HASWELL;
|
||||
}
|
||||
if ((westmere_flags & supports) == westmere_flags) {
|
||||
return architecture::westmere;
|
||||
return Architecture::WESTMERE;
|
||||
}
|
||||
if (SIMDExtensions::NEON) return architecture::arm64;
|
||||
if (instruction_set::NEON)
|
||||
return Architecture::ARM64;
|
||||
|
||||
return architecture::none;
|
||||
return Architecture::NONE;
|
||||
}
|
||||
|
||||
|
||||
using unified_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
using stage1_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
|
||||
using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
|
||||
extern unified_functype *unified_ptr;
|
||||
|
||||
extern stage1_functype *stage1_ptr;
|
||||
|
||||
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
architecture best_implementation = _find_best_supported_implementation();
|
||||
Architecture best_implementation = _find_best_supported_implementation();
|
||||
// Selecting the best implementation
|
||||
switch (best_implementation) {
|
||||
#ifdef IS_X86_64
|
||||
case architecture::haswell:
|
||||
unified_ptr = &unified_machine<architecture::haswell>;
|
||||
case Architecture::HASWELL:
|
||||
unified_ptr = &unified_machine<Architecture::HASWELL>;
|
||||
break;
|
||||
case architecture::westmere:
|
||||
unified_ptr = &unified_machine<architecture::westmere>;
|
||||
case Architecture::WESTMERE:
|
||||
unified_ptr = &unified_machine<Architecture::WESTMERE>;
|
||||
break;
|
||||
#endif
|
||||
#ifdef IS_ARM64
|
||||
case architecture::arm64:
|
||||
unified_ptr = &unified_machine<architecture::arm64>;
|
||||
case Architecture::ARM64:
|
||||
unified_ptr = &unified_machine<Architecture::ARM64>;
|
||||
break;
|
||||
#endif
|
||||
default :
|
||||
default:
|
||||
std::cerr << "The processor is not supported by simdjson." << std::endl;
|
||||
return simdjson::UNEXPECTED_ERROR;
|
||||
}
|
||||
|
@ -87,24 +88,25 @@ int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
|||
}
|
||||
|
||||
// Responsible to select the best json_parse implementation
|
||||
int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
architecture best_implementation = _find_best_supported_implementation();
|
||||
int find_structural_bits_dispatch(const uint8_t *buf, size_t len,
|
||||
ParsedJson &pj) {
|
||||
Architecture best_implementation = _find_best_supported_implementation();
|
||||
// Selecting the best implementation
|
||||
switch (best_implementation) {
|
||||
#ifdef IS_X86_64
|
||||
case architecture::haswell:
|
||||
stage1_ptr = &find_structural_bits<architecture::haswell>;
|
||||
case Architecture::HASWELL:
|
||||
stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
|
||||
break;
|
||||
case architecture::westmere:
|
||||
stage1_ptr = &find_structural_bits<architecture::westmere>;
|
||||
case Architecture::WESTMERE:
|
||||
stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
|
||||
break;
|
||||
#endif
|
||||
#ifdef IS_ARM64
|
||||
case architecture::arm64:
|
||||
stage1_ptr = &find_structural_bits<architecture::arm64>;
|
||||
case Architecture::ARM64:
|
||||
stage1_ptr = &find_structural_bits<Architecture::ARM64>;
|
||||
break;
|
||||
#endif
|
||||
default :
|
||||
default:
|
||||
std::cerr << "The processor is not supported by simdjson." << std::endl;
|
||||
return simdjson::UNEXPECTED_ERROR;
|
||||
}
|
||||
|
@ -114,23 +116,21 @@ int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj
|
|||
|
||||
stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
|
||||
unified_functype *unified_ptr = &unified_machine_dispatch;
|
||||
}
|
||||
|
||||
|
||||
} // namespace simdjson
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool dump = false;
|
||||
bool jsonoutput = false;
|
||||
bool forceoneiteration = false;
|
||||
bool justdata = false;
|
||||
bool json_output = false;
|
||||
bool force_one_iteration = false;
|
||||
bool just_data = false;
|
||||
#ifndef _MSC_VER
|
||||
int c;
|
||||
|
||||
while ((c = getopt(argc, argv, "1vdt")) != -1) {
|
||||
switch (c) {
|
||||
case 't':
|
||||
justdata = true;
|
||||
just_data = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
|
@ -139,15 +139,15 @@ int main(int argc, char *argv[]) {
|
|||
dump = true;
|
||||
break;
|
||||
case 'j':
|
||||
jsonoutput = true;
|
||||
json_output = true;
|
||||
break;
|
||||
case '1':
|
||||
forceoneiteration = true;
|
||||
force_one_iteration = true;
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
int optind = 1;
|
||||
#endif
|
||||
|
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
const char *filename = argv[optind];
|
||||
if (optind + 1 < argc) {
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
||||
<< std::endl;
|
||||
}
|
||||
if (verbose) {
|
||||
std::cout << "[verbose] loading " << filename << std::endl;
|
||||
|
@ -170,30 +171,41 @@ int main(int argc, char *argv[]) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
if (verbose) {
|
||||
std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << "[verbose] loaded " << filename << " (" << p.size()
|
||||
<< " bytes)" << std::endl;
|
||||
}
|
||||
#if defined(DEBUG)
|
||||
const uint32_t iterations = 1;
|
||||
#else
|
||||
const uint32_t iterations =
|
||||
forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
|
||||
force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
|
||||
#endif
|
||||
std::vector<double> res;
|
||||
res.resize(iterations);
|
||||
if(!justdata) printf("number of iterations %u \n", iterations);
|
||||
if (!just_data)
|
||||
printf("number of iterations %u \n", iterations);
|
||||
#if !defined(__linux__)
|
||||
#define SQUASH_COUNTERS
|
||||
if (justdata) {
|
||||
printf("justdata (-t) flag only works under linux.\n");
|
||||
if (just_data) {
|
||||
printf("just_data (-t) flag only works under linux.\n");
|
||||
}
|
||||
#endif
|
||||
{// practice run
|
||||
{ // practice run
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size());
|
||||
if(allocok) {
|
||||
simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj);
|
||||
simdjson::unified_ptr((const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)p.data(), p.size(), pj);
|
||||
bool allocok = pj.allocate_capacity(p.size());
|
||||
if (allocok) {
|
||||
simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
|
||||
simdjson::unified_ptr(
|
||||
(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t
|
||||
*)(const uint8_t *)
|
||||
p.data(),
|
||||
p.size(), pj);
|
||||
}
|
||||
}
|
||||
#ifndef SQUASH_COUNTERS
|
||||
|
@ -220,7 +232,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
unified.start();
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size());
|
||||
bool allocok = pj.allocate_capacity(p.size());
|
||||
if (!allocok) {
|
||||
std::cerr << "failed to allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
@ -235,7 +247,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
|
||||
}
|
||||
unified.start();
|
||||
isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS);
|
||||
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
|
||||
simdjson::SUCCESS);
|
||||
unified.end(results);
|
||||
cy1 += results[0];
|
||||
cl1 += results[1];
|
||||
|
@ -247,7 +260,9 @@ int main(int argc, char *argv[]) {
|
|||
break;
|
||||
}
|
||||
unified.start();
|
||||
isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj));
|
||||
isok = isok &&
|
||||
(simdjson::SUCCESS ==
|
||||
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
|
||||
unified.end(results);
|
||||
cy2 += results[0];
|
||||
cl2 += results[1];
|
||||
|
@ -266,7 +281,7 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "[verbose] iteration # " << i << std::endl;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size());
|
||||
bool allocok = pj.allocate_capacity(p.size());
|
||||
if (!allocok) {
|
||||
std::cerr << "failed to allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
@ -276,20 +291,24 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS);
|
||||
isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj));
|
||||
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
|
||||
simdjson::SUCCESS);
|
||||
isok = isok &&
|
||||
(simdjson::SUCCESS ==
|
||||
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> secs = end - start;
|
||||
res[i] = secs.count();
|
||||
if(! isok) {
|
||||
std::cerr << pj.getErrorMsg() << std::endl;
|
||||
if (!isok) {
|
||||
std::cerr << pj.get_error_message() << std::endl;
|
||||
std::cerr << "Could not parse. " << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
simdjson::ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
|
||||
if (!pj.isValid()) {
|
||||
std::cerr << pj.getErrorMsg() << std::endl;
|
||||
}
|
||||
simdjson::ParsedJson pj =
|
||||
build_parsed_json(p); // do the parsing again to get the stats
|
||||
if (!pj.is_valid()) {
|
||||
std::cerr << pj.get_error_message() << std::endl;
|
||||
std::cerr << "Could not parse. " << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
@ -297,7 +316,7 @@ int main(int argc, char *argv[]) {
|
|||
double speedinGBs = (p.size()) / (min_result * 1000000000.0);
|
||||
#ifndef SQUASH_COUNTERS
|
||||
unsigned long total = cy0 + cy1 + cy2;
|
||||
if (justdata) {
|
||||
if (just_data) {
|
||||
float cpb0 = (double)cy0 / (iterations * p.size());
|
||||
float cpb1 = (double)cy1 / (iterations * p.size());
|
||||
float cpb2 = (double)cy2 / (iterations * p.size());
|
||||
|
@ -315,8 +334,8 @@ int main(int argc, char *argv[]) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2,
|
||||
cpbtotal, speedinGBs);
|
||||
printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
|
||||
speedinGBs);
|
||||
free(newfile);
|
||||
} else {
|
||||
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
|
||||
|
@ -352,16 +371,16 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
printf(" all stages: %.2f cycles per input byte.\n",
|
||||
(double)total / (iterations * p.size()));
|
||||
printf("Estimated average frequency: %.3f GHz.\n", (double)total / (iterations * min_result * 1000000000.0));
|
||||
printf("Estimated average frequency: %.3f GHz.\n",
|
||||
(double)total / (iterations * min_result * 1000000000.0));
|
||||
}
|
||||
#endif
|
||||
if (!justdata) {
|
||||
if (!just_data) {
|
||||
std::cout << "Min: " << min_result << " bytes read: " << p.size()
|
||||
<< " Gigabytes/second: " << speedinGBs
|
||||
<< std::endl;
|
||||
<< " Gigabytes/second: " << speedinGBs << std::endl;
|
||||
}
|
||||
if (jsonoutput) {
|
||||
isok = isok && pj.printjson(std::cout);
|
||||
if (json_output) {
|
||||
isok = isok && pj.print_json(std::cout);
|
||||
}
|
||||
if (dump) {
|
||||
isok = isok && pj.dump_raw_tape(std::cout);
|
||||
|
|
|
@ -43,11 +43,11 @@ void print_stat(const stat_t &s) {
|
|||
s.true_count, s.false_count);
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) stat_t
|
||||
simdjson_compute_stats(const simdjson::padded_string &p) {
|
||||
stat_t answer;
|
||||
simdjson::ParsedJson pj = build_parsed_json(p);
|
||||
answer.valid = pj.isValid();
|
||||
answer.valid = pj.is_valid();
|
||||
if (!answer.valid) {
|
||||
return answer;
|
||||
}
|
||||
|
@ -57,24 +57,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
|||
answer.null_count = 0;
|
||||
answer.true_count = 0;
|
||||
answer.false_count = 0;
|
||||
size_t tapeidx = 0;
|
||||
uint64_t tape_val = pj.tape[tapeidx++];
|
||||
size_t tape_idx = 0;
|
||||
uint64_t tape_val = pj.tape[tape_idx++];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
size_t howmany = 0;
|
||||
size_t how_many = 0;
|
||||
assert(type == 'r');
|
||||
howmany = tape_val & JSONVALUEMASK;
|
||||
for (; tapeidx < howmany; tapeidx++) {
|
||||
tape_val = pj.tape[tapeidx];
|
||||
// uint64_t payload = tape_val & JSONVALUEMASK;
|
||||
how_many = tape_val & JSON_VALUE_MASK;
|
||||
for (; tape_idx < how_many; tape_idx++) {
|
||||
tape_val = pj.tape[tape_idx];
|
||||
// uint64_t payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
switch (type) {
|
||||
case 'l': // we have a long int
|
||||
answer.number_count++;
|
||||
tapeidx++; // skipping the integer
|
||||
tape_idx++; // skipping the integer
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
answer.number_count++;
|
||||
tapeidx++; // skipping the double
|
||||
tape_idx++; // skipping the double
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
answer.null_count++;
|
||||
|
@ -145,8 +145,8 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
|
|||
}
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
stat_t sasjon_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) stat_t
|
||||
sasjon_compute_stats(const simdjson::padded_string &p) {
|
||||
stat_t answer;
|
||||
char *buffer = (char *)malloc(p.size());
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
|
@ -203,8 +203,8 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
|
|||
}
|
||||
}
|
||||
|
||||
__attribute__ ((noinline))
|
||||
stat_t rapid_computestats(const simdjson::padded_string &p) {
|
||||
__attribute__((noinline)) stat_t
|
||||
rapid_compute_stats(const simdjson::padded_string &p) {
|
||||
stat_t answer;
|
||||
char *buffer = (char *)malloc(p.size() + 1);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
|
@ -228,13 +228,13 @@ stat_t rapid_computestats(const simdjson::padded_string &p) {
|
|||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool justdata = false;
|
||||
bool just_data = false;
|
||||
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "vt")) != -1)
|
||||
switch (c) {
|
||||
case 't':
|
||||
justdata = true;
|
||||
just_data = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
|
@ -243,15 +243,18 @@ int main(int argc, char *argv[]) {
|
|||
abort();
|
||||
}
|
||||
if (optind >= argc) {
|
||||
std::cerr << "Using different parsers, we compute the content statistics of "
|
||||
"JSON documents." << std::endl;
|
||||
std::cerr
|
||||
<< "Using different parsers, we compute the content statistics of "
|
||||
"JSON documents."
|
||||
<< std::endl;
|
||||
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
||||
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
const char *filename = argv[optind];
|
||||
if (optind + 1 < argc) {
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
||||
<< std::endl;
|
||||
}
|
||||
simdjson::padded_string p;
|
||||
try {
|
||||
|
@ -271,17 +274,17 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << p.size() << " B ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
stat_t s1 = simdjson_computestats(p);
|
||||
stat_t s1 = simdjson_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("simdjson: ");
|
||||
print_stat(s1);
|
||||
}
|
||||
stat_t s2 = rapid_computestats(p);
|
||||
stat_t s2 = rapid_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("rapid: ");
|
||||
print_stat(s2);
|
||||
}
|
||||
stat_t s3 = sasjon_computestats(p);
|
||||
stat_t s3 = sasjon_compute_stats(p);
|
||||
if (verbose) {
|
||||
printf("sasjon: ");
|
||||
print_stat(s3);
|
||||
|
@ -290,13 +293,13 @@ int main(int argc, char *argv[]) {
|
|||
assert(stat_equal(s1, s3));
|
||||
int repeat = 50;
|
||||
int volume = p.size();
|
||||
if(justdata) {
|
||||
if (just_data) {
|
||||
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
||||
}
|
||||
BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat,
|
||||
volume, !justdata);
|
||||
BEST_TIME("RapidJSON ", rapid_computestats(p).valid, true, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("sasjon ", sasjon_computestats(p).valid, true, , repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat,
|
||||
volume, !just_data);
|
||||
BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume,
|
||||
!just_data);
|
||||
BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume,
|
||||
!just_data);
|
||||
}
|
||||
|
|
|
@ -59,12 +59,12 @@ bool fastjson_parse(const char *input) {
|
|||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool justdata = false;
|
||||
bool just_data = false;
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "vt")) != -1)
|
||||
switch (c) {
|
||||
case 't':
|
||||
justdata = true;
|
||||
just_data = true;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = true;
|
||||
|
@ -102,24 +102,24 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << std::endl;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
|
||||
if (!allocok) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
|
||||
int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
|
||||
int volume = p.size();
|
||||
if (justdata) {
|
||||
if (just_data) {
|
||||
printf("%-42s %20s %20s %20s %20s \n", "name", "cycles_per_byte",
|
||||
"cycles_per_byte_err", "gb_per_s", "gb_per_s_err");
|
||||
}
|
||||
if (!justdata)
|
||||
BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
|
||||
repeat, volume, !justdata);
|
||||
if (!just_data)
|
||||
BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).is_valid(), true,
|
||||
, repeat, volume, !just_data);
|
||||
// (static alloc)
|
||||
BEST_TIME("simdjson ", json_parse(p, pj), simdjson::SUCCESS, , repeat, volume,
|
||||
!justdata);
|
||||
!just_data);
|
||||
|
||||
rapidjson::Document d;
|
||||
|
||||
|
@ -127,56 +127,57 @@ int main(int argc, char *argv[]) {
|
|||
memcpy(buffer, p.data(), p.size());
|
||||
buffer[p.size()] = '\0';
|
||||
#ifndef ALLPARSER
|
||||
if (!justdata)
|
||||
if (!just_data)
|
||||
#endif
|
||||
BEST_TIME(
|
||||
"RapidJSON ", d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
|
||||
.HasParseError(),
|
||||
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
BEST_TIME("RapidJSON ",
|
||||
d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
|
||||
.HasParseError(),
|
||||
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!just_data);
|
||||
BEST_TIME("RapidJSON (insitu)",
|
||||
d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(),
|
||||
false,
|
||||
memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'),
|
||||
repeat, volume, !justdata);
|
||||
repeat, volume, !just_data);
|
||||
#ifndef ALLPARSER
|
||||
if (!justdata)
|
||||
if (!just_data)
|
||||
#endif
|
||||
BEST_TIME("sajson (dynamic mem)",
|
||||
sajson::parse(sajson::dynamic_allocation(),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid(),
|
||||
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!justdata);
|
||||
!just_data);
|
||||
|
||||
size_t astbuffersize = p.size();
|
||||
size_t *ast_buffer = (size_t *)malloc(astbuffersize * sizeof(size_t));
|
||||
size_t ast_buffer_size = p.size();
|
||||
size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t));
|
||||
// (static alloc, insitu)
|
||||
BEST_TIME("sajson",
|
||||
sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid(),
|
||||
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!justdata);
|
||||
BEST_TIME(
|
||||
"sajson",
|
||||
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid(),
|
||||
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
|
||||
#ifdef ALLPARSER
|
||||
std::string json11err;
|
||||
BEST_TIME("dropbox (json11) ",
|
||||
((json11::Json::parse(buffer, json11err).is_null()) ||
|
||||
(!json11err.empty())),
|
||||
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!justdata);
|
||||
!just_data);
|
||||
|
||||
BEST_TIME("fastjson ", fastjson_parse(buffer), true,
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
|
||||
JsonValue value;
|
||||
JsonAllocator allocator;
|
||||
char *endptr;
|
||||
BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator),
|
||||
JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume,
|
||||
!justdata);
|
||||
!just_data);
|
||||
void *state;
|
||||
BEST_TIME("ultrajson ",
|
||||
(UJDecode(buffer, p.size(), NULL, &state) == NULL), false,
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
|
||||
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
|
||||
|
||||
{
|
||||
std::unique_ptr<jsmntok_t[]> tokens =
|
||||
|
@ -185,32 +186,33 @@ int main(int argc, char *argv[]) {
|
|||
jsmn_init(&parser);
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
buffer[p.size()] = '\0';
|
||||
BEST_TIME("jsmn ", (jsmn_parse(&parser, buffer, p.size(),
|
||||
tokens.get(), p.size()) > 0),
|
||||
true, jsmn_init(&parser), repeat, volume, !justdata);
|
||||
BEST_TIME(
|
||||
"jsmn ",
|
||||
(jsmn_parse(&parser, buffer, p.size(), tokens.get(), p.size()) > 0),
|
||||
true, jsmn_init(&parser), repeat, volume, !just_data);
|
||||
}
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
buffer[p.size()] = '\0';
|
||||
cJSON *tree = cJSON_Parse(buffer);
|
||||
BEST_TIME("cJSON ", ((tree = cJSON_Parse(buffer)) != NULL), true,
|
||||
cJSON_Delete(tree), repeat, volume, !justdata);
|
||||
cJSON_Delete(tree), repeat, volume, !just_data);
|
||||
cJSON_Delete(tree);
|
||||
|
||||
Json::CharReaderBuilder b;
|
||||
Json::CharReader *jsoncppreader = b.newCharReader();
|
||||
Json::CharReader *json_cpp_reader = b.newCharReader();
|
||||
Json::Value root;
|
||||
Json::String errs;
|
||||
BEST_TIME("jsoncpp ",
|
||||
jsoncppreader->parse(buffer, buffer + volume, &root, &errs), true, ,
|
||||
repeat, volume, !justdata);
|
||||
delete jsoncppreader;
|
||||
json_cpp_reader->parse(buffer, buffer + volume, &root, &errs), true,
|
||||
, repeat, volume, !just_data);
|
||||
delete json_cpp_reader;
|
||||
#endif
|
||||
if (!justdata)
|
||||
if (!just_data)
|
||||
BEST_TIME("memcpy ",
|
||||
(memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat,
|
||||
volume, !justdata);
|
||||
volume, !just_data);
|
||||
#ifdef __linux__
|
||||
if (!justdata) {
|
||||
if (!just_data) {
|
||||
printf("\n \n <doing additional analysis with performance counters (Linux "
|
||||
"only)>\n");
|
||||
std::vector<int> evts;
|
||||
|
@ -265,7 +267,7 @@ int main(int argc, char *argv[]) {
|
|||
for (int i = 0; i < repeat; i++) {
|
||||
memcpy(buffer, p.data(), p.size());
|
||||
unified.start();
|
||||
if (sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize),
|
||||
if (sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid() != true)
|
||||
printf("bug\n");
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#include <iostream>
|
||||
#ifndef _MSC_VER
|
||||
#ifndef _MSC_VER
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "simdjson/jsonioutil.h"
|
||||
|
@ -29,7 +29,7 @@ struct stat_s {
|
|||
size_t float_count;
|
||||
size_t string_count;
|
||||
size_t backslash_count;
|
||||
size_t nonasciibyte_count;
|
||||
size_t non_ascii_byte_count;
|
||||
size_t object_count;
|
||||
size_t array_count;
|
||||
size_t null_count;
|
||||
|
@ -42,16 +42,17 @@ struct stat_s {
|
|||
|
||||
using stat_t = struct stat_s;
|
||||
|
||||
stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
||||
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
|
||||
stat_t answer;
|
||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
|
||||
answer.valid = pj.isValid();
|
||||
answer.valid = pj.is_valid();
|
||||
if (!answer.valid) {
|
||||
return answer;
|
||||
}
|
||||
answer.backslash_count = count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.nonasciibyte_count =
|
||||
count_nonasciibytes(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.backslash_count =
|
||||
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.non_ascii_byte_count = count_nonasciibytes(
|
||||
reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.byte_count = p.size();
|
||||
answer.integer_count = 0;
|
||||
answer.float_count = 0;
|
||||
|
@ -62,24 +63,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
|||
answer.false_count = 0;
|
||||
answer.string_count = 0;
|
||||
answer.structural_indexes_count = pj.n_structural_indexes;
|
||||
size_t tapeidx = 0;
|
||||
uint64_t tape_val = pj.tape[tapeidx++];
|
||||
size_t tape_idx = 0;
|
||||
uint64_t tape_val = pj.tape[tape_idx++];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
size_t howmany = 0;
|
||||
size_t how_many = 0;
|
||||
assert(type == 'r');
|
||||
howmany = tape_val & JSONVALUEMASK;
|
||||
for (; tapeidx < howmany; tapeidx++) {
|
||||
tape_val = pj.tape[tapeidx];
|
||||
// uint64_t payload = tape_val & JSONVALUEMASK;
|
||||
how_many = tape_val & JSON_VALUE_MASK;
|
||||
for (; tape_idx < how_many; tape_idx++) {
|
||||
tape_val = pj.tape[tape_idx];
|
||||
// uint64_t payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
switch (type) {
|
||||
case 'l': // we have a long int
|
||||
answer.integer_count++;
|
||||
tapeidx++; // skipping the integer
|
||||
tape_idx++; // skipping the integer
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
answer.float_count++;
|
||||
tapeidx++; // skipping the double
|
||||
tape_idx++; // skipping the double
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
answer.null_count++;
|
||||
|
@ -112,14 +113,14 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
|||
|
||||
int main(int argc, char *argv[]) {
|
||||
#ifndef _MSC_VER
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "")) != -1) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "")) != -1) {
|
||||
switch (c) {
|
||||
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
int optind = 1;
|
||||
#endif
|
||||
|
@ -141,30 +142,30 @@ int main(int argc, char *argv[]) {
|
|||
std::cerr << "Could not load the file " << filename << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
stat_t s = simdjson_computestats(p);
|
||||
stat_t s = simdjson_compute_stats(p);
|
||||
if (!s.valid) {
|
||||
std::cerr << "not a valid JSON" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
printf("# integer_count float_count string_count backslash_count "
|
||||
"nonasciibyte_count object_count array_count null_count true_count "
|
||||
"non_ascii_byte_count object_count array_count null_count true_count "
|
||||
"false_count byte_count structural_indexes_count ");
|
||||
#ifdef __linux__
|
||||
printf(
|
||||
" stage1_cycle_count stage1_instruction_count stage2_cycle_count "
|
||||
" stage2_instruction_count stage3_cycle_count stage3_instruction_count ");
|
||||
printf(" stage1_cycle_count stage1_instruction_count stage2_cycle_count "
|
||||
" stage2_instruction_count stage3_cycle_count "
|
||||
"stage3_instruction_count ");
|
||||
#else
|
||||
printf("(you are not under linux, so perf counters are disaabled)");
|
||||
#endif
|
||||
printf("\n");
|
||||
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu ", s.integer_count,
|
||||
s.float_count, s.string_count, s.backslash_count, s.nonasciibyte_count,
|
||||
s.object_count, s.array_count, s.null_count, s.true_count,
|
||||
s.false_count, s.byte_count, s.structural_indexes_count);
|
||||
s.float_count, s.string_count, s.backslash_count,
|
||||
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
|
||||
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
|
||||
#ifdef __linux__
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size());
|
||||
bool allocok = pj.allocate_capacity(p.size());
|
||||
if (!allocok) {
|
||||
std::cerr << "failed to allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
@ -180,20 +181,22 @@ int main(int argc, char *argv[]) {
|
|||
results.resize(evts.size());
|
||||
for (uint32_t i = 0; i < iterations; i++) {
|
||||
unified.start();
|
||||
// The default template is simdjson::architecture::native.
|
||||
bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
|
||||
// The default template is simdjson::Architecture::NATIVE.
|
||||
bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) ==
|
||||
simdjson::SUCCESS);
|
||||
unified.end(results);
|
||||
|
||||
|
||||
cy1 += results[0];
|
||||
cl1 += results[1];
|
||||
|
||||
|
||||
unified.start();
|
||||
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
||||
isok =
|
||||
isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
|
||||
unified.end(results);
|
||||
|
||||
|
||||
cy2 += results[0];
|
||||
cl2 += results[1];
|
||||
if(!isok) {
|
||||
if (!isok) {
|
||||
std::cerr << "failure?" << std::endl;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,11 +10,11 @@
|
|||
|
||||
// the input buf should be readable up to buf + SIMDJSON_PADDING
|
||||
#ifdef __AVX2__
|
||||
#define SIMDJSON_PADDING sizeof(__m256i)
|
||||
#define SIMDJSON_PADDING sizeof(__m256i)
|
||||
#else
|
||||
// this is a stopgap; there should be a better description of the
|
||||
// main loop and its behavior that abstracts over this
|
||||
#define SIMDJSON_PADDING 32
|
||||
#define SIMDJSON_PADDING 32
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
|
@ -23,7 +23,6 @@
|
|||
#define SIMDJSON_USE_COMPUTED_GOTO
|
||||
#endif
|
||||
|
||||
|
||||
// Align to N-byte boundary
|
||||
#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
|
||||
#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
|
||||
|
@ -49,13 +48,13 @@
|
|||
|
||||
#else
|
||||
|
||||
// For non-Visual Studio compilers, we may assume that same-page buffer overrun is fine.
|
||||
// However, it will make it difficult to be "valgrind clean".
|
||||
// For non-Visual Studio compilers, we may assume that same-page buffer overrun
|
||||
// is fine. However, it will make it difficult to be "valgrind clean".
|
||||
//#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
||||
//#define ALLOW_SAME_PAGE_BUFFER_OVERRUN true
|
||||
//#else
|
||||
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
|
||||
//#endif
|
||||
//#endif
|
||||
|
||||
// The following is likely unnecessarily complex.
|
||||
#ifdef __SANITIZE_ADDRESS__
|
||||
|
@ -63,16 +62,18 @@
|
|||
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
|
||||
#elif defined(__has_feature)
|
||||
// we have CLANG?
|
||||
// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we have a non-empty qualifier?
|
||||
# if (__has_feature(address_sanitizer))
|
||||
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER __attribute__((no_sanitize("address")))
|
||||
# endif
|
||||
#endif
|
||||
// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we
|
||||
// have a non-empty qualifier?
|
||||
#if (__has_feature(address_sanitizer))
|
||||
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER \
|
||||
__attribute__((no_sanitize("address")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__has_feature)
|
||||
# if (__has_feature(memory_sanitizer))
|
||||
#if (__has_feature(memory_sanitizer))
|
||||
#define LENIENT_MEM_SANITIZER __attribute__((no_sanitize("memory")))
|
||||
# endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define really_inline inline __attribute__((always_inline, unused))
|
||||
|
@ -88,7 +89,7 @@
|
|||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#endif
|
||||
|
||||
#endif // MSC_VER
|
||||
#endif // MSC_VER
|
||||
|
||||
// if it does not apply, make it an empty macro
|
||||
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/* From https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
|
||||
Highly modified.
|
||||
/* From
|
||||
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
|
||||
Highly modified.
|
||||
|
||||
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
||||
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
||||
|
@ -7,9 +8,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|||
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
||||
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
||||
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
|
||||
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
|
||||
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
|
||||
Samy Bengio, Johnny Mariethoz)
|
||||
|
||||
All rights reserved.
|
||||
|
||||
|
@ -23,8 +25,8 @@ modification, are permitted provided that the following conditions are met:
|
|||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
||||
and IDIAP Research Institute nor the names of its contributors may be
|
||||
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
|
||||
America and IDIAP Research Institute nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
|
@ -60,51 +62,48 @@ constexpr uint32_t cpuid_bmi2_bit = 1 << 8; // bit 8 of EBX for EAX=0x7
|
|||
constexpr uint32_t cpuid_sse42_bit = 1 << 20; // bit 20 of ECX for EAX=0x1
|
||||
constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; // bit 1 of ECX for EAX=0x1
|
||||
|
||||
enum SIMDExtensions {
|
||||
DEFAULT = 0x0,
|
||||
NEON = 0x1,
|
||||
AVX2 = 0x4,
|
||||
SSE42 = 0x8,
|
||||
enum instruction_set {
|
||||
DEFAULT = 0x0,
|
||||
NEON = 0x1,
|
||||
AVX2 = 0x4,
|
||||
SSE42 = 0x8,
|
||||
PCLMULQDQ = 0x10,
|
||||
BMI1 = 0x20,
|
||||
BMI2 = 0x40
|
||||
BMI1 = 0x20,
|
||||
BMI2 = 0x40
|
||||
};
|
||||
|
||||
#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
|
||||
|
||||
#if defined(__NEON__)
|
||||
#if defined(__NEON__)
|
||||
|
||||
static inline uint32_t detect_supported_architectures()
|
||||
{
|
||||
return SIMDExtensions::NEON;
|
||||
static inline uint32_t detect_supported_architectures() {
|
||||
return instruction_set::NEON;
|
||||
}
|
||||
|
||||
#else //ARM without NEON
|
||||
#else // ARM without NEON
|
||||
|
||||
static inline uint32_t detect_supported_architectures()
|
||||
{
|
||||
return SIMDExtensions::DEFAULT;
|
||||
static inline uint32_t detect_supported_architectures() {
|
||||
return instruction_set::DEFAULT;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else // x86
|
||||
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
|
||||
{
|
||||
#endif
|
||||
|
||||
#else // x86
|
||||
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
|
||||
uint32_t *edx) {
|
||||
#if defined(_MSC_VER)
|
||||
int cpuInfo[4];
|
||||
__cpuid(cpuInfo, *eax);
|
||||
*eax = cpuInfo[0];
|
||||
*ebx = cpuInfo[1];
|
||||
*ecx = cpuInfo[2];
|
||||
*edx = cpuInfo[3];
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info, *eax);
|
||||
*eax = cpu_info[0];
|
||||
*ebx = cpu_info[1];
|
||||
*ecx = cpu_info[2];
|
||||
*edx = cpu_info[3];
|
||||
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
|
||||
uint32_t level = *eax;
|
||||
__get_cpuid (level, eax, ebx, ecx, edx);
|
||||
__get_cpuid(level, eax, ebx, ecx, edx);
|
||||
#else
|
||||
uint32_t a = *eax, b, c = *ecx, d;
|
||||
asm volatile ( "cpuid\n\t"
|
||||
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
|
||||
asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
|
||||
*eax = a;
|
||||
*ebx = b;
|
||||
*ecx = c;
|
||||
|
@ -112,10 +111,9 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline uint32_t detect_supported_architectures()
|
||||
{
|
||||
static inline uint32_t detect_supported_architectures() {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
uint32_t hostSimdExts = 0x0;
|
||||
uint32_t host_isa = 0x0;
|
||||
|
||||
// ECX for EAX=0x7
|
||||
eax = 0x7;
|
||||
|
@ -123,15 +121,15 @@ static inline uint32_t detect_supported_architectures()
|
|||
cpuid(&eax, &ebx, &ecx, &edx);
|
||||
|
||||
if (ebx & cpuid_avx2_bit) {
|
||||
hostSimdExts |= SIMDExtensions::AVX2;
|
||||
host_isa |= instruction_set::AVX2;
|
||||
}
|
||||
|
||||
if (ebx & cpuid_bmi1_bit) {
|
||||
hostSimdExts |= SIMDExtensions::BMI1;
|
||||
host_isa |= instruction_set::BMI1;
|
||||
}
|
||||
|
||||
if (ebx & cpuid_bmi2_bit) {
|
||||
hostSimdExts |= SIMDExtensions::BMI2;
|
||||
host_isa |= instruction_set::BMI2;
|
||||
}
|
||||
|
||||
// EBX for EAX=0x1
|
||||
|
@ -139,16 +137,16 @@ static inline uint32_t detect_supported_architectures()
|
|||
cpuid(&eax, &ebx, &ecx, &edx);
|
||||
|
||||
if (ecx & cpuid_sse42_bit) {
|
||||
hostSimdExts |= SIMDExtensions::SSE42;
|
||||
host_isa |= instruction_set::SSE42;
|
||||
}
|
||||
|
||||
if (ecx & cpuid_pclmulqdq_bit) {
|
||||
hostSimdExts |= SIMDExtensions::PCLMULQDQ;
|
||||
host_isa |= instruction_set::PCLMULQDQ;
|
||||
}
|
||||
|
||||
return hostSimdExts;
|
||||
return host_isa;
|
||||
}
|
||||
|
||||
#endif // end SIMD extension detection code
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -35,7 +35,6 @@ really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) {
|
|||
return structural_or_whitespace_or_null_negated[c];
|
||||
}
|
||||
|
||||
|
||||
const uint32_t structural_or_whitespace_negated[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
|
@ -76,7 +75,6 @@ really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) {
|
|||
return structural_or_whitespace_or_null[c];
|
||||
}
|
||||
|
||||
|
||||
const uint32_t structural_or_whitespace[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
|
@ -94,7 +92,7 @@ really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|||
return structural_or_whitespace[c];
|
||||
}
|
||||
|
||||
const uint32_t digittoval32[886] = {
|
||||
const uint32_t digit_to_val32[886] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
|
@ -103,7 +101,7 @@ const uint32_t digittoval32[886] = {
|
|||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
|
||||
0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
|
||||
0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF,
|
||||
|
@ -138,7 +136,7 @@ const uint32_t digittoval32[886] = {
|
|||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
|
||||
0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
|
||||
0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
|
||||
0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF,
|
||||
|
@ -173,7 +171,7 @@ const uint32_t digittoval32[886] = {
|
|||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
|
||||
0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
|
||||
0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
|
||||
0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF,
|
||||
|
@ -208,7 +206,7 @@ const uint32_t digittoval32[886] = {
|
|||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
|
||||
0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
|
||||
0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
|
||||
0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF,
|
||||
|
@ -244,15 +242,17 @@ const uint32_t digittoval32[886] = {
|
|||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
|
||||
// returns a value with the high 16 bits set if not valid
|
||||
// otherwise returns the conversion of the 4 hex digits at src into the bottom 16 bits of the 32-bit
|
||||
// return register
|
||||
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
||||
// 16 bits of the 32-bit return register
|
||||
//
|
||||
// see https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
||||
static inline uint32_t hex_to_u32_nocheck(const uint8_t *src) {// strictly speaking, static inline is a C-ism
|
||||
uint32_t v1 = digittoval32[630 + src[0]];
|
||||
uint32_t v2 = digittoval32[420 + src[1]];
|
||||
uint32_t v3 = digittoval32[210 + src[2]];
|
||||
uint32_t v4 = digittoval32[0 + src[3]];
|
||||
// see
|
||||
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
||||
static inline uint32_t hex_to_u32_nocheck(
|
||||
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
||||
uint32_t v1 = digit_to_val32[630 + src[0]];
|
||||
uint32_t v2 = digit_to_val32[420 + src[1]];
|
||||
uint32_t v3 = digit_to_val32[210 + src[2]];
|
||||
uint32_t v4 = digit_to_val32[0 + src[3]];
|
||||
return v1 | v2 | v3 | v4;
|
||||
}
|
||||
|
||||
|
@ -272,19 +272,21 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|||
if (cp <= 0x7F) {
|
||||
c[0] = cp;
|
||||
return 1; // ascii
|
||||
} if (cp <= 0x7FF) {
|
||||
}
|
||||
if (cp <= 0x7FF) {
|
||||
c[0] = (cp >> 6) + 192;
|
||||
c[1] = (cp & 63) + 128;
|
||||
return 2; // universal plane
|
||||
// Surrogates are treated elsewhere...
|
||||
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
||||
// return 0; // surrogates // could put assert here
|
||||
// Surrogates are treated elsewhere...
|
||||
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
||||
// return 0; // surrogates // could put assert here
|
||||
} else if (cp <= 0xFFFF) {
|
||||
c[0] = (cp >> 12) + 224;
|
||||
c[1] = ((cp >> 6) & 63) + 128;
|
||||
c[2] = (cp & 63) + 128;
|
||||
return 3;
|
||||
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed
|
||||
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
||||
// is not needed
|
||||
c[0] = (cp >> 18) + 240;
|
||||
c[1] = ((cp >> 12) & 63) + 128;
|
||||
c[2] = ((cp >> 6) & 63) + 128;
|
||||
|
@ -294,6 +296,6 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|||
// will return 0 when the code point was too large.
|
||||
return 0; // bad r
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
||||
|
|
|
@ -10,40 +10,40 @@ namespace simdjson {
|
|||
static inline void print_with_escapes(const unsigned char *src) {
|
||||
while (*src) {
|
||||
switch (*src) {
|
||||
case '\b':
|
||||
putchar('\\');
|
||||
putchar('b');
|
||||
break;
|
||||
case '\f':
|
||||
putchar('\\');
|
||||
putchar('f');
|
||||
break;
|
||||
case '\n':
|
||||
putchar('\\');
|
||||
putchar('n');
|
||||
break;
|
||||
case '\r':
|
||||
putchar('\\');
|
||||
putchar('r');
|
||||
break;
|
||||
case '\"':
|
||||
putchar('\\');
|
||||
putchar('"');
|
||||
break;
|
||||
case '\t':
|
||||
putchar('\\');
|
||||
putchar('t');
|
||||
break;
|
||||
case '\\':
|
||||
putchar('\\');
|
||||
putchar('\\');
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
printf("\\u%04x", *src);
|
||||
} else {
|
||||
putchar(*src);
|
||||
}
|
||||
case '\b':
|
||||
putchar('\\');
|
||||
putchar('b');
|
||||
break;
|
||||
case '\f':
|
||||
putchar('\\');
|
||||
putchar('f');
|
||||
break;
|
||||
case '\n':
|
||||
putchar('\\');
|
||||
putchar('n');
|
||||
break;
|
||||
case '\r':
|
||||
putchar('\\');
|
||||
putchar('r');
|
||||
break;
|
||||
case '\"':
|
||||
putchar('\\');
|
||||
putchar('"');
|
||||
break;
|
||||
case '\t':
|
||||
putchar('\\');
|
||||
putchar('t');
|
||||
break;
|
||||
case '\\':
|
||||
putchar('\\');
|
||||
putchar('\\');
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
printf("\\u%04x", *src);
|
||||
} else {
|
||||
putchar(*src);
|
||||
}
|
||||
}
|
||||
src++;
|
||||
}
|
||||
|
@ -54,43 +54,43 @@ static inline void print_with_escapes(const unsigned char *src,
|
|||
std::ostream &os) {
|
||||
while (*src) {
|
||||
switch (*src) {
|
||||
case '\b':
|
||||
os << '\\';
|
||||
os << 'b';
|
||||
break;
|
||||
case '\f':
|
||||
os << '\\';
|
||||
os << 'f';
|
||||
break;
|
||||
case '\n':
|
||||
os << '\\';
|
||||
os << 'n';
|
||||
break;
|
||||
case '\r':
|
||||
os << '\\';
|
||||
os << 'r';
|
||||
break;
|
||||
case '\"':
|
||||
os << '\\';
|
||||
os << '"';
|
||||
break;
|
||||
case '\t':
|
||||
os << '\\';
|
||||
os << 't';
|
||||
break;
|
||||
case '\\':
|
||||
os << '\\';
|
||||
os << '\\';
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
std::ios::fmtflags f(os.flags());
|
||||
os << std::hex << std::setw(4) << std::setfill('0')
|
||||
<< static_cast<int>(*src);
|
||||
os.flags(f);
|
||||
} else {
|
||||
os << *src;
|
||||
}
|
||||
case '\b':
|
||||
os << '\\';
|
||||
os << 'b';
|
||||
break;
|
||||
case '\f':
|
||||
os << '\\';
|
||||
os << 'f';
|
||||
break;
|
||||
case '\n':
|
||||
os << '\\';
|
||||
os << 'n';
|
||||
break;
|
||||
case '\r':
|
||||
os << '\\';
|
||||
os << 'r';
|
||||
break;
|
||||
case '\"':
|
||||
os << '\\';
|
||||
os << '"';
|
||||
break;
|
||||
case '\t':
|
||||
os << '\\';
|
||||
os << 't';
|
||||
break;
|
||||
case '\\':
|
||||
os << '\\';
|
||||
os << '\\';
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
std::ios::fmtflags f(os.flags());
|
||||
os << std::hex << std::setw(4) << std::setfill('0')
|
||||
<< static_cast<int>(*src);
|
||||
os.flags(f);
|
||||
} else {
|
||||
os << *src;
|
||||
}
|
||||
}
|
||||
src++;
|
||||
}
|
||||
|
@ -101,40 +101,40 @@ static inline void print_with_escapes(const unsigned char *src, size_t len) {
|
|||
const unsigned char *finalsrc = src + len;
|
||||
while (src < finalsrc) {
|
||||
switch (*src) {
|
||||
case '\b':
|
||||
putchar('\\');
|
||||
putchar('b');
|
||||
break;
|
||||
case '\f':
|
||||
putchar('\\');
|
||||
putchar('f');
|
||||
break;
|
||||
case '\n':
|
||||
putchar('\\');
|
||||
putchar('n');
|
||||
break;
|
||||
case '\r':
|
||||
putchar('\\');
|
||||
putchar('r');
|
||||
break;
|
||||
case '\"':
|
||||
putchar('\\');
|
||||
putchar('"');
|
||||
break;
|
||||
case '\t':
|
||||
putchar('\\');
|
||||
putchar('t');
|
||||
break;
|
||||
case '\\':
|
||||
putchar('\\');
|
||||
putchar('\\');
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
printf("\\u%04x", *src);
|
||||
} else {
|
||||
putchar(*src);
|
||||
}
|
||||
case '\b':
|
||||
putchar('\\');
|
||||
putchar('b');
|
||||
break;
|
||||
case '\f':
|
||||
putchar('\\');
|
||||
putchar('f');
|
||||
break;
|
||||
case '\n':
|
||||
putchar('\\');
|
||||
putchar('n');
|
||||
break;
|
||||
case '\r':
|
||||
putchar('\\');
|
||||
putchar('r');
|
||||
break;
|
||||
case '\"':
|
||||
putchar('\\');
|
||||
putchar('"');
|
||||
break;
|
||||
case '\t':
|
||||
putchar('\\');
|
||||
putchar('t');
|
||||
break;
|
||||
case '\\':
|
||||
putchar('\\');
|
||||
putchar('\\');
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
printf("\\u%04x", *src);
|
||||
} else {
|
||||
putchar(*src);
|
||||
}
|
||||
}
|
||||
src++;
|
||||
}
|
||||
|
@ -146,43 +146,43 @@ static inline void print_with_escapes(const unsigned char *src,
|
|||
const unsigned char *finalsrc = src + len;
|
||||
while (src < finalsrc) {
|
||||
switch (*src) {
|
||||
case '\b':
|
||||
os << '\\';
|
||||
os << 'b';
|
||||
break;
|
||||
case '\f':
|
||||
os << '\\';
|
||||
os << 'f';
|
||||
break;
|
||||
case '\n':
|
||||
os << '\\';
|
||||
os << 'n';
|
||||
break;
|
||||
case '\r':
|
||||
os << '\\';
|
||||
os << 'r';
|
||||
break;
|
||||
case '\"':
|
||||
os << '\\';
|
||||
os << '"';
|
||||
break;
|
||||
case '\t':
|
||||
os << '\\';
|
||||
os << 't';
|
||||
break;
|
||||
case '\\':
|
||||
os << '\\';
|
||||
os << '\\';
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
std::ios::fmtflags f(os.flags());
|
||||
os << std::hex << std::setw(4) << std::setfill('0')
|
||||
<< static_cast<int>(*src);
|
||||
os.flags(f);
|
||||
} else {
|
||||
os << *src;
|
||||
}
|
||||
case '\b':
|
||||
os << '\\';
|
||||
os << 'b';
|
||||
break;
|
||||
case '\f':
|
||||
os << '\\';
|
||||
os << 'f';
|
||||
break;
|
||||
case '\n':
|
||||
os << '\\';
|
||||
os << 'n';
|
||||
break;
|
||||
case '\r':
|
||||
os << '\\';
|
||||
os << 'r';
|
||||
break;
|
||||
case '\"':
|
||||
os << '\\';
|
||||
os << '"';
|
||||
break;
|
||||
case '\t':
|
||||
os << '\\';
|
||||
os << 't';
|
||||
break;
|
||||
case '\\':
|
||||
os << '\\';
|
||||
os << '\\';
|
||||
break;
|
||||
default:
|
||||
if (*src <= 0x1F) {
|
||||
std::ios::fmtflags f(os.flags());
|
||||
os << std::hex << std::setw(4) << std::setfill('0')
|
||||
<< static_cast<int>(*src);
|
||||
os.flags(f);
|
||||
} else {
|
||||
os << *src;
|
||||
}
|
||||
}
|
||||
src++;
|
||||
}
|
||||
|
@ -196,7 +196,7 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
|
|||
size_t len) {
|
||||
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os, len);
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
#
|
||||
#endif
|
||||
|
|
|
@ -8,10 +8,8 @@
|
|||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
|
||||
#include "simdjson/padded_string.h"
|
||||
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
// load a file in memory...
|
||||
|
@ -20,15 +18,15 @@ namespace simdjson {
|
|||
// first element of the pair is a string (null terminated)
|
||||
// whereas the second element is the length.
|
||||
// caller is responsible to free (aligned_free((void*)result.data())))
|
||||
//
|
||||
//
|
||||
// throws an exception if the file cannot be opened, use try/catch
|
||||
// try {
|
||||
// p = get_corpus(filename);
|
||||
// } catch (const std::exception& e) {
|
||||
// } catch (const std::exception& e) {
|
||||
// aligned_free((void*)p.data());
|
||||
// std::cout << "Could not load the file " << filename << std::endl;
|
||||
// }
|
||||
padded_string get_corpus(const std::string& filename);
|
||||
}
|
||||
padded_string get_corpus(const std::string &filename);
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#ifndef SIMDJSON_JSONMINIFIER_H
|
||||
#define SIMDJSON_JSONMINIFIER_H
|
||||
|
||||
#include "simdjson/padded_string.h"
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <string_view>
|
||||
#include "simdjson/padded_string.h"
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
|
@ -12,20 +12,19 @@ namespace simdjson {
|
|||
// out can be the same pointer. Result is null terminated,
|
||||
// return the string length (minus the null termination).
|
||||
// The accelerated version of this function only runs on AVX2 hardware.
|
||||
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);
|
||||
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out);
|
||||
|
||||
|
||||
static inline size_t jsonminify(const char *buf, size_t len, char *out) {
|
||||
return jsonminify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(out));
|
||||
static inline size_t json_minify(const char *buf, size_t len, char *out) {
|
||||
return json_minify(reinterpret_cast<const uint8_t *>(buf), len,
|
||||
reinterpret_cast<uint8_t *>(out));
|
||||
}
|
||||
|
||||
|
||||
static inline size_t jsonminify(const std::string_view & p, char *out) {
|
||||
return jsonminify(p.data(), p.size(), out);
|
||||
static inline size_t json_minify(const std::string_view &p, char *out) {
|
||||
return json_minify(p.data(), p.size(), out);
|
||||
}
|
||||
|
||||
static inline size_t jsonminify(const padded_string & p, char *out) {
|
||||
return jsonminify(p.data(), p.size(), out);
|
||||
}
|
||||
static inline size_t json_minify(const padded_string &p, char *out) {
|
||||
return json_minify(p.data(), p.size(), out);
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -1,136 +1,161 @@
|
|||
#ifndef SIMDJSON_JSONPARSER_H
|
||||
#define SIMDJSON_JSONPARSER_H
|
||||
#include <string>
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/padded_string.h"
|
||||
#include "simdjson/jsonioutil.h"
|
||||
#include "simdjson/padded_string.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage2_build_tape.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include <string>
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#include <sysinfoapi.h>
|
||||
#include <sysinfoapi.h> // must be included after windows.h
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
namespace simdjson {
|
||||
// The function that users are expected to call is json_parse.
|
||||
// We have more than one such function because we want to support several
|
||||
// We have more than one such function because we want to support several
|
||||
// instruction sets.
|
||||
|
||||
// function pointer type for json_parse
|
||||
using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded);
|
||||
using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj,
|
||||
bool realloc_if_needed);
|
||||
|
||||
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
|
||||
// Pointer that holds the json_parse implementation corresponding to the
|
||||
// available SIMD instruction set
|
||||
extern json_parse_functype *json_parse_ptr;
|
||||
|
||||
// json_parse_implementation is the generic function, it is specialized for various
|
||||
// architectures, e.g., as json_parse_implementation<architecture::haswell>
|
||||
// or json_parse_implementation<architecture::arm64>
|
||||
template<architecture T>
|
||||
int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
|
||||
if (pj.bytecapacity < len) {
|
||||
// json_parse_implementation is the generic function, it is specialized for
|
||||
// various architectures, e.g., as
|
||||
// json_parse_implementation<Architecture::HASWELL> or
|
||||
// json_parse_implementation<Architecture::ARM64>
|
||||
template <Architecture T>
|
||||
int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj,
|
||||
bool realloc_if_needed = true) {
|
||||
if (pj.byte_capacity < len) {
|
||||
return simdjson::CAPACITY;
|
||||
}
|
||||
bool reallocated = false;
|
||||
if(reallocifneeded) {
|
||||
if (realloc_if_needed) {
|
||||
#if ALLOW_SAME_PAGE_BUFFER_OVERRUN
|
||||
// realloc is needed if the end of the memory crosses a page
|
||||
#ifdef _MSC_VER
|
||||
SYSTEM_INFO sysInfo;
|
||||
GetSystemInfo(&sysInfo);
|
||||
long pagesize = sysInfo.dwPageSize;
|
||||
SYSTEM_INFO sysInfo;
|
||||
GetSystemInfo(&sysInfo);
|
||||
long page_size = sysInfo.dwPageSize;
|
||||
#else
|
||||
long pagesize = sysconf (_SC_PAGESIZE);
|
||||
long page_size = sysconf(_SC_PAGESIZE);
|
||||
#endif
|
||||
//////////////
|
||||
// We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
|
||||
// are in the same page.
|
||||
// That is, we want to check that
|
||||
// (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize
|
||||
// That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize.
|
||||
// That is, we want to check that
|
||||
// (buf + len - 1) / page_size == (buf + len - 1 + SIMDJSON_PADDING) /
|
||||
// page_size That's true if (buf + len - 1) % page_size + SIMDJSON_PADDING <
|
||||
// page_size.
|
||||
///////////
|
||||
if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
|
||||
if ((reinterpret_cast<uintptr_t>(buf + len - 1) % page_size) +
|
||||
SIMDJSON_PADDING <
|
||||
static_cast<uintptr_t>(page_size)) {
|
||||
#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
|
||||
if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate
|
||||
if (true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always
|
||||
// reallocate
|
||||
#endif
|
||||
const uint8_t *tmpbuf = buf;
|
||||
buf = (uint8_t *) allocate_padded_buffer(len);
|
||||
if(buf == NULL) return simdjson::MEMALLOC;
|
||||
memcpy((void*)buf,tmpbuf,len);
|
||||
const uint8_t *tmp_buf = buf;
|
||||
buf = (uint8_t *)allocate_padded_buffer(len);
|
||||
if (buf == NULL)
|
||||
return simdjson::MEMALLOC;
|
||||
memcpy((void *)buf, tmp_buf, len);
|
||||
reallocated = true;
|
||||
} // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
|
||||
} // if(reallocifneeded) {
|
||||
} // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) %
|
||||
// page_size ) + SIMDJSON_PADDING < static_cast<uintptr_t>(page_size) ) {
|
||||
} // if(realloc_if_needed) {
|
||||
int stage1_is_ok = simdjson::find_structural_bits<T>(buf, len, pj);
|
||||
if(stage1_is_ok != simdjson::SUCCESS) {
|
||||
pj.errorcode = stage1_is_ok;
|
||||
return pj.errorcode;
|
||||
}
|
||||
if (stage1_is_ok != simdjson::SUCCESS) {
|
||||
pj.error_code = stage1_is_ok;
|
||||
return pj.error_code;
|
||||
}
|
||||
int res = unified_machine<T>(buf, len, pj);
|
||||
if(reallocated) { aligned_free((void*)buf);}
|
||||
if (reallocated) {
|
||||
aligned_free((void *)buf);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// Parse a document found in buf.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
// Parse a document found in buf.
|
||||
//
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
|
||||
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
|
||||
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
|
||||
// into a string).
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)).
|
||||
//
|
||||
// You can also check validity by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
|
||||
// or an error code from simdjson/simdjson.h in case of failure such as
|
||||
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
|
||||
// the simdjson::error_message function converts these error codes into a
|
||||
// string).
|
||||
//
|
||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
|
||||
// all bytes at and after buf + len are ignored (can be garbage).
|
||||
// The ParsedJson object can be reused.
|
||||
// You can also check validity by calling pj.is_valid(). The same ParsedJson can
|
||||
// be reused for other documents.
|
||||
//
|
||||
// If realloc_if_needed is true (default) then a temporary buffer is created
|
||||
// when needed during processing (a copy of the input string is made). The input
|
||||
// buf should be readable up to buf + len + SIMDJSON_PADDING if
|
||||
// realloc_if_needed is false, all bytes at and after buf + len are ignored
|
||||
// (can be garbage). The ParsedJson object can be reused.
|
||||
|
||||
inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
|
||||
return json_parse_ptr(buf, len, pj, reallocifneeded);
|
||||
inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj,
|
||||
bool realloc_if_needed = true) {
|
||||
return json_parse_ptr(buf, len, pj, realloc_if_needed);
|
||||
}
|
||||
|
||||
// Parse a document found in buf.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
//
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
|
||||
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
|
||||
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
|
||||
// into a string).
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)).
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
|
||||
// or an error code from simdjson/simdjson.h in case of failure such as
|
||||
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
|
||||
// the simdjson::error_message function converts these error codes into a
|
||||
// string).
|
||||
//
|
||||
// You can also check validity
|
||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||
// by calling pj.is_valid(). The same ParsedJson can be reused for other
|
||||
// documents.
|
||||
//
|
||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
|
||||
// all bytes at and after buf + len are ignored (can be garbage).
|
||||
// The ParsedJson object can be reused.
|
||||
inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
|
||||
return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj, reallocifneeded);
|
||||
// If realloc_if_needed is true (default) then a temporary buffer is created
|
||||
// when needed during processing (a copy of the input string is made). The input
|
||||
// buf should be readable up to buf + len + SIMDJSON_PADDING if
|
||||
// realloc_if_needed is false, all bytes at and after buf + len are ignored
|
||||
// (can be garbage). The ParsedJson object can be reused.
|
||||
inline int json_parse(const char *buf, size_t len, ParsedJson &pj,
|
||||
bool realloc_if_needed = true) {
|
||||
return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj,
|
||||
realloc_if_needed);
|
||||
}
|
||||
|
||||
// We do not want to allow implicit conversion from C string to std::string.
|
||||
int json_parse(const char * buf, ParsedJson &pj) = delete;
|
||||
int json_parse(const char *buf, ParsedJson &pj) = delete;
|
||||
|
||||
// Parse a document found in in string s.
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)).
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
|
||||
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
|
||||
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
|
||||
// into a string).
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
|
||||
// or an error code from simdjson/simdjson.h in case of failure such as
|
||||
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
|
||||
// the simdjson::error_message function converts these error codes into a
|
||||
// string).
|
||||
//
|
||||
// A temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
|
@ -139,72 +164,82 @@ inline int json_parse(const std::string &s, ParsedJson &pj) {
|
|||
}
|
||||
|
||||
// Parse a document found in in string s.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
//
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
|
||||
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
|
||||
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
|
||||
// into a string).
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)).
|
||||
//
|
||||
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
|
||||
// or an error code from simdjson/simdjson.h in case of failure such as
|
||||
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
|
||||
// the simdjson::error_message function converts these error codes into a
|
||||
// string).
|
||||
//
|
||||
// You can also check validity
|
||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||
// by calling pj.is_valid(). The same ParsedJson can be reused for other
|
||||
// documents.
|
||||
inline int json_parse(const padded_string &s, ParsedJson &pj) {
|
||||
return json_parse(s.data(), s.length(), pj, false);
|
||||
}
|
||||
|
||||
|
||||
// Build a ParsedJson object. You can check validity
|
||||
// by calling pj.isValid(). This does the memory allocation needed for ParsedJson.
|
||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
// by calling pj.is_valid(). This does the memory allocation needed for
|
||||
// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is
|
||||
// created when needed during processing (a copy of the input string is made).
|
||||
//
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
|
||||
// all bytes at and after buf + len are ignored (can be garbage).
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
|
||||
// realloc_if_needed is false, all bytes at and after buf + len are ignored
|
||||
// (can be garbage).
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// This is a convenience function which calls json_parse.
|
||||
WARN_UNUSED
|
||||
ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded = true);
|
||||
ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
|
||||
bool realloc_if_needed = true);
|
||||
|
||||
WARN_UNUSED
|
||||
// Build a ParsedJson object. You can check validity
|
||||
// by calling pj.isValid(). This does the memory allocation needed for ParsedJson.
|
||||
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
// by calling pj.is_valid(). This does the memory allocation needed for
|
||||
// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is
|
||||
// created when needed during processing (a copy of the input string is made).
|
||||
//
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
|
||||
// all bytes at and after buf + len are ignored (can be garbage).
|
||||
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
|
||||
// realloc_if_needed is false, all bytes at and after buf + len are ignored
|
||||
// (can be garbage).
|
||||
//
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// This is a convenience function which calls json_parse.
|
||||
inline ParsedJson build_parsed_json(const char * buf, size_t len, bool reallocifneeded = true) {
|
||||
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len, reallocifneeded);
|
||||
inline ParsedJson build_parsed_json(const char *buf, size_t len,
|
||||
bool realloc_if_needed = true) {
|
||||
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len,
|
||||
realloc_if_needed);
|
||||
}
|
||||
|
||||
|
||||
// We do not want to allow implicit conversion from C string to std::string.
|
||||
ParsedJson build_parsed_json(const char *buf) = delete;
|
||||
|
||||
|
||||
// Parse a document found in in string s.
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// Return SUCCESS (an integer = 0) in case of a success. You can also check validity
|
||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
|
||||
// success. You can also check validity by calling pj.is_valid(). The same
|
||||
// ParsedJson can be reused for other documents.
|
||||
//
|
||||
// A temporary buffer is created when needed during processing
|
||||
// (a copy of the input string is made).
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// This is a convenience function which calls json_parse.
|
||||
WARN_UNUSED
|
||||
|
@ -212,19 +247,20 @@ inline ParsedJson build_parsed_json(const std::string &s) {
|
|||
return build_parsed_json(s.data(), s.length(), true);
|
||||
}
|
||||
|
||||
|
||||
// Parse a document found in in string s.
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
|
||||
// Return SUCCESS (an integer = 0) in case of a success. You can also check validity
|
||||
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
|
||||
// is responsible for omitting it, UTF-8 BOM are discouraged.
|
||||
// You need to preallocate ParsedJson with a capacity of len (e.g.,
|
||||
// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
|
||||
// success. You can also check validity by calling pj.is_valid(). The same
|
||||
// ParsedJson can be reused for other documents.
|
||||
//
|
||||
// The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
|
||||
// discouraged.
|
||||
//
|
||||
// This is a convenience function which calls json_parse.
|
||||
WARN_UNUSED
|
||||
inline ParsedJson build_parsed_json(const padded_string &s) {
|
||||
return build_parsed_json(s.data(), s.length(), false);
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -7,16 +7,17 @@
|
|||
#include "simdjson/portability.h"
|
||||
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
void foundInvalidNumber(const uint8_t *buf);
|
||||
void foundInteger(int64_t result, const uint8_t *buf);
|
||||
void foundFloat(double result, const uint8_t *buf);
|
||||
void found_invalid_number(const uint8_t *buf);
|
||||
void found_integer(int64_t result, const uint8_t *buf);
|
||||
void found_float(double result, const uint8_t *buf);
|
||||
#endif
|
||||
|
||||
namespace simdjson {
|
||||
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
|
||||
// to std::numeric_limits<double>::max(), so from
|
||||
// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero
|
||||
// normal values is std::numeric_limits<double>::min() or about 2.225074e-308.
|
||||
// Allowable floating-point values range from
|
||||
// std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
|
||||
// so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest
|
||||
// non-zero normal values is std::numeric_limits<double>::min() or
|
||||
// about 2.225074e-308.
|
||||
static const double power_of_ten[] = {
|
||||
1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
|
||||
1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
|
||||
|
@ -113,7 +114,7 @@ really_inline bool
|
|||
is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
|
||||
return structural_or_whitespace_or_exponent_or_decimal_negated[c];
|
||||
}
|
||||
}// simdjson
|
||||
} // namespace simdjson
|
||||
#ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING
|
||||
#define SWAR_NUMBER_PARSING
|
||||
#endif
|
||||
|
@ -126,7 +127,7 @@ namespace simdjson {
|
|||
// http://0x80.pl/articles/swar-digits-validate.html
|
||||
static inline bool is_made_of_eight_digits_fast(const char *chars) {
|
||||
uint64_t val;
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(7 <= SIMDJSON_PADDING);
|
||||
memcpy(&val, chars, 8);
|
||||
|
@ -138,7 +139,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
|
|||
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
||||
0x3333333333333333);
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
#ifdef IS_X86_64
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
|
@ -150,7 +151,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|||
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
||||
const __m128i mul_1_10000 =
|
||||
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
||||
const __m128i input = _mm_sub_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
||||
const __m128i input = _mm_sub_epi8(
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
||||
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
||||
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
||||
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
||||
|
@ -158,7 +160,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|||
return _mm_cvtsi128_si32(
|
||||
t4); // only captures the sum of the first 8 digits, drop the rest
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
#endif
|
||||
|
||||
|
@ -167,15 +169,14 @@ namespace simdjson {
|
|||
// we don't have SSE, so let us use a scalar function
|
||||
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
||||
static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
||||
uint64_t val;
|
||||
memcpy(&val, chars, sizeof(uint64_t));
|
||||
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
||||
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
||||
return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
|
||||
uint64_t val;
|
||||
memcpy(&val, chars, sizeof(uint64_t));
|
||||
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
||||
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
||||
return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
|
@ -183,10 +184,9 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|||
// It is only even going to be used when negative_exponent is tiny.
|
||||
static double subnormal_power10(double base, int negative_exponent) {
|
||||
// this is probably not going to be fast
|
||||
return base * 1e-308 * pow(10, negative_exponent + 308);
|
||||
return base * 1e-308 * pow(10, negative_exponent + 308);
|
||||
}
|
||||
|
||||
|
||||
// called by parse_number when we know that the output is a float,
|
||||
// but where there might be some integer overflow. The trick here is to
|
||||
// parse using floats from the start.
|
||||
|
@ -197,10 +197,8 @@ static double subnormal_power10(double base, int negative_exponent) {
|
|||
//
|
||||
// Note: a redesign could avoid this function entirely.
|
||||
//
|
||||
static never_inline bool
|
||||
parse_float(const uint8_t *const buf,
|
||||
ParsedJson &pj, const uint32_t offset,
|
||||
bool found_minus) {
|
||||
static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
|
||||
const uint32_t offset, bool found_minus) {
|
||||
const char *p = reinterpret_cast<const char *>(buf + offset);
|
||||
bool negative = false;
|
||||
if (found_minus) {
|
||||
|
@ -223,100 +221,102 @@ parse_float(const uint8_t *const buf,
|
|||
}
|
||||
if ('.' == *p) {
|
||||
++p;
|
||||
int fractionalweight = 308;
|
||||
if(is_integer(*p)) {
|
||||
int fractional_weight = 308;
|
||||
if (is_integer(*p)) {
|
||||
unsigned char digit = *p - '0';
|
||||
++p;
|
||||
|
||||
fractionalweight --;
|
||||
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
|
||||
fractional_weight--;
|
||||
i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
|
||||
: 0);
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
unsigned char digit = *p - '0';
|
||||
++p;
|
||||
fractionalweight --;
|
||||
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
|
||||
fractional_weight--;
|
||||
i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
|
||||
: 0);
|
||||
}
|
||||
}
|
||||
if (('e' == *p) || ('E' == *p)) {
|
||||
++p;
|
||||
bool negexp = false;
|
||||
bool neg_exp = false;
|
||||
if ('-' == *p) {
|
||||
negexp = true;
|
||||
neg_exp = true;
|
||||
++p;
|
||||
} else if ('+' == *p) {
|
||||
++p;
|
||||
}
|
||||
if (!is_integer(*p)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
unsigned char digit = *p - '0';
|
||||
int64_t expnumber = digit; // exponential part
|
||||
int64_t exp_number = digit; // exponential part
|
||||
p++;
|
||||
if (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
if(expnumber > 0x100000000) {// we need to check for overflows
|
||||
if (exp_number > 0x100000000) { // we need to check for overflows
|
||||
// we refuse to parse this
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
++p;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (unlikely(expnumber > 308)) {
|
||||
if (unlikely(exp_number > 308)) {
|
||||
// this path is unlikely
|
||||
if(negexp) {
|
||||
// We either have zero or a subnormal.
|
||||
if (neg_exp) {
|
||||
// We either have zero or a subnormal.
|
||||
// We expect this to be uncommon so we go through a slow path.
|
||||
i = subnormal_power10(i, - expnumber);
|
||||
i = subnormal_power10(i, -exp_number);
|
||||
} else {
|
||||
// We know for sure that we have a number that is too large,
|
||||
// we refuse to parse this
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
int exponent = (negexp ? -expnumber : expnumber);
|
||||
// we have that expnumber is [0,308] so that
|
||||
// exponent is [-308,308] so that
|
||||
int exponent = (neg_exp ? -exp_number : exp_number);
|
||||
// we have that exp_number is [0,308] so that
|
||||
// exponent is [-308,308] so that
|
||||
// 308 + exponent is in [0, 2 * 308]
|
||||
i *= power_of_ten[308 + exponent];
|
||||
}
|
||||
}
|
||||
}
|
||||
if(is_not_structural_or_whitespace(*p)) {
|
||||
if (is_not_structural_or_whitespace(*p)) {
|
||||
return false;
|
||||
}
|
||||
double d = negative ? -i : i;
|
||||
pj.write_tape_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundFloat(d, buf + offset);
|
||||
found_float(d, buf + offset);
|
||||
#endif
|
||||
return is_structural_or_whitespace(*p);
|
||||
}
|
||||
|
@ -354,13 +354,13 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
digit = *p - '0';
|
||||
if (mul_overflow(i, 10, &i)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
if (add_overflow(i, digit, &i)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
|
@ -371,7 +371,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
if (i > 0x8000000000000000) {
|
||||
// overflows!
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
|
@ -379,15 +379,16 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
if (i >= 0x8000000000000000) {
|
||||
// overflows!
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
}
|
||||
int64_t signed_answer = negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
|
||||
int64_t signed_answer =
|
||||
negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
|
||||
pj.write_tape_s64(signed_answer);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInteger(signed_answer, buf + offset);
|
||||
found_integer(signed_answer, buf + offset);
|
||||
#endif
|
||||
return is_structural_or_whitespace(*p);
|
||||
}
|
||||
|
@ -396,18 +397,18 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
|||
// define JSON_TEST_NUMBERS for unit testing
|
||||
//
|
||||
// It is assumed that the number is followed by a structural ({,},],[) character
|
||||
// or a white space character. If that is not the case (e.g., when the JSON document
|
||||
// is made of a single number), then it is necessary to copy the content and append
|
||||
// a space before calling this function.
|
||||
// or a white space character. If that is not the case (e.g., when the JSON
|
||||
// document is made of a single number), then it is necessary to copy the
|
||||
// content and append a space before calling this function.
|
||||
//
|
||||
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
|
||||
static really_inline bool parse_number(const uint8_t *const buf,
|
||||
ParsedJson &pj,
|
||||
static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
|
||||
const uint32_t offset,
|
||||
bool found_minus) {
|
||||
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes useful to skip parsing
|
||||
pj.write_tape_s64(0); // always write zero
|
||||
return true; // always succeeds
|
||||
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
|
||||
// useful to skip parsing
|
||||
pj.write_tape_s64(0); // always write zero
|
||||
return true; // always succeeds
|
||||
#else
|
||||
const char *p = reinterpret_cast<const char *>(buf + offset);
|
||||
bool negative = false;
|
||||
|
@ -415,28 +416,28 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
|||
++p;
|
||||
negative = true;
|
||||
if (!is_integer(*p)) { // a negative sign must be followed by an integer
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const char *const startdigits = p;
|
||||
const char *const start_digits = p;
|
||||
|
||||
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
|
||||
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
|
||||
if (*p == '0') { // 0 cannot be followed by an integer
|
||||
++p;
|
||||
if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
i = 0;
|
||||
} else {
|
||||
if (!(is_integer(*p))) { // must start with an integer
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
@ -447,7 +448,8 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
|||
// we rarely see large integer parts like 123456789
|
||||
while (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
// a multiplication by 10 is cheaper than an arbitrary integer multiplication
|
||||
// a multiplication by 10 is cheaper than an arbitrary integer
|
||||
// multiplication
|
||||
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
||||
++p;
|
||||
}
|
||||
|
@ -461,17 +463,18 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
|||
// z that fits in 53 bits, then we will be able to convert back the
|
||||
// the integer into a float in a lossless manner.
|
||||
++p;
|
||||
const char *const firstafterperiod = p;
|
||||
if(is_integer(*p)) {
|
||||
const char *const first_after_period = p;
|
||||
if (is_integer(*p)) {
|
||||
unsigned char digit = *p - '0';
|
||||
++p;
|
||||
i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
|
||||
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
|
||||
// cheaper than arbitrary mult.
|
||||
// we will handle the overflow later
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
#ifdef SWAR_NUMBER_PARSING
|
||||
// this helps if we have lots of decimals!
|
||||
|
@ -484,102 +487,100 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
|||
while (is_integer(*p)) {
|
||||
unsigned char digit = *p - '0';
|
||||
++p;
|
||||
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
|
||||
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
|
||||
// because we have parse_highprecision_float later.
|
||||
}
|
||||
exponent = firstafterperiod - p;
|
||||
exponent = first_after_period - p;
|
||||
}
|
||||
int digitcount = p - startdigits - 1; // used later to guard against overflows
|
||||
int64_t expnumber = 0; // exponential part
|
||||
int digit_count =
|
||||
p - start_digits - 1; // used later to guard against overflows
|
||||
int64_t exp_number = 0; // exponential part
|
||||
if (('e' == *p) || ('E' == *p)) {
|
||||
is_float = true;
|
||||
++p;
|
||||
bool negexp = false;
|
||||
bool neg_exp = false;
|
||||
if ('-' == *p) {
|
||||
negexp = true;
|
||||
neg_exp = true;
|
||||
++p;
|
||||
} else if ('+' == *p) {
|
||||
++p;
|
||||
}
|
||||
if (!is_integer(*p)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
unsigned char digit = *p - '0';
|
||||
expnumber = digit;
|
||||
exp_number = digit;
|
||||
p++;
|
||||
if (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (is_integer(*p)) {
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
if(expnumber > 0x100000000) {// we need to check for overflows
|
||||
// we refuse to parse this
|
||||
if (exp_number > 0x100000000) { // we need to check for overflows
|
||||
// we refuse to parse this
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInvalidNumber(buf + offset);
|
||||
found_invalid_number(buf + offset);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
digit = *p - '0';
|
||||
expnumber = 10 * expnumber + digit;
|
||||
++p;
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
exponent += (negexp ? -expnumber : expnumber);
|
||||
exponent += (neg_exp ? -exp_number : exp_number);
|
||||
}
|
||||
if (is_float) {
|
||||
uint64_t powerindex = 308 + exponent;
|
||||
if (unlikely((digitcount >= 19))) { // this is uncommon
|
||||
// It is possible that the integer had an overflow.
|
||||
uint64_t power_index = 308 + exponent;
|
||||
if (unlikely((digit_count >= 19))) { // this is uncommon
|
||||
// It is possible that the integer had an overflow.
|
||||
// We have to handle the case where we have 0.0000somenumber.
|
||||
const char * start = startdigits;
|
||||
while((*start == '0') || (*start == '.')) {
|
||||
start++;
|
||||
const char *start = start_digits;
|
||||
while ((*start == '0') || (*start == '.')) {
|
||||
start++;
|
||||
}
|
||||
digitcount -= (start - startdigits);
|
||||
if(digitcount >= 19) {
|
||||
digit_count -= (start - start_digits);
|
||||
if (digit_count >= 19) {
|
||||
// Ok, chances are good that we had an overflow!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
return parse_float(buf, pj, offset,
|
||||
found_minus);
|
||||
|
||||
}
|
||||
return parse_float(buf, pj, offset, found_minus);
|
||||
}
|
||||
}
|
||||
if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
|
||||
if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
return parse_float(buf, pj, offset,
|
||||
found_minus);
|
||||
return parse_float(buf, pj, offset, found_minus);
|
||||
}
|
||||
double factor = power_of_ten[powerindex];
|
||||
double factor = power_of_ten[power_index];
|
||||
factor = negative ? -factor : factor;
|
||||
double d = i * factor;
|
||||
pj.write_tape_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundFloat(d, buf + offset);
|
||||
found_float(d, buf + offset);
|
||||
#endif
|
||||
} else {
|
||||
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
|
||||
if (unlikely(digit_count >= 18)) { // this is uncommon!!!
|
||||
// there is a good chance that we had an overflow, so we need
|
||||
// need to recover: we parse the whole thing again.
|
||||
return parse_large_integer(buf, pj, offset,
|
||||
found_minus);
|
||||
return parse_large_integer(buf, pj, offset, found_minus);
|
||||
}
|
||||
i = negative ? 0-i : i;
|
||||
i = negative ? 0 - i : i;
|
||||
pj.write_tape_s64(i);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
foundInteger(i, buf + offset);
|
||||
found_integer(i, buf + offset);
|
||||
#endif
|
||||
}
|
||||
return is_structural_or_whitespace(*p);
|
||||
return is_structural_or_whitespace(*p);
|
||||
#endif // SIMDJSON_SKIPNUMBERPARSING
|
||||
}
|
||||
}//simdjson
|
||||
} // simdjson
|
||||
#endif
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#ifndef SIMDJSON_PADDING_STRING_H
|
||||
#define SIMDJSON_PADDING_STRING_H
|
||||
#include "simdjson/portability.h"
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
namespace simdjson {
|
||||
// low-level function to allocate memory with padding so we can read passed the
|
||||
|
@ -65,6 +65,6 @@ private:
|
|||
size_t viable_size;
|
||||
char *data_ptr;
|
||||
};
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,48 +1,49 @@
|
|||
#ifndef SIMDJSON_PARSEDJSON_H
|
||||
#define SIMDJSON_PARSEDJSON_H
|
||||
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/jsonformatutils.h"
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/jsonformatutils.h"
|
||||
#include "simdjson/portability.h"
|
||||
|
||||
#define JSONVALUEMASK 0xFFFFFFFFFFFFFF
|
||||
#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
|
||||
|
||||
#define DEFAULTMAXDEPTH 1024// a JSON document with a depth exceeding 1024 is probably de facto invalid
|
||||
#define DEFAULT_MAX_DEPTH \
|
||||
1024 // a JSON document with a depth exceeding 1024 is probably de facto
|
||||
// invalid
|
||||
|
||||
namespace simdjson {
|
||||
/************
|
||||
* The JSON is parsed to a tape, see the accompanying tape.md file
|
||||
* for documentation.
|
||||
***********/
|
||||
struct ParsedJson {
|
||||
class ParsedJson {
|
||||
public:
|
||||
|
||||
// create a ParsedJson container with zero capacity, call allocateCapacity to
|
||||
// create a ParsedJson container with zero capacity, call allocate_capacity to
|
||||
// allocate memory
|
||||
ParsedJson();
|
||||
~ParsedJson();
|
||||
ParsedJson(ParsedJson && p);
|
||||
ParsedJson(ParsedJson &&p);
|
||||
|
||||
// if needed, allocate memory so that the object is able to process JSON
|
||||
// documents having up to len bytes and maxdepth "depth"
|
||||
// documents having up to len bytes and max_depth "depth"
|
||||
WARN_UNUSED
|
||||
bool allocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH);
|
||||
bool allocate_capacity(size_t len, size_t max_depth = DEFAULT_MAX_DEPTH);
|
||||
|
||||
// returns true if the document parsed was valid
|
||||
bool isValid() const;
|
||||
bool is_valid() const;
|
||||
|
||||
// return an error code corresponding to the last parsing attempt, see simdjson.h
|
||||
// will return simdjson::UNITIALIZED if no parsing was attempted
|
||||
int getErrorCode() const;
|
||||
// return an error code corresponding to the last parsing attempt, see
|
||||
// simdjson.h will return simdjson::UNITIALIZED if no parsing was attempted
|
||||
int get_error_code() const;
|
||||
|
||||
// return the string equivalent of "getErrorCode"
|
||||
std::string getErrorMsg() const;
|
||||
// return the string equivalent of "get_error_code"
|
||||
std::string get_error_message() const;
|
||||
|
||||
// deallocate memory and set capacity to zero, called automatically by the
|
||||
// destructor
|
||||
|
@ -55,11 +56,10 @@ public:
|
|||
// return false if the tape is likely wrong (e.g., you did not parse a valid
|
||||
// JSON).
|
||||
WARN_UNUSED
|
||||
bool printjson(std::ostream &os);
|
||||
bool print_json(std::ostream &os);
|
||||
WARN_UNUSED
|
||||
bool dump_raw_tape(std::ostream &os);
|
||||
|
||||
|
||||
// all nodes are stored on the tape using a 64-bit word.
|
||||
//
|
||||
// strings, double and ints are stored as
|
||||
|
@ -76,43 +76,42 @@ public:
|
|||
|
||||
// this should be considered a private function
|
||||
really_inline void write_tape(uint64_t val, uint8_t c) {
|
||||
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
|
||||
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
|
||||
}
|
||||
|
||||
really_inline void write_tape_s64(int64_t i) {
|
||||
write_tape(0, 'l');
|
||||
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
|
||||
write_tape(0, 'l');
|
||||
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
|
||||
}
|
||||
|
||||
really_inline void write_tape_double(double d) {
|
||||
write_tape(0, 'd');
|
||||
static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size");
|
||||
memcpy(& tape[current_loc++], &d, sizeof(double));
|
||||
//tape[current_loc++] = *((uint64_t *)&d);
|
||||
memcpy(&tape[current_loc++], &d, sizeof(double));
|
||||
// tape[current_loc++] = *((uint64_t *)&d);
|
||||
}
|
||||
|
||||
really_inline uint32_t get_current_loc() { return current_loc; }
|
||||
|
||||
really_inline void annotate_previousloc(uint32_t saved_loc, uint64_t val) {
|
||||
tape[saved_loc] |= val;
|
||||
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
|
||||
tape[saved_loc] |= val;
|
||||
}
|
||||
|
||||
struct InvalidJSON : public std::exception {
|
||||
const char * what () const throw () {
|
||||
return "JSON document is invalid";
|
||||
}
|
||||
class InvalidJSON : public std::exception {
|
||||
const char *what() const throw() { return "JSON document is invalid"; }
|
||||
};
|
||||
|
||||
struct iterator {
|
||||
class Iterator {
|
||||
// might throw InvalidJSON if ParsedJson is invalid
|
||||
explicit iterator(ParsedJson &pj_);
|
||||
~iterator();
|
||||
public:
|
||||
explicit Iterator(ParsedJson &pj_);
|
||||
~Iterator();
|
||||
|
||||
iterator(const iterator &o);
|
||||
Iterator(const Iterator &o);
|
||||
|
||||
iterator(iterator &&o);
|
||||
Iterator(Iterator &&o);
|
||||
|
||||
inline bool isOk() const;
|
||||
inline bool is_ok() const;
|
||||
|
||||
// useful for debuging purposes
|
||||
inline size_t get_tape_location() const;
|
||||
|
@ -120,11 +119,12 @@ public:
|
|||
// useful for debuging purposes
|
||||
inline size_t get_tape_length() const;
|
||||
|
||||
// returns the current depth (start at 1 with 0 reserved for the fictitious root node)
|
||||
// returns the current depth (start at 1 with 0 reserved for the fictitious
|
||||
// root node)
|
||||
inline size_t get_depth() const;
|
||||
|
||||
// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([).
|
||||
// The root node has type 'r'.
|
||||
// A scope is a series of nodes at the same depth, typically it is either an
|
||||
// object ({) or an array ([). The root node has type 'r'.
|
||||
inline uint8_t get_scope_type() const;
|
||||
|
||||
// move forward in document order
|
||||
|
@ -132,81 +132,65 @@ public:
|
|||
|
||||
// retrieve the character code of what we're looking at:
|
||||
// [{"sltfn are the possibilities
|
||||
inline uint8_t get_type() const {
|
||||
return current_type; // short functions should be inlined!
|
||||
inline uint8_t get_type() const {
|
||||
return current_type; // short functions should be inlined!
|
||||
}
|
||||
|
||||
// get the int64_t value at this node; valid only if we're at "l"
|
||||
inline int64_t get_integer() const {
|
||||
if(location + 1 >= tape_length) {
|
||||
return 0;// default value in case of error
|
||||
}
|
||||
return static_cast<int64_t>(pj.tape[location + 1]);
|
||||
inline int64_t get_integer() const {
|
||||
if (location + 1 >= tape_length) {
|
||||
return 0; // default value in case of error
|
||||
}
|
||||
return static_cast<int64_t>(pj.tape[location + 1]);
|
||||
}
|
||||
|
||||
// get the string value at this node (NULL ended); valid only if we're at "
|
||||
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
|
||||
// return value is valid UTF-8
|
||||
// It may contain NULL chars within the string: get_string_length determines the true
|
||||
// string length.
|
||||
inline const char * get_string() const {
|
||||
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK) + sizeof(uint32_t)) ;
|
||||
// note that tabs, and line endings are escaped in the returned value (see
|
||||
// print_with_escapes) return value is valid UTF-8 It may contain NULL chars
|
||||
// within the string: get_string_length determines the true string length.
|
||||
inline const char *get_string() const {
|
||||
return reinterpret_cast<const char *>(
|
||||
pj.string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
|
||||
}
|
||||
|
||||
// return the length of the string in bytes
|
||||
inline uint32_t get_string_length() const {
|
||||
uint32_t answer;
|
||||
memcpy(&answer, reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)), sizeof(uint32_t));
|
||||
memcpy(&answer,
|
||||
reinterpret_cast<const char *>(pj.string_buf +
|
||||
(current_val & JSON_VALUE_MASK)),
|
||||
sizeof(uint32_t));
|
||||
return answer;
|
||||
}
|
||||
|
||||
// get the double value at this node; valid only if
|
||||
// we're at "d"
|
||||
inline double get_double() const {
|
||||
if(location + 1 >= tape_length) {
|
||||
return NAN;// default value in case of error
|
||||
inline double get_double() const {
|
||||
if (location + 1 >= tape_length) {
|
||||
return NAN; // default value in case of error
|
||||
}
|
||||
double answer;
|
||||
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
|
||||
memcpy(&answer, &pj.tape[location + 1], sizeof(answer));
|
||||
return answer;
|
||||
}
|
||||
|
||||
inline bool is_object_or_array() const { return is_object() || is_array(); }
|
||||
|
||||
inline bool is_object_or_array() const {
|
||||
return is_object() || is_array();
|
||||
}
|
||||
inline bool is_object() const { return get_type() == '{'; }
|
||||
|
||||
inline bool is_object() const {
|
||||
return get_type() == '{';
|
||||
}
|
||||
inline bool is_array() const { return get_type() == '['; }
|
||||
|
||||
inline bool is_array() const {
|
||||
return get_type() == '[';
|
||||
}
|
||||
inline bool is_string() const { return get_type() == '"'; }
|
||||
|
||||
inline bool is_string() const {
|
||||
return get_type() == '"';
|
||||
}
|
||||
inline bool is_integer() const { return get_type() == 'l'; }
|
||||
|
||||
inline bool is_integer() const {
|
||||
return get_type() == 'l';
|
||||
}
|
||||
inline bool is_double() const { return get_type() == 'd'; }
|
||||
|
||||
inline bool is_double() const {
|
||||
return get_type() == 'd';
|
||||
}
|
||||
inline bool is_true() const { return get_type() == 't'; }
|
||||
|
||||
inline bool is_true() const {
|
||||
return get_type() == 't';
|
||||
}
|
||||
inline bool is_false() const { return get_type() == 'f'; }
|
||||
|
||||
inline bool is_false() const {
|
||||
return get_type() == 'f';
|
||||
}
|
||||
|
||||
inline bool is_null() const {
|
||||
return get_type() == 'n';
|
||||
}
|
||||
inline bool is_null() const { return get_type() == 'n'; }
|
||||
|
||||
static bool is_object_or_array(uint8_t type) {
|
||||
return ((type == '[') || (type == '{'));
|
||||
|
@ -219,16 +203,17 @@ public:
|
|||
// We seek the key using C's strcmp so if your JSON strings contain
|
||||
// NULL chars, this would trigger a false positive: if you expect that
|
||||
// to be the case, take extra precautions.
|
||||
inline bool move_to_key(const char * key);
|
||||
inline bool move_to_key(const char *key);
|
||||
// when at {, go one level deep, looking for a given key
|
||||
// if successful, we are left pointing at the value,
|
||||
// if not, we are still pointing at the object ({)
|
||||
// (in case of repeated keys, this only finds the first one).
|
||||
// The string we search for can contain NULL values.
|
||||
inline bool move_to_key(const char * key, uint32_t length);
|
||||
|
||||
// when at a key location within an object, this moves to the accompanying value (located next to it).
|
||||
// this is equivalent but much faster than calling "next()".
|
||||
inline bool move_to_key(const char *key, uint32_t length);
|
||||
|
||||
// when at a key location within an object, this moves to the accompanying
|
||||
// value (located next to it). this is equivalent but much faster than
|
||||
// calling "next()".
|
||||
inline void move_to_value();
|
||||
|
||||
// when at [, go one level deep, and advance to the given index.
|
||||
|
@ -239,54 +224,55 @@ public:
|
|||
// Moves the iterator to the value correspoding to the json pointer.
|
||||
// Always search from the root of the document.
|
||||
// if successful, we are left pointing at the value,
|
||||
// if not, we are still pointing the same value we were pointing before the call.
|
||||
// The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
|
||||
// However, the standard says "If a referenced member name is not unique in an object,
|
||||
// the member that is referenced is undefined, and evaluation fails".
|
||||
// Here we just return the first corresponding value.
|
||||
// The length parameter is the length of the jsonpointer string ('pointer').
|
||||
bool move_to(const char * pointer, uint32_t length);
|
||||
// if not, we are still pointing the same value we were pointing before the
|
||||
// call. The json pointer follows the rfc6901 standard's syntax:
|
||||
// https://tools.ietf.org/html/rfc6901 However, the standard says "If a
|
||||
// referenced member name is not unique in an object, the member that is
|
||||
// referenced is undefined, and evaluation fails". Here we just return the
|
||||
// first corresponding value. The length parameter is the length of the
|
||||
// jsonpointer string ('pointer').
|
||||
bool move_to(const char *pointer, uint32_t length);
|
||||
|
||||
// Moves the iterator to the value correspoding to the json pointer.
|
||||
// Always search from the root of the document.
|
||||
// if successful, we are left pointing at the value,
|
||||
// if not, we are still pointing the same value we were pointing before the call.
|
||||
// The json pointer implementation follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
|
||||
// However, the standard says "If a referenced member name is not unique in an object,
|
||||
// the member that is referenced is undefined, and evaluation fails".
|
||||
// Here we just return the first corresponding value.
|
||||
inline bool move_to(const std::string & pointer) {
|
||||
// if not, we are still pointing the same value we were pointing before the
|
||||
// call. The json pointer implementation follows the rfc6901 standard's
|
||||
// syntax: https://tools.ietf.org/html/rfc6901 However, the standard says
|
||||
// "If a referenced member name is not unique in an object, the member that
|
||||
// is referenced is undefined, and evaluation fails". Here we just return
|
||||
// the first corresponding value.
|
||||
inline bool move_to(const std::string &pointer) {
|
||||
return move_to(pointer.c_str(), pointer.length());
|
||||
}
|
||||
|
||||
|
||||
|
||||
private:
|
||||
// Almost the same as move_to(), except it searchs from the current
|
||||
// position. The pointer's syntax is identical, though that case is not
|
||||
// handled by the rfc6901 standard. The '/' is still required at the
|
||||
// beginning. However, contrary to move_to(), the URI Fragment Identifier
|
||||
// Representation is not supported here. Also, in case of failure, we are
|
||||
// left pointing at the closest value it could reach. For these reasons it
|
||||
// is private. It exists because it is used by move_to().
|
||||
bool relative_move_to(const char *pointer, uint32_t length);
|
||||
|
||||
// Almost the same as move_to(), except it searchs from the current position.
|
||||
// The pointer's syntax is identical, though that case is not handled by the rfc6901 standard.
|
||||
// The '/' is still required at the beginning.
|
||||
// However, contrary to move_to(), the URI Fragment Identifier Representation is not supported here.
|
||||
// Also, in case of failure, we are left pointing at the closest value it could reach.
|
||||
// For these reasons it is private. It exists because it is used by move_to().
|
||||
bool relative_move_to(const char * pointer, uint32_t length);
|
||||
public:
|
||||
|
||||
// throughout return true if we can do the navigation, false
|
||||
// otherwise
|
||||
|
||||
// Withing a given scope (series of nodes at the same depth within either an
|
||||
// array or an object), we move forward.
|
||||
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [.
|
||||
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
|
||||
// valid if we're not at the end of a scope (returns true).
|
||||
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, {
|
||||
// and [. At the object ({) or at the array ([), you can issue a "down" to
|
||||
// visit their content. valid if we're not at the end of a scope (returns
|
||||
// true).
|
||||
inline bool next();
|
||||
|
||||
// Withing a given scope (series of nodes at the same depth within either an
|
||||
// array or an object), we move backward.
|
||||
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end
|
||||
// of the scope.
|
||||
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
|
||||
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true
|
||||
// when starting at the end of the scope. At the object ({) or at the array
|
||||
// ([), you can issue a "down" to visit their content.
|
||||
inline bool prev();
|
||||
|
||||
// Moves back to either the containing array or object (type { or [) from
|
||||
|
@ -294,11 +280,9 @@ public:
|
|||
// Valid unless we are at the first level of the document
|
||||
inline bool up();
|
||||
|
||||
|
||||
// Valid if we're at a [ or { and it starts a non-empty scope; moves us to start of
|
||||
// that deeper scope if it not empty.
|
||||
// Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the
|
||||
// "a" node.
|
||||
// Valid if we're at a [ or { and it starts a non-empty scope; moves us to
|
||||
// start of that deeper scope if it not empty. Thus, given [true, null,
|
||||
// {"a":1}, [1,2]], if we are at the { node, we would move to the "a" node.
|
||||
inline bool down();
|
||||
|
||||
// move us to the start of our current scope,
|
||||
|
@ -306,7 +290,8 @@ public:
|
|||
inline void to_start_scope();
|
||||
|
||||
inline void rewind() {
|
||||
while(up());
|
||||
while (up())
|
||||
;
|
||||
}
|
||||
|
||||
// void to_end_scope(); // move us to
|
||||
|
@ -314,26 +299,28 @@ public:
|
|||
|
||||
// print the thing we're currently pointing at
|
||||
bool print(std::ostream &os, bool escape_strings = true) const;
|
||||
typedef struct {size_t start_of_scope; uint8_t scope_type;} scopeindex_t;
|
||||
typedef struct {
|
||||
size_t start_of_scope;
|
||||
uint8_t scope_type;
|
||||
} scopeindex_t;
|
||||
|
||||
private:
|
||||
|
||||
iterator& operator=(const iterator& other) = delete ;
|
||||
private:
|
||||
Iterator &operator=(const Iterator &other) = delete;
|
||||
|
||||
ParsedJson &pj;
|
||||
size_t depth;
|
||||
size_t location; // our current location on a tape
|
||||
size_t location; // our current location on a tape
|
||||
size_t tape_length;
|
||||
uint8_t current_type;
|
||||
uint64_t current_val;
|
||||
scopeindex_t *depthindex;
|
||||
scopeindex_t *depth_index;
|
||||
};
|
||||
|
||||
size_t bytecapacity{0}; // indicates how many bits are meant to be supported
|
||||
size_t byte_capacity{0}; // indicates how many bits are meant to be supported
|
||||
|
||||
size_t depthcapacity{0}; // how deep we can go
|
||||
size_t tapecapacity{0};
|
||||
size_t stringcapacity{0};
|
||||
size_t depth_capacity{0}; // how deep we can go
|
||||
size_t tape_capacity{0};
|
||||
size_t string_capacity{0};
|
||||
uint32_t current_loc{0};
|
||||
uint32_t n_structural_indexes{0};
|
||||
|
||||
|
@ -343,24 +330,23 @@ private:
|
|||
uint32_t *containing_scope_offset;
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
void **ret_address;
|
||||
#else
|
||||
#else
|
||||
char *ret_address;
|
||||
#endif
|
||||
|
||||
uint8_t *string_buf; // should be at least bytecapacity
|
||||
uint8_t *string_buf; // should be at least byte_capacity
|
||||
uint8_t *current_string_buf_loc;
|
||||
bool isvalid{false};
|
||||
int errorcode{simdjson::UNITIALIZED};
|
||||
bool valid{false};
|
||||
int error_code{simdjson::UNITIALIZED};
|
||||
|
||||
private :
|
||||
|
||||
// we don't want the default constructor to be called
|
||||
ParsedJson(const ParsedJson & p) = delete; // we don't want the default constructor to be called
|
||||
// we don't want the assignment to be called
|
||||
ParsedJson & operator=(const ParsedJson&o) = delete;
|
||||
private:
|
||||
// we don't want the default constructor to be called
|
||||
ParsedJson(const ParsedJson &p) =
|
||||
delete; // we don't want the default constructor to be called
|
||||
// we don't want the assignment to be called
|
||||
ParsedJson &operator=(const ParsedJson &o) = delete;
|
||||
};
|
||||
|
||||
|
||||
// dump bits low to high
|
||||
inline void dumpbits_always(uint64_t v, const std::string &msg) {
|
||||
for (uint32_t i = 0; i < 64; i++) {
|
||||
|
@ -377,188 +363,180 @@ inline void dumpbits32_always(uint32_t v, const std::string &msg) {
|
|||
}
|
||||
|
||||
WARN_UNUSED
|
||||
bool ParsedJson::iterator::isOk() const {
|
||||
return location < tape_length;
|
||||
}
|
||||
bool ParsedJson::Iterator::is_ok() const { return location < tape_length; }
|
||||
|
||||
// useful for debuging purposes
|
||||
size_t ParsedJson::iterator::get_tape_location() const {
|
||||
return location;
|
||||
}
|
||||
size_t ParsedJson::Iterator::get_tape_location() const { return location; }
|
||||
|
||||
// useful for debuging purposes
|
||||
size_t ParsedJson::iterator::get_tape_length() const {
|
||||
return tape_length;
|
||||
size_t ParsedJson::Iterator::get_tape_length() const { return tape_length; }
|
||||
|
||||
// returns the current depth (start at 1 with 0 reserved for the fictitious root
|
||||
// node)
|
||||
size_t ParsedJson::Iterator::get_depth() const { return depth; }
|
||||
|
||||
// A scope is a series of nodes at the same depth, typically it is either an
|
||||
// object ({) or an array ([). The root node has type 'r'.
|
||||
uint8_t ParsedJson::Iterator::get_scope_type() const {
|
||||
return depth_index[depth].scope_type;
|
||||
}
|
||||
|
||||
// returns the current depth (start at 1 with 0 reserved for the fictitious root node)
|
||||
size_t ParsedJson::iterator::get_depth() const {
|
||||
return depth;
|
||||
}
|
||||
|
||||
// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([).
|
||||
// The root node has type 'r'.
|
||||
uint8_t ParsedJson::iterator::get_scope_type() const {
|
||||
return depthindex[depth].scope_type;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::move_forward() {
|
||||
if(location + 1 >= tape_length) {
|
||||
return false; // we are at the end!
|
||||
}
|
||||
|
||||
if ((current_type == '[') || (current_type == '{')){
|
||||
// We are entering a new scope
|
||||
depth++;
|
||||
depthindex[depth].start_of_scope = location;
|
||||
depthindex[depth].scope_type = current_type;
|
||||
} else if ((current_type == ']') || (current_type == '}')) {
|
||||
// Leaving a scope.
|
||||
depth--;
|
||||
} else if ((current_type == 'd') || (current_type == 'l')) {
|
||||
// d and l types use 2 locations on the tape, not just one.
|
||||
location += 1;
|
||||
}
|
||||
bool ParsedJson::Iterator::move_forward() {
|
||||
if (location + 1 >= tape_length) {
|
||||
return false; // we are at the end!
|
||||
}
|
||||
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
// We are entering a new scope
|
||||
depth++;
|
||||
depth_index[depth].start_of_scope = location;
|
||||
depth_index[depth].scope_type = current_type;
|
||||
} else if ((current_type == ']') || (current_type == '}')) {
|
||||
// Leaving a scope.
|
||||
depth--;
|
||||
} else if ((current_type == 'd') || (current_type == 'l')) {
|
||||
// d and l types use 2 locations on the tape, not just one.
|
||||
location += 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
return true;
|
||||
}
|
||||
|
||||
location += 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ParsedJson::iterator::move_to_value() {
|
||||
// assume that we are on a key, so move by 1.
|
||||
location += 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
void ParsedJson::Iterator::move_to_value() {
|
||||
// assume that we are on a key, so move by 1.
|
||||
location += 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
}
|
||||
|
||||
|
||||
bool ParsedJson::iterator::move_to_key(const char * key) {
|
||||
if(down()) {
|
||||
do {
|
||||
assert(is_string());
|
||||
bool rightkey = (strcmp(get_string(),key)==0);// null chars would fool this
|
||||
move_to_value();
|
||||
if(rightkey) {
|
||||
return true;
|
||||
}
|
||||
} while(next());
|
||||
assert(up());// not found
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) {
|
||||
if(down()) {
|
||||
do {
|
||||
assert(is_string());
|
||||
bool rightkey = ((get_string_length() == length) && (memcmp(get_string(),key,length)==0));
|
||||
move_to_value();
|
||||
if(rightkey) {
|
||||
return true;
|
||||
}
|
||||
} while(next());
|
||||
assert(up());// not found
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::move_to_index(uint32_t index) {
|
||||
assert(is_array());
|
||||
if (down()) {
|
||||
uint32_t i = 0;
|
||||
for (; i < index; i++) {
|
||||
if (!next()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == index) {
|
||||
bool ParsedJson::Iterator::move_to_key(const char *key) {
|
||||
if (down()) {
|
||||
do {
|
||||
assert(is_string());
|
||||
bool right_key =
|
||||
(strcmp(get_string(), key) == 0); // null chars would fool this
|
||||
move_to_value();
|
||||
if (right_key) {
|
||||
return true;
|
||||
}
|
||||
assert(up());
|
||||
}
|
||||
return false;
|
||||
} while (next());
|
||||
assert(up()); // not found
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::prev() {
|
||||
if(location - 1 < depthindex[depth].start_of_scope) {
|
||||
return false;
|
||||
}
|
||||
location -= 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
if ((current_type == ']') || (current_type == '}')){
|
||||
// we need to jump
|
||||
size_t new_location = ( current_val & JSONVALUEMASK);
|
||||
if(new_location < depthindex[depth].start_of_scope) {
|
||||
return false; // shoud never happen
|
||||
bool ParsedJson::Iterator::move_to_key(const char *key, uint32_t length) {
|
||||
if (down()) {
|
||||
do {
|
||||
assert(is_string());
|
||||
bool right_key = ((get_string_length() == length) &&
|
||||
(memcmp(get_string(), key, length) == 0));
|
||||
move_to_value();
|
||||
if (right_key) {
|
||||
return true;
|
||||
}
|
||||
location = new_location;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
}
|
||||
return true;
|
||||
} while (next());
|
||||
assert(up()); // not found
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool ParsedJson::iterator::up() {
|
||||
if(depth == 1) {
|
||||
return false; // don't allow moving back to root
|
||||
}
|
||||
to_start_scope();
|
||||
// next we just move to the previous value
|
||||
depth--;
|
||||
location -= 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool ParsedJson::iterator::down() {
|
||||
if(location + 1 >= tape_length) {
|
||||
return false;
|
||||
}
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
size_t npos = (current_val & JSONVALUEMASK);
|
||||
if(npos == location + 2) {
|
||||
return false; // we have an empty scope
|
||||
bool ParsedJson::Iterator::move_to_index(uint32_t index) {
|
||||
assert(is_array());
|
||||
if (down()) {
|
||||
uint32_t i = 0;
|
||||
for (; i < index; i++) {
|
||||
if (!next()) {
|
||||
break;
|
||||
}
|
||||
depth++;
|
||||
location = location + 1;
|
||||
depthindex[depth].start_of_scope = location;
|
||||
depthindex[depth].scope_type = current_type;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
}
|
||||
if (i == index) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
assert(up());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ParsedJson::iterator::to_start_scope() {
|
||||
location = depthindex[depth].start_of_scope;
|
||||
bool ParsedJson::Iterator::prev() {
|
||||
if (location - 1 < depth_index[depth].start_of_scope) {
|
||||
return false;
|
||||
}
|
||||
location -= 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
if ((current_type == ']') || (current_type == '}')) {
|
||||
// we need to jump
|
||||
size_t new_location = (current_val & JSON_VALUE_MASK);
|
||||
if (new_location < depth_index[depth].start_of_scope) {
|
||||
return false; // shoud never happen
|
||||
}
|
||||
location = new_location;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::next() {
|
||||
size_t npos;
|
||||
if ((current_type == '[') || (current_type == '{')){
|
||||
// we need to jump
|
||||
npos = ( current_val & JSONVALUEMASK);
|
||||
} else {
|
||||
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
|
||||
bool ParsedJson::Iterator::up() {
|
||||
if (depth == 1) {
|
||||
return false; // don't allow moving back to root
|
||||
}
|
||||
to_start_scope();
|
||||
// next we just move to the previous value
|
||||
depth--;
|
||||
location -= 1;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParsedJson::Iterator::down() {
|
||||
if (location + 1 >= tape_length) {
|
||||
return false;
|
||||
}
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
size_t npos = (current_val & JSON_VALUE_MASK);
|
||||
if (npos == location + 2) {
|
||||
return false; // we have an empty scope
|
||||
}
|
||||
uint64_t nextval = pj.tape[npos];
|
||||
uint8_t nexttype = (nextval >> 56);
|
||||
if((nexttype == ']') || (nexttype == '}')) {
|
||||
return false; // we reached the end of the scope
|
||||
}
|
||||
location = npos;
|
||||
current_val = nextval;
|
||||
current_type = nexttype;
|
||||
depth++;
|
||||
location = location + 1;
|
||||
depth_index[depth].start_of_scope = location;
|
||||
depth_index[depth].scope_type = current_type;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ParsedJson::Iterator::to_start_scope() {
|
||||
location = depth_index[depth].start_of_scope;
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
}
|
||||
|
||||
bool ParsedJson::Iterator::next() {
|
||||
size_t npos;
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
// we need to jump
|
||||
npos = (current_val & JSON_VALUE_MASK);
|
||||
} else {
|
||||
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
|
||||
}
|
||||
uint64_t next_val = pj.tape[npos];
|
||||
uint8_t next_type = (next_val >> 56);
|
||||
if ((next_type == ']') || (next_type == '}')) {
|
||||
return false; // we reached the end of the scope
|
||||
}
|
||||
location = npos;
|
||||
current_val = next_val;
|
||||
current_type = next_type;
|
||||
return true;
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -2,33 +2,32 @@
|
|||
#define SIMDJSON_PORTABILITY_H
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_AMD64)
|
||||
# define IS_X86_64 1
|
||||
#define IS_X86_64 1
|
||||
#endif
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
# define IS_ARM64 1
|
||||
#define IS_ARM64 1
|
||||
#endif
|
||||
|
||||
// this is almost standard?
|
||||
#define STRINGIFY(a) #a
|
||||
|
||||
|
||||
|
||||
// we are going to use runtime dispatch
|
||||
#ifdef IS_X86_64
|
||||
#ifdef __clang__
|
||||
// clang does not have GCC push pop
|
||||
// warning: clang attribute push can't be used within a namespace in clang up til 8.0 so TARGET_REGION and
|
||||
// UNTARGET_REGION must be *outside* of a namespace.
|
||||
#define TARGET_REGION(T) _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), apply_to=function)))
|
||||
// warning: clang attribute push can't be used within a namespace in clang up
|
||||
// til 8.0 so TARGET_REGION and UNTARGET_REGION must be *outside* of a
|
||||
// namespace.
|
||||
#define TARGET_REGION(T) \
|
||||
_Pragma(STRINGIFY( \
|
||||
clang attribute push(__attribute__((target(T))), apply_to = function)))
|
||||
#define UNTARGET_REGION _Pragma("clang attribute pop")
|
||||
#elif defined(__GNUC__)
|
||||
// GCC is easier
|
||||
#define TARGET_REGION(T) \
|
||||
_Pragma("GCC push_options") \
|
||||
_Pragma(STRINGIFY(GCC target(T)))
|
||||
#define UNTARGET_REGION \
|
||||
_Pragma("GCC pop_options")
|
||||
#else
|
||||
#define TARGET_REGION(T) \
|
||||
_Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
|
||||
#define UNTARGET_REGION _Pragma("GCC pop_options")
|
||||
#else
|
||||
#define TARGET_REGION(T)
|
||||
#define UNTARGET_REGION
|
||||
#endif // clang then gcc
|
||||
|
@ -39,49 +38,50 @@ _Pragma("GCC pop_options")
|
|||
|
||||
#endif // x86
|
||||
|
||||
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#include <intrin.h>
|
||||
#else
|
||||
# if IS_X86_64
|
||||
# include <x86intrin.h>
|
||||
# elif IS_ARM64
|
||||
# include <arm_neon.h>
|
||||
# endif
|
||||
#if IS_X86_64
|
||||
#include <x86intrin.h>
|
||||
#elif IS_ARM64
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <iso646.h>
|
||||
#include <cstdint>
|
||||
#include <iso646.h>
|
||||
|
||||
namespace simdjson {
|
||||
static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
return _addcarry_u64(0, value1, value2, reinterpret_cast<unsigned __int64 *>(result));
|
||||
static inline bool add_overflow(uint64_t value1, uint64_t value2,
|
||||
uint64_t *result) {
|
||||
return _addcarry_u64(0, value1, value2,
|
||||
reinterpret_cast<unsigned __int64 *>(result));
|
||||
}
|
||||
|
||||
# pragma intrinsic(_umul128)
|
||||
static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
uint64_t high;
|
||||
*result = _umul128(value1, value2, &high);
|
||||
return high;
|
||||
#pragma intrinsic(_umul128)
|
||||
static inline bool mul_overflow(uint64_t value1, uint64_t value2,
|
||||
uint64_t *result) {
|
||||
uint64_t high;
|
||||
*result = _umul128(value1, value2, &high);
|
||||
return high;
|
||||
}
|
||||
|
||||
static inline int trailingzeroes(uint64_t input_num) {
|
||||
return static_cast<int>(_tzcnt_u64(input_num));
|
||||
static inline int trailing_zeroes(uint64_t input_num) {
|
||||
return static_cast<int>(_tzcnt_u64(input_num));
|
||||
}
|
||||
|
||||
static inline int leadingzeroes(uint64_t input_num) {
|
||||
return static_cast<int>(_lzcnt_u64(input_num));
|
||||
static inline int leading_zeroes(uint64_t input_num) {
|
||||
return static_cast<int>(_lzcnt_u64(input_num));
|
||||
}
|
||||
|
||||
static inline int hamming(uint64_t input_num) {
|
||||
#ifdef _WIN64 // highly recommended!!!
|
||||
return (int)__popcnt64(input_num);
|
||||
#else // if we must support 32-bit Windows
|
||||
return (int)(__popcnt((uint32_t)input_num) +
|
||||
__popcnt((uint32_t)(input_num >> 32)));
|
||||
#ifdef _WIN64 // highly recommended!!!
|
||||
return (int)__popcnt64(input_num);
|
||||
#else // if we must support 32-bit Windows
|
||||
return (int)(__popcnt((uint32_t)input_num) +
|
||||
__popcnt((uint32_t)(input_num >> 32)));
|
||||
#endif
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
@ -90,78 +90,83 @@ static inline int hamming(uint64_t input_num) {
|
|||
#include <cstdlib>
|
||||
|
||||
namespace simdjson {
|
||||
static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
return __builtin_uaddll_overflow(value1, value2, (unsigned long long*)result);
|
||||
static inline bool add_overflow(uint64_t value1, uint64_t value2,
|
||||
uint64_t *result) {
|
||||
return __builtin_uaddll_overflow(value1, value2,
|
||||
(unsigned long long *)result);
|
||||
}
|
||||
static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
||||
return __builtin_umulll_overflow(value1, value2, (unsigned long long *)result);
|
||||
static inline bool mul_overflow(uint64_t value1, uint64_t value2,
|
||||
uint64_t *result) {
|
||||
return __builtin_umulll_overflow(value1, value2,
|
||||
(unsigned long long *)result);
|
||||
}
|
||||
|
||||
/* result might be undefined when input_num is zero */
|
||||
static inline int trailingzeroes(uint64_t input_num) {
|
||||
#ifdef __BMI__// tzcnt is BMI1
|
||||
return _tzcnt_u64(input_num);
|
||||
static inline int trailing_zeroes(uint64_t input_num) {
|
||||
#ifdef __BMI__ // tzcnt is BMI1
|
||||
return _tzcnt_u64(input_num);
|
||||
#else
|
||||
return __builtin_ctzll(input_num);
|
||||
return __builtin_ctzll(input_num);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* result might be undefined when input_num is zero */
|
||||
static inline int leadingzeroes(uint64_t input_num) {
|
||||
static inline int leading_zeroes(uint64_t input_num) {
|
||||
#ifdef __BMI2__
|
||||
return _lzcnt_u64(input_num);
|
||||
return _lzcnt_u64(input_num);
|
||||
#else
|
||||
return __builtin_clzll(input_num);
|
||||
return __builtin_clzll(input_num);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* result might be undefined when input_num is zero */
|
||||
static inline int hamming(uint64_t input_num) {
|
||||
#ifdef __POPCOUNT__
|
||||
return _popcnt64(input_num);
|
||||
return _popcnt64(input_num);
|
||||
#else
|
||||
return __builtin_popcountll(input_num);
|
||||
return __builtin_popcountll(input_num);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif // _MSC_VER
|
||||
|
||||
|
||||
namespace simdjson {
|
||||
// portable version of posix_memalign
|
||||
static inline void *aligned_malloc(size_t alignment, size_t size) {
|
||||
void *p;
|
||||
void *p;
|
||||
#ifdef _MSC_VER
|
||||
p = _aligned_malloc(size, alignment);
|
||||
p = _aligned_malloc(size, alignment);
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
p = __mingw_aligned_malloc(size, alignment);
|
||||
p = __mingw_aligned_malloc(size, alignment);
|
||||
#else
|
||||
// somehow, if this is used before including "x86intrin.h", it creates an
|
||||
// implicit defined warning.
|
||||
if (posix_memalign(&p, alignment, size) != 0) { return nullptr; }
|
||||
// somehow, if this is used before including "x86intrin.h", it creates an
|
||||
// implicit defined warning.
|
||||
if (posix_memalign(&p, alignment, size) != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
return p;
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline char *aligned_malloc_char(size_t alignment, size_t size) {
|
||||
return (char*)aligned_malloc(alignment, size);
|
||||
return (char *)aligned_malloc(alignment, size);
|
||||
}
|
||||
|
||||
static inline void aligned_free(void *memblock) {
|
||||
if(memblock == nullptr) { return; }
|
||||
static inline void aligned_free(void *mem_block) {
|
||||
if (mem_block == nullptr) {
|
||||
return;
|
||||
}
|
||||
#ifdef _MSC_VER
|
||||
_aligned_free(memblock);
|
||||
_aligned_free(mem_block);
|
||||
#elif defined(__MINGW32__) || defined(__MINGW64__)
|
||||
__mingw_aligned_free(memblock);
|
||||
__mingw_aligned_free(mem_block);
|
||||
#else
|
||||
free(memblock);
|
||||
free(mem_block);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline void aligned_free_char(char *memblock) {
|
||||
aligned_free((void*)memblock);
|
||||
}
|
||||
static inline void aligned_free_char(char *mem_block) {
|
||||
aligned_free((void *)mem_block);
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif // SIMDJSON_PORTABILITY_H
|
||||
|
|
|
@ -5,38 +5,40 @@
|
|||
|
||||
namespace simdjson {
|
||||
// Represents the minimal architecture that would support an implementation
|
||||
enum class architecture {
|
||||
westmere,
|
||||
haswell,
|
||||
arm64,
|
||||
none,
|
||||
// TODO remove 'native' in favor of runtime dispatch?
|
||||
// the 'native' enum class value should point at a good default on the current machine
|
||||
enum class Architecture {
|
||||
WESTMERE,
|
||||
HASWELL,
|
||||
ARM64,
|
||||
NONE,
|
||||
// TODO remove 'native' in favor of runtime dispatch?
|
||||
// the 'native' enum class value should point at a good default on the current
|
||||
// machine
|
||||
#ifdef IS_X86_64
|
||||
native = westmere
|
||||
NATIVE = WESTMERE
|
||||
#elif defined(IS_ARM64)
|
||||
native = arm64
|
||||
NATIVE = ARM64
|
||||
#endif
|
||||
};
|
||||
|
||||
enum errorValues {
|
||||
enum ErrorValues {
|
||||
SUCCESS = 0,
|
||||
CAPACITY, // This ParsedJson can't support a document that big
|
||||
MEMALLOC, // Error allocating memory, most likely out of memory
|
||||
TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this is a generic error
|
||||
CAPACITY, // This ParsedJson can't support a document that big
|
||||
MEMALLOC, // Error allocating memory, most likely out of memory
|
||||
TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this
|
||||
// is a generic error
|
||||
DEPTH_ERROR, // Your document exceeds the user-specified depth limitation
|
||||
STRING_ERROR, // Problem while parsing a string
|
||||
T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't'
|
||||
F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
|
||||
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
|
||||
NUMBER_ERROR, // Problem while parsing a number
|
||||
UTF8_ERROR, // the input is not valid UTF-8
|
||||
UNITIALIZED, // unknown error, or uninitialized document
|
||||
EMPTY, // no structural document found
|
||||
STRING_ERROR, // Problem while parsing a string
|
||||
T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't'
|
||||
F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
|
||||
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
|
||||
NUMBER_ERROR, // Problem while parsing a number
|
||||
UTF8_ERROR, // the input is not valid UTF-8
|
||||
UNITIALIZED, // unknown error, or uninitialized document
|
||||
EMPTY, // no structural document found
|
||||
UNESCAPED_CHARS, // found unescaped characters in a string.
|
||||
UNCLOSED_STRING, // missing quote at the end
|
||||
UNEXPECTED_ERROR // indicative of a bug in simdjson
|
||||
};
|
||||
const std::string& errorMsg(const int);
|
||||
}
|
||||
const std::string &error_message(const int);
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
|
||||
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
#define SIMDJSON_VERSION 0.1.2
|
||||
// /include/simdjson/simdjson_version.h automatically generated by release.py,
|
||||
// do not change by hand
|
||||
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
#define SIMDJSON_VERSION 0.1.2
|
||||
namespace simdjson {
|
||||
enum {
|
||||
SIMDJSON_VERSION_MAJOR = 0,
|
||||
SIMDJSON_VERSION_MINOR = 1,
|
||||
SIMDJSON_VERSION_REVISION = 2
|
||||
};
|
||||
enum {
|
||||
SIMDJSON_VERSION_MAJOR = 0,
|
||||
SIMDJSON_VERSION_MINOR = 1,
|
||||
SIMDJSON_VERSION_REVISION = 2
|
||||
};
|
||||
}
|
||||
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
|
||||
|
|
|
@ -4,14 +4,15 @@
|
|||
#ifndef SIMDJSON_SIMDUTF8CHECK_ARM64_H
|
||||
#define SIMDJSON_SIMDUTF8CHECK_ARM64_H
|
||||
|
||||
#if defined(_ARM_NEON) || defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64))
|
||||
#if defined(_ARM_NEON) || defined(__aarch64__) || \
|
||||
(defined(_MSC_VER) && defined(_M_ARM64))
|
||||
|
||||
#include <cstdio>
|
||||
#include <arm_neon.h>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*
|
||||
* legal utf-8 byte sequence
|
||||
|
@ -32,47 +33,49 @@
|
|||
namespace simdjson {
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
static inline void checkSmallerThan0xF4(int8x16_t current_bytes,
|
||||
int8x16_t *has_error) {
|
||||
static inline void check_smaller_than_0xF4(int8x16_t current_bytes,
|
||||
int8x16_t *has_error) {
|
||||
// unsigned, saturates to 0 below max
|
||||
*has_error = vorrq_s8(*has_error,
|
||||
vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
|
||||
*has_error = vorrq_s8(
|
||||
*has_error, vreinterpretq_s8_u8(vqsubq_u8(
|
||||
vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
|
||||
}
|
||||
|
||||
static const int8_t _nibbles[] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
2, 2, // 110x
|
||||
3, // 1110
|
||||
4, // 1111, next should be 0 (not checked here)
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
2, 2, // 110x
|
||||
3, // 1110
|
||||
4, // 1111, next should be 0 (not checked here)
|
||||
};
|
||||
|
||||
static inline int8x16_t continuationLengths(int8x16_t high_nibbles) {
|
||||
static inline int8x16_t continuation_lengths(int8x16_t high_nibbles) {
|
||||
return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles));
|
||||
}
|
||||
|
||||
static inline int8x16_t carryContinuations(int8x16_t initial_lengths,
|
||||
int8x16_t previous_carries) {
|
||||
static inline int8x16_t carry_continuations(int8x16_t initial_lengths,
|
||||
int8x16_t previous_carries) {
|
||||
|
||||
int8x16_t right1 =
|
||||
vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
|
||||
vdupq_n_u8(1)));
|
||||
int8x16_t right1 = vreinterpretq_s8_u8(vqsubq_u8(
|
||||
vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
|
||||
vdupq_n_u8(1)));
|
||||
int8x16_t sum = vaddq_s8(initial_lengths, right1);
|
||||
|
||||
int8x16_t right2 = vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
|
||||
vdupq_n_u8(2)));
|
||||
int8x16_t right2 = vreinterpretq_s8_u8(
|
||||
vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
|
||||
vdupq_n_u8(2)));
|
||||
return vaddq_s8(sum, right2);
|
||||
}
|
||||
|
||||
static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carries,
|
||||
int8x16_t *has_error) {
|
||||
static inline void check_continuations(int8x16_t initial_lengths,
|
||||
int8x16_t carries,
|
||||
int8x16_t *has_error) {
|
||||
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
// (carries > length) == (lengths > 0)
|
||||
uint8x16_t overunder =
|
||||
vceqq_u8(vcgtq_s8(carries, initial_lengths),
|
||||
vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
|
||||
uint8x16_t overunder = vceqq_u8(vcgtq_s8(carries, initial_lengths),
|
||||
vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
|
||||
|
||||
*has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
|
||||
}
|
||||
|
@ -80,9 +83,9 @@ static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carri
|
|||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
static inline void checkFirstContinuationMax(int8x16_t current_bytes,
|
||||
int8x16_t off1_current_bytes,
|
||||
int8x16_t *has_error) {
|
||||
static inline void check_first_continuation_max(int8x16_t current_bytes,
|
||||
int8x16_t off1_current_bytes,
|
||||
int8x16_t *has_error) {
|
||||
uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED));
|
||||
uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4));
|
||||
|
||||
|
@ -91,23 +94,24 @@ static inline void checkFirstContinuationMax(int8x16_t current_bytes,
|
|||
uint8x16_t badfollowF4 =
|
||||
vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4);
|
||||
|
||||
*has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)));
|
||||
*has_error = vorrq_s8(
|
||||
*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)));
|
||||
}
|
||||
|
||||
static const int8_t _initial_mins[] = {
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, // 10xx => false
|
||||
(int8_t) 0xC2, -128, // 110x
|
||||
(int8_t) 0xE1, // 1110
|
||||
(int8_t) 0xF1,
|
||||
-128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, -128, -128, // 10xx => false
|
||||
(int8_t)0xC2, -128, // 110x
|
||||
(int8_t)0xE1, // 1110
|
||||
(int8_t)0xF1,
|
||||
};
|
||||
|
||||
static const int8_t _second_mins[] = {
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
(int8_t) 0xA0, // 1110
|
||||
(int8_t) 0x90,
|
||||
-128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
(int8_t)0xA0, // 1110
|
||||
(int8_t)0x90,
|
||||
};
|
||||
|
||||
// map off1_hibits => error condition
|
||||
|
@ -116,58 +120,61 @@ static const int8_t _second_mins[] = {
|
|||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
static inline void checkOverlong(int8x16_t current_bytes,
|
||||
int8x16_t off1_current_bytes, int8x16_t hibits,
|
||||
int8x16_t previous_hibits, int8x16_t *has_error) {
|
||||
static inline void check_overlong(int8x16_t current_bytes,
|
||||
int8x16_t off1_current_bytes,
|
||||
int8x16_t hibits, int8x16_t previous_hibits,
|
||||
int8x16_t *has_error) {
|
||||
int8x16_t off1_hibits = vextq_s8(previous_hibits, hibits, 16 - 1);
|
||||
int8x16_t initial_mins = vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits));
|
||||
int8x16_t initial_mins =
|
||||
vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits));
|
||||
|
||||
uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes);
|
||||
|
||||
int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits));
|
||||
int8x16_t second_mins =
|
||||
vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits));
|
||||
uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes);
|
||||
*has_error =
|
||||
vorrq_s8(*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
|
||||
*has_error = vorrq_s8(
|
||||
*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
|
||||
}
|
||||
|
||||
struct processed_utf_bytes {
|
||||
int8x16_t rawbytes;
|
||||
int8x16_t raw_bytes;
|
||||
int8x16_t high_nibbles;
|
||||
int8x16_t carried_continuations;
|
||||
};
|
||||
|
||||
static inline void count_nibbles(int8x16_t bytes,
|
||||
struct processed_utf_bytes *answer) {
|
||||
answer->rawbytes = bytes;
|
||||
answer->raw_bytes = bytes;
|
||||
answer->high_nibbles =
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
|
||||
}
|
||||
|
||||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static inline struct processed_utf_bytes
|
||||
checkUTF8Bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
|
||||
int8x16_t *has_error) {
|
||||
check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
|
||||
int8x16_t *has_error) {
|
||||
struct processed_utf_bytes pb;
|
||||
count_nibbles(current_bytes, &pb);
|
||||
|
||||
checkSmallerThan0xF4(current_bytes, has_error);
|
||||
check_smaller_than_0xF4(current_bytes, has_error);
|
||||
|
||||
int8x16_t initial_lengths = continuationLengths(pb.high_nibbles);
|
||||
int8x16_t initial_lengths = continuation_lengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations =
|
||||
carryContinuations(initial_lengths, previous->carried_continuations);
|
||||
carry_continuations(initial_lengths, previous->carried_continuations);
|
||||
|
||||
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
check_continuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
int8x16_t off1_current_bytes =
|
||||
vextq_s8(previous->rawbytes, pb.rawbytes, 16 - 1);
|
||||
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
|
||||
vextq_s8(previous->raw_bytes, pb.raw_bytes, 16 - 1);
|
||||
check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
|
||||
|
||||
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
}// simdjson
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
#ifndef SIMDJSON_SIMDUTF8CHECK_HASWELL_H
|
||||
#define SIMDJSON_SIMDUTF8CHECK_HASWELL_H
|
||||
|
||||
|
||||
#include "simdjson/portability.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "simdjson/portability.h"
|
||||
|
||||
#ifdef IS_X86_64
|
||||
/*
|
||||
|
@ -38,14 +37,14 @@ static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) {
|
|||
}
|
||||
|
||||
// all byte values must be no larger than 0xF4
|
||||
static inline void avxcheckSmallerThan0xF4(__m256i current_bytes,
|
||||
__m256i *has_error) {
|
||||
static inline void avx_check_smaller_than_0xF4(__m256i current_bytes,
|
||||
__m256i *has_error) {
|
||||
// unsigned, saturates to 0 below max
|
||||
*has_error = _mm256_or_si256(
|
||||
*has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4)));
|
||||
}
|
||||
|
||||
static inline __m256i avxcontinuationLengths(__m256i high_nibbles) {
|
||||
static inline __m256i avx_continuation_lengths(__m256i high_nibbles) {
|
||||
return _mm256_shuffle_epi8(
|
||||
_mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
|
@ -61,8 +60,8 @@ static inline __m256i avxcontinuationLengths(__m256i high_nibbles) {
|
|||
high_nibbles);
|
||||
}
|
||||
|
||||
static inline __m256i avxcarryContinuations(__m256i initial_lengths,
|
||||
__m256i previous_carries) {
|
||||
static inline __m256i avx_carry_continuations(__m256i initial_lengths,
|
||||
__m256i previous_carries) {
|
||||
|
||||
__m256i right1 = _mm256_subs_epu8(
|
||||
push_last_byte_of_a_to_b(previous_carries, initial_lengths),
|
||||
|
@ -74,8 +73,9 @@ static inline __m256i avxcarryContinuations(__m256i initial_lengths,
|
|||
return _mm256_add_epi8(sum, right2);
|
||||
}
|
||||
|
||||
static inline void avxcheckContinuations(__m256i initial_lengths,
|
||||
__m256i carries, __m256i *has_error) {
|
||||
static inline void avx_check_continuations(__m256i initial_lengths,
|
||||
__m256i carries,
|
||||
__m256i *has_error) {
|
||||
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
|
@ -90,9 +90,9 @@ static inline void avxcheckContinuations(__m256i initial_lengths,
|
|||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
static inline void avxcheckFirstContinuationMax(__m256i current_bytes,
|
||||
__m256i off1_current_bytes,
|
||||
__m256i *has_error) {
|
||||
static inline void avx_check_first_continuation_max(__m256i current_bytes,
|
||||
__m256i off1_current_bytes,
|
||||
__m256i *has_error) {
|
||||
__m256i maskED =
|
||||
_mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED));
|
||||
__m256i maskF4 =
|
||||
|
@ -113,37 +113,37 @@ static inline void avxcheckFirstContinuationMax(__m256i current_bytes,
|
|||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
static inline void avxcheckOverlong(__m256i current_bytes,
|
||||
__m256i off1_current_bytes, __m256i hibits,
|
||||
__m256i previous_hibits,
|
||||
__m256i *has_error) {
|
||||
static inline void avx_check_overlong(__m256i current_bytes,
|
||||
__m256i off1_current_bytes,
|
||||
__m256i hibits, __m256i previous_hibits,
|
||||
__m256i *has_error) {
|
||||
__m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
|
||||
__m256i initial_mins = _mm256_shuffle_epi8(
|
||||
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1, // 1111
|
||||
-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1), // 1111
|
||||
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1, // 1111
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, // 10xx => false
|
||||
0xC2, -128, // 110x
|
||||
0xE1, // 1110
|
||||
0xF1), // 1111
|
||||
off1_hibits);
|
||||
|
||||
__m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
|
||||
|
||||
__m256i second_mins = _mm256_shuffle_epi8(
|
||||
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90, // 1111
|
||||
-128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90), // 1111
|
||||
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90, // 1111
|
||||
-128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
-128, -128, -128, // 10xx => false
|
||||
127, 127, // 110x => true
|
||||
0xA0, // 1110
|
||||
0x90), // 1111
|
||||
off1_hibits);
|
||||
__m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
|
||||
*has_error = _mm256_or_si256(*has_error,
|
||||
|
@ -151,14 +151,14 @@ static inline void avxcheckOverlong(__m256i current_bytes,
|
|||
}
|
||||
|
||||
struct avx_processed_utf_bytes {
|
||||
__m256i rawbytes;
|
||||
__m256i raw_bytes;
|
||||
__m256i high_nibbles;
|
||||
__m256i carried_continuations;
|
||||
};
|
||||
|
||||
static inline void avx_count_nibbles(__m256i bytes,
|
||||
struct avx_processed_utf_bytes *answer) {
|
||||
answer->rawbytes = bytes;
|
||||
answer->raw_bytes = bytes;
|
||||
answer->high_nibbles =
|
||||
_mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F));
|
||||
}
|
||||
|
@ -166,33 +166,33 @@ static inline void avx_count_nibbles(__m256i bytes,
|
|||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static inline struct avx_processed_utf_bytes
|
||||
avxcheckUTF8Bytes(__m256i current_bytes,
|
||||
struct avx_processed_utf_bytes *previous,
|
||||
__m256i *has_error) {
|
||||
struct avx_processed_utf_bytes pb{};
|
||||
avx_check_utf8_bytes(__m256i current_bytes,
|
||||
struct avx_processed_utf_bytes *previous,
|
||||
__m256i *has_error) {
|
||||
struct avx_processed_utf_bytes pb {};
|
||||
avx_count_nibbles(current_bytes, &pb);
|
||||
|
||||
avxcheckSmallerThan0xF4(current_bytes, has_error);
|
||||
avx_check_smaller_than_0xF4(current_bytes, has_error);
|
||||
|
||||
__m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
|
||||
__m256i initial_lengths = avx_continuation_lengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations =
|
||||
avxcarryContinuations(initial_lengths, previous->carried_continuations);
|
||||
avx_carry_continuations(initial_lengths, previous->carried_continuations);
|
||||
|
||||
avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
avx_check_continuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
__m256i off1_current_bytes =
|
||||
push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
|
||||
avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
|
||||
push_last_byte_of_a_to_b(previous->raw_bytes, pb.raw_bytes);
|
||||
avx_check_first_continuation_max(current_bytes, off1_current_bytes,
|
||||
has_error);
|
||||
|
||||
avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
avx_check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
}// simdjson
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION // haswell
|
||||
|
||||
|
||||
#endif // IS_X86_64
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#ifndef SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
|
||||
#define SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
|
||||
|
||||
#include "simdjson/portability.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "simdjson/portability.h"
|
||||
#ifdef IS_X86_64
|
||||
|
||||
/*
|
||||
|
@ -29,16 +29,16 @@
|
|||
/********** sse code **********/
|
||||
TARGET_WESTMERE
|
||||
|
||||
namespace simdjson{
|
||||
namespace simdjson {
|
||||
// all byte values must be no larger than 0xF4
|
||||
static inline void checkSmallerThan0xF4(__m128i current_bytes,
|
||||
__m128i *has_error) {
|
||||
static inline void check_smaller_than_0xF4(__m128i current_bytes,
|
||||
__m128i *has_error) {
|
||||
// unsigned, saturates to 0 below max
|
||||
*has_error = _mm_or_si128(*has_error,
|
||||
_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
|
||||
}
|
||||
|
||||
static inline __m128i continuationLengths(__m128i high_nibbles) {
|
||||
static inline __m128i continuation_lengths(__m128i high_nibbles) {
|
||||
return _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
|
||||
0, 0, 0, 0, // 10xx (continuation)
|
||||
|
@ -48,8 +48,8 @@ static inline __m128i continuationLengths(__m128i high_nibbles) {
|
|||
high_nibbles);
|
||||
}
|
||||
|
||||
static inline __m128i carryContinuations(__m128i initial_lengths,
|
||||
__m128i previous_carries) {
|
||||
static inline __m128i carry_continuations(__m128i initial_lengths,
|
||||
__m128i previous_carries) {
|
||||
|
||||
__m128i right1 =
|
||||
_mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
|
||||
|
@ -61,8 +61,8 @@ static inline __m128i carryContinuations(__m128i initial_lengths,
|
|||
return _mm_add_epi8(sum, right2);
|
||||
}
|
||||
|
||||
static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
|
||||
__m128i *has_error) {
|
||||
static inline void check_continuations(__m128i initial_lengths, __m128i carries,
|
||||
__m128i *has_error) {
|
||||
|
||||
// overlap || underlap
|
||||
// carry > length && length > 0 || !(carry > length) && !(length > 0)
|
||||
|
@ -77,9 +77,9 @@ static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
|
|||
// when 0xED is found, next byte must be no larger than 0x9F
|
||||
// when 0xF4 is found, next byte must be no larger than 0x8F
|
||||
// next byte must be continuation, ie sign bit is set, so signed < is ok
|
||||
static inline void checkFirstContinuationMax(__m128i current_bytes,
|
||||
__m128i off1_current_bytes,
|
||||
__m128i *has_error) {
|
||||
static inline void check_first_continuation_max(__m128i current_bytes,
|
||||
__m128i off1_current_bytes,
|
||||
__m128i *has_error) {
|
||||
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
|
||||
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
|
||||
|
||||
|
@ -97,9 +97,9 @@ static inline void checkFirstContinuationMax(__m128i current_bytes,
|
|||
// E => < E1 && < A0
|
||||
// F => < F1 && < 90
|
||||
// else false && false
|
||||
static inline void checkOverlong(__m128i current_bytes,
|
||||
__m128i off1_current_bytes, __m128i hibits,
|
||||
__m128i previous_hibits, __m128i *has_error) {
|
||||
static inline void check_overlong(__m128i current_bytes,
|
||||
__m128i off1_current_bytes, __m128i hibits,
|
||||
__m128i previous_hibits, __m128i *has_error) {
|
||||
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
|
||||
__m128i initial_mins = _mm_shuffle_epi8(
|
||||
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
|
||||
|
@ -124,14 +124,14 @@ static inline void checkOverlong(__m128i current_bytes,
|
|||
}
|
||||
|
||||
struct processed_utf_bytes {
|
||||
__m128i rawbytes;
|
||||
__m128i raw_bytes;
|
||||
__m128i high_nibbles;
|
||||
__m128i carried_continuations;
|
||||
};
|
||||
|
||||
static inline void count_nibbles(__m128i bytes,
|
||||
struct processed_utf_bytes *answer) {
|
||||
answer->rawbytes = bytes;
|
||||
answer->raw_bytes = bytes;
|
||||
answer->high_nibbles =
|
||||
_mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
|
||||
}
|
||||
|
@ -139,32 +139,31 @@ static inline void count_nibbles(__m128i bytes,
|
|||
// check whether the current bytes are valid UTF-8
|
||||
// at the end of the function, previous gets updated
|
||||
static struct processed_utf_bytes
|
||||
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
|
||||
__m128i *has_error) {
|
||||
check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
|
||||
__m128i *has_error) {
|
||||
struct processed_utf_bytes pb;
|
||||
count_nibbles(current_bytes, &pb);
|
||||
|
||||
checkSmallerThan0xF4(current_bytes, has_error);
|
||||
check_smaller_than_0xF4(current_bytes, has_error);
|
||||
|
||||
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
|
||||
__m128i initial_lengths = continuation_lengths(pb.high_nibbles);
|
||||
|
||||
pb.carried_continuations =
|
||||
carryContinuations(initial_lengths, previous->carried_continuations);
|
||||
carry_continuations(initial_lengths, previous->carried_continuations);
|
||||
|
||||
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
check_continuations(initial_lengths, pb.carried_continuations, has_error);
|
||||
|
||||
__m128i off1_current_bytes =
|
||||
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
|
||||
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
|
||||
_mm_alignr_epi8(pb.raw_bytes, previous->raw_bytes, 16 - 1);
|
||||
check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
|
||||
|
||||
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
|
||||
previous->high_nibbles, has_error);
|
||||
return pb;
|
||||
}
|
||||
}//simdjson
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION // westmere
|
||||
|
||||
|
||||
#endif // IS_X86_64
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,67 +1,60 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_H
|
||||
|
||||
#include <cassert>
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include <cassert>
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
template<architecture>
|
||||
struct simd_input;
|
||||
template <Architecture> struct simd_input;
|
||||
|
||||
template<architecture T>
|
||||
uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||
template <Architecture T> uint64_t compute_quote_mask(uint64_t quote_bits);
|
||||
|
||||
namespace {
|
||||
// for when clmul is unavailable
|
||||
[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
|
||||
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
|
||||
quote_mask = quote_mask ^ (quote_mask << 2);
|
||||
quote_mask = quote_mask ^ (quote_mask << 4);
|
||||
quote_mask = quote_mask ^ (quote_mask << 8);
|
||||
quote_mask = quote_mask ^ (quote_mask << 16);
|
||||
quote_mask = quote_mask ^ (quote_mask << 32);
|
||||
return quote_mask;
|
||||
}
|
||||
// for when clmul is unavailable
|
||||
[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
|
||||
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
|
||||
quote_mask = quote_mask ^ (quote_mask << 2);
|
||||
quote_mask = quote_mask ^ (quote_mask << 4);
|
||||
quote_mask = quote_mask ^ (quote_mask << 8);
|
||||
quote_mask = quote_mask ^ (quote_mask << 16);
|
||||
quote_mask = quote_mask ^ (quote_mask << 32);
|
||||
return quote_mask;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Holds the state required to perform check_utf8().
|
||||
template<architecture>
|
||||
struct utf8_checking_state;
|
||||
template <Architecture> struct utf8_checking_state;
|
||||
|
||||
|
||||
template<architecture T>
|
||||
void check_utf8(simd_input<T> in, utf8_checking_state<T>& state);
|
||||
template <Architecture T>
|
||||
void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
|
||||
|
||||
// Checks if the utf8 validation has found any error.
|
||||
template<architecture T>
|
||||
errorValues check_utf8_errors(utf8_checking_state<T>& state);
|
||||
template <Architecture T>
|
||||
ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
|
||||
|
||||
// a straightforward comparison of a mask against input.
|
||||
template<architecture T>
|
||||
// a straightforward comparison of a mask against input.
|
||||
template <Architecture T>
|
||||
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
|
||||
template<architecture T>
|
||||
simd_input<T> fill_input(const uint8_t * ptr);
|
||||
template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
|
||||
|
||||
|
||||
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
|
||||
template<architecture T>
|
||||
// find all values less than or equal than the content of maxval (using unsigned
|
||||
// arithmetic)
|
||||
template <Architecture T>
|
||||
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
|
||||
|
||||
template <Architecture T>
|
||||
really_inline uint64_t find_odd_backslash_sequences(
|
||||
simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
|
||||
|
||||
template<architecture T> really_inline
|
||||
uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
|
||||
|
||||
|
||||
template<architecture T> really_inline
|
||||
uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask);
|
||||
|
||||
template <Architecture T>
|
||||
really_inline uint64_t find_quote_mask_and_bits(
|
||||
simd_input<T> in, uint64_t odd_ends, uint64_t &prev_iter_inside_quote,
|
||||
uint64_t "e_bits, uint64_t &error_mask);
|
||||
|
||||
// do a 'shufti' to detect structural JSON characters
|
||||
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
|
||||
|
@ -70,9 +63,8 @@ uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
|
|||
// we are also interested in the four whitespace characters
|
||||
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
||||
// these go into the next 2 buckets of the comparison (8/16)
|
||||
template<architecture T>
|
||||
void find_whitespace_and_structurals(simd_input<T> in,
|
||||
uint64_t &whitespace,
|
||||
template <Architecture T>
|
||||
void find_whitespace_and_structurals(simd_input<T> in, uint64_t &whitespace,
|
||||
uint64_t &structurals);
|
||||
|
||||
// return a updated structural bit vector with quoted contents cleared out and
|
||||
|
@ -86,7 +78,7 @@ really_inline uint64_t finalize_structurals(
|
|||
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
|
||||
// mask off anything inside quotes
|
||||
structurals &= ~quote_mask;
|
||||
// add the real quote bits back into our bitmask as well, so we can
|
||||
// add the real quote bits back into our bit_mask as well, so we can
|
||||
// quickly traverse the strings we've spent all this trouble gathering
|
||||
structurals |= quote_bits;
|
||||
// Now, establish "pseudo-structural characters". These are non-whitespace
|
||||
|
@ -114,12 +106,14 @@ really_inline uint64_t finalize_structurals(
|
|||
return structurals;
|
||||
}
|
||||
|
||||
template<architecture T = architecture::native>
|
||||
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj);
|
||||
template <Architecture T = Architecture::NATIVE>
|
||||
int find_structural_bits(const uint8_t *buf, size_t len,
|
||||
simdjson::ParsedJson &pj);
|
||||
|
||||
template<architecture T = architecture::native>
|
||||
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) {
|
||||
return find_structural_bits((const uint8_t*)buf, len, pj);
|
||||
template <Architecture T = Architecture::NATIVE>
|
||||
int find_structural_bits(const char *buf, size_t len,
|
||||
simdjson::ParsedJson &pj) {
|
||||
return find_structural_bits((const uint8_t *)buf, len, pj);
|
||||
}
|
||||
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1,23 +1,24 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
|
||||
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
#include "simdjson/stage1_find_marks_flatten.h"
|
||||
#include "simdjson/simdutf8check_arm64.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_flatten.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
|
||||
#ifdef IS_ARM64
|
||||
namespace simdjson {
|
||||
template<> struct simd_input<architecture::arm64> {
|
||||
template <> struct simd_input<Architecture::ARM64> {
|
||||
uint8x16_t i0;
|
||||
uint8x16_t i1;
|
||||
uint8x16_t i2;
|
||||
uint8x16_t i3;
|
||||
};
|
||||
|
||||
template<> really_inline
|
||||
simd_input<architecture::arm64> fill_input<architecture::arm64>(const uint8_t * ptr) {
|
||||
struct simd_input<architecture::arm64> in;
|
||||
template <>
|
||||
really_inline simd_input<Architecture::ARM64>
|
||||
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::ARM64> in;
|
||||
in.i0 = vld1q_u8(ptr + 0);
|
||||
in.i1 = vld1q_u8(ptr + 16);
|
||||
in.i2 = vld1q_u8(ptr + 32);
|
||||
|
@ -25,26 +26,24 @@ simd_input<architecture::arm64> fill_input<architecture::arm64>(const uint8_t *
|
|||
return in;
|
||||
}
|
||||
|
||||
|
||||
really_inline
|
||||
uint16_t neonmovemask(uint8x16_t input) {
|
||||
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
really_inline uint16_t neon_movemask(uint8x16_t input) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t minput = vandq_u8(input, bitmask);
|
||||
uint8x16_t minput = vandq_u8(input, bit_mask);
|
||||
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
tmp = vpaddq_u8(tmp, tmp);
|
||||
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
||||
}
|
||||
|
||||
really_inline
|
||||
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
|
||||
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
|
||||
uint8x16_t p2, uint8x16_t p3) {
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t t0 = vandq_u8(p0, bitmask);
|
||||
uint8x16_t t1 = vandq_u8(p1, bitmask);
|
||||
uint8x16_t t2 = vandq_u8(p2, bitmask);
|
||||
uint8x16_t t3 = vandq_u8(p3, bitmask);
|
||||
uint8x16_t t0 = vandq_u8(p0, bit_mask);
|
||||
uint8x16_t t1 = vandq_u8(p1, bit_mask);
|
||||
uint8x16_t t2 = vandq_u8(p2, bit_mask);
|
||||
uint8x16_t t3 = vandq_u8(p3, bit_mask);
|
||||
uint8x16_t sum0 = vpaddq_u8(t0, t1);
|
||||
uint8x16_t sum1 = vpaddq_u8(t2, t3);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
|
@ -52,108 +51,122 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
|
|||
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t compute_quote_mask<architecture::arm64>(uint64_t quote_bits) {
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
|
||||
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
|
||||
return vmull_p64( -1ULL, quote_bits);
|
||||
return vmull_p64(-1ULL, quote_bits);
|
||||
#else
|
||||
return portable_compute_quote_mask(quote_bits);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
struct utf8_checking_state<architecture::arm64>
|
||||
{
|
||||
int8x16_t has_error {};
|
||||
processed_utf_bytes previous {};
|
||||
template <> struct utf8_checking_state<Architecture::ARM64> {
|
||||
int8x16_t has_error{};
|
||||
processed_utf_bytes previous{};
|
||||
};
|
||||
|
||||
// Checks that all bytes are ascii
|
||||
really_inline
|
||||
bool check_ascii_neon(simd_input<architecture::arm64> in) {
|
||||
really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
|
||||
// checking if the most significant bit is always equal to 0.
|
||||
uint8x16_t highbit = vdupq_n_u8(0x80);
|
||||
uint8x16_t high_bit = vdupq_n_u8(0x80);
|
||||
uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
|
||||
uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
|
||||
uint8x16_t t3 = vorrq_u8(t0, t1);
|
||||
uint8x16_t t4 = vandq_u8(t3, highbit);
|
||||
uint8x16_t t4 = vandq_u8(t3, high_bit);
|
||||
uint64x2_t v64 = vreinterpretq_u64_u8(t4);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) == 0;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
void check_utf8<architecture::arm64>(simd_input<architecture::arm64> in,
|
||||
utf8_checking_state<architecture::arm64>& state) {
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in,
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
if (check_ascii_neon(in)) {
|
||||
// All bytes are ascii. Therefore the byte that was just before must be ascii too.
|
||||
// We only check the byte that was just before simd_input. Nines are arbitrary values.
|
||||
const int8x16_t verror = (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
|
||||
// All bytes are ascii. Therefore the byte that was just before must be
|
||||
// ascii too. We only check the byte that was just before simd_input. Nines
|
||||
// are arbitrary values.
|
||||
const int8x16_t verror =
|
||||
(int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
|
||||
state.has_error =
|
||||
vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(state.previous.carried_continuations,
|
||||
verror)),
|
||||
state.has_error);
|
||||
vorrq_s8(vreinterpretq_s8_u8(
|
||||
vcgtq_s8(state.previous.carried_continuations, verror)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i0), &(state.previous), &(state.has_error));
|
||||
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i1), &(state.previous), &(state.has_error));
|
||||
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i2), &(state.previous), &(state.has_error));
|
||||
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i3), &(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
|
||||
&(state.previous), &(state.has_error));
|
||||
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
|
||||
&(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
errorValues check_utf8_errors<architecture::arm64>(utf8_checking_state<architecture::arm64>& state) {
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
|
||||
utf8_checking_state<Architecture::ARM64> &state) {
|
||||
uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
|
||||
uint32x2_t v32 = vqmovn_u64(v64);
|
||||
uint64x1_t result = vreinterpret_u64_u32(v32);
|
||||
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t cmp_mask_against_input<architecture::arm64>(simd_input<architecture::arm64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
|
||||
return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t unsigned_lteq_against_input<architecture::arm64>(simd_input<architecture::arm64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
|
||||
return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_odd_backslash_sequences<architecture::arm64>(simd_input<architecture::arm64> in, uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(architecture::arm64, in, prev_iter_ends_odd_backslash);
|
||||
template <>
|
||||
really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in,
|
||||
uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
|
||||
prev_iter_ends_odd_backslash);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_quote_mask_and_bits<architecture::arm64>(simd_input<architecture::arm64> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(architecture::arm64, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
|
||||
template <>
|
||||
really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits,
|
||||
uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
|
||||
prev_iter_inside_quote, quote_bits, error_mask)
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
void find_whitespace_and_structurals<architecture::arm64>(
|
||||
simd_input<architecture::arm64> in,
|
||||
uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
const uint8x16_t low_nibble_mask = (uint8x16_t){
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
|
||||
const uint8x16_t high_nibble_mask = (uint8x16_t){
|
||||
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
|
||||
simd_input<Architecture::ARM64> in, uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
const uint8x16_t low_nibble_mask =
|
||||
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
|
||||
const uint8x16_t high_nibble_mask =
|
||||
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
|
||||
|
||||
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
|
||||
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
|
||||
|
@ -183,15 +196,15 @@ void find_whitespace_and_structurals<architecture::arm64>(
|
|||
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
|
||||
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
|
||||
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
|
||||
structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
|
||||
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
|
||||
|
||||
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
|
||||
whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
|
||||
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
|
||||
}
|
||||
}// simdjson namespace
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // IS_ARM64
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
|
|
@ -10,17 +10,17 @@ namespace simdjson {
|
|||
// again our optimized version.
|
||||
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
|
||||
uint32_t idx, uint64_t bits) {
|
||||
uint32_t * out_ptr = base_ptr + base;
|
||||
uint32_t *out_ptr = base_ptr + base;
|
||||
idx -= 64;
|
||||
while(bits != 0) {
|
||||
out_ptr[0] = idx + trailingzeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
out_ptr++;
|
||||
while (bits != 0) {
|
||||
out_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
out_ptr++;
|
||||
}
|
||||
base = (out_ptr - base_ptr);
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
// flatten out values in 'bits' assuming that they are are to have values of idx
|
||||
// plus their position in the bitvector, and store these indexes at
|
||||
// base_ptr[base] incrementing base as we go
|
||||
|
@ -28,65 +28,66 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
|
|||
// needs to be large enough to handle this
|
||||
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
|
||||
uint32_t idx, uint64_t bits) {
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// Unfortunately, in other cases,
|
||||
// it helps tremendously.
|
||||
if(bits == 0) return;
|
||||
if (bits == 0)
|
||||
return;
|
||||
uint32_t cnt = hamming(bits);
|
||||
uint32_t next_base = base + cnt;
|
||||
idx -= 64;
|
||||
base_ptr += base;
|
||||
{
|
||||
base_ptr[0] = idx + trailingzeroes(bits);
|
||||
{
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[1] = idx + trailingzeroes(bits);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[2] = idx + trailingzeroes(bits);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[3] = idx + trailingzeroes(bits);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[4] = idx + trailingzeroes(bits);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[5] = idx + trailingzeroes(bits);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[6] = idx + trailingzeroes(bits);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[7] = idx + trailingzeroes(bits);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr += 8;
|
||||
}
|
||||
// We hope that the next branch is easily predicted.
|
||||
if (cnt > 8) {
|
||||
base_ptr[0] = idx + trailingzeroes(bits);
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[1] = idx + trailingzeroes(bits);
|
||||
base_ptr[1] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[2] = idx + trailingzeroes(bits);
|
||||
base_ptr[2] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[3] = idx + trailingzeroes(bits);
|
||||
base_ptr[3] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[4] = idx + trailingzeroes(bits);
|
||||
base_ptr[4] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[5] = idx + trailingzeroes(bits);
|
||||
base_ptr[5] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[6] = idx + trailingzeroes(bits);
|
||||
base_ptr[6] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr[7] = idx + trailingzeroes(bits);
|
||||
base_ptr[7] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr += 8;
|
||||
}
|
||||
if (cnt > 16) { // unluckly: we rarely get here
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// every 4 characters (possible with inputs like "","","",...).
|
||||
do {
|
||||
base_ptr[0] = idx + trailingzeroes(bits);
|
||||
base_ptr[0] = idx + trailing_zeroes(bits);
|
||||
bits = bits & (bits - 1);
|
||||
base_ptr++;
|
||||
} while(bits != 0);
|
||||
} while (bits != 0);
|
||||
}
|
||||
base = next_base;
|
||||
}
|
||||
#endif // SIMDJSON_NAIVE_FLATTEN
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
|
|
@ -1,7 +1,7 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
|
||||
|
||||
// This file provides the same function as
|
||||
// This file provides the same function as
|
||||
// stage1_find_marks_flatten.h, but uses Intel intrinsics.
|
||||
// This should provide better performance on Visual Studio
|
||||
// and other compilers that do a conservative optimization.
|
||||
|
@ -20,15 +20,16 @@ namespace haswell {
|
|||
// needs to be large enough to handle this
|
||||
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
|
||||
uint32_t idx, uint64_t bits) {
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// In some instances, the next branch is expensive because it is mispredicted.
|
||||
// Unfortunately, in other cases,
|
||||
// it helps tremendously.
|
||||
if(bits == 0) return;
|
||||
if (bits == 0)
|
||||
return;
|
||||
uint32_t cnt = _mm_popcnt_u64(bits);
|
||||
uint32_t next_base = base + cnt;
|
||||
idx -= 64;
|
||||
base_ptr += base;
|
||||
{
|
||||
{
|
||||
base_ptr[0] = idx + _tzcnt_u64(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr[1] = idx + _tzcnt_u64(bits);
|
||||
|
@ -68,19 +69,18 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
|
|||
base_ptr += 8;
|
||||
}
|
||||
if (cnt > 16) { // unluckly: we rarely get here
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// since it means having one structural or pseudo-structral element
|
||||
// every 4 characters (possible with inputs like "","","",...).
|
||||
do {
|
||||
base_ptr[0] = idx + _tzcnt_u64(bits);
|
||||
bits = _blsr_u64(bits);
|
||||
base_ptr++;
|
||||
} while(bits != 0);
|
||||
} while (bits != 0);
|
||||
}
|
||||
base = next_base;
|
||||
}
|
||||
} // haswell
|
||||
} // simdjson
|
||||
} // namespace haswell
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
|
||||
|
|
|
@ -1,31 +1,32 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
|
||||
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
#include "simdjson/stage1_find_marks_flatten_haswell.h"
|
||||
#include "simdjson/simdutf8check_haswell.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_flatten_haswell.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
|
||||
#ifdef IS_X86_64
|
||||
|
||||
TARGET_HASWELL
|
||||
namespace simdjson {
|
||||
template<>
|
||||
struct simd_input<architecture::haswell> {
|
||||
template <> struct simd_input<Architecture::HASWELL> {
|
||||
__m256i lo;
|
||||
__m256i hi;
|
||||
};
|
||||
|
||||
template<> really_inline
|
||||
simd_input<architecture::haswell> fill_input<architecture::haswell>(const uint8_t * ptr) {
|
||||
struct simd_input<architecture::haswell> in;
|
||||
template <>
|
||||
really_inline simd_input<Architecture::HASWELL>
|
||||
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::HASWELL> in;
|
||||
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
|
||||
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
|
||||
return in;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t compute_quote_mask<architecture::haswell>(uint64_t quote_bits) {
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
|
||||
// There should be no such thing with a processing supporting avx2
|
||||
// but not clmul.
|
||||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
|
@ -33,45 +34,50 @@ uint64_t compute_quote_mask<architecture::haswell>(uint64_t quote_bits) {
|
|||
return quote_mask;
|
||||
}
|
||||
|
||||
template<>
|
||||
struct utf8_checking_state<architecture::haswell> {
|
||||
template <> struct utf8_checking_state<Architecture::HASWELL> {
|
||||
__m256i has_error;
|
||||
avx_processed_utf_bytes previous;
|
||||
utf8_checking_state() {
|
||||
has_error = _mm256_setzero_si256();
|
||||
previous.rawbytes = _mm256_setzero_si256();
|
||||
previous.raw_bytes = _mm256_setzero_si256();
|
||||
previous.high_nibbles = _mm256_setzero_si256();
|
||||
previous.carried_continuations =_mm256_setzero_si256();
|
||||
previous.carried_continuations = _mm256_setzero_si256();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<> really_inline
|
||||
void check_utf8<architecture::haswell>(simd_input<architecture::haswell> in,
|
||||
utf8_checking_state<architecture::haswell>& state) {
|
||||
__m256i highbit = _mm256_set1_epi8(0x80);
|
||||
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in,
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
__m256i high_bit = _mm256_set1_epi8(0x80);
|
||||
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error = _mm256_or_si256(
|
||||
_mm256_cmpgt_epi8(
|
||||
state.previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
_mm256_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error));
|
||||
state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
errorValues check_utf8_errors<architecture::haswell>(utf8_checking_state<architecture::haswell>& state) {
|
||||
return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
|
||||
utf8_checking_state<Architecture::HASWELL> &state) {
|
||||
return _mm256_testz_si256(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t cmp_mask_against_input<architecture::haswell>(simd_input<architecture::haswell> in, uint8_t m) {
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i mask = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
|
@ -80,31 +86,38 @@ uint64_t cmp_mask_against_input<architecture::haswell>(simd_input<architecture::
|
|||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t unsigned_lteq_against_input<architecture::haswell>(simd_input<architecture::haswell> in, uint8_t m) {
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint8_t m) {
|
||||
const __m256i maxval = _mm256_set1_epi8(m);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
|
||||
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
|
||||
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval);
|
||||
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
|
||||
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
|
||||
return res_0 | (res_1 << 32);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_odd_backslash_sequences<architecture::haswell>(simd_input<architecture::haswell> in, uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(architecture::haswell, in, prev_iter_ends_odd_backslash);
|
||||
template <>
|
||||
really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in,
|
||||
uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
|
||||
prev_iter_ends_odd_backslash);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_quote_mask_and_bits<architecture::haswell>(simd_input<architecture::haswell> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(architecture::haswell, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
|
||||
template <>
|
||||
really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits,
|
||||
uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
|
||||
prev_iter_inside_quote, quote_bits, error_mask)
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
void find_whitespace_and_structurals<architecture::haswell>(simd_input<architecture::haswell> in,
|
||||
uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
|
||||
simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
#ifdef SIMDJSON_NAIVE_STRUCTURAL
|
||||
// You should never need this naive approach, but it can be useful
|
||||
// for research purposes
|
||||
|
@ -112,21 +125,28 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
|
|||
__m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
|
||||
__m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
|
||||
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
|
||||
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace));
|
||||
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace));
|
||||
struct_lo =
|
||||
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
|
||||
struct_hi =
|
||||
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
|
||||
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
|
||||
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket));
|
||||
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket));
|
||||
struct_lo =
|
||||
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
|
||||
struct_hi =
|
||||
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
|
||||
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
|
||||
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket));
|
||||
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket));
|
||||
struct_lo =
|
||||
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
|
||||
struct_hi =
|
||||
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
|
||||
const __m256i mask_column = _mm256_set1_epi8(0x3a);
|
||||
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column));
|
||||
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column));
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
|
||||
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
|
||||
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma));
|
||||
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma));
|
||||
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
|
||||
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
|
||||
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
|
||||
uint64_t structural_res_0 =
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
|
||||
uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
|
||||
structurals = (structural_res_0 | (structural_res_1 << 32));
|
||||
|
||||
|
@ -134,34 +154,34 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
|
|||
__m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
|
||||
__m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
|
||||
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
|
||||
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed));
|
||||
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed));
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
|
||||
const __m256i mask_tab = _mm256_set1_epi8(0x09);
|
||||
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab));
|
||||
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab));
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
|
||||
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
|
||||
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage));
|
||||
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage));
|
||||
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
|
||||
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
|
||||
|
||||
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
|
||||
whitespace = (ws_res_0 | (ws_res_1 << 32));
|
||||
// end of naive approach
|
||||
|
||||
#else // SIMDJSON_NAIVE_STRUCTURAL
|
||||
const __m256i structural_table = _mm256_setr_epi8(
|
||||
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
|
||||
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
#else // SIMDJSON_NAIVE_STRUCTURAL
|
||||
const __m256i structural_table =
|
||||
_mm256_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
|
||||
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m256i white_table = _mm256_setr_epi8(
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
const __m256i struct_offset = _mm256_set1_epi8(0xd4);
|
||||
const __m256i struct_mask = _mm256_set1_epi8(32);
|
||||
|
||||
__m256i lo_white = _mm256_cmpeq_epi8(in.lo,
|
||||
_mm256_shuffle_epi8(white_table, in.lo));
|
||||
__m256i hi_white = _mm256_cmpeq_epi8(in.hi,
|
||||
_mm256_shuffle_epi8(white_table, in.hi));
|
||||
__m256i lo_white =
|
||||
_mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
|
||||
__m256i hi_white =
|
||||
_mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
|
||||
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
|
||||
whitespace = (ws_res_0 | (ws_res_1 << 32));
|
||||
|
@ -173,7 +193,7 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
|
|||
__m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
|
||||
__m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
|
||||
__m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
|
||||
|
||||
|
||||
uint64_t structural_res_0 =
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
|
||||
uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
|
||||
|
@ -184,6 +204,5 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
|
|||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
|
||||
#endif // IS_X86_64
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
|
|
@ -10,41 +10,44 @@
|
|||
// indicate whether we end an iteration on an odd-length sequence of
|
||||
// backslashes, which modifies our subsequent search for odd-length
|
||||
// sequences of backslashes in an obvious way.
|
||||
// We need to compile that code for multiple architectures. However, target attributes can be used
|
||||
// only once by function definition. Huge macro seemed better than huge code duplication.
|
||||
// uint64_t FIND_ODD_BACKSLASH_SEQUENCES(architecture T, simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash)
|
||||
#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) { \
|
||||
const uint64_t even_bits = 0x5555555555555555ULL; \
|
||||
const uint64_t odd_bits = ~even_bits; \
|
||||
uint64_t bs_bits = cmp_mask_against_input<T>(in, '\\'); \
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1); \
|
||||
/* flip lowest if we have an odd-length run at the end of the prior */ \
|
||||
/* iteration */ \
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \
|
||||
uint64_t even_starts = start_edges & even_start_mask; \
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask; \
|
||||
uint64_t even_carries = bs_bits + even_starts; \
|
||||
\
|
||||
uint64_t odd_carries; \
|
||||
/* must record the carry-out of our odd-carries out of bit 63; this */ \
|
||||
/* indicates whether the sense of any edge going to the next iteration */ \
|
||||
/* should be flipped */ \
|
||||
bool iter_ends_odd_backslash = \
|
||||
add_overflow(bs_bits, odd_starts, &odd_carries); \
|
||||
\
|
||||
odd_carries |= \
|
||||
prev_iter_ends_odd_backslash; /* push in bit zero as a potential end */ \
|
||||
/* if we had an odd-numbered run at the */ \
|
||||
/* end of the previous iteration */ \
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits; \
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits; \
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits; \
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits; \
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \
|
||||
return odd_ends; \
|
||||
}
|
||||
|
||||
// We need to compile that code for multiple architectures. However, target
|
||||
// attributes can be used only once by function definition. Huge macro seemed
|
||||
// better than huge code duplication. uint64_t
|
||||
// FIND_ODD_BACKSLASH_SEQUENCES(Architecture T, simd_input<T> in, uint64_t
|
||||
// &prev_iter_ends_odd_backslash)
|
||||
#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) \
|
||||
{ \
|
||||
const uint64_t even_bits = 0x5555555555555555ULL; \
|
||||
const uint64_t odd_bits = ~even_bits; \
|
||||
uint64_t bs_bits = cmp_mask_against_input<T>(in, '\\'); \
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1); \
|
||||
/* flip lowest if we have an odd-length run at the end of the prior \
|
||||
* iteration */ \
|
||||
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \
|
||||
uint64_t even_starts = start_edges & even_start_mask; \
|
||||
uint64_t odd_starts = start_edges & ~even_start_mask; \
|
||||
uint64_t even_carries = bs_bits + even_starts; \
|
||||
\
|
||||
uint64_t odd_carries; \
|
||||
/* must record the carry-out of our odd-carries out of bit 63; this \
|
||||
* indicates whether the sense of any edge going to the next iteration \
|
||||
* should be flipped */ \
|
||||
bool iter_ends_odd_backslash = \
|
||||
add_overflow(bs_bits, odd_starts, &odd_carries); \
|
||||
\
|
||||
odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a \
|
||||
* potential end if we had an \
|
||||
* odd-numbered run at the \
|
||||
* end of the previous \
|
||||
* iteration */ \
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits; \
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits; \
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits; \
|
||||
uint64_t odd_start_even_end = odd_carry_ends & even_bits; \
|
||||
uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \
|
||||
return odd_ends; \
|
||||
}
|
||||
|
||||
// return both the quote mask (which is a half-open mask that covers the first
|
||||
// quote
|
||||
|
@ -58,34 +61,39 @@
|
|||
// Note that we don't do any error checking to see if we have backslash
|
||||
// sequences outside quotes; these
|
||||
// backslash sequences (of any length) will be detected elsewhere.
|
||||
// We need to compile that code for multiple architectures. However, target attributes can be used
|
||||
// only once by function definition. Huge macro seemed better than huge code duplication.
|
||||
// uint64_t FIND_QUOTE_MASK_AND_BITS(architecture T, simd_input<T> in, uint64_t odd_ends,
|
||||
// uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask)
|
||||
#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) { \
|
||||
quote_bits = cmp_mask_against_input<T>(in, '"'); \
|
||||
quote_bits = quote_bits & ~odd_ends; \
|
||||
uint64_t quote_mask = compute_quote_mask<T>(quote_bits); \
|
||||
quote_mask ^= prev_iter_inside_quote; \
|
||||
/* All Unicode characters may be placed within the */ \
|
||||
/* quotation marks, except for the characters that MUST be escaped: */ \
|
||||
/* quotation mark, reverse solidus, and the control characters (U+0000 */ \
|
||||
/*through U+001F). */ \
|
||||
/* https://tools.ietf.org/html/rfc8259 */ \
|
||||
uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F); \
|
||||
error_mask |= quote_mask & unescaped; \
|
||||
/* right shift of a signed value expected to be well-defined and standard */ \
|
||||
/* compliant as of C++20, */ \
|
||||
/* John Regher from Utah U. says this is fine code */ \
|
||||
prev_iter_inside_quote = \
|
||||
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63); \
|
||||
return quote_mask; \
|
||||
} \
|
||||
// We need to compile that code for multiple architectures. However, target
|
||||
// attributes can be used only once by function definition. Huge macro seemed
|
||||
// better than huge code duplication. uint64_t
|
||||
// FIND_QUOTE_MASK_AND_BITS(Architecture T, simd_input<T> in, uint64_t odd_ends,
|
||||
// uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t
|
||||
// &error_mask)
|
||||
#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, \
|
||||
quote_bits, error_mask) \
|
||||
{ \
|
||||
quote_bits = cmp_mask_against_input<T>(in, '"'); \
|
||||
quote_bits = quote_bits & ~odd_ends; \
|
||||
uint64_t quote_mask = compute_quote_mask<T>(quote_bits); \
|
||||
quote_mask ^= prev_iter_inside_quote; \
|
||||
/* All Unicode characters may be placed within the \
|
||||
* quotation marks, except for the characters that MUST be escaped: \
|
||||
* quotation mark, reverse solidus, and the control characters (U+0000 \
|
||||
* through U+001F). \
|
||||
* https://tools.ietf.org/html/rfc8259 */ \
|
||||
uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F); \
|
||||
error_mask |= quote_mask & unescaped; \
|
||||
/* right shift of a signed value expected to be well-defined and standard \
|
||||
* compliant as of C++20, \
|
||||
* John Regher from Utah U. says this is fine code */ \
|
||||
prev_iter_inside_quote = \
|
||||
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63); \
|
||||
return quote_mask; \
|
||||
}
|
||||
|
||||
// Find structural bits in a 64-byte chunk.
|
||||
// We need to compile that code for multiple architectures. However, target attributes can be used
|
||||
// only once by function definition. Huge macro seemed better than huge code duplication.
|
||||
// void FIND_STRUCTURAL_BITS_64(architecture T,
|
||||
// We need to compile that code for multiple architectures. However, target
|
||||
// attributes can be used only once by function definition. Huge macro seemed
|
||||
// better than huge code duplication. void FIND_STRUCTURAL_BITS_64(
|
||||
// Architecture T,
|
||||
// const uint8_t *buf,
|
||||
// size_t idx,
|
||||
// uint32_t *base_ptr,
|
||||
|
@ -95,131 +103,137 @@
|
|||
// uint64_t &prev_iter_ends_pseudo_pred,
|
||||
// uint64_t &structurals,
|
||||
// uint64_t &error_mask,
|
||||
// utf8_checking_state<T> &utf8_state, flatten function)
|
||||
#define FIND_STRUCTURAL_BITS_64(T, \
|
||||
buf, \
|
||||
idx, \
|
||||
base_ptr, \
|
||||
base, \
|
||||
prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, \
|
||||
prev_iter_ends_pseudo_pred, \
|
||||
structurals, \
|
||||
error_mask, \
|
||||
utf8_state, \
|
||||
flat \
|
||||
) { \
|
||||
simd_input<T> in = fill_input<T>(buf); \
|
||||
check_utf8<T>(in, utf8_state); \
|
||||
/* detect odd sequences of backslashes */ \
|
||||
uint64_t odd_ends = find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash); \
|
||||
\
|
||||
/* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \
|
||||
/* themselves */ \
|
||||
uint64_t quote_bits; \
|
||||
uint64_t quote_mask = find_quote_mask_and_bits<T>( \
|
||||
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \
|
||||
\
|
||||
/* take the previous iterations structural bits, not our current iteration, */ \
|
||||
/* and flatten */ \
|
||||
flat(base_ptr, base, idx, structurals); \
|
||||
\
|
||||
uint64_t whitespace; \
|
||||
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
|
||||
\
|
||||
/* fixup structurals to reflect quotes and add pseudo-structural characters */ \
|
||||
structurals = finalize_structurals(structurals, whitespace, quote_mask, \
|
||||
quote_bits, prev_iter_ends_pseudo_pred); \
|
||||
} \
|
||||
|
||||
|
||||
// We need to compile that code for multiple architectures. However, target attributes can be used
|
||||
// only once by function definition. Huge macro seemed better than huge code duplication.
|
||||
// errorValues FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj, flatten functio )
|
||||
#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) { \
|
||||
if (len > pj.bytecapacity) { \
|
||||
std::cerr << "Your ParsedJson object only supports documents up to " \
|
||||
<< pj.bytecapacity << " bytes but you are trying to process " << len \
|
||||
<< " bytes" << std::endl; \
|
||||
return simdjson::CAPACITY; \
|
||||
} \
|
||||
uint32_t *base_ptr = pj.structural_indexes; \
|
||||
uint32_t base = 0; \
|
||||
utf8_checking_state<T> utf8_state; \
|
||||
\
|
||||
/* we have padded the input out to 64 byte multiple with the remainder being */ \
|
||||
/* zeros */ \
|
||||
\
|
||||
/* persistent state across loop */ \
|
||||
/* does the last iteration end with an odd-length sequence of backslashes? */ \
|
||||
/* either 0 or 1, but a 64-bit value */ \
|
||||
uint64_t prev_iter_ends_odd_backslash = 0ULL; \
|
||||
/* does the previous iteration end inside a double-quote pair? */ \
|
||||
uint64_t prev_iter_inside_quote = 0ULL; /* either all zeros or all ones */ \
|
||||
/* does the previous iteration end on something that is a predecessor of a */ \
|
||||
/* pseudo-structural character - i.e. whitespace or a structural character */ \
|
||||
/* effectively the very first char is considered to follow "whitespace" for */ \
|
||||
/* the */ \
|
||||
/* purposes of pseudo-structural character detection so we initialize to 1 */ \
|
||||
uint64_t prev_iter_ends_pseudo_pred = 1ULL; \
|
||||
\
|
||||
/* structurals are persistent state across loop as we flatten them on the */ \
|
||||
/* subsequent iteration into our array pointed to be base_ptr. */ \
|
||||
/* This is harmless on the first iteration as structurals==0 */ \
|
||||
/* and is done for performance reasons; we can hide some of the latency of the */ \
|
||||
/* expensive carryless multiply in the previous step with this work */ \
|
||||
uint64_t structurals = 0; \
|
||||
\
|
||||
size_t lenminus64 = len < 64 ? 0 : len - 64; \
|
||||
size_t idx = 0; \
|
||||
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII code points < 0x20) */ \
|
||||
\
|
||||
for (; idx < lenminus64; idx += 64) { \
|
||||
FIND_STRUCTURAL_BITS_64(T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
|
||||
error_mask, utf8_state, flat); \
|
||||
} \
|
||||
/* If we have a final chunk of less than 64 bytes, pad it to 64 with spaces */ \
|
||||
/* before processing it (otherwise, we risk invalidating the UTF-8 checks). */ \
|
||||
if (idx < len) { \
|
||||
uint8_t tmpbuf[64]; \
|
||||
memset(tmpbuf, 0x20, 64); \
|
||||
memcpy(tmpbuf, buf + idx, len - idx); \
|
||||
FIND_STRUCTURAL_BITS_64(T, &tmpbuf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
|
||||
error_mask, utf8_state, flat); \
|
||||
idx += 64; \
|
||||
} \
|
||||
\
|
||||
/* is last string quote closed? */ \
|
||||
if (prev_iter_inside_quote) { \
|
||||
return simdjson::UNCLOSED_STRING; \
|
||||
} \
|
||||
\
|
||||
/* finally, flatten out the remaining structurals from the last iteration */ \
|
||||
flat(base_ptr, base, idx, structurals); \
|
||||
\
|
||||
pj.n_structural_indexes = base; \
|
||||
/* a valid JSON file cannot have zero structural indexes - we should have */ \
|
||||
/* found something */ \
|
||||
if (pj.n_structural_indexes == 0u) { \
|
||||
return simdjson::EMPTY; \
|
||||
} \
|
||||
if (base_ptr[pj.n_structural_indexes - 1] > len) { \
|
||||
return simdjson::UNEXPECTED_ERROR; \
|
||||
} \
|
||||
if (len != base_ptr[pj.n_structural_indexes - 1]) { \
|
||||
/* the string might not be NULL terminated, but we add a virtual NULL ending */ \
|
||||
/* character. */ \
|
||||
base_ptr[pj.n_structural_indexes++] = len; \
|
||||
} \
|
||||
/* make it safe to dereference one beyond this array */ \
|
||||
base_ptr[pj.n_structural_indexes] = 0; \
|
||||
if (error_mask) { \
|
||||
return simdjson::UNESCAPED_CHARS; \
|
||||
} \
|
||||
return check_utf8_errors<T>(utf8_state); \
|
||||
}
|
||||
// utf8_checking_state<T> &utf8_state, flatten
|
||||
// function)
|
||||
#define FIND_STRUCTURAL_BITS_64( \
|
||||
T, buf, idx, base_ptr, base, prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
|
||||
error_mask, utf8_state, flat) \
|
||||
{ \
|
||||
simd_input<T> in = fill_input<T>(buf); \
|
||||
check_utf8<T>(in, utf8_state); \
|
||||
/* detect odd sequences of backslashes */ \
|
||||
uint64_t odd_ends = \
|
||||
find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash); \
|
||||
\
|
||||
/* detect insides of quote pairs ("quote_mask") and also our quote_bits \
|
||||
* themselves */ \
|
||||
uint64_t quote_bits; \
|
||||
uint64_t quote_mask = find_quote_mask_and_bits<T>( \
|
||||
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \
|
||||
\
|
||||
/* take the previous iterations structural bits, not our current \
|
||||
* iteration, \
|
||||
* and flatten */ \
|
||||
flat(base_ptr, base, idx, structurals); \
|
||||
\
|
||||
uint64_t whitespace; \
|
||||
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
|
||||
\
|
||||
/* fixup structurals to reflect quotes and add pseudo-structural \
|
||||
* characters */ \
|
||||
structurals = \
|
||||
finalize_structurals(structurals, whitespace, quote_mask, quote_bits, \
|
||||
prev_iter_ends_pseudo_pred); \
|
||||
}
|
||||
|
||||
// We need to compile that code for multiple architectures. However, target
|
||||
// attributes can be used only once by function definition. Huge macro seemed
|
||||
// better than huge code duplication. ErrorValues
|
||||
// FIND_STRUCTURAL_BITS(Architecture T, const uint8_t *buf, size_t len,
|
||||
// ParsedJson &pj, flatten function)
|
||||
#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) \
|
||||
{ \
|
||||
if (len > pj.byte_capacity) { \
|
||||
std::cerr << "Your ParsedJson object only supports documents up to " \
|
||||
<< pj.byte_capacity << " bytes but you are trying to process " \
|
||||
<< len << " bytes" << std::endl; \
|
||||
return simdjson::CAPACITY; \
|
||||
} \
|
||||
uint32_t *base_ptr = pj.structural_indexes; \
|
||||
uint32_t base = 0; \
|
||||
utf8_checking_state<T> utf8_state; \
|
||||
\
|
||||
/* we have padded the input out to 64 byte multiple with the remainder \
|
||||
* being zeros persistent state across loop does the last iteration end \
|
||||
* with an odd-length sequence of backslashes? */ \
|
||||
\
|
||||
/* either 0 or 1, but a 64-bit value */ \
|
||||
uint64_t prev_iter_ends_odd_backslash = 0ULL; \
|
||||
/* does the previous iteration end inside a double-quote pair? */ \
|
||||
uint64_t prev_iter_inside_quote = \
|
||||
0ULL; /* either all zeros or all ones \
|
||||
* does the previous iteration end on something that is a \
|
||||
* predecessor of a pseudo-structural character - i.e. \
|
||||
* whitespace or a structural character effectively the very \
|
||||
* first char is considered to follow "whitespace" for the \
|
||||
* purposes of pseudo-structural character detection so we \
|
||||
* initialize to 1 */ \
|
||||
uint64_t prev_iter_ends_pseudo_pred = 1ULL; \
|
||||
\
|
||||
/* structurals are persistent state across loop as we flatten them on the \
|
||||
* subsequent iteration into our array pointed to be base_ptr. \
|
||||
* This is harmless on the first iteration as structurals==0 \
|
||||
* and is done for performance reasons; we can hide some of the latency of \
|
||||
* the \
|
||||
* expensive carryless multiply in the previous step with this work */ \
|
||||
uint64_t structurals = 0; \
|
||||
\
|
||||
size_t lenminus64 = len < 64 ? 0 : len - 64; \
|
||||
size_t idx = 0; \
|
||||
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII \
|
||||
code points < 0x20) */ \
|
||||
\
|
||||
for (; idx < lenminus64; idx += 64) { \
|
||||
FIND_STRUCTURAL_BITS_64( \
|
||||
T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
|
||||
error_mask, utf8_state, flat); \
|
||||
} \
|
||||
/* If we have a final chunk of less than 64 bytes, pad it to 64 with \
|
||||
* spaces before processing it (otherwise, we risk invalidating the UTF-8 \
|
||||
* checks). */ \
|
||||
if (idx < len) { \
|
||||
uint8_t tmp_buf[64]; \
|
||||
memset(tmp_buf, 0x20, 64); \
|
||||
memcpy(tmp_buf, buf + idx, len - idx); \
|
||||
FIND_STRUCTURAL_BITS_64( \
|
||||
T, &tmp_buf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
|
||||
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
|
||||
error_mask, utf8_state, flat); \
|
||||
idx += 64; \
|
||||
} \
|
||||
\
|
||||
/* is last string quote closed? */ \
|
||||
if (prev_iter_inside_quote) { \
|
||||
return simdjson::UNCLOSED_STRING; \
|
||||
} \
|
||||
\
|
||||
/* finally, flatten out the remaining structurals from the last iteration \
|
||||
*/ \
|
||||
flat(base_ptr, base, idx, structurals); \
|
||||
\
|
||||
pj.n_structural_indexes = base; \
|
||||
/* a valid JSON file cannot have zero structural indexes - we should have \
|
||||
* found something */ \
|
||||
if (pj.n_structural_indexes == 0u) { \
|
||||
return simdjson::EMPTY; \
|
||||
} \
|
||||
if (base_ptr[pj.n_structural_indexes - 1] > len) { \
|
||||
return simdjson::UNEXPECTED_ERROR; \
|
||||
} \
|
||||
if (len != base_ptr[pj.n_structural_indexes - 1]) { \
|
||||
/* the string might not be NULL terminated, but we add a virtual NULL \
|
||||
* ending \
|
||||
* character. */ \
|
||||
base_ptr[pj.n_structural_indexes++] = len; \
|
||||
} \
|
||||
/* make it safe to dereference one beyond this array */ \
|
||||
base_ptr[pj.n_structural_indexes] = 0; \
|
||||
if (error_mask) { \
|
||||
return simdjson::UNESCAPED_CHARS; \
|
||||
} \
|
||||
return check_utf8_errors<T>(utf8_state); \
|
||||
}
|
||||
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_MACROS_H
|
|
@ -1,26 +1,26 @@
|
|||
#ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
|
||||
#define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
|
||||
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
#include "simdjson/stage1_find_marks_flatten.h"
|
||||
#include "simdjson/simdutf8check_westmere.h"
|
||||
#include "simdjson/stage1_find_marks.h"
|
||||
#include "simdjson/stage1_find_marks_flatten.h"
|
||||
#include "simdjson/stage1_find_marks_macros.h"
|
||||
|
||||
#ifdef IS_X86_64
|
||||
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
template<>
|
||||
struct simd_input<architecture::westmere> {
|
||||
template <> struct simd_input<Architecture::WESTMERE> {
|
||||
__m128i v0;
|
||||
__m128i v1;
|
||||
__m128i v2;
|
||||
__m128i v3;
|
||||
};
|
||||
|
||||
template<> really_inline
|
||||
simd_input<architecture::westmere> fill_input<architecture::westmere>(const uint8_t * ptr) {
|
||||
struct simd_input<architecture::westmere> in;
|
||||
template <>
|
||||
really_inline simd_input<Architecture::WESTMERE>
|
||||
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
|
||||
struct simd_input<Architecture::WESTMERE> in;
|
||||
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
|
||||
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
|
||||
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
|
||||
|
@ -28,61 +28,69 @@ simd_input<architecture::westmere> fill_input<architecture::westmere>(const uint
|
|||
return in;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t compute_quote_mask<architecture::westmere>(uint64_t quote_bits) {
|
||||
template <>
|
||||
really_inline uint64_t
|
||||
compute_quote_mask<Architecture::WESTMERE>(uint64_t quote_bits) {
|
||||
return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
}
|
||||
|
||||
template<>
|
||||
struct utf8_checking_state<architecture::westmere>
|
||||
{
|
||||
template <> struct utf8_checking_state<Architecture::WESTMERE> {
|
||||
__m128i has_error = _mm_setzero_si128();
|
||||
processed_utf_bytes previous {
|
||||
_mm_setzero_si128(), // rawbytes
|
||||
_mm_setzero_si128(), // high_nibbles
|
||||
_mm_setzero_si128() // carried_continuations
|
||||
processed_utf_bytes previous{
|
||||
_mm_setzero_si128(), // raw_bytes
|
||||
_mm_setzero_si128(), // high_nibbles
|
||||
_mm_setzero_si128() // carried_continuations
|
||||
};
|
||||
};
|
||||
|
||||
template<> really_inline
|
||||
void check_utf8<architecture::westmere>(simd_input<architecture::westmere> in,
|
||||
utf8_checking_state<architecture::westmere>& state) {
|
||||
__m128i highbit = _mm_set1_epi8(0x80);
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
|
||||
template <>
|
||||
really_inline void check_utf8<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in,
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
__m128i high_bit = _mm_set1_epi8(0x80);
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error = _mm_or_si128(
|
||||
_mm_cmpgt_epi8(
|
||||
state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error));
|
||||
state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
|
||||
}
|
||||
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
|
||||
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
|
||||
// it is ascii, we just check continuation
|
||||
state.has_error = _mm_or_si128(
|
||||
_mm_cmpgt_epi8(
|
||||
state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
state.has_error =
|
||||
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
|
||||
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 1)),
|
||||
state.has_error);
|
||||
} else {
|
||||
// it is not ascii so we have to do heavy work
|
||||
state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error));
|
||||
state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
|
||||
state.previous =
|
||||
check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
|
||||
}
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
errorValues check_utf8_errors<architecture::westmere>(utf8_checking_state<architecture::westmere>& state) {
|
||||
return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
|
||||
template <>
|
||||
really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
|
||||
utf8_checking_state<Architecture::WESTMERE> &state) {
|
||||
return _mm_testz_si128(state.has_error, state.has_error) == 0
|
||||
? simdjson::UTF8_ERROR
|
||||
: simdjson::SUCCESS;
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t cmp_mask_against_input<architecture::westmere>(simd_input<architecture::westmere> in, uint8_t m) {
|
||||
template <>
|
||||
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i mask = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
|
@ -95,54 +103,60 @@ uint64_t cmp_mask_against_input<architecture::westmere>(simd_input<architecture:
|
|||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t unsigned_lteq_against_input<architecture::westmere>(simd_input<architecture::westmere> in, uint8_t m) {
|
||||
template <>
|
||||
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint8_t m) {
|
||||
const __m128i maxval = _mm_set1_epi8(m);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval);
|
||||
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
|
||||
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval);
|
||||
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
|
||||
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval);
|
||||
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
|
||||
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval);
|
||||
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
|
||||
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
|
||||
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_odd_backslash_sequences<architecture::westmere>(simd_input<architecture::westmere> in, uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(architecture::westmere, in, prev_iter_ends_odd_backslash);
|
||||
template <>
|
||||
really_inline uint64_t find_odd_backslash_sequences<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in,
|
||||
uint64_t &prev_iter_ends_odd_backslash) {
|
||||
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::WESTMERE, in,
|
||||
prev_iter_ends_odd_backslash);
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
uint64_t find_quote_mask_and_bits<architecture::westmere>(simd_input<architecture::westmere> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(architecture::westmere, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
|
||||
template <>
|
||||
really_inline uint64_t find_quote_mask_and_bits<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint64_t odd_ends,
|
||||
uint64_t &prev_iter_inside_quote, uint64_t "e_bits,
|
||||
uint64_t &error_mask) {
|
||||
FIND_QUOTE_MASK_AND_BITS(Architecture::WESTMERE, in, odd_ends,
|
||||
prev_iter_inside_quote, quote_bits, error_mask)
|
||||
}
|
||||
|
||||
template<> really_inline
|
||||
void find_whitespace_and_structurals<architecture::westmere>(simd_input<architecture::westmere> in,
|
||||
uint64_t &whitespace, uint64_t &structurals) {
|
||||
const __m128i structural_table = _mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m128i white_table = _mm_setr_epi8(
|
||||
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
|
||||
template <>
|
||||
really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
|
||||
simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,
|
||||
uint64_t &structurals) {
|
||||
const __m128i structural_table =
|
||||
_mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
|
||||
const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2,
|
||||
100, 9, 10, 112, 100, 13, 100, 100);
|
||||
const __m128i struct_offset = _mm_set1_epi8(0xd4);
|
||||
const __m128i struct_mask = _mm_set1_epi8(32);
|
||||
|
||||
__m128i white0 = _mm_cmpeq_epi8(in.v0,
|
||||
_mm_shuffle_epi8(white_table, in.v0));
|
||||
__m128i white1 = _mm_cmpeq_epi8(in.v1,
|
||||
_mm_shuffle_epi8(white_table, in.v1));
|
||||
__m128i white2 = _mm_cmpeq_epi8(in.v2,
|
||||
_mm_shuffle_epi8(white_table, in.v2));
|
||||
__m128i white3 = _mm_cmpeq_epi8(in.v3,
|
||||
_mm_shuffle_epi8(white_table, in.v3));
|
||||
__m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0));
|
||||
__m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1));
|
||||
__m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2));
|
||||
__m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3));
|
||||
uint64_t ws_res_0 = _mm_movemask_epi8(white0);
|
||||
uint64_t ws_res_1 = _mm_movemask_epi8(white1);
|
||||
uint64_t ws_res_2 = _mm_movemask_epi8(white2);
|
||||
uint64_t ws_res_3 = _mm_movemask_epi8(white3);
|
||||
|
||||
whitespace = (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
|
||||
whitespace =
|
||||
(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
|
||||
|
||||
__m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
|
||||
__m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
|
||||
|
@ -169,13 +183,12 @@ void find_whitespace_and_structurals<architecture::westmere>(simd_input<architec
|
|||
uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
|
||||
uint64_t structural_res_3 = _mm_movemask_epi8(struct4);
|
||||
|
||||
structurals = (structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
|
||||
structurals = (structural_res_0 | (structural_res_1 << 16) |
|
||||
(structural_res_2 << 32) | (structural_res_3 << 48));
|
||||
}
|
||||
|
||||
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
|
||||
#endif // IS_X86_64
|
||||
#endif // SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
|
|
@ -9,8 +9,8 @@
|
|||
#include "simdjson/jsoncharutils.h"
|
||||
#include "simdjson/numberparsing.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/stringparsing.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/stringparsing.h"
|
||||
|
||||
namespace simdjson {
|
||||
void init_state_machine();
|
||||
|
@ -20,7 +20,8 @@ really_inline bool is_valid_true_atom(const uint8_t *loc) {
|
|||
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
uint64_t
|
||||
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
|
@ -43,8 +44,9 @@ really_inline bool is_valid_false_atom(const uint8_t *loc) {
|
|||
// the last character of false (it being 5 byte long!) would be
|
||||
// ignored
|
||||
uint64_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
uint64_t
|
||||
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
|
@ -58,8 +60,9 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
|||
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
|
||||
uint64_t mask4 = 0x00000000ffffffff;
|
||||
uint32_t error = 0;
|
||||
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
uint64_t
|
||||
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
|
||||
// this can read up to 7 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
|
||||
std::memcpy(&locval, loc, sizeof(uint64_t));
|
||||
|
@ -68,15 +71,15 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
|
|||
return error == 0;
|
||||
}
|
||||
|
||||
template<architecture T = architecture::native>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
template <Architecture T = Architecture::NATIVE>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
|
||||
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||
|
||||
template<architecture T = architecture::native>
|
||||
template <Architecture T = Architecture::NATIVE>
|
||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
||||
return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
|
||||
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
#endif
|
||||
|
|
|
@ -6,8 +6,9 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
|
||||
#ifdef JSON_TEST_STRINGS
|
||||
void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
|
||||
void foundBadString(const uint8_t *buf);
|
||||
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||
const uint8_t *parsed_end);
|
||||
void found_bad_string(const uint8_t *buf);
|
||||
#endif
|
||||
|
||||
namespace simdjson {
|
||||
|
@ -37,7 +38,6 @@ static const uint8_t escape_map[256] = {
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
|
||||
// handle a unicode codepoint
|
||||
// write appropriate values into dest
|
||||
// src will advance 6 bytes or 12 bytes
|
||||
|
@ -45,9 +45,10 @@ static const uint8_t escape_map[256] = {
|
|||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
WARN_UNUSED
|
||||
really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **dst_ptr) {
|
||||
really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
||||
uint8_t **dst_ptr) {
|
||||
// hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
||||
// conversion isn't valid; we defer the check for this to inside the
|
||||
// conversion isn't valid; we defer the check for this to inside the
|
||||
// multilingual plane check
|
||||
uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
|
||||
*src_ptr += 6;
|
||||
|
@ -58,14 +59,14 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
|
|||
return false;
|
||||
}
|
||||
uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
|
||||
|
||||
|
||||
// if the first code point is invalid we will get here, as we will go past
|
||||
// the check for being outside the Basic Multilingual plane. If we don't
|
||||
// find a \u immediately afterwards we fail out anyhow, but if we do,
|
||||
// find a \u immediately afterwards we fail out anyhow, but if we do,
|
||||
// this check catches both the case of the first code point being invalid
|
||||
// or the second code point being invalid.
|
||||
if ((code_point | code_point_2) >> 16) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
code_point =
|
||||
|
@ -84,18 +85,17 @@ struct parse_string_helper {
|
|||
};
|
||||
|
||||
// Finds where the backslashes and quotes are located.
|
||||
template<architecture>
|
||||
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
|
||||
template <Architecture>
|
||||
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src,
|
||||
uint8_t *dst);
|
||||
|
||||
template <Architecture T>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
really_inline bool
|
||||
parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, ParsedJson &pj,
|
||||
UNUSED const uint32_t depth, UNUSED uint32_t offset);
|
||||
|
||||
|
||||
template<architecture T>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||
bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset);
|
||||
|
||||
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
||||
/// Now include the specializations:
|
||||
#include "simdjson/stringparsing_arm64.h"
|
||||
|
|
|
@ -6,46 +6,51 @@
|
|||
|
||||
#ifdef IS_ARM64
|
||||
namespace simdjson {
|
||||
template<> really_inline
|
||||
parse_string_helper find_bs_bits_and_quote_bits<architecture::arm64> (const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
|
||||
uint8x16_t v0 = vld1q_u8(src);
|
||||
uint8x16_t v1 = vld1q_u8(src+16);
|
||||
vst1q_u8(dst, v0);
|
||||
vst1q_u8(dst+16, v1);
|
||||
|
||||
uint8x16_t bs_mask = vmovq_n_u8('\\');
|
||||
uint8x16_t qt_mask = vmovq_n_u8('"');
|
||||
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
|
||||
uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
|
||||
uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
|
||||
uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
|
||||
|
||||
cmp_bs_0 = vandq_u8(cmp_bs_0, bitmask);
|
||||
cmp_bs_1 = vandq_u8(cmp_bs_1, bitmask);
|
||||
cmp_qt_0 = vandq_u8(cmp_qt_0, bitmask);
|
||||
cmp_qt_1 = vandq_u8(cmp_qt_1, bitmask);
|
||||
template <>
|
||||
really_inline parse_string_helper
|
||||
find_bs_bits_and_quote_bits<Architecture::ARM64>(const uint8_t *src,
|
||||
uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
|
||||
uint8x16_t v0 = vld1q_u8(src);
|
||||
uint8x16_t v1 = vld1q_u8(src + 16);
|
||||
vst1q_u8(dst, v0);
|
||||
vst1q_u8(dst + 16, v1);
|
||||
|
||||
uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
|
||||
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
sum0 = vpaddq_u8(sum0, sum0);
|
||||
return {
|
||||
uint8x16_t bs_mask = vmovq_n_u8('\\');
|
||||
uint8x16_t qt_mask = vmovq_n_u8('"');
|
||||
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
|
||||
uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
|
||||
uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
|
||||
uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
|
||||
|
||||
cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
|
||||
cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
|
||||
cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
|
||||
cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
|
||||
|
||||
uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
|
||||
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
|
||||
sum0 = vpaddq_u8(sum0, sum1);
|
||||
sum0 = vpaddq_u8(sum0, sum0);
|
||||
return {
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
|
||||
};
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
|
||||
};
|
||||
}
|
||||
|
||||
template<>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||
bool parse_string<architecture::arm64>(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||
PARSE_STRING(architecture::arm64, buf, len, pj, depth, offset);
|
||||
}
|
||||
template <>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
really_inline bool
|
||||
parse_string<Architecture::ARM64>(UNUSED const uint8_t *buf,
|
||||
UNUSED size_t len, ParsedJson &pj,
|
||||
UNUSED const uint32_t depth,
|
||||
UNUSED uint32_t offset) {
|
||||
PARSE_STRING(Architecture::ARM64, buf, len, pj, depth, offset);
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -4,34 +4,39 @@
|
|||
#include "simdjson/stringparsing.h"
|
||||
#include "simdjson/stringparsing_macros.h"
|
||||
|
||||
|
||||
#ifdef IS_X86_64
|
||||
TARGET_HASWELL
|
||||
namespace simdjson {
|
||||
template<> really_inline
|
||||
parse_string_helper find_bs_bits_and_quote_bits<architecture::haswell> (const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
|
||||
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
||||
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
||||
return {
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
|
||||
template <>
|
||||
really_inline parse_string_helper
|
||||
find_bs_bits_and_quote_bits<Architecture::HASWELL>(const uint8_t *src,
|
||||
uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
|
||||
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
|
||||
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
|
||||
return {
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(
|
||||
_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
template<>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||
bool parse_string<architecture::haswell>(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||
PARSE_STRING(architecture::haswell, buf, len, pj, depth, offset);
|
||||
template <>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
really_inline bool
|
||||
parse_string<Architecture::HASWELL>(UNUSED const uint8_t *buf,
|
||||
UNUSED size_t len, ParsedJson &pj,
|
||||
UNUSED const uint32_t depth,
|
||||
UNUSED uint32_t offset) {
|
||||
PARSE_STRING(Architecture::HASWELL, buf, len, pj, depth, offset);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,80 +1,88 @@
|
|||
#ifndef SIMDJSON_STRINGPARSING_MACROS_H
|
||||
#define SIMDJSON_STRINGPARSING_MACROS_H
|
||||
|
||||
// We need to compile that code for multiple architectures. However, target attributes can be used
|
||||
// only once by function definition. Huge macro seemed better than huge code duplication.
|
||||
// bool PARSE_STRING(architecture T, UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
// ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset)
|
||||
#define PARSE_STRING(T, buf, len, pj, depth, offset) { \
|
||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \
|
||||
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */ \
|
||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \
|
||||
const uint8_t *const start_of_string = dst; \
|
||||
while (1) { \
|
||||
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst); \
|
||||
if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) { \
|
||||
/* we encountered quotes first. Move dst to point to quotes and exit */ \
|
||||
\
|
||||
/* find out where the quote is... */ \
|
||||
uint32_t quote_dist = trailingzeroes(helper.quote_bits); \
|
||||
\
|
||||
/* NULL termination is still handy if you expect all your strings to be NULL terminated? */ \
|
||||
/* It comes at a small cost */ \
|
||||
dst[quote_dist] = 0; \
|
||||
\
|
||||
uint32_t str_length = (dst - start_of_string) + quote_dist; \
|
||||
memcpy(pj.current_string_buf_loc,&str_length, sizeof(uint32_t)); \
|
||||
/*///////////////////// */ \
|
||||
/* Above, check for overflow in case someone has a crazy string (>=4GB?) */ \
|
||||
/* But only add the overflow check when the document itself exceeds 4GB */ \
|
||||
/* Currently unneeded because we refuse to parse docs larger or equal to 4GB. */ \
|
||||
/*////////////////////// */ \
|
||||
\
|
||||
\
|
||||
/* we advance the point, accounting for the fact that we have a NULL termination */ \
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1; \
|
||||
return true; \
|
||||
} \
|
||||
if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) { \
|
||||
/* find out where the backspace is */ \
|
||||
uint32_t bs_dist = trailingzeroes(helper.bs_bits); \
|
||||
uint8_t escape_char = src[bs_dist + 1]; \
|
||||
/* we encountered backslash first. Handle backslash */ \
|
||||
if (escape_char == 'u') { \
|
||||
/* move src/dst up to the start; they will be further adjusted */ \
|
||||
/* within the unicode codepoint handling code. */ \
|
||||
src += bs_dist; \
|
||||
dst += bs_dist; \
|
||||
if (!handle_unicode_codepoint(&src, &dst)) { \
|
||||
return false; \
|
||||
} \
|
||||
} else { \
|
||||
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and */ \
|
||||
/* write bs_dist+1 characters to output */ \
|
||||
/* note this may reach beyond the part of the buffer we've actually */ \
|
||||
/* seen. I think this is ok */ \
|
||||
uint8_t escape_result = escape_map[escape_char]; \
|
||||
if (escape_result == 0u) { \
|
||||
return false; /* bogus escape value is an error */ \
|
||||
} \
|
||||
dst[bs_dist] = escape_result; \
|
||||
src += bs_dist + 2; \
|
||||
dst += bs_dist + 1; \
|
||||
} \
|
||||
} else { \
|
||||
/* they are the same. Since they can't co-occur, it means we encountered */ \
|
||||
/* neither. */ \
|
||||
if constexpr(T == architecture::westmere) { \
|
||||
src += 16; \
|
||||
dst += 16; \
|
||||
} else { \
|
||||
src += 32; \
|
||||
dst += 32; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
/* can't be reached */ \
|
||||
return true; \
|
||||
}
|
||||
// We need to compile that code for multiple architectures. However, target
|
||||
// attributes can be used only once by function definition. Huge macro seemed
|
||||
// better than huge code duplication.ç
|
||||
// bool PARSE_STRING(Architecture T, const uint8_t *buf, size_t len, ParsedJson
|
||||
// &pj,const uint32_t depth, uint32_t offset)
|
||||
#define PARSE_STRING(T, buf, len, pj, depth, offset) \
|
||||
{ \
|
||||
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \
|
||||
const uint8_t *src = \
|
||||
&buf[offset + 1]; /* we know that buf at offset is a " */ \
|
||||
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \
|
||||
const uint8_t *const start_of_string = dst; \
|
||||
while (1) { \
|
||||
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst); \
|
||||
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { \
|
||||
/* we encountered quotes first. Move dst to point to quotes and exit \
|
||||
*/ \
|
||||
\
|
||||
/* find out where the quote is... */ \
|
||||
uint32_t quote_dist = trailing_zeroes(helper.quote_bits); \
|
||||
\
|
||||
/* NULL termination is still handy if you expect all your strings to \
|
||||
* be NULL terminated? */ \
|
||||
/* It comes at a small cost */ \
|
||||
dst[quote_dist] = 0; \
|
||||
\
|
||||
uint32_t str_length = (dst - start_of_string) + quote_dist; \
|
||||
memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); \
|
||||
/***************************** \
|
||||
* Above, check for overflow in case someone has a crazy string \
|
||||
* (>=4GB?) _ \
|
||||
* But only add the overflow check when the document itself exceeds \
|
||||
* 4GB \
|
||||
* Currently unneeded because we refuse to parse docs larger or equal \
|
||||
* to 4GB. \
|
||||
****************************/ \
|
||||
\
|
||||
/* we advance the point, accounting for the fact that we have a NULL \
|
||||
* termination */ \
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1; \
|
||||
return true; \
|
||||
} \
|
||||
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { \
|
||||
/* find out where the backspace is */ \
|
||||
uint32_t bs_dist = trailing_zeroes(helper.bs_bits); \
|
||||
uint8_t escape_char = src[bs_dist + 1]; \
|
||||
/* we encountered backslash first. Handle backslash */ \
|
||||
if (escape_char == 'u') { \
|
||||
/* move src/dst up to the start; they will be further adjusted \
|
||||
within the unicode codepoint handling code. */ \
|
||||
src += bs_dist; \
|
||||
dst += bs_dist; \
|
||||
if (!handle_unicode_codepoint(&src, &dst)) { \
|
||||
return false; \
|
||||
} \
|
||||
} else { \
|
||||
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and \
|
||||
* write bs_dist+1 characters to output \
|
||||
* note this may reach beyond the part of the buffer we've actually \
|
||||
* seen. I think this is ok */ \
|
||||
uint8_t escape_result = escape_map[escape_char]; \
|
||||
if (escape_result == 0u) { \
|
||||
return false; /* bogus escape value is an error */ \
|
||||
} \
|
||||
dst[bs_dist] = escape_result; \
|
||||
src += bs_dist + 2; \
|
||||
dst += bs_dist + 1; \
|
||||
} \
|
||||
} else { \
|
||||
/* they are the same. Since they can't co-occur, it means we \
|
||||
* encountered neither. */ \
|
||||
if constexpr (T == Architecture::WESTMERE) { \
|
||||
src += 16; \
|
||||
dst += 16; \
|
||||
} else { \
|
||||
src += 32; \
|
||||
dst += 32; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
/* can't be reached */ \
|
||||
return true; \
|
||||
}
|
||||
|
||||
#endif
|
|
@ -4,32 +4,37 @@
|
|||
#include "simdjson/stringparsing.h"
|
||||
#include "simdjson/stringparsing_macros.h"
|
||||
|
||||
|
||||
#ifdef IS_X86_64
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
template<> really_inline
|
||||
parse_string_helper find_bs_bits_and_quote_bits<architecture::westmere> (const uint8_t *src, uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
|
||||
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
|
||||
return {
|
||||
static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
|
||||
template <>
|
||||
really_inline parse_string_helper
|
||||
find_bs_bits_and_quote_bits<Architecture::WESTMERE>(const uint8_t *src,
|
||||
uint8_t *dst) {
|
||||
// this can read up to 31 bytes beyond the buffer size, but we require
|
||||
// SIMDJSON_PADDING of padding
|
||||
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
|
||||
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
|
||||
return {
|
||||
static_cast<uint32_t>(
|
||||
_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
|
||||
static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
template<>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
|
||||
bool parse_string<architecture::westmere>(UNUSED const uint8_t *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
|
||||
PARSE_STRING(architecture::westmere, buf, len, pj, depth, offset);
|
||||
}
|
||||
template <>
|
||||
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
|
||||
really_inline bool
|
||||
parse_string<Architecture::WESTMERE>(UNUSED const uint8_t *buf,
|
||||
UNUSED size_t len, ParsedJson &pj,
|
||||
UNUSED const uint32_t depth,
|
||||
UNUSED uint32_t offset) {
|
||||
PARSE_STRING(Architecture::WESTMERE, buf, len, pj, depth, offset);
|
||||
}
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,35 +1,35 @@
|
|||
#include "simdjson/jsonioutil.h"
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
namespace simdjson {
|
||||
char * allocate_padded_buffer(size_t length) {
|
||||
// we could do a simple malloc
|
||||
//return (char *) malloc(length + SIMDJSON_PADDING);
|
||||
// However, we might as well align to cache lines...
|
||||
size_t totalpaddedlength = length + SIMDJSON_PADDING;
|
||||
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
|
||||
return padded_buffer;
|
||||
char *allocate_padded_buffer(size_t length) {
|
||||
// we could do a simple malloc
|
||||
// return (char *) malloc(length + SIMDJSON_PADDING);
|
||||
// However, we might as well align to cache lines...
|
||||
size_t totalpaddedlength = length + SIMDJSON_PADDING;
|
||||
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
|
||||
return padded_buffer;
|
||||
}
|
||||
|
||||
padded_string get_corpus(const std::string& filename) {
|
||||
padded_string get_corpus(const std::string &filename) {
|
||||
std::FILE *fp = std::fopen(filename.c_str(), "rb");
|
||||
if (fp != nullptr) {
|
||||
std::fseek(fp, 0, SEEK_END);
|
||||
size_t len = std::ftell(fp);
|
||||
padded_string s(len);
|
||||
if(s.data() == nullptr) {
|
||||
if (s.data() == nullptr) {
|
||||
std::fclose(fp);
|
||||
throw std::runtime_error("could not allocate memory");
|
||||
throw std::runtime_error("could not allocate memory");
|
||||
}
|
||||
std::rewind(fp);
|
||||
size_t readb = std::fread(s.data(), 1, len, fp);
|
||||
std::fclose(fp);
|
||||
if(readb != len) {
|
||||
throw std::runtime_error("could not read the data");
|
||||
if (readb != len) {
|
||||
throw std::runtime_error("could not read the data");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
throw std::runtime_error("could not load corpus");
|
||||
}
|
||||
throw std::runtime_error("could not load corpus");
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -38,13 +38,13 @@ static uint8_t jump_table[256 * 3] = {
|
|||
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
||||
};
|
||||
|
||||
size_t jsonminify(const unsigned char *bytes, size_t howmany,
|
||||
unsigned char *out) {
|
||||
size_t json_minify(const unsigned char *bytes, size_t how_many,
|
||||
unsigned char *out) {
|
||||
size_t i = 0, pos = 0;
|
||||
uint8_t quote = 0;
|
||||
uint8_t nonescape = 1;
|
||||
|
||||
while (i < howmany) {
|
||||
while (i < how_many) {
|
||||
unsigned char c = bytes[i];
|
||||
uint8_t *meta = jump_table + 3 * c;
|
||||
|
||||
|
@ -64,7 +64,6 @@ size_t jsonminify(const unsigned char *bytes, size_t howmany,
|
|||
|
||||
namespace simdjson {
|
||||
|
||||
|
||||
// some intrinsics are missing under GCC?
|
||||
#ifndef __clang__
|
||||
#ifndef _MSC_VER
|
||||
|
@ -85,8 +84,6 @@ static inline void _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo,
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// a straightforward comparison of a mask against input.
|
||||
static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
|
||||
__m256i mask) {
|
||||
|
@ -98,8 +95,9 @@ static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
|
|||
}
|
||||
|
||||
// take input from buf and remove useless whitespace, input and output can be
|
||||
// the same, result is null terminated, return the string length (minus the null termination)
|
||||
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
||||
// the same, result is null terminated, return the string length (minus the null
|
||||
// termination)
|
||||
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
|
||||
// Useful constant masks
|
||||
const uint64_t even_bits = 0x5555555555555555ULL;
|
||||
const uint64_t odd_bits = ~even_bits;
|
||||
|
@ -109,11 +107,13 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
|
||||
size_t idx = 0;
|
||||
if (len >= 64) {
|
||||
size_t avxlen = len - 63;
|
||||
size_t avx_len = len - 63;
|
||||
|
||||
for (; idx < avxlen; idx += 64) {
|
||||
__m256i input_lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
|
||||
__m256i input_hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
|
||||
for (; idx < avx_len; idx += 64) {
|
||||
__m256i input_lo =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
|
||||
__m256i input_hi =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
|
||||
uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
|
||||
_mm256_set1_epi8('\\'));
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
|
@ -122,8 +122,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t odd_carries;
|
||||
bool iter_ends_odd_backslash = add_overflow(
|
||||
bs_bits, odd_starts, &odd_carries);
|
||||
bool iter_ends_odd_backslash =
|
||||
add_overflow(bs_bits, odd_starts, &odd_carries);
|
||||
odd_carries |= prev_iter_ends_odd_backslash;
|
||||
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
|
@ -137,7 +137,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
prev_iter_inside_quote = static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);// might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University
|
||||
prev_iter_inside_quote = static_cast<uint64_t>(
|
||||
static_cast<int64_t>(quote_mask) >>
|
||||
63); // might be undefined behavior, should be fully defined in C++20,
|
||||
// ok according to John Regher from Utah University
|
||||
const __m256i low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 9 a b c d
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
|
||||
|
@ -163,7 +166,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
|
||||
|
||||
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
|
||||
uint64_t ws_res_0 =
|
||||
static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
|
||||
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
|
||||
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
|
||||
whitespace &= ~quote_mask;
|
||||
|
@ -175,17 +179,18 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = hamming((~whitespace));
|
||||
__m256i vmask1 =
|
||||
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
||||
__m256i vmask2 =
|
||||
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
||||
__m256i vmask1 = _mm256_loadu2_m128i(
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
||||
__m256i vmask2 = _mm256_loadu2_m128i(
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), reinterpret_cast<__m128i *>(out), result1);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), reinterpret_cast<__m128i *>(out + pop2),
|
||||
result2);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1),
|
||||
reinterpret_cast<__m128i *>(out), result1);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3),
|
||||
reinterpret_cast<__m128i *>(out + pop2), result2);
|
||||
out += pop4;
|
||||
}
|
||||
}
|
||||
|
@ -195,8 +200,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint8_t buffer[64];
|
||||
memset(buffer, 0, 64);
|
||||
memcpy(buffer, buf + idx, len - idx);
|
||||
__m256i input_lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
|
||||
__m256i input_hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
|
||||
__m256i input_lo =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
|
||||
__m256i input_hi =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
|
||||
uint64_t bs_bits =
|
||||
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
|
||||
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
|
||||
|
@ -205,10 +212,11 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t odd_starts = start_edges & ~even_start_mask;
|
||||
uint64_t even_carries = bs_bits + even_starts;
|
||||
uint64_t odd_carries;
|
||||
//bool iter_ends_odd_backslash =
|
||||
add_overflow( bs_bits, odd_starts, &odd_carries);
|
||||
// bool iter_ends_odd_backslash =
|
||||
add_overflow(bs_bits, odd_starts, &odd_carries);
|
||||
odd_carries |= prev_iter_ends_odd_backslash;
|
||||
//prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
|
||||
// prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
|
||||
// // we never use it
|
||||
uint64_t even_carry_ends = even_carries & ~bs_bits;
|
||||
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
|
||||
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
|
||||
|
@ -220,7 +228,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
|
||||
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
|
||||
quote_mask ^= prev_iter_inside_quote;
|
||||
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore
|
||||
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
|
||||
// don't need this anymore
|
||||
|
||||
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
|
||||
__m256i mask_70 =
|
||||
|
@ -254,23 +263,23 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
|
|||
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||
int pop4 = hamming((~whitespace));
|
||||
__m256i vmask1 =
|
||||
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
||||
__m256i vmask2 =
|
||||
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
||||
__m256i vmask1 = _mm256_loadu2_m128i(
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
||||
__m256i vmask2 = _mm256_loadu2_m128i(
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), reinterpret_cast<__m128i *>(buffer),
|
||||
result1);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), reinterpret_cast<__m128i *>(buffer + pop2),
|
||||
result2);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1),
|
||||
reinterpret_cast<__m128i *>(buffer), result1);
|
||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3),
|
||||
reinterpret_cast<__m128i *>(buffer + pop2), result2);
|
||||
memcpy(out, buffer, pop4);
|
||||
out += pop4;
|
||||
}
|
||||
*out = '\0';// NULL termination
|
||||
*out = '\0'; // NULL termination
|
||||
return out - initout;
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
|
@ -1,67 +1,68 @@
|
|||
#include "simdjson/jsonparser.h"
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#include <sysinfoapi.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "simdjson/simdjson.h"
|
||||
#include "simdjson/isadetection.h"
|
||||
#include "simdjson/jsonparser.h"
|
||||
#include "simdjson/portability.h"
|
||||
#include "simdjson/simdjson.h"
|
||||
|
||||
namespace simdjson {
|
||||
|
||||
architecture find_best_supported_implementation() {
|
||||
constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ
|
||||
| SIMDExtensions::BMI1 | SIMDExtensions::BMI2;
|
||||
constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ;
|
||||
Architecture find_best_supported_implementation() {
|
||||
constexpr uint32_t haswell_flags =
|
||||
instruction_set::AVX2 | instruction_set::PCLMULQDQ |
|
||||
instruction_set::BMI1 | instruction_set::BMI2;
|
||||
constexpr uint32_t westmere_flags =
|
||||
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
|
||||
|
||||
uint32_t supports = detect_supported_architectures();
|
||||
// Order from best to worst (within architecture)
|
||||
if ((haswell_flags & supports) == haswell_flags) return architecture::haswell;
|
||||
if ((westmere_flags & supports) == westmere_flags) return architecture::westmere;
|
||||
if (SIMDExtensions::NEON) return architecture::arm64;
|
||||
if ((haswell_flags & supports) == haswell_flags)
|
||||
return Architecture::HASWELL;
|
||||
if ((westmere_flags & supports) == westmere_flags)
|
||||
return Architecture::WESTMERE;
|
||||
if (instruction_set::NEON)
|
||||
return Architecture::ARM64;
|
||||
|
||||
return architecture::none;
|
||||
return Architecture::NONE;
|
||||
}
|
||||
|
||||
// Responsible to select the best json_parse implementation
|
||||
int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) {
|
||||
architecture best_implementation = find_best_supported_implementation();
|
||||
int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
|
||||
bool realloc_if_needed) {
|
||||
Architecture best_implementation = find_best_supported_implementation();
|
||||
// Selecting the best implementation
|
||||
switch (best_implementation) {
|
||||
#ifdef IS_X86_64
|
||||
case architecture::haswell:
|
||||
json_parse_ptr = &json_parse_implementation<architecture::haswell>;
|
||||
case Architecture::HASWELL:
|
||||
json_parse_ptr = &json_parse_implementation<Architecture::HASWELL>;
|
||||
break;
|
||||
case architecture::westmere:
|
||||
json_parse_ptr = &json_parse_implementation<architecture::westmere>;
|
||||
case Architecture::WESTMERE:
|
||||
json_parse_ptr = &json_parse_implementation<Architecture::WESTMERE>;
|
||||
break;
|
||||
#endif
|
||||
#ifdef IS_ARM64
|
||||
case architecture::arm64:
|
||||
json_parse_ptr = &json_parse_implementation<architecture::arm64>;
|
||||
case Architecture::ARM64:
|
||||
json_parse_ptr = &json_parse_implementation<Architecture::ARM64>;
|
||||
break;
|
||||
#endif
|
||||
default :
|
||||
default:
|
||||
std::cerr << "The processor is not supported by simdjson." << std::endl;
|
||||
return simdjson::UNEXPECTED_ERROR;
|
||||
}
|
||||
|
||||
return json_parse_ptr(buf, len, pj, reallocifneeded);
|
||||
return json_parse_ptr(buf, len, pj, realloc_if_needed);
|
||||
}
|
||||
|
||||
json_parse_functype *json_parse_ptr = &json_parse_dispatch;
|
||||
|
||||
WARN_UNUSED
|
||||
ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded) {
|
||||
ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
|
||||
bool realloc_if_needed) {
|
||||
ParsedJson pj;
|
||||
bool ok = pj.allocateCapacity(len);
|
||||
if(ok) {
|
||||
json_parse(buf, len, pj, reallocifneeded);
|
||||
bool ok = pj.allocate_capacity(len);
|
||||
if (ok) {
|
||||
json_parse(buf, len, pj, realloc_if_needed);
|
||||
} else {
|
||||
std::cerr << "failure during memory allocation " << std::endl;
|
||||
}
|
||||
return pj;
|
||||
}
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1,324 +1,323 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
|
||||
namespace simdjson {
|
||||
ParsedJson::ParsedJson() :
|
||||
structural_indexes(nullptr), tape(nullptr), containing_scope_offset(nullptr),
|
||||
ret_address(nullptr), string_buf(nullptr), current_string_buf_loc(nullptr) {}
|
||||
ParsedJson::ParsedJson()
|
||||
: structural_indexes(nullptr), tape(nullptr),
|
||||
containing_scope_offset(nullptr), ret_address(nullptr),
|
||||
string_buf(nullptr), current_string_buf_loc(nullptr) {}
|
||||
|
||||
ParsedJson::~ParsedJson() {
|
||||
deallocate();
|
||||
ParsedJson::~ParsedJson() { deallocate(); }
|
||||
|
||||
ParsedJson::ParsedJson(ParsedJson &&p)
|
||||
: byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity),
|
||||
tape_capacity(p.tape_capacity), string_capacity(p.string_capacity),
|
||||
current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes),
|
||||
structural_indexes(p.structural_indexes), tape(p.tape),
|
||||
containing_scope_offset(p.containing_scope_offset),
|
||||
ret_address(p.ret_address), string_buf(p.string_buf),
|
||||
current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) {
|
||||
p.structural_indexes = nullptr;
|
||||
p.tape = nullptr;
|
||||
p.containing_scope_offset = nullptr;
|
||||
p.ret_address = nullptr;
|
||||
p.string_buf = nullptr;
|
||||
p.current_string_buf_loc = nullptr;
|
||||
}
|
||||
|
||||
ParsedJson::ParsedJson(ParsedJson && p)
|
||||
: bytecapacity(p.bytecapacity),
|
||||
depthcapacity(p.depthcapacity),
|
||||
tapecapacity(p.tapecapacity),
|
||||
stringcapacity(p.stringcapacity),
|
||||
current_loc(p.current_loc),
|
||||
n_structural_indexes(p.n_structural_indexes),
|
||||
structural_indexes(p.structural_indexes),
|
||||
tape(p.tape),
|
||||
containing_scope_offset(p.containing_scope_offset),
|
||||
ret_address(p.ret_address),
|
||||
string_buf(p.string_buf),
|
||||
current_string_buf_loc(p.current_string_buf_loc),
|
||||
isvalid(p.isvalid) {
|
||||
p.structural_indexes=nullptr;
|
||||
p.tape=nullptr;
|
||||
p.containing_scope_offset=nullptr;
|
||||
p.ret_address=nullptr;
|
||||
p.string_buf=nullptr;
|
||||
p.current_string_buf_loc=nullptr;
|
||||
}
|
||||
|
||||
|
||||
|
||||
WARN_UNUSED
|
||||
bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
|
||||
if (maxdepth <= 0) {
|
||||
maxdepth = 1; // don't let the user allocate nothing
|
||||
}
|
||||
if (len <= 0) {
|
||||
len = 64; // allocating 0 bytes is wasteful.
|
||||
}
|
||||
if(len > SIMDJSON_MAXSIZE_BYTES) {
|
||||
return false;
|
||||
}
|
||||
if ((len <= bytecapacity) && (depthcapacity < maxdepth)) {
|
||||
return true;
|
||||
}
|
||||
deallocate();
|
||||
isvalid = false;
|
||||
bytecapacity = 0; // will only set it to len after allocations are a success
|
||||
n_structural_indexes = 0;
|
||||
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
|
||||
structural_indexes = new (std::nothrow) uint32_t[max_structures];
|
||||
// a pathological input like "[[[[..." would generate len tape elements, so need a capacity of len + 1
|
||||
size_t localtapecapacity = ROUNDUP_N(len + 1, 64);
|
||||
// a document with only zero-length strings... could have len/3 string
|
||||
// and we would need len/3 * 5 bytes on the string buffer
|
||||
size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64);
|
||||
string_buf = new (std::nothrow) uint8_t[localstringcapacity];
|
||||
tape = new (std::nothrow) uint64_t[localtapecapacity];
|
||||
containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
ret_address = new (std::nothrow) void *[maxdepth];
|
||||
#else
|
||||
ret_address = new (std::nothrow) char[maxdepth];
|
||||
#endif
|
||||
if ((string_buf == nullptr) || (tape == nullptr) ||
|
||||
(containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) {
|
||||
std::cerr << "Could not allocate memory" << std::endl;
|
||||
delete[] ret_address;
|
||||
delete[] containing_scope_offset;
|
||||
delete[] tape;
|
||||
delete[] string_buf;
|
||||
delete[] structural_indexes;
|
||||
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
// We do not need to initialize this content for parsing, though we could
|
||||
// need to initialize it for safety.
|
||||
memset(string_buf, 0 , localstringcapacity);
|
||||
memset(structural_indexes, 0, max_structures * sizeof(uint32_t));
|
||||
memset(tape, 0, localtapecapacity * sizeof(uint64_t));
|
||||
*/
|
||||
bytecapacity = len;
|
||||
depthcapacity = maxdepth;
|
||||
tapecapacity = localtapecapacity;
|
||||
stringcapacity = localstringcapacity;
|
||||
bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) {
|
||||
if (max_depth <= 0) {
|
||||
max_depth = 1; // don't let the user allocate nothing
|
||||
}
|
||||
if (len <= 0) {
|
||||
len = 64; // allocating 0 bytes is wasteful.
|
||||
}
|
||||
if (len > SIMDJSON_MAXSIZE_BYTES) {
|
||||
return false;
|
||||
}
|
||||
if ((len <= byte_capacity) && (depth_capacity < max_depth)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParsedJson::isValid() const {
|
||||
return isvalid;
|
||||
}
|
||||
|
||||
int ParsedJson::getErrorCode() const {
|
||||
return errorcode;
|
||||
}
|
||||
|
||||
std::string ParsedJson::getErrorMsg() const {
|
||||
return errorMsg(errorcode);
|
||||
}
|
||||
|
||||
void ParsedJson::deallocate() {
|
||||
bytecapacity = 0;
|
||||
depthcapacity = 0;
|
||||
tapecapacity = 0;
|
||||
stringcapacity = 0;
|
||||
}
|
||||
deallocate();
|
||||
valid = false;
|
||||
byte_capacity = 0; // will only set it to len after allocations are a success
|
||||
n_structural_indexes = 0;
|
||||
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
|
||||
structural_indexes = new (std::nothrow) uint32_t[max_structures];
|
||||
// a pathological input like "[[[[..." would generate len tape elements, so
|
||||
// need a capacity of len + 1
|
||||
size_t local_tape_capacity = ROUNDUP_N(len + 1, 64);
|
||||
// a document with only zero-length strings... could have len/3 string
|
||||
// and we would need len/3 * 5 bytes on the string buffer
|
||||
size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64);
|
||||
string_buf = new (std::nothrow) uint8_t[local_string_capacity];
|
||||
tape = new (std::nothrow) uint64_t[local_tape_capacity];
|
||||
containing_scope_offset = new (std::nothrow) uint32_t[max_depth];
|
||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||
ret_address = new (std::nothrow) void *[max_depth];
|
||||
#else
|
||||
ret_address = new (std::nothrow) char[max_depth];
|
||||
#endif
|
||||
if ((string_buf == nullptr) || (tape == nullptr) ||
|
||||
(containing_scope_offset == nullptr) || (ret_address == nullptr) ||
|
||||
(structural_indexes == nullptr)) {
|
||||
std::cerr << "Could not allocate memory" << std::endl;
|
||||
delete[] ret_address;
|
||||
delete[] containing_scope_offset;
|
||||
delete[] tape;
|
||||
delete[] string_buf;
|
||||
delete[] structural_indexes;
|
||||
isvalid = false;
|
||||
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
// We do not need to initialize this content for parsing, though we could
|
||||
// need to initialize it for safety.
|
||||
memset(string_buf, 0 , local_string_capacity);
|
||||
memset(structural_indexes, 0, max_structures * sizeof(uint32_t));
|
||||
memset(tape, 0, local_tape_capacity * sizeof(uint64_t));
|
||||
*/
|
||||
byte_capacity = len;
|
||||
depth_capacity = max_depth;
|
||||
tape_capacity = local_tape_capacity;
|
||||
string_capacity = local_string_capacity;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParsedJson::is_valid() const { return valid; }
|
||||
|
||||
int ParsedJson::get_error_code() const { return error_code; }
|
||||
|
||||
std::string ParsedJson::get_error_message() const {
|
||||
return error_message(error_code);
|
||||
}
|
||||
|
||||
void ParsedJson::deallocate() {
|
||||
byte_capacity = 0;
|
||||
depth_capacity = 0;
|
||||
tape_capacity = 0;
|
||||
string_capacity = 0;
|
||||
delete[] ret_address;
|
||||
delete[] containing_scope_offset;
|
||||
delete[] tape;
|
||||
delete[] string_buf;
|
||||
delete[] structural_indexes;
|
||||
valid = false;
|
||||
}
|
||||
|
||||
void ParsedJson::init() {
|
||||
current_string_buf_loc = string_buf;
|
||||
current_loc = 0;
|
||||
isvalid = false;
|
||||
current_string_buf_loc = string_buf;
|
||||
current_loc = 0;
|
||||
valid = false;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
bool ParsedJson::printjson(std::ostream &os) {
|
||||
if(!isvalid) {
|
||||
return false;
|
||||
}
|
||||
uint32_t string_length;
|
||||
size_t tapeidx = 0;
|
||||
uint64_t tape_val = tape[tapeidx];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
size_t howmany = 0;
|
||||
if (type == 'r') {
|
||||
howmany = tape_val & JSONVALUEMASK;
|
||||
} else {
|
||||
fprintf(stderr, "Error: no starting root node?");
|
||||
return false;
|
||||
}
|
||||
if (howmany > tapecapacity) {
|
||||
fprintf(stderr,
|
||||
"We may be exceeding the tape capacity. Is this a valid document?\n");
|
||||
return false;
|
||||
}
|
||||
tapeidx++;
|
||||
bool *inobject = new bool[depthcapacity];
|
||||
auto *inobjectidx = new size_t[depthcapacity];
|
||||
int depth = 1; // only root at level 0
|
||||
inobjectidx[depth] = 0;
|
||||
inobject[depth] = false;
|
||||
for (; tapeidx < howmany; tapeidx++) {
|
||||
tape_val = tape[tapeidx];
|
||||
uint64_t payload = tape_val & JSONVALUEMASK;
|
||||
type = (tape_val >> 56);
|
||||
if (!inobject[depth]) {
|
||||
if ((inobjectidx[depth] > 0) && (type != ']')) {
|
||||
os << ",";
|
||||
}
|
||||
inobjectidx[depth]++;
|
||||
} else { // if (inobject) {
|
||||
if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) &&
|
||||
(type != '}')) {
|
||||
os << ",";
|
||||
}
|
||||
if (((inobjectidx[depth] & 1) == 1)) {
|
||||
os << ":";
|
||||
}
|
||||
inobjectidx[depth]++;
|
||||
bool ParsedJson::print_json(std::ostream &os) {
|
||||
if (!valid) {
|
||||
return false;
|
||||
}
|
||||
uint32_t string_length;
|
||||
size_t tape_idx = 0;
|
||||
uint64_t tape_val = tape[tape_idx];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
size_t how_many = 0;
|
||||
if (type == 'r') {
|
||||
how_many = tape_val & JSON_VALUE_MASK;
|
||||
} else {
|
||||
fprintf(stderr, "Error: no starting root node?");
|
||||
return false;
|
||||
}
|
||||
if (how_many > tape_capacity) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"We may be exceeding the tape capacity. Is this a valid document?\n");
|
||||
return false;
|
||||
}
|
||||
tape_idx++;
|
||||
bool *in_object = new bool[depth_capacity];
|
||||
auto *in_object_idx = new size_t[depth_capacity];
|
||||
int depth = 1; // only root at level 0
|
||||
in_object_idx[depth] = 0;
|
||||
in_object[depth] = false;
|
||||
for (; tape_idx < how_many; tape_idx++) {
|
||||
tape_val = tape[tape_idx];
|
||||
uint64_t payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
if (!in_object[depth]) {
|
||||
if ((in_object_idx[depth] > 0) && (type != ']')) {
|
||||
os << ",";
|
||||
}
|
||||
switch (type) {
|
||||
case '"': // we have a string
|
||||
os << '"';
|
||||
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
|
||||
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
|
||||
os << '"';
|
||||
break;
|
||||
case 'l': // we have a long int
|
||||
if (tapeidx + 1 >= howmany) {
|
||||
delete[] inobject;
|
||||
delete[] inobjectidx;
|
||||
return false;
|
||||
}
|
||||
os << static_cast<int64_t>(tape[++tapeidx]);
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
if (tapeidx + 1 >= howmany){
|
||||
delete[] inobject;
|
||||
delete[] inobjectidx;
|
||||
return false;
|
||||
}
|
||||
double answer;
|
||||
memcpy(&answer, &tape[++tapeidx], sizeof(answer));
|
||||
os << answer;
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
os << "null";
|
||||
break;
|
||||
case 't': // we have a true
|
||||
os << "true";
|
||||
break;
|
||||
case 'f': // we have a false
|
||||
os << "false";
|
||||
break;
|
||||
case '{': // we have an object
|
||||
os << '{';
|
||||
depth++;
|
||||
inobject[depth] = true;
|
||||
inobjectidx[depth] = 0;
|
||||
break;
|
||||
case '}': // we end an object
|
||||
depth--;
|
||||
os << '}';
|
||||
break;
|
||||
case '[': // we start an array
|
||||
os << '[';
|
||||
depth++;
|
||||
inobject[depth] = false;
|
||||
inobjectidx[depth] = 0;
|
||||
break;
|
||||
case ']': // we end an array
|
||||
depth--;
|
||||
os << ']';
|
||||
break;
|
||||
case 'r': // we start and end with the root node
|
||||
fprintf(stderr, "should we be hitting the root node?\n");
|
||||
delete[] inobject;
|
||||
delete[] inobjectidx;
|
||||
return false;
|
||||
default:
|
||||
fprintf(stderr, "bug %c\n", type);
|
||||
delete[] inobject;
|
||||
delete[] inobjectidx;
|
||||
in_object_idx[depth]++;
|
||||
} else { // if (in_object) {
|
||||
if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) &&
|
||||
(type != '}')) {
|
||||
os << ",";
|
||||
}
|
||||
if (((in_object_idx[depth] & 1) == 1)) {
|
||||
os << ":";
|
||||
}
|
||||
in_object_idx[depth]++;
|
||||
}
|
||||
switch (type) {
|
||||
case '"': // we have a string
|
||||
os << '"';
|
||||
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
|
||||
print_with_escapes(
|
||||
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
|
||||
string_length);
|
||||
os << '"';
|
||||
break;
|
||||
case 'l': // we have a long int
|
||||
if (tape_idx + 1 >= how_many) {
|
||||
delete[] in_object;
|
||||
delete[] in_object_idx;
|
||||
return false;
|
||||
}
|
||||
os << static_cast<int64_t>(tape[++tape_idx]);
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
if (tape_idx + 1 >= how_many) {
|
||||
delete[] in_object;
|
||||
delete[] in_object_idx;
|
||||
return false;
|
||||
}
|
||||
double answer;
|
||||
memcpy(&answer, &tape[++tape_idx], sizeof(answer));
|
||||
os << answer;
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
os << "null";
|
||||
break;
|
||||
case 't': // we have a true
|
||||
os << "true";
|
||||
break;
|
||||
case 'f': // we have a false
|
||||
os << "false";
|
||||
break;
|
||||
case '{': // we have an object
|
||||
os << '{';
|
||||
depth++;
|
||||
in_object[depth] = true;
|
||||
in_object_idx[depth] = 0;
|
||||
break;
|
||||
case '}': // we end an object
|
||||
depth--;
|
||||
os << '}';
|
||||
break;
|
||||
case '[': // we start an array
|
||||
os << '[';
|
||||
depth++;
|
||||
in_object[depth] = false;
|
||||
in_object_idx[depth] = 0;
|
||||
break;
|
||||
case ']': // we end an array
|
||||
depth--;
|
||||
os << ']';
|
||||
break;
|
||||
case 'r': // we start and end with the root node
|
||||
fprintf(stderr, "should we be hitting the root node?\n");
|
||||
delete[] in_object;
|
||||
delete[] in_object_idx;
|
||||
return false;
|
||||
default:
|
||||
fprintf(stderr, "bug %c\n", type);
|
||||
delete[] in_object;
|
||||
delete[] in_object_idx;
|
||||
return false;
|
||||
}
|
||||
delete[] inobject;
|
||||
delete[] inobjectidx;
|
||||
return true;
|
||||
}
|
||||
delete[] in_object;
|
||||
delete[] in_object_idx;
|
||||
return true;
|
||||
}
|
||||
|
||||
WARN_UNUSED
|
||||
bool ParsedJson::dump_raw_tape(std::ostream &os) {
|
||||
if(!isvalid) {
|
||||
return false;
|
||||
}
|
||||
uint32_t string_length;
|
||||
size_t tapeidx = 0;
|
||||
uint64_t tape_val = tape[tapeidx];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
os << tapeidx << " : " << type;
|
||||
tapeidx++;
|
||||
size_t howmany = 0;
|
||||
if (type == 'r') {
|
||||
howmany = tape_val & JSONVALUEMASK;
|
||||
} else {
|
||||
fprintf(stderr, "Error: no starting root node?");
|
||||
return false;
|
||||
}
|
||||
os << "\t// pointing to " << howmany <<" (right after last node)\n";
|
||||
uint64_t payload;
|
||||
for (; tapeidx < howmany; tapeidx++) {
|
||||
os << tapeidx << " : ";
|
||||
tape_val = tape[tapeidx];
|
||||
payload = tape_val & JSONVALUEMASK;
|
||||
type = (tape_val >> 56);
|
||||
switch (type) {
|
||||
case '"': // we have a string
|
||||
os << "string \"";
|
||||
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
|
||||
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
|
||||
os << '"';
|
||||
os << '\n';
|
||||
break;
|
||||
case 'l': // we have a long int
|
||||
if (tapeidx + 1 >= howmany) {
|
||||
return false;
|
||||
}
|
||||
os << "integer " << static_cast<int64_t>(tape[++tapeidx]) << "\n";
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
os << "float ";
|
||||
if (tapeidx + 1 >= howmany) {
|
||||
return false;
|
||||
}
|
||||
double answer;
|
||||
memcpy(&answer, &tape[++tapeidx], sizeof(answer));
|
||||
os << answer << '\n';
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
os << "null\n";
|
||||
break;
|
||||
case 't': // we have a true
|
||||
os << "true\n";
|
||||
break;
|
||||
case 'f': // we have a false
|
||||
os << "false\n";
|
||||
break;
|
||||
case '{': // we have an object
|
||||
os << "{\t// pointing to next tape location " << payload << " (first node after the scope) \n";
|
||||
break;
|
||||
case '}': // we end an object
|
||||
os << "}\t// pointing to previous tape location " << payload << " (start of the scope) \n";
|
||||
break;
|
||||
case '[': // we start an array
|
||||
os << "[\t// pointing to next tape location " << payload << " (first node after the scope) \n";
|
||||
break;
|
||||
case ']': // we end an array
|
||||
os << "]\t// pointing to previous tape location " << payload << " (start of the scope) \n";
|
||||
break;
|
||||
case 'r': // we start and end with the root node
|
||||
printf("end of root\n");
|
||||
return false;
|
||||
default:
|
||||
if (!valid) {
|
||||
return false;
|
||||
}
|
||||
uint32_t string_length;
|
||||
size_t tape_idx = 0;
|
||||
uint64_t tape_val = tape[tape_idx];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
os << tape_idx << " : " << type;
|
||||
tape_idx++;
|
||||
size_t how_many = 0;
|
||||
if (type == 'r') {
|
||||
how_many = tape_val & JSON_VALUE_MASK;
|
||||
} else {
|
||||
fprintf(stderr, "Error: no starting root node?");
|
||||
return false;
|
||||
}
|
||||
os << "\t// pointing to " << how_many << " (right after last node)\n";
|
||||
uint64_t payload;
|
||||
for (; tape_idx < how_many; tape_idx++) {
|
||||
os << tape_idx << " : ";
|
||||
tape_val = tape[tape_idx];
|
||||
payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
switch (type) {
|
||||
case '"': // we have a string
|
||||
os << "string \"";
|
||||
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
|
||||
print_with_escapes(
|
||||
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
|
||||
string_length);
|
||||
os << '"';
|
||||
os << '\n';
|
||||
break;
|
||||
case 'l': // we have a long int
|
||||
if (tape_idx + 1 >= how_many) {
|
||||
return false;
|
||||
}
|
||||
os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n";
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
os << "float ";
|
||||
if (tape_idx + 1 >= how_many) {
|
||||
return false;
|
||||
}
|
||||
double answer;
|
||||
memcpy(&answer, &tape[++tape_idx], sizeof(answer));
|
||||
os << answer << '\n';
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
os << "null\n";
|
||||
break;
|
||||
case 't': // we have a true
|
||||
os << "true\n";
|
||||
break;
|
||||
case 'f': // we have a false
|
||||
os << "false\n";
|
||||
break;
|
||||
case '{': // we have an object
|
||||
os << "{\t// pointing to next tape location " << payload
|
||||
<< " (first node after the scope) \n";
|
||||
break;
|
||||
case '}': // we end an object
|
||||
os << "}\t// pointing to previous tape location " << payload
|
||||
<< " (start of the scope) \n";
|
||||
break;
|
||||
case '[': // we start an array
|
||||
os << "[\t// pointing to next tape location " << payload
|
||||
<< " (first node after the scope) \n";
|
||||
break;
|
||||
case ']': // we end an array
|
||||
os << "]\t// pointing to previous tape location " << payload
|
||||
<< " (start of the scope) \n";
|
||||
break;
|
||||
case 'r': // we start and end with the root node
|
||||
printf("end of root\n");
|
||||
return false;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
tape_val = tape[tapeidx];
|
||||
payload = tape_val & JSONVALUEMASK;
|
||||
type = (tape_val >> 56);
|
||||
os << tapeidx << " : "<< type <<"\t// pointing to " << payload <<" (start root)\n";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
tape_val = tape[tape_idx];
|
||||
payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
os << tape_idx << " : " << type << "\t// pointing to " << payload
|
||||
<< " (start root)\n";
|
||||
return true;
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1,264 +1,269 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
#include "simdjson/common_defs.h"
|
||||
#include "simdjson/parsedjson.h"
|
||||
#include <iterator>
|
||||
|
||||
namespace simdjson {
|
||||
ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
|
||||
if(!pj.isValid()) {
|
||||
throw InvalidJSON();
|
||||
}
|
||||
depthindex = new scopeindex_t[pj.depthcapacity];
|
||||
// memory allocation would throw
|
||||
//if(depthindex == nullptr) {
|
||||
// return;
|
||||
//}
|
||||
depthindex[0].start_of_scope = location;
|
||||
current_val = pj.tape[location++];
|
||||
current_type = (current_val >> 56);
|
||||
depthindex[0].scope_type = current_type;
|
||||
if (current_type == 'r') {
|
||||
tape_length = current_val & JSONVALUEMASK;
|
||||
if(location < tape_length) {
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
depth++;
|
||||
depthindex[depth].start_of_scope = location;
|
||||
depthindex[depth].scope_type = current_type;
|
||||
}
|
||||
} else {
|
||||
// should never happen
|
||||
throw InvalidJSON();
|
||||
}
|
||||
}
|
||||
|
||||
ParsedJson::iterator::~iterator() {
|
||||
delete[] depthindex;
|
||||
}
|
||||
|
||||
ParsedJson::iterator::iterator(const iterator &o):
|
||||
pj(o.pj), depth(o.depth), location(o.location),
|
||||
tape_length(0), current_type(o.current_type),
|
||||
current_val(o.current_val), depthindex(nullptr) {
|
||||
depthindex = new scopeindex_t[pj.depthcapacity];
|
||||
// allocation might throw
|
||||
memcpy(depthindex, o.depthindex, pj.depthcapacity * sizeof(depthindex[0]));
|
||||
tape_length = o.tape_length;
|
||||
}
|
||||
|
||||
ParsedJson::iterator::iterator(iterator &&o):
|
||||
pj(o.pj), depth(o.depth), location(o.location),
|
||||
tape_length(o.tape_length), current_type(o.current_type),
|
||||
current_val(o.current_val), depthindex(o.depthindex) {
|
||||
o.depthindex = nullptr;// we take ownership
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
|
||||
if(!isOk()) {
|
||||
return false;
|
||||
ParsedJson::Iterator::Iterator(ParsedJson &pj_)
|
||||
: pj(pj_), depth(0), location(0), tape_length(0), depth_index(nullptr) {
|
||||
if (!pj.is_valid()) {
|
||||
throw InvalidJSON();
|
||||
}
|
||||
depth_index = new scopeindex_t[pj.depth_capacity];
|
||||
// memory allocation would throw
|
||||
// if(depth_index == nullptr) {
|
||||
// return;
|
||||
//}
|
||||
depth_index[0].start_of_scope = location;
|
||||
current_val = pj.tape[location++];
|
||||
current_type = (current_val >> 56);
|
||||
depth_index[0].scope_type = current_type;
|
||||
if (current_type == 'r') {
|
||||
tape_length = current_val & JSON_VALUE_MASK;
|
||||
if (location < tape_length) {
|
||||
current_val = pj.tape[location];
|
||||
current_type = (current_val >> 56);
|
||||
depth++;
|
||||
depth_index[depth].start_of_scope = location;
|
||||
depth_index[depth].scope_type = current_type;
|
||||
}
|
||||
switch (current_type) {
|
||||
case '"': // we have a string
|
||||
} else {
|
||||
// should never happen
|
||||
throw InvalidJSON();
|
||||
}
|
||||
}
|
||||
|
||||
ParsedJson::Iterator::~Iterator() { delete[] depth_index; }
|
||||
|
||||
ParsedJson::Iterator::Iterator(const Iterator &o)
|
||||
: pj(o.pj), depth(o.depth), location(o.location), tape_length(0),
|
||||
current_type(o.current_type), current_val(o.current_val),
|
||||
depth_index(nullptr) {
|
||||
depth_index = new scopeindex_t[pj.depth_capacity];
|
||||
// allocation might throw
|
||||
memcpy(depth_index, o.depth_index,
|
||||
pj.depth_capacity * sizeof(depth_index[0]));
|
||||
tape_length = o.tape_length;
|
||||
}
|
||||
|
||||
ParsedJson::Iterator::Iterator(Iterator &&o)
|
||||
: pj(o.pj), depth(o.depth), location(o.location),
|
||||
tape_length(o.tape_length), current_type(o.current_type),
|
||||
current_val(o.current_val), depth_index(o.depth_index) {
|
||||
o.depth_index = nullptr; // we take ownership
|
||||
}
|
||||
|
||||
bool ParsedJson::Iterator::print(std::ostream &os, bool escape_strings) const {
|
||||
if (!is_ok()) {
|
||||
return false;
|
||||
}
|
||||
switch (current_type) {
|
||||
case '"': // we have a string
|
||||
os << '"';
|
||||
if(escape_strings) {
|
||||
print_with_escapes(get_string(), os, get_string_length());
|
||||
if (escape_strings) {
|
||||
print_with_escapes(get_string(), os, get_string_length());
|
||||
} else {
|
||||
// was: os << get_string();, but given that we can include null chars, we have to do something crazier:
|
||||
std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator<char>(os));
|
||||
// was: os << get_string();, but given that we can include null chars, we
|
||||
// have to do something crazier:
|
||||
std::copy(get_string(), get_string() + get_string_length(),
|
||||
std::ostream_iterator<char>(os));
|
||||
}
|
||||
os << '"';
|
||||
break;
|
||||
case 'l': // we have a long int
|
||||
case 'l': // we have a long int
|
||||
os << get_integer();
|
||||
break;
|
||||
case 'd':
|
||||
case 'd':
|
||||
os << get_double();
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
case 'n': // we have a null
|
||||
os << "null";
|
||||
break;
|
||||
case 't': // we have a true
|
||||
case 't': // we have a true
|
||||
os << "true";
|
||||
break;
|
||||
case 'f': // we have a false
|
||||
case 'f': // we have a false
|
||||
os << "false";
|
||||
break;
|
||||
case '{': // we have an object
|
||||
case '}': // we end an object
|
||||
case '[': // we start an array
|
||||
case ']': // we end an array
|
||||
case '{': // we have an object
|
||||
case '}': // we end an object
|
||||
case '[': // we start an array
|
||||
case ']': // we end an array
|
||||
os << static_cast<char>(current_type);
|
||||
break;
|
||||
default:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::move_to(const char * pointer, uint32_t length) {
|
||||
char* new_pointer = nullptr;
|
||||
if (pointer[0] == '#') {
|
||||
// Converting fragment representation to string representation
|
||||
new_pointer = new char[length];
|
||||
uint32_t new_length = 0;
|
||||
for (uint32_t i = 1; i < length; i++) {
|
||||
if (pointer[i] == '%' && pointer[i+1] == 'x') {
|
||||
try {
|
||||
int fragment = std::stoi(std::string(&pointer[i+2], 2), nullptr, 16);
|
||||
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
|
||||
// escaping the character
|
||||
new_pointer[new_length] = '\\';
|
||||
new_length++;
|
||||
}
|
||||
new_pointer[new_length] = fragment;
|
||||
i += 3;
|
||||
}
|
||||
catch(std::invalid_argument& e) {
|
||||
delete[] new_pointer;
|
||||
return false; // the fragment is invalid
|
||||
bool ParsedJson::Iterator::move_to(const char *pointer, uint32_t length) {
|
||||
char *new_pointer = nullptr;
|
||||
if (pointer[0] == '#') {
|
||||
// Converting fragment representation to string representation
|
||||
new_pointer = new char[length];
|
||||
uint32_t new_length = 0;
|
||||
for (uint32_t i = 1; i < length; i++) {
|
||||
if (pointer[i] == '%' && pointer[i + 1] == 'x') {
|
||||
try {
|
||||
int fragment =
|
||||
std::stoi(std::string(&pointer[i + 2], 2), nullptr, 16);
|
||||
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
|
||||
// escaping the character
|
||||
new_pointer[new_length] = '\\';
|
||||
new_length++;
|
||||
}
|
||||
new_pointer[new_length] = fragment;
|
||||
i += 3;
|
||||
} catch (std::invalid_argument &e) {
|
||||
delete[] new_pointer;
|
||||
return false; // the fragment is invalid
|
||||
}
|
||||
else {
|
||||
new_pointer[new_length] = pointer[i];
|
||||
}
|
||||
new_length++;
|
||||
} else {
|
||||
new_pointer[new_length] = pointer[i];
|
||||
}
|
||||
length = new_length;
|
||||
pointer = new_pointer;
|
||||
new_length++;
|
||||
}
|
||||
|
||||
// saving the current state
|
||||
size_t depth_s = depth;
|
||||
size_t location_s = location;
|
||||
uint8_t current_type_s = current_type;
|
||||
uint64_t current_val_s = current_val;
|
||||
scopeindex_t *depthindex_s = depthindex;
|
||||
|
||||
rewind(); // The json pointer is used from the root of the document.
|
||||
length = new_length;
|
||||
pointer = new_pointer;
|
||||
}
|
||||
|
||||
bool found = relative_move_to(pointer, length);
|
||||
delete[] new_pointer;
|
||||
// saving the current state
|
||||
size_t depth_s = depth;
|
||||
size_t location_s = location;
|
||||
uint8_t current_type_s = current_type;
|
||||
uint64_t current_val_s = current_val;
|
||||
scopeindex_t *depth_index_s = depth_index;
|
||||
|
||||
if (!found) {
|
||||
// since the pointer has found nothing, we get back to the original position.
|
||||
depth = depth_s;
|
||||
location = location_s;
|
||||
current_type = current_type_s;
|
||||
current_val = current_val_s;
|
||||
depthindex = depthindex_s;
|
||||
}
|
||||
rewind(); // The json pointer is used from the root of the document.
|
||||
|
||||
return found;
|
||||
bool found = relative_move_to(pointer, length);
|
||||
delete[] new_pointer;
|
||||
|
||||
if (!found) {
|
||||
// since the pointer has found nothing, we get back to the original
|
||||
// position.
|
||||
depth = depth_s;
|
||||
location = location_s;
|
||||
current_type = current_type_s;
|
||||
current_val = current_val_s;
|
||||
depth_index = depth_index_s;
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
bool ParsedJson::iterator::relative_move_to(const char * pointer, uint32_t length) {
|
||||
if (length == 0) {
|
||||
// returns the whole document
|
||||
return true;
|
||||
}
|
||||
bool ParsedJson::Iterator::relative_move_to(const char *pointer,
|
||||
uint32_t length) {
|
||||
if (length == 0) {
|
||||
// returns the whole document
|
||||
return true;
|
||||
}
|
||||
|
||||
if (pointer[0] != '/') {
|
||||
// '/' must be the first character
|
||||
if (pointer[0] != '/') {
|
||||
// '/' must be the first character
|
||||
return false;
|
||||
}
|
||||
|
||||
// finding the key in an object or the index in an array
|
||||
std::string key_or_index;
|
||||
uint32_t offset = 1;
|
||||
|
||||
// checking for the "-" case
|
||||
if (is_array() && pointer[1] == '-') {
|
||||
if (length != 2) {
|
||||
// the pointer must be exactly "/-"
|
||||
// there can't be anything more after '-' as an index
|
||||
return false;
|
||||
}
|
||||
key_or_index = '-';
|
||||
offset = length; // will skip the loop coming right after
|
||||
}
|
||||
|
||||
// finding the key in an object or the index in an array
|
||||
std::string key_or_index;
|
||||
uint32_t offset = 1;
|
||||
// We either transform the first reference token to a valid json key
|
||||
// or we make sure it is a valid index in an array.
|
||||
for (; offset < length; offset++) {
|
||||
if (pointer[offset] == '/') {
|
||||
// beginning of the next key or index
|
||||
break;
|
||||
}
|
||||
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
|
||||
// the index of an array must be an integer
|
||||
// we also make sure std::stoi won't discard whitespaces later
|
||||
return false;
|
||||
}
|
||||
if (pointer[offset] == '~') {
|
||||
// "~1" represents "/"
|
||||
if (pointer[offset + 1] == '1') {
|
||||
key_or_index += '/';
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
// "~0" represents "~"
|
||||
if (pointer[offset + 1] == '0') {
|
||||
key_or_index += '~';
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (pointer[offset] == '\\') {
|
||||
if (pointer[offset + 1] == '\\' || pointer[offset + 1] == '"' ||
|
||||
(pointer[offset + 1] <= 0x1F)) {
|
||||
key_or_index += pointer[offset + 1];
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
return false; // invalid escaped character
|
||||
}
|
||||
if (pointer[offset] == '\"') {
|
||||
// unescaped quote character. this is an invalid case.
|
||||
// lets do nothing and assume most pointers will be valid.
|
||||
// it won't find any corresponding json key anyway.
|
||||
// return false;
|
||||
}
|
||||
key_or_index += pointer[offset];
|
||||
}
|
||||
|
||||
// checking for the "-" case
|
||||
if (is_array() && pointer[1] == '-') {
|
||||
if (length != 2) {
|
||||
// the pointer must be exactly "/-"
|
||||
// there can't be anything more after '-' as an index
|
||||
bool found = false;
|
||||
if (is_object()) {
|
||||
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
|
||||
found = relative_move_to(pointer + offset, length - offset);
|
||||
}
|
||||
} else if (is_array()) {
|
||||
if (key_or_index == "-") { // handling "-" case first
|
||||
if (down()) {
|
||||
while (next())
|
||||
; // moving to the end of the array
|
||||
// moving to the nonexistent value right after...
|
||||
size_t npos;
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
// we need to jump
|
||||
npos = (current_val & JSON_VALUE_MASK);
|
||||
} else {
|
||||
npos =
|
||||
location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
|
||||
}
|
||||
location = npos;
|
||||
current_val = pj.tape[npos];
|
||||
current_type = (current_val >> 56);
|
||||
return true; // how could it fail ?
|
||||
}
|
||||
} else { // regular numeric index
|
||||
// The index can't have a leading '0'
|
||||
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
|
||||
return false;
|
||||
}
|
||||
key_or_index = '-';
|
||||
offset = length; // will skip the loop coming right after
|
||||
}
|
||||
|
||||
// We either transform the first reference token to a valid json key
|
||||
// or we make sure it is a valid index in an array.
|
||||
for (; offset < length ; offset++) {
|
||||
if (pointer[offset] == '/') {
|
||||
// beginning of the next key or index
|
||||
break;
|
||||
}
|
||||
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
|
||||
// the index of an array must be an integer
|
||||
// we also make sure std::stoi won't discard whitespaces later
|
||||
// it cannot be empty
|
||||
if (key_or_index.length() == 0) {
|
||||
return false;
|
||||
}
|
||||
if (pointer[offset] == '~') {
|
||||
// "~1" represents "/"
|
||||
if (pointer[offset+1] == '1') {
|
||||
key_or_index += '/';
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
// "~0" represents "~"
|
||||
if (pointer[offset+1] == '0') {
|
||||
key_or_index += '~';
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
// we already checked the index contains only valid digits
|
||||
uint32_t index = std::stoi(key_or_index);
|
||||
if (move_to_index(index)) {
|
||||
found = relative_move_to(pointer + offset, length - offset);
|
||||
}
|
||||
if (pointer[offset] == '\\') {
|
||||
if (pointer[offset+1] == '\\' || pointer[offset+1] == '"' || (pointer[offset+1] <= 0x1F)) {
|
||||
key_or_index += pointer[offset+1];
|
||||
offset++;
|
||||
continue;
|
||||
}
|
||||
return false; // invalid escaped character
|
||||
}
|
||||
if (pointer[offset] == '\"') {
|
||||
// unescaped quote character. this is an invalid case.
|
||||
// lets do nothing and assume most pointers will be valid.
|
||||
// it won't find any corresponding json key anyway.
|
||||
// return false;
|
||||
}
|
||||
key_or_index += pointer[offset];
|
||||
}
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
if (is_object()) {
|
||||
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
|
||||
found = relative_move_to(pointer+offset, length-offset);
|
||||
}
|
||||
}
|
||||
else if(is_array()) {
|
||||
if (key_or_index == "-") { // handling "-" case first
|
||||
if (down()) {
|
||||
while(next()); // moving to the end of the array
|
||||
// moving to the nonexistent value right after...
|
||||
size_t npos;
|
||||
if ((current_type == '[') || (current_type == '{')) {
|
||||
// we need to jump
|
||||
npos = ( current_val & JSONVALUEMASK);
|
||||
} else {
|
||||
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
|
||||
}
|
||||
location = npos;
|
||||
current_val = pj.tape[npos];
|
||||
current_type = (current_val >> 56);
|
||||
return true; // how could it fail ?
|
||||
}
|
||||
} else { // regular numeric index
|
||||
// The index can't have a leading '0'
|
||||
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
|
||||
return false;
|
||||
}
|
||||
// it cannot be empty
|
||||
if (key_or_index.length() == 0) {
|
||||
return false;
|
||||
}
|
||||
// we already checked the index contains only valid digits
|
||||
uint32_t index = std::stoi(key_or_index);
|
||||
if (move_to_index(index)) {
|
||||
found = relative_move_to(pointer+offset, length-offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
return found;
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1,25 +1,30 @@
|
|||
#include <map>
|
||||
#include "simdjson/simdjson.h"
|
||||
#include <map>
|
||||
|
||||
namespace simdjson {
|
||||
const std::map<int, const std::string> errorStrings = {
|
||||
const std::map<int, const std::string> error_strings = {
|
||||
{SUCCESS, "No errors"},
|
||||
{CAPACITY, "This ParsedJson can't support a document that big"},
|
||||
{MEMALLOC, "Error allocating memory, we're most likely out of memory"},
|
||||
{TAPE_ERROR, "Something went wrong while writing to the tape"},
|
||||
{STRING_ERROR, "Problem while parsing a string"},
|
||||
{T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'"},
|
||||
{F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"},
|
||||
{N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"},
|
||||
{T_ATOM_ERROR,
|
||||
"Problem while parsing an atom starting with the letter 't'"},
|
||||
{F_ATOM_ERROR,
|
||||
"Problem while parsing an atom starting with the letter 'f'"},
|
||||
{N_ATOM_ERROR,
|
||||
"Problem while parsing an atom starting with the letter 'n'"},
|
||||
{NUMBER_ERROR, "Problem while parsing a number"},
|
||||
{UTF8_ERROR, "The input is not valid UTF-8"},
|
||||
{UNITIALIZED, "Unitialized"},
|
||||
{EMPTY, "Empty"},
|
||||
{UNESCAPED_CHARS, "Within strings, some characters must be escapted, we found unescapted characters"},
|
||||
{UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson"},
|
||||
{UNESCAPED_CHARS, "Within strings, some characters must be escapted, we "
|
||||
"found unescapted characters"},
|
||||
{UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as "
|
||||
"you may have found a bug in simdjson"},
|
||||
};
|
||||
|
||||
const std::string& errorMsg(const int errorCode) {
|
||||
return errorStrings.at(errorCode);
|
||||
}
|
||||
const std::string &error_message(const int error_code) {
|
||||
return error_strings.at(error_code);
|
||||
}
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1,37 +1,41 @@
|
|||
#include "simdjson/portability.h"
|
||||
|
||||
|
||||
#ifdef IS_X86_64
|
||||
|
||||
#include "simdjson/stage1_find_marks_haswell.h"
|
||||
#include "simdjson/stage1_find_marks_westmere.h"
|
||||
TARGET_HASWELL
|
||||
namespace simdjson {
|
||||
template<>
|
||||
int find_structural_bits<architecture::haswell>(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(architecture::haswell, buf, len, pj, simdjson::haswell::flatten_bits);
|
||||
template <>
|
||||
int find_structural_bits<Architecture::HASWELL>(const uint8_t *buf, size_t len,
|
||||
ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(Architecture::HASWELL, buf, len, pj,
|
||||
simdjson::haswell::flatten_bits);
|
||||
}
|
||||
} // simdjson
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
TARGET_WESTMERE
|
||||
namespace simdjson {
|
||||
template<>
|
||||
int find_structural_bits<architecture::westmere>(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(architecture::westmere, buf, len, pj, simdjson::flatten_bits);
|
||||
template <>
|
||||
int find_structural_bits<Architecture::WESTMERE>(const uint8_t *buf, size_t len,
|
||||
ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(Architecture::WESTMERE, buf, len, pj,
|
||||
simdjson::flatten_bits);
|
||||
}
|
||||
} // simdjson
|
||||
} // namespace simdjson
|
||||
UNTARGET_REGION
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef IS_ARM64
|
||||
#include "simdjson/stage1_find_marks_arm64.h"
|
||||
namespace simdjson {
|
||||
template<>
|
||||
int find_structural_bits<architecture::arm64>(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(architecture::arm64, buf, len, pj, simdjson::flatten_bits);
|
||||
}
|
||||
template <>
|
||||
int find_structural_bits<Architecture::ARM64>(const uint8_t *buf, size_t len,
|
||||
ParsedJson &pj) {
|
||||
FIND_STRUCTURAL_BITS(Architecture::ARM64, buf, len, pj,
|
||||
simdjson::flatten_bits);
|
||||
}
|
||||
} // namespace simdjson
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -40,7 +40,7 @@ using namespace rapidjson;
|
|||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool verbose = false;
|
||||
bool justfavorites = false;
|
||||
bool just_favorites = false;
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "vm")) != -1)
|
||||
switch (c) {
|
||||
|
@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
|
|||
verbose = true;
|
||||
break;
|
||||
case 'm':
|
||||
justfavorites = true;
|
||||
just_favorites = true;
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
|
@ -77,8 +77,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << std::endl;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
size_t maxdepth = 1024 * 4;
|
||||
bool allocok = pj.allocateCapacity(p.size(), maxdepth);
|
||||
size_t max_depth = 1024 * 4;
|
||||
bool allocok = pj.allocate_capacity(p.size(), max_depth);
|
||||
if (!allocok) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
|
|||
sajson::parse(sajson::dynamic_allocation(),
|
||||
sajson::mutable_string_view(p.size(), buffer))
|
||||
.is_valid();
|
||||
if (justfavorites) {
|
||||
if (just_favorites) {
|
||||
printf("our parser : %s \n",
|
||||
ours_correct ? "correct" : "invalid");
|
||||
printf("rapid (check encoding) : %s \n",
|
||||
|
@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
|
|||
if (oursreturn == simdjson::DEPTH_ERROR) {
|
||||
printf("simdjson encountered a DEPTH_ERROR, it was parametrized to "
|
||||
"reject documents with depth exceeding %zu.\n",
|
||||
maxdepth);
|
||||
max_depth);
|
||||
}
|
||||
if ((ours_correct != rapid_correct_checkencoding) ||
|
||||
(rapid_correct_checkencoding != sajson_correct) ||
|
||||
|
@ -157,12 +157,12 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
Json::CharReaderBuilder b;
|
||||
Json::CharReader *jsoncppreader = b.newCharReader();
|
||||
Json::CharReader *json_cpp_reader = b.newCharReader();
|
||||
Json::Value root;
|
||||
Json::String errs;
|
||||
bool isjsoncppok =
|
||||
jsoncppreader->parse(buffer, buffer + p.size(), &root, &errs);
|
||||
delete jsoncppreader;
|
||||
bool is_json_cpp_ok =
|
||||
json_cpp_reader->parse(buffer, buffer + p.size(), &root, &errs);
|
||||
delete json_cpp_reader;
|
||||
|
||||
printf("our parser : %s \n",
|
||||
ours_correct ? "correct" : "invalid");
|
||||
|
@ -185,7 +185,7 @@ int main(int argc, char *argv[]) {
|
|||
printf("cjson : %s \n",
|
||||
cjson_correct ? "correct" : "invalid");
|
||||
printf("jsoncpp : %s \n",
|
||||
isjsoncppok ? "correct" : "invalid");
|
||||
is_json_cpp_ok ? "correct" : "invalid");
|
||||
|
||||
free(buffer);
|
||||
return EXIT_SUCCESS;
|
||||
|
|
|
@ -15,10 +15,10 @@ bool skyprophet_test() {
|
|||
std::vector<std::string> data;
|
||||
char buf[1024];
|
||||
for (size_t i = 0; i < n_records; ++i) {
|
||||
auto n =
|
||||
sprintf(buf, "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
|
||||
auto n = sprintf(buf,
|
||||
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
|
||||
"\"school\": {\"id\": %zu, \"name\": \"school%zu\"}}",
|
||||
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
|
||||
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
|
||||
data.emplace_back(std::string(buf, n));
|
||||
}
|
||||
for (size_t i = 0; i < n_records; ++i) {
|
||||
|
@ -40,7 +40,7 @@ bool skyprophet_test() {
|
|||
maxsize = s.size();
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
if (!pj.allocateCapacity(maxsize)) {
|
||||
if (!pj.allocate_capacity(maxsize)) {
|
||||
printf("allocation failure in skyprophet_test\n");
|
||||
return false;
|
||||
}
|
||||
|
@ -52,12 +52,12 @@ bool skyprophet_test() {
|
|||
}
|
||||
counter++;
|
||||
auto ok1 = json_parse(rec.c_str(), rec.length(), pj);
|
||||
if (ok1 != 0 || !pj.isValid()) {
|
||||
if (ok1 != 0 || !pj.is_valid()) {
|
||||
printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str());
|
||||
return false;
|
||||
}
|
||||
auto ok2 = json_parse(rec, pj);
|
||||
if (ok2 != 0 || !pj.isValid()) {
|
||||
if (ok2 != 0 || !pj.is_valid()) {
|
||||
printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str());
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -17,14 +17,14 @@
|
|||
/**
|
||||
* Does the file filename ends with the given extension.
|
||||
*/
|
||||
static bool hasExtension(const char *filename, const char *extension) {
|
||||
static bool has_extension(const char *filename, const char *extension) {
|
||||
const char *ext = strrchr(filename, '.');
|
||||
return ((ext != nullptr) && (strcmp(ext, extension) == 0));
|
||||
}
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
bool starts_with(const char *pre, const char *str) {
|
||||
size_t len_pre = strlen(pre), len_str = strlen(str);
|
||||
return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0;
|
||||
}
|
||||
|
||||
bool contains(const char *pre, const char *str) {
|
||||
|
@ -32,7 +32,7 @@ bool contains(const char *pre, const char *str) {
|
|||
}
|
||||
|
||||
bool validate(const char *dirname) {
|
||||
bool everythingfine = true;
|
||||
bool everything_fine = true;
|
||||
const char *extension = ".json";
|
||||
size_t dirlen = strlen(dirname);
|
||||
struct dirent **entry_list;
|
||||
|
@ -45,15 +45,15 @@ bool validate(const char *dirname) {
|
|||
printf("nothing in dir %s \n", dirname);
|
||||
return false;
|
||||
}
|
||||
bool *isfileasexpected = new bool[c];
|
||||
bool *is_file_as_expected = new bool[c];
|
||||
for (int i = 0; i < c; i++) {
|
||||
isfileasexpected[i] = true;
|
||||
is_file_as_expected[i] = true;
|
||||
}
|
||||
size_t howmany = 0;
|
||||
size_t how_many = 0;
|
||||
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
||||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
if (hasExtension(name, extension)) {
|
||||
if (has_extension(name, extension)) {
|
||||
printf("validating: file %s ", name);
|
||||
fflush(nullptr);
|
||||
size_t filelen = strlen(name);
|
||||
|
@ -73,38 +73,38 @@ bool validate(const char *dirname) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!allocok) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
}
|
||||
++howmany;
|
||||
const int parseRes = json_parse(p, pj);
|
||||
printf("%s\n", parseRes == 0 ? "ok" : "invalid");
|
||||
++how_many;
|
||||
const int parse_res = json_parse(p, pj);
|
||||
printf("%s\n", parse_res == 0 ? "ok" : "invalid");
|
||||
if (contains("EXCLUDE", name)) {
|
||||
// skipping
|
||||
howmany--;
|
||||
} else if (startsWith("pass", name) && parseRes != 0) {
|
||||
isfileasexpected[i] = false;
|
||||
how_many--;
|
||||
} else if (starts_with("pass", name) && parse_res != 0) {
|
||||
is_file_as_expected[i] = false;
|
||||
printf("warning: file %s should pass but it fails. Error is: %s\n",
|
||||
name, simdjson::errorMsg(parseRes).data());
|
||||
everythingfine = false;
|
||||
} else if (startsWith("fail", name) && parseRes == 0) {
|
||||
isfileasexpected[i] = false;
|
||||
name, simdjson::error_message(parse_res).data());
|
||||
everything_fine = false;
|
||||
} else if (starts_with("fail", name) && parse_res == 0) {
|
||||
is_file_as_expected[i] = false;
|
||||
printf("warning: file %s should fail but it passes.\n", name);
|
||||
everythingfine = false;
|
||||
everything_fine = false;
|
||||
}
|
||||
free(fullpath);
|
||||
}
|
||||
}
|
||||
printf("%zu files checked.\n", howmany);
|
||||
if (everythingfine) {
|
||||
printf("%zu files checked.\n", how_many);
|
||||
if (everything_fine) {
|
||||
printf("All ok!\n");
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"There were problems! Consider reviewing the following files:\n");
|
||||
for (int i = 0; i < c; i++) {
|
||||
if (!isfileasexpected[i]) {
|
||||
if (!is_file_as_expected[i]) {
|
||||
fprintf(stderr, "%s \n", entry_list[i]->d_name);
|
||||
}
|
||||
}
|
||||
|
@ -113,8 +113,8 @@ bool validate(const char *dirname) {
|
|||
free(entry_list[i]);
|
||||
}
|
||||
free(entry_list);
|
||||
delete[] isfileasexpected;
|
||||
return everythingfine;
|
||||
delete[] is_file_as_expected;
|
||||
return everything_fine;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
|
|
@ -13,31 +13,30 @@
|
|||
|
||||
#include "simdjson/common_defs.h"
|
||||
|
||||
|
||||
// ulp distance
|
||||
// ulp distance
|
||||
// Marc B. Reynolds, 2016-2019
|
||||
// Public Domain under http://unlicense.org, see link for details.
|
||||
// adapted by D. Lemire
|
||||
inline uint32_t f32_ulp_dist(float a, float b) {
|
||||
uint32_t ua, ub;
|
||||
memcpy(&ua, &a, sizeof(ua));
|
||||
memcpy(&ub, &b, sizeof(ub));
|
||||
if ((int32_t)(ub^ua) >= 0)
|
||||
return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
|
||||
return ua+ub+0x80000000;
|
||||
memcpy(&ua, &a, sizeof(ua));
|
||||
memcpy(&ub, &b, sizeof(ub));
|
||||
if ((int32_t)(ub ^ ua) >= 0)
|
||||
return (int32_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua);
|
||||
return ua + ub + 0x80000000;
|
||||
}
|
||||
|
||||
// ulp distance
|
||||
// ulp distance
|
||||
// Marc B. Reynolds, 2016-2019
|
||||
// Public Domain under http://unlicense.org, see link for details.
|
||||
// adapted by D. Lemire
|
||||
inline uint64_t f64_ulp_dist(double a, double b) {
|
||||
uint64_t ua, ub;
|
||||
memcpy(&ua, &a, sizeof(ua));
|
||||
memcpy(&ub, &b, sizeof(ub));
|
||||
if ((int64_t)(ub^ua) >= 0)
|
||||
return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
|
||||
return ua+ub+0x80000000;
|
||||
memcpy(&ua, &a, sizeof(ua));
|
||||
memcpy(&ub, &b, sizeof(ub));
|
||||
if ((int64_t)(ub ^ ua) >= 0)
|
||||
return (int64_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua);
|
||||
return ua + ub + 0x80000000;
|
||||
}
|
||||
|
||||
int parse_error;
|
||||
|
@ -51,7 +50,7 @@ size_t invalid_count;
|
|||
// strings that start with these should not be parsed as numbers
|
||||
const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"};
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
bool starts_with(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre);
|
||||
return strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
@ -60,27 +59,27 @@ bool is_in_bad_list(const char *buf) {
|
|||
if (buf[0] != '0')
|
||||
return false;
|
||||
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
|
||||
if (startsWith(really_bad[i], buf))
|
||||
if (starts_with(really_bad[i], buf))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void foundInvalidNumber(const uint8_t *buf) {
|
||||
void found_invalid_number(const uint8_t *buf) {
|
||||
invalid_count++;
|
||||
char *endptr;
|
||||
double expected = strtod((const char *)buf, &endptr);
|
||||
if (endptr != (const char *)buf) {
|
||||
if (!is_in_bad_list((const char *)buf)) {
|
||||
printf(
|
||||
"Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ",
|
||||
buf, expected);
|
||||
printf("Warning: found_invalid_number %.32s whereas strtod parses it to "
|
||||
"%f, ",
|
||||
buf, expected);
|
||||
printf(" while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_WARNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void foundInteger(int64_t result, const uint8_t *buf) {
|
||||
void found_integer(int64_t result, const uint8_t *buf) {
|
||||
int_count++;
|
||||
char *endptr;
|
||||
long long expected = strtoll((const char *)buf, &endptr, 10);
|
||||
|
@ -91,7 +90,7 @@ void foundInteger(int64_t result, const uint8_t *buf) {
|
|||
}
|
||||
}
|
||||
|
||||
void foundFloat(double result, const uint8_t *buf) {
|
||||
void found_float(double result, const uint8_t *buf) {
|
||||
char *endptr;
|
||||
float_count++;
|
||||
double expected = strtod((const char *)buf, &endptr);
|
||||
|
@ -111,8 +110,8 @@ void foundFloat(double result, const uint8_t *buf) {
|
|||
return;
|
||||
}
|
||||
// we want to get some reasonable relative accuracy
|
||||
uint64_t ULP = f64_ulp_dist(expected,result);
|
||||
if (f64_ulp_dist(expected,result) > 1) {
|
||||
uint64_t ULP = f64_ulp_dist(expected, result);
|
||||
if (f64_ulp_dist(expected, result) > 1) {
|
||||
fprintf(stderr, "parsed %.128e from \n", result);
|
||||
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
|
||||
fprintf(stderr, " %.128e,", expected);
|
||||
|
@ -128,7 +127,7 @@ void foundFloat(double result, const uint8_t *buf) {
|
|||
/**
|
||||
* Does the file filename ends with the given extension.
|
||||
*/
|
||||
static bool hasExtension(const char *filename, const char *extension) {
|
||||
static bool has_extension(const char *filename, const char *extension) {
|
||||
const char *ext = strrchr(filename, '.');
|
||||
return (ext && !strcmp(ext, extension));
|
||||
}
|
||||
|
@ -151,7 +150,7 @@ bool validate(const char *dirname) {
|
|||
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
||||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
if (hasExtension(name, extension)) {
|
||||
if (has_extension(name, extension)) {
|
||||
size_t filelen = strlen(name);
|
||||
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
||||
strcpy(fullpath, dirname);
|
||||
|
@ -170,7 +169,7 @@ bool validate(const char *dirname) {
|
|||
}
|
||||
// terrible hack but just to get it working
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!allocok) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
|
|
|
@ -4,34 +4,35 @@
|
|||
#include "simdjson/parsedjson.h"
|
||||
|
||||
int main() {
|
||||
// {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}"
|
||||
std::string json = "{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}";
|
||||
simdjson::ParsedJson pj;
|
||||
assert(pj.allocateCapacity(json.length()));
|
||||
simdjson::json_parse(json.c_str(), json.length(), pj);
|
||||
assert(pj.isValid());
|
||||
simdjson::ParsedJson::iterator it(pj);
|
||||
// {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}"
|
||||
std::string json =
|
||||
"{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}";
|
||||
simdjson::ParsedJson pj;
|
||||
assert(pj.allocate_capacity(json.length()));
|
||||
simdjson::json_parse(json.c_str(), json.length(), pj);
|
||||
assert(pj.is_valid());
|
||||
simdjson::ParsedJson::Iterator it(pj);
|
||||
|
||||
// valid JSON String Representation pointer
|
||||
std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0");
|
||||
assert(it.move_to(pointer1.c_str(), pointer1.length()));
|
||||
assert(it.is_string());
|
||||
assert(it.get_string() == std::string("value0"));
|
||||
// valid JSON String Representation pointer
|
||||
std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0");
|
||||
assert(it.move_to(pointer1.c_str(), pointer1.length()));
|
||||
assert(it.is_string());
|
||||
assert(it.get_string() == std::string("value0"));
|
||||
|
||||
// valid URI Fragment Identifier Representation pointer
|
||||
std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1");
|
||||
assert(it.move_to(pointer2.c_str(), pointer2.length()));
|
||||
assert(it.is_string());
|
||||
assert(it.get_string() == std::string("value1"));
|
||||
// valid URI Fragment Identifier Representation pointer
|
||||
std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1");
|
||||
assert(it.move_to(pointer2.c_str(), pointer2.length()));
|
||||
assert(it.is_string());
|
||||
assert(it.get_string() == std::string("value1"));
|
||||
|
||||
// invalid pointer with leading 0 in index
|
||||
std::string pointer3("#/~1~001abc/01");
|
||||
assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed
|
||||
assert(it.is_string()); // has probably not moved
|
||||
assert(it.get_string() == std::string("value1")); // has not move
|
||||
// invalid pointer with leading 0 in index
|
||||
std::string pointer3("#/~1~001abc/01");
|
||||
assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed
|
||||
assert(it.is_string()); // has probably not moved
|
||||
assert(it.get_string() == std::string("value1")); // has not move
|
||||
|
||||
// "the (nonexistent) member after the last array element"
|
||||
std::string pointer4("/~1~001abc/-");
|
||||
assert(it.move_to(pointer4.c_str(), pointer4.length()));
|
||||
assert(it.get_type() == ']');
|
||||
// "the (nonexistent) member after the last array element"
|
||||
std::string pointer4("/~1~001abc/-");
|
||||
assert(it.move_to(pointer4.c_str(), pointer4.length()));
|
||||
assert(it.get_type() == ']');
|
||||
}
|
||||
|
|
|
@ -7,15 +7,15 @@ int main() {
|
|||
const char *filename = JSON_TEST_PATH;
|
||||
padded_string p = get_corpus(filename);
|
||||
ParsedJson pj = build_parsed_json(p); // do the parsing
|
||||
if (!pj.isValid()) {
|
||||
if (!pj.is_valid()) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (!pj.allocateCapacity(p.size())) {
|
||||
if (!pj.allocate_capacity(p.size())) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
const int res = json_parse(p, pj);
|
||||
if (res) {
|
||||
std::cerr << errorMsg(res) << std::endl;
|
||||
std::cerr << error_message(res) << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#include <assert.h>
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
#include <dirent.h>
|
||||
#include <inttypes.h>
|
||||
#include <climits>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
|
@ -72,7 +72,7 @@ static bool parse_string(const char *p, char *output, char **end) {
|
|||
|
||||
for (;;) {
|
||||
#if (CHAR_MIN < 0) || (!defined(CHAR_MIN)) // the '!defined' is just paranoia
|
||||
// in this path, char is *signed*
|
||||
// in this path, char is *signed*
|
||||
if ((*p >= 0 && *p < 0x20)) {
|
||||
return false; // unescaped
|
||||
}
|
||||
|
@ -209,12 +209,12 @@ static bool parse_string(const char *p, char *output, char **end) {
|
|||
}
|
||||
}
|
||||
// end of borrowed code
|
||||
char *bigbuffer; // global variable
|
||||
char *big_buffer; // global variable
|
||||
|
||||
void foundBadString(const uint8_t *buf) {
|
||||
void found_bad_string(const uint8_t *buf) {
|
||||
bad_string++;
|
||||
char *end;
|
||||
if (parse_string((const char *)buf, bigbuffer, &end)) {
|
||||
if (parse_string((const char *)buf, big_buffer, &end)) {
|
||||
printf("WARNING: Sajson-like parser seems to think that the string is "
|
||||
"valid %32s \n",
|
||||
buf);
|
||||
|
@ -234,18 +234,18 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
|
|||
}
|
||||
}
|
||||
|
||||
void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||
const uint8_t *parsed_end) {
|
||||
size_t thislen = parsed_end - parsed_begin;
|
||||
total_string_length += thislen;
|
||||
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
|
||||
const uint8_t *parsed_end) {
|
||||
size_t this_len = parsed_end - parsed_begin;
|
||||
total_string_length += this_len;
|
||||
good_string++;
|
||||
char *end = NULL;
|
||||
if (!parse_string((const char *)buf, bigbuffer, &end)) {
|
||||
if (!parse_string((const char *)buf, big_buffer, &end)) {
|
||||
printf("WARNING: reference parser seems to think that the string is NOT "
|
||||
"valid %32s \n",
|
||||
buf);
|
||||
}
|
||||
if (end == bigbuffer) {
|
||||
if (end == big_buffer) {
|
||||
// we have a zero-length string
|
||||
if (parsed_begin != parsed_end) {
|
||||
printf("WARNING: We have a zero-length but gap is %zu \n",
|
||||
|
@ -255,35 +255,35 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
|||
empty_string++;
|
||||
return;
|
||||
}
|
||||
size_t len = end - bigbuffer;
|
||||
if (len != thislen) {
|
||||
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen,
|
||||
size_t len = end - big_buffer;
|
||||
if (len != this_len) {
|
||||
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", this_len,
|
||||
len);
|
||||
printf("\nour parsed string : '%*s'\n\n", (int)thislen,
|
||||
printf("\nour parsed string : '%*s'\n\n", (int)this_len,
|
||||
(const char *)parsed_begin);
|
||||
print_hex((const char *)parsed_begin, thislen);
|
||||
print_hex((const char *)parsed_begin, this_len);
|
||||
printf("\n");
|
||||
|
||||
printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer);
|
||||
print_hex((const char *)bigbuffer, len);
|
||||
printf("reference parsing :'%*s'\n\n", (int)len, big_buffer);
|
||||
print_hex((const char *)big_buffer, len);
|
||||
printf("\n");
|
||||
|
||||
probable_bug = true;
|
||||
}
|
||||
if (memcmp(bigbuffer, parsed_begin, thislen) != 0) {
|
||||
if (memcmp(big_buffer, parsed_begin, this_len) != 0) {
|
||||
printf("WARNING: parsed strings disagree \n");
|
||||
printf("Lengths %zu %zu \n", thislen, len);
|
||||
printf("Lengths %zu %zu \n", this_len, len);
|
||||
|
||||
printf("\nour parsed string : '%*s'\n", (int)thislen,
|
||||
printf("\nour parsed string : '%*s'\n", (int)this_len,
|
||||
(const char *)parsed_begin);
|
||||
print_hex((const char *)parsed_begin, thislen);
|
||||
print_hex((const char *)parsed_begin, this_len);
|
||||
printf("\n");
|
||||
|
||||
printf("reference parsing :'%*s'\n", (int)len, bigbuffer);
|
||||
print_hex((const char *)bigbuffer, len);
|
||||
printf("reference parsing :'%*s'\n", (int)len, big_buffer);
|
||||
print_hex((const char *)big_buffer, len);
|
||||
printf("\n");
|
||||
|
||||
print_cmp_hex((const char *)parsed_begin, bigbuffer, thislen);
|
||||
print_cmp_hex((const char *)parsed_begin, big_buffer, this_len);
|
||||
|
||||
probable_bug = true;
|
||||
}
|
||||
|
@ -295,12 +295,12 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
|
|||
/**
|
||||
* Does the file filename ends with the given extension.
|
||||
*/
|
||||
static bool hasExtension(const char *filename, const char *extension) {
|
||||
static bool has_extension(const char *filename, const char *extension) {
|
||||
const char *ext = strrchr(filename, '.');
|
||||
return (ext && !strcmp(ext, extension));
|
||||
}
|
||||
|
||||
bool startsWith(const char *pre, const char *str) {
|
||||
bool starts_with(const char *pre, const char *str) {
|
||||
size_t lenpre = strlen(pre), lenstr = strlen(str);
|
||||
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
|
||||
}
|
||||
|
@ -323,7 +323,7 @@ bool validate(const char *dirname) {
|
|||
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
|
||||
for (int i = 0; i < c; i++) {
|
||||
const char *name = entry_list[i]->d_name;
|
||||
if (hasExtension(name, extension)) {
|
||||
if (has_extension(name, extension)) {
|
||||
size_t filelen = strlen(name);
|
||||
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
|
||||
strcpy(fullpath, dirname);
|
||||
|
@ -341,13 +341,13 @@ bool validate(const char *dirname) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!allocok) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
}
|
||||
bigbuffer = (char *)malloc(p.size());
|
||||
if (bigbuffer == NULL) {
|
||||
big_buffer = (char *)malloc(p.size());
|
||||
if (big_buffer == NULL) {
|
||||
std::cerr << "can't allocate memory" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
@ -356,7 +356,7 @@ bool validate(const char *dirname) {
|
|||
total_string_length = 0;
|
||||
empty_string = 0;
|
||||
bool isok = json_parse(p, pj);
|
||||
free(bigbuffer);
|
||||
free(big_buffer);
|
||||
if (good_string > 0) {
|
||||
printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t "
|
||||
"empty strings: %10zu "
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#include "simdjson/jsonioutil.h"
|
||||
#include "simdjson/jsonparser.h"
|
||||
|
||||
void compute_dump(simdjson::ParsedJson::iterator &pjh) {
|
||||
void compute_dump(simdjson::ParsedJson::Iterator &pjh) {
|
||||
if (pjh.is_object()) {
|
||||
std::cout << "{";
|
||||
if (pjh.down()) {
|
||||
|
@ -40,8 +40,8 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) {
|
|||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool rawdump = false;
|
||||
bool apidump = false;
|
||||
bool rawdump = false;
|
||||
bool apidump = false;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
int c;
|
||||
|
@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
|
|||
default:
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
int optind = 1;
|
||||
#endif
|
||||
|
@ -70,7 +70,8 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
const char *filename = argv[optind];
|
||||
if (optind + 1 < argc) {
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
|
||||
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
||||
<< std::endl;
|
||||
}
|
||||
simdjson::padded_string p;
|
||||
try {
|
||||
|
@ -80,25 +81,28 @@ int main(int argc, char *argv[]) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!allocok) {
|
||||
std::cerr << "failed to allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
int res = simdjson::json_parse(p, pj); // do the parsing, return false on error
|
||||
int res =
|
||||
simdjson::json_parse(p, pj); // do the parsing, return false on error
|
||||
if (res != simdjson::SUCCESS) {
|
||||
std::cerr << " Parsing failed. Error is '" << simdjson::errorMsg(res) << "'." << std::endl;
|
||||
std::cerr << " Parsing failed. Error is '" << simdjson::error_message(res)
|
||||
<< "'." << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (apidump) {
|
||||
simdjson::ParsedJson::iterator pjh(pj);
|
||||
if (!pjh.isOk()) {
|
||||
simdjson::ParsedJson::Iterator pjh(pj);
|
||||
if (!pjh.is_ok()) {
|
||||
std::cerr << " Could not iterate parsed result. " << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
compute_dump(pjh);
|
||||
} else {
|
||||
const bool is_ok = rawdump ? pj.dump_raw_tape(std::cout) : pj.printjson(std::cout);
|
||||
const bool is_ok =
|
||||
rawdump ? pj.dump_raw_tape(std::cout) : pj.print_json(std::cout);
|
||||
if (!is_ok) {
|
||||
std::cerr << " Could not print out parsed result. " << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
#include <iostream>
|
||||
#include "simdjson/jsonioutil.h"
|
||||
#include "simdjson/jsonparser.h"
|
||||
#include <iostream>
|
||||
|
||||
|
||||
void compute_dump(simdjson::ParsedJson::iterator &pjh) {
|
||||
void compute_dump(simdjson::ParsedJson::Iterator &pjh) {
|
||||
if (pjh.is_object()) {
|
||||
std::cout << "{";
|
||||
if (pjh.down()) {
|
||||
|
@ -40,9 +39,16 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) {
|
|||
int main(int argc, char *argv[]) {
|
||||
if (argc < 3) {
|
||||
std::cerr << "Usage: " << argv[0] << " <jsonfile> <jsonpath>" << std::endl;
|
||||
std::cerr << "Follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901" << std::endl;
|
||||
std::cerr << " Example: " << argv[0] << " jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2 " << std::endl;
|
||||
std::cerr << "Multiple <jsonpath> can be issued in the same command, but at least one is needed." << std::endl;
|
||||
std::cerr << "Follows the rfc6901 standard's syntax: "
|
||||
"https://tools.ietf.org/html/rfc6901"
|
||||
<< std::endl;
|
||||
std::cerr << " Example: " << argv[0]
|
||||
<< " jsonexamples/small/demo.json /Image/Width /Image/Height "
|
||||
"/Image/IDs/2 "
|
||||
<< std::endl;
|
||||
std::cerr << "Multiple <jsonpath> can be issued in the same command, but "
|
||||
"at least one is needed."
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
const char *filename = argv[1];
|
||||
|
@ -54,31 +60,33 @@ int main(int argc, char *argv[]) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::ParsedJson pj;
|
||||
bool allocok = pj.allocateCapacity(p.size(), 1024);
|
||||
bool allocok = pj.allocate_capacity(p.size(), 1024);
|
||||
if (!allocok) {
|
||||
std::cerr << "failed to allocate memory" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
int res = simdjson::json_parse(p, pj); // do the parsing, return false on error
|
||||
int res =
|
||||
simdjson::json_parse(p, pj); // do the parsing, return false on error
|
||||
if (res) {
|
||||
std::cerr << " Parsing failed with error " << simdjson::errorMsg(res) << std::endl;
|
||||
std::cerr << " Parsing failed with error " << simdjson::error_message(res)
|
||||
<< std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
std::cout << "[" << std::endl;
|
||||
for(int idx = 2; idx < argc; idx++) {
|
||||
const char * jsonpath = argv[idx];
|
||||
simdjson::ParsedJson::iterator it(pj);
|
||||
if(it.move_to(std::string(jsonpath))) {
|
||||
std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl;
|
||||
std::cout << "\"value\":";
|
||||
compute_dump(it);
|
||||
std::cout << "}" << std::endl;
|
||||
} else {
|
||||
std::cout << "null" << std::endl;
|
||||
}
|
||||
if(idx + 1 < argc) {
|
||||
std::cout << "," << std::endl;
|
||||
}
|
||||
for (int idx = 2; idx < argc; idx++) {
|
||||
const char *jsonpath = argv[idx];
|
||||
simdjson::ParsedJson::Iterator it(pj);
|
||||
if (it.move_to(std::string(jsonpath))) {
|
||||
std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl;
|
||||
std::cout << "\"value\":";
|
||||
compute_dump(it);
|
||||
std::cout << "}" << std::endl;
|
||||
} else {
|
||||
std::cout << "null" << std::endl;
|
||||
}
|
||||
if (idx + 1 < argc) {
|
||||
std::cout << "," << std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
|
|
|
@ -3,30 +3,28 @@
|
|||
#include "simdjson/jsonioutil.h"
|
||||
#include "simdjson/jsonparser.h"
|
||||
|
||||
size_t count_nonasciibytes(const uint8_t* input, size_t length) {
|
||||
size_t count_nonasciibytes(const uint8_t *input, size_t length) {
|
||||
size_t count = 0;
|
||||
for(size_t i = 0; i < length; i++) {
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
count += input[i] >> 7;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
size_t count_backslash(const uint8_t* input, size_t length) {
|
||||
size_t count = 0;
|
||||
for(size_t i = 0; i < length; i++) {
|
||||
count += (input[i] == '\\') ? 1 : 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
size_t count_backslash(const uint8_t *input, size_t length) {
|
||||
size_t count = 0;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
count += (input[i] == '\\') ? 1 : 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
struct stat_s {
|
||||
size_t integer_count;
|
||||
size_t float_count;
|
||||
size_t string_count;
|
||||
size_t backslash_count;
|
||||
size_t nonasciibyte_count;
|
||||
size_t non_ascii_byte_count;
|
||||
size_t object_count;
|
||||
size_t array_count;
|
||||
size_t null_count;
|
||||
|
@ -39,18 +37,18 @@ struct stat_s {
|
|||
|
||||
using stat_t = struct stat_s;
|
||||
|
||||
|
||||
|
||||
stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
||||
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
|
||||
stat_t answer;
|
||||
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
|
||||
answer.valid = pj.isValid();
|
||||
answer.valid = pj.is_valid();
|
||||
if (!answer.valid) {
|
||||
std::cerr << pj.getErrorMsg() << std::endl;
|
||||
std::cerr << pj.get_error_message() << std::endl;
|
||||
return answer;
|
||||
}
|
||||
answer.backslash_count = count_backslash(reinterpret_cast<const uint8_t*>(p.data()), p.size());
|
||||
answer.nonasciibyte_count = count_nonasciibytes(reinterpret_cast<const uint8_t*>(p.data()), p.size());
|
||||
answer.backslash_count =
|
||||
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.non_ascii_byte_count = count_nonasciibytes(
|
||||
reinterpret_cast<const uint8_t *>(p.data()), p.size());
|
||||
answer.byte_count = p.size();
|
||||
answer.integer_count = 0;
|
||||
answer.float_count = 0;
|
||||
|
@ -61,24 +59,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
|||
answer.false_count = 0;
|
||||
answer.string_count = 0;
|
||||
answer.structural_indexes_count = pj.n_structural_indexes;
|
||||
size_t tapeidx = 0;
|
||||
uint64_t tape_val = pj.tape[tapeidx++];
|
||||
size_t tape_idx = 0;
|
||||
uint64_t tape_val = pj.tape[tape_idx++];
|
||||
uint8_t type = (tape_val >> 56);
|
||||
size_t howmany = 0;
|
||||
size_t how_many = 0;
|
||||
assert(type == 'r');
|
||||
howmany = tape_val & JSONVALUEMASK;
|
||||
for (; tapeidx < howmany; tapeidx++) {
|
||||
tape_val = pj.tape[tapeidx];
|
||||
// uint64_t payload = tape_val & JSONVALUEMASK;
|
||||
how_many = tape_val & JSON_VALUE_MASK;
|
||||
for (; tape_idx < how_many; tape_idx++) {
|
||||
tape_val = pj.tape[tape_idx];
|
||||
// uint64_t payload = tape_val & JSON_VALUE_MASK;
|
||||
type = (tape_val >> 56);
|
||||
switch (type) {
|
||||
case 'l': // we have a long int
|
||||
answer.integer_count++;
|
||||
tapeidx++; // skipping the integer
|
||||
tape_idx++; // skipping the integer
|
||||
break;
|
||||
case 'd': // we have a double
|
||||
answer.float_count++;
|
||||
tapeidx++; // skipping the double
|
||||
tape_idx++; // skipping the double
|
||||
break;
|
||||
case 'n': // we have a null
|
||||
answer.null_count++;
|
||||
|
@ -109,12 +107,6 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
|
|||
return answer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int myoptind = 1;
|
||||
if (myoptind >= argc) {
|
||||
|
@ -124,7 +116,8 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
const char *filename = argv[myoptind];
|
||||
if (myoptind + 1 < argc) {
|
||||
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] << std::endl;
|
||||
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1]
|
||||
<< std::endl;
|
||||
}
|
||||
simdjson::padded_string p;
|
||||
try {
|
||||
|
@ -133,16 +126,18 @@ int main(int argc, char *argv[]) {
|
|||
std::cerr << "Could not load the file " << filename << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
stat_t s = simdjson_computestats(p);
|
||||
if(!s.valid) {
|
||||
stat_t s = simdjson_compute_stats(p);
|
||||
if (!s.valid) {
|
||||
std::cerr << "not a valid JSON" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
|
||||
printf("# integer_count float_count string_count backslash_count nonasciibyte_count object_count array_count null_count true_count false_count byte_count structural_indexes_count\n");
|
||||
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, s.float_count,
|
||||
s.string_count, s.backslash_count, s.nonasciibyte_count, s.object_count, s.array_count,
|
||||
s.null_count, s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
|
||||
printf("# integer_count float_count string_count backslash_count "
|
||||
"non_ascii_byte_count object_count array_count null_count true_count "
|
||||
"false_count byte_count structural_indexes_count\n");
|
||||
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count,
|
||||
s.float_count, s.string_count, s.backslash_count,
|
||||
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
|
||||
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
|
|
@ -10,12 +10,12 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
simdjson::padded_string p;
|
||||
std::string filename = argv[argc - 1];
|
||||
try{
|
||||
try {
|
||||
simdjson::get_corpus(filename).swap(p);
|
||||
} catch (const std::exception& e) {
|
||||
std::cout << "Could not load the file " << filename << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
} catch (const std::exception &e) {
|
||||
std::cout << "Could not load the file " << filename << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::jsonminify(p, p.data());
|
||||
printf("%s",p.data());
|
||||
simdjson::json_minify(p, p.data());
|
||||
printf("%s", p.data());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue