Style uniformization (#238)

* massive clang-format -style=LLVM

* naming harmonization

* adding commentary about sysinfoapi.h
This commit is contained in:
ioioioio 2019-07-30 17:18:10 -04:00 committed by Daniel Lemire
parent 065805d6e1
commit c2eea8abba
57 changed files with 3617 additions and 3389 deletions

View File

@ -67,7 +67,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi
## Code usage and example
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::Iterator pjh(pj)`, see 'Navigating the parsed document').
```C
#include "simdjson/jsonparser.h"
@ -80,12 +80,12 @@ const char * filename = ... //
// use whatever means you want to get a string (UTF-8) of your JSON document
padded_string p = get_corpus(filename);
ParsedJson pj;
pj.allocateCapacity(p.size()); // allocate memory for parsing up to p.size() bytes
pj.allocate_capacity(p.size()); // allocate memory for parsing up to p.size() bytes
const int res = json_parse(p, pj); // do the parsing, return 0 on success
// parsing is done!
if (res != 0) {
// You can use the "simdjson/simdjson.h" header to access the error message
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
}
// the ParsedJson document can be used here
// pj can be reused with other json_parse calls.
@ -103,9 +103,9 @@ using namespace simdjson;
const char * filename = ... //
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if( ! pj.isValid() ) {
if( ! pj.is_valid() ) {
// something went wrong
std::cout << pj.getErrorMsg() << std::endl;
std::cout << pj.get_error_message() << std::endl;
}
```
@ -119,13 +119,13 @@ using namespace simdjson;
/...
std::string mystring = ... //
ParsedJson pj;
pj.allocateCapacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
pj.allocate_capacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
// std::string may not overallocate so a copy will be needed
const int res = json_parse(mystring, pj); // do the parsing, return 0 on success
// parsing is done!
if (res != 0) {
// You can use the "simdjson/simdjson.h" header to access the error message
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl;
}
// pj can be reused with other json_parse calls.
```
@ -141,9 +141,9 @@ using namespace simdjson;
std::string mystring = ... //
// std::string may not overallocate so a copy will be needed
ParsedJson pj = build_parsed_json(mystring); // do the parsing
if( ! pj.isValid() ) {
if( ! pj.is_valid() ) {
// something went wrong
std::cout << pj.getErrorMsg() << std::endl;
std::cout << pj.get_error_message() << std::endl;
}
```
@ -164,9 +164,9 @@ int main(int argc, char *argv[]) {
const char * filename = argv[1];
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if( ! pj.isValid() ) {
if( ! pj.is_valid() ) {
std::cout << "not valid" << std::endl;
std::cout << pj.getErrorMsg() << std::endl;
std::cout << pj.get_error_message() << std::endl;
} else {
std::cout << "valid" << std::endl;
}
@ -370,8 +370,8 @@ In C++, given a `ParsedJson`, we can move to a node with the `move_to` method, p
Here is a code sample to dump back the parsed JSON to a string:
```c
ParsedJson::iterator pjh(pj);
if (!pjh.isOk()) {
ParsedJson::Iterator pjh(pj);
if (!pjh.is_ok()) {
std::cerr << " Could not iterate parsed result. " << std::endl;
return EXIT_FAILURE;
}
@ -379,7 +379,7 @@ Here is a code sample to dump back the parsed JSON to a string:
//
// where compute_dump is :
void compute_dump(ParsedJson::iterator &pjh) {
void compute_dump(ParsedJson::Iterator &pjh) {
if (pjh.is_object()) {
std::cout << "{";
if (pjh.down()) {
@ -417,12 +417,12 @@ void compute_dump(ParsedJson::iterator &pjh) {
The following function will find all user.id integers:
```C
void simdjson_scan(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
void simdjson_scan(std::vector<int64_t> &answer, ParsedJson::Iterator &i) {
while(i.move_forward()) {
if(i.get_scope_type() == '{') {
bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
bool found_user = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if(founduser) {
if(found_user) {
if(i.is_object() && i.move_to_key("id",2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());

View File

@ -117,7 +117,7 @@ int main(int argc, char *argv[]) {
const char * filename = argv[1];
simdjson::padded_string p = simdjson::get_corpus(filename);
simdjson::ParsedJson pj = simdjson::build_parsed_json(p); // do the parsing
if( ! pj.isValid() ) {
if( ! pj.is_valid() ) {
std::cout << "not valid" << std::endl;
} else {
std::cout << "valid" << std::endl;

View File

@ -18,7 +18,7 @@ const char *unitname = "cycles";
: \
: /* no read only */ \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
@ -32,7 +32,7 @@ const char *unitname = "cycles";
: "=r"(cyc_high), "=r"(cyc_low) \
: /* no read only registers */ \
: "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)

View File

@ -30,49 +30,51 @@ void print_vec(const std::vector<int64_t> &v) {
std::cout << std::endl;
}
void simdjson_scan(std::vector<int64_t> &answer, simdjson::ParsedJson::iterator &i) {
while(i.move_forward()) {
if(i.get_scope_type() == '{') {
bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if(founduser) {
if(i.is_object() && i.move_to_key("id",2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
}
}
}
void simdjson_scan(std::vector<int64_t> &answer,
simdjson::ParsedJson::Iterator &i) {
while (i.move_forward()) {
if (i.get_scope_type() == '{') {
bool found_user = (i.get_string_length() == 4) &&
(memcmp(i.get_string(), "user", 4) == 0);
i.move_to_value();
if (found_user) {
if (i.is_object() && i.move_to_key("id", 2)) {
if (i.is_integer()) {
answer.push_back(i.get_integer());
}
i.up();
}
}
}
}
}
__attribute__ ((noinline))
std::vector<int64_t> simdjson_justdom(simdjson::ParsedJson &pj) {
__attribute__((noinline)) std::vector<int64_t>
simdjson_just_dom(simdjson::ParsedJson &pj) {
std::vector<int64_t> answer;
simdjson::ParsedJson::iterator i(pj);
simdjson_scan(answer,i);
simdjson::ParsedJson::Iterator i(pj);
simdjson_scan(answer, i);
remove_duplicates(answer);
return answer;
}
__attribute__ ((noinline))
std::vector<int64_t> simdjson_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) std::vector<int64_t>
simdjson_compute_stats(const simdjson::padded_string &p) {
std::vector<int64_t> answer;
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
if (!pj.isValid()) {
if (!pj.is_valid()) {
return answer;
}
simdjson::ParsedJson::iterator i(pj);
simdjson_scan(answer,i);
simdjson::ParsedJson::Iterator i(pj);
simdjson_scan(answer, i);
remove_duplicates(answer);
return answer;
}
__attribute__ ((noinline))
bool simdjson_justparse(const simdjson::padded_string &p) {
__attribute__((noinline)) bool
simdjson_just_parse(const simdjson::padded_string &p) {
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
bool answer = !pj.isValid();
bool answer = !pj.is_valid();
return answer;
}
@ -88,25 +90,27 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
}
case TYPE_OBJECT: {
auto length = node.get_length();
// sajson has O(log n) find_object_key, but we still visit each node anyhow because we
// need to visit all values.
// sajson has O(log n) find_object_key, but we still visit each node anyhow
// because we need to visit all values.
for (auto i = 0u; i < length; ++i) {
auto key = node.get_object_key(i); // expected: sajson::string
bool founduser = (key.length() == 4) && (memcmp(key.data(), "user", 4) == 0);
if (founduser) { // found a user!!!
auto uservalue = node.get_object_value(i); // get the value
if (uservalue.get_type() ==
bool found_user =
(key.length() == 4) && (memcmp(key.data(), "user", 4) == 0);
if (found_user) { // found a user!!!
auto user_value = node.get_object_value(i); // get the value
if (user_value.get_type() ==
TYPE_OBJECT) { // the value should be an object
// now we know that we only need one value
auto uservaluelength = uservalue.get_length();
auto rightindex = uservalue.find_object_key(sajson::string("id",2));
if(rightindex < uservaluelength) {
auto v = uservalue.get_object_value(rightindex);
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
answer.push_back(v.get_integer_value()); // record it!
} else if (v.get_type() == TYPE_DOUBLE) {
answer.push_back((int64_t)v.get_double_value()); // record it!
}
auto user_value_length = user_value.get_length();
auto right_index =
user_value.find_object_key(sajson::string("id", 2));
if (right_index < user_value_length) {
auto v = user_value.get_object_value(right_index);
if (v.get_type() == TYPE_INTEGER) { // check that it is an integer
answer.push_back(v.get_integer_value()); // record it!
} else if (v.get_type() == TYPE_DOUBLE) {
answer.push_back((int64_t)v.get_double_value()); // record it!
}
}
}
}
@ -126,16 +130,16 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
}
}
__attribute__ ((noinline))
std::vector<int64_t> sasjon_justdom(sajson::document & d) {
__attribute__((noinline)) std::vector<int64_t>
sasjon_just_dom(sajson::document &d) {
std::vector<int64_t> answer;
sajson_traverse(answer, d.get_root());
remove_duplicates(answer);
return answer;
}
__attribute__ ((noinline))
std::vector<int64_t> sasjon_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) std::vector<int64_t>
sasjon_compute_stats(const simdjson::padded_string &p) {
std::vector<int64_t> answer;
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
@ -151,8 +155,8 @@ std::vector<int64_t> sasjon_computestats(const simdjson::padded_string &p) {
return answer;
}
__attribute__ ((noinline))
bool sasjon_justparse(const simdjson::padded_string &p) {
__attribute__((noinline)) bool
sasjon_just_parse(const simdjson::padded_string &p) {
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
auto d = sajson::parse(sajson::dynamic_allocation(),
@ -167,8 +171,9 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
case kObjectType:
for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
++m) {
bool founduser = (m->name.GetStringLength() == 4) && (memcmp(m->name.GetString(), "user", 4) == 0);
if (founduser) {
bool found_user = (m->name.GetStringLength() == 4) &&
(memcmp(m->name.GetString(), "user", 4) == 0);
if (found_user) {
const rapidjson::Value &child = m->value;
if (child.GetType() == kObjectType) {
for (Value::ConstMemberIterator k = child.MemberBegin();
@ -201,16 +206,16 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
}
}
__attribute__ ((noinline))
std::vector<int64_t> rapid_justdom(rapidjson::Document &d) {
__attribute__((noinline)) std::vector<int64_t>
rapid_just_dom(rapidjson::Document &d) {
std::vector<int64_t> answer;
rapid_traverse(answer, d);
remove_duplicates(answer);
return answer;
}
__attribute__ ((noinline))
std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) std::vector<int64_t>
rapid_compute_stats(const simdjson::padded_string &p) {
std::vector<int64_t> answer;
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
@ -218,8 +223,8 @@ std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
rapidjson::Document d;
d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
if (d.HasParseError()) {
free(buffer);
return answer;
free(buffer);
return answer;
}
rapid_traverse(answer, d);
free(buffer);
@ -227,8 +232,8 @@ std::vector<int64_t> rapid_computestats(const simdjson::padded_string &p) {
return answer;
}
__attribute__ ((noinline))
bool rapid_justparse(const simdjson::padded_string &p) {
__attribute__((noinline)) bool
rapid_just_parse(const simdjson::padded_string &p) {
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
@ -239,16 +244,15 @@ bool rapid_justparse(const simdjson::padded_string &p) {
return answer;
}
int main(int argc, char *argv[]) {
bool verbose = false;
bool justdata = false;
bool just_data = false;
int c;
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
justdata = true;
just_data = true;
break;
case 'v':
verbose = true;
@ -257,15 +261,18 @@ int main(int argc, char *argv[]) {
abort();
}
if (optind >= argc) {
std::cerr << "Using different parsers, we compute the content statistics of "
"JSON documents." << std::endl;
std::cerr
<< "Using different parsers, we compute the content statistics of "
"JSON documents."
<< std::endl;
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
exit(1);
}
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
<< std::endl;
}
simdjson::padded_string p;
try {
@ -285,17 +292,17 @@ int main(int argc, char *argv[]) {
std::cout << p.size() << " B ";
std::cout << std::endl;
}
std::vector<int64_t> s1 = simdjson_computestats(p);
std::vector<int64_t> s1 = simdjson_compute_stats(p);
if (verbose) {
printf("simdjson: ");
print_vec(s1);
}
std::vector<int64_t> s2 = rapid_computestats(p);
std::vector<int64_t> s2 = rapid_compute_stats(p);
if (verbose) {
printf("rapid: ");
print_vec(s2);
}
std::vector<int64_t> s3 = sasjon_computestats(p);
std::vector<int64_t> s3 = sasjon_compute_stats(p);
if (verbose) {
printf("sasjon: ");
print_vec(s3);
@ -306,34 +313,35 @@ int main(int argc, char *argv[]) {
int repeat = 500;
int volume = p.size();
if(justdata) {
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
if (just_data) {
printf(
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat,
volume, !justdata);
BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume,
!justdata);
BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume,
!justdata);
BEST_TIME("simdjson (just parse) ", simdjson_justparse(p), false, , repeat,
volume, !justdata);
BEST_TIME("rapid (just parse) ", rapid_justparse(p), false, , repeat, volume,
!justdata);
BEST_TIME("sasjon (just parse) ", sasjon_justparse(p), false, , repeat, volume,
!justdata);
BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat,
volume, !just_data);
BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume,
!just_data);
BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume,
!just_data);
BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), false, , repeat,
volume, !just_data);
BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat,
volume, !just_data);
BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat,
volume, !just_data);
simdjson::ParsedJson dsimdjson = simdjson::build_parsed_json(p);
BEST_TIME("simdjson (just dom) ", simdjson_justdom(dsimdjson).size(), size, , repeat,
volume, !justdata);
BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size,
, repeat, volume, !just_data);
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
rapidjson::Document drapid;
drapid.ParseInsitu<kParseValidateEncodingFlag>(buffer);
BEST_TIME("rapid (just dom) ", rapid_justdom(drapid).size(), size, , repeat, volume,
!justdata);
BEST_TIME("rapid (just dom) ", rapid_just_dom(drapid).size(), size, , repeat,
volume, !just_data);
memcpy(buffer, p.data(), p.size());
auto dsasjon = sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer));
BEST_TIME("sasjon (just dom) ", sasjon_justdom(dsasjon).size(), size, , repeat, volume,
!justdata);
sajson::mutable_string_view(p.size(), buffer));
BEST_TIME("sasjon (just dom) ", sasjon_just_dom(dsasjon).size(), size, ,
repeat, volume, !just_data);
free(buffer);
}

View File

@ -1,5 +1,5 @@
#include <unistd.h>
#include <iostream>
#include <unistd.h>
#include "benchmark.h"
#include "simdjson/jsonioutil.h"
@ -17,7 +17,7 @@
using namespace simdjson;
using namespace rapidjson;
std::string rapidstringmeInsitu(char *json) {
std::string rapid_stringme_insitu(char *json) {
Document d;
d.ParseInsitu(json);
if (d.HasParseError()) {
@ -30,7 +30,7 @@ std::string rapidstringmeInsitu(char *json) {
return buffer.GetString();
}
std::string rapidstringme(char *json) {
std::string rapid_stringme(char *json) {
Document d;
d.Parse(json);
if (d.HasParseError()) {
@ -46,29 +46,28 @@ std::string rapidstringme(char *json) {
int main(int argc, char *argv[]) {
int c;
bool verbose = false;
bool justdata = false;
bool just_data = false;
while ((c = getopt (argc, argv, "vt")) != -1)
switch (c)
{
case 't':
justdata = true;
break;
case 'v':
verbose = true;
break;
default:
abort ();
}
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
just_data = true;
break;
case 'v':
verbose = true;
break;
default:
abort();
}
if (optind >= argc) {
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
exit(1);
}
const char * filename = argv[optind];
const char *filename = argv[optind];
simdjson::padded_string p;
try {
simdjson::get_corpus(filename).swap(p);
} catch (const std::exception& e) { // caught by reference to base
} catch (const std::exception &e) { // caught by reference to base
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
@ -88,71 +87,95 @@ int main(int argc, char *argv[]) {
int repeat = 50;
int volume = p.size();
if(justdata) {
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
if (just_data) {
printf(
"name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
size_t strlength = rapidstringme((char *)p.data()).size();
size_t strlength = rapid_stringme((char *)p.data()).size();
if (verbose)
std::cout << "input length is " << p.size() << " stringified length is "
<< strlength << std::endl;
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, !justdata);
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
BEST_TIME_NOCHECK("despacing with RapidJSON",
rapid_stringme((char *)p.data()), , repeat, volume,
!just_data);
BEST_TIME_NOCHECK(
"despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
memcpy(buffer, p.data(), p.size());
size_t outlength =
simdjson::jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
(uint8_t *)buffer);
if (verbose)
std::cout << "jsonminify length is " << outlength << std::endl;
std::cout << "json_minify length is " << outlength << std::endl;
uint8_t *cbuffer = (uint8_t *)buffer;
BEST_TIME("jsonminify", simdjson::jsonminify(cbuffer, p.size(), cbuffer), outlength,
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
printf("minisize = %zu, original size = %zu (minified down to %.2f percent "
"of original) \n",
outlength, p.size(), outlength * 100.0 / p.size());
/***
* Is it worth it to minify before parsing?
***/
rapidjson::Document d;
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
char *minibuffer = simdjson::allocate_padded_buffer(p.size() + 1);
size_t minisize = simdjson::jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer);
minibuffer[minisize] = '\0';
char *mini_buffer = simdjson::allocate_padded_buffer(p.size() + 1);
size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
(uint8_t *)mini_buffer);
mini_buffer[minisize] = '\0';
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
memcpy(buffer, minibuffer, p.size()),
repeat, volume, !justdata);
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
false, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
!just_data);
size_t astbuffersize = p.size() * 2;
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
size_t ast_buffer_size = p.size() * 2;
size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t));
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
BEST_TIME(
"sajson orig",
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(p.size(), buffer))
.is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
BEST_TIME(
"sajson despaced",
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(minisize, buffer))
.is_valid(),
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data);
simdjson::ParsedJson pj;
bool isallocok = pj.allocateCapacity(p.size(), 1024);
if(!isallocok) {
bool is_alloc_ok = pj.allocate_capacity(p.size(), 1024);
if (!is_alloc_ok) {
fprintf(stderr, "failed to allocate memory\n");
return EXIT_FAILURE;
}
bool automated_reallocation = false;
BEST_TIME("simdjson orig", simdjson::json_parse((const uint8_t*)buffer, p.size(), pj, automated_reallocation), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
}
bool automated_reallocation = false;
BEST_TIME("simdjson orig",
simdjson::json_parse((const uint8_t *)buffer, p.size(), pj,
automated_reallocation),
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
simdjson::ParsedJson pj2;
bool isallocok2 = pj2.allocateCapacity(p.size(), 1024);
if(!isallocok2) {
bool is_alloc_ok2 = pj2.allocate_capacity(p.size(), 1024);
if (!is_alloc_ok2) {
fprintf(stderr, "failed to allocate memory\n");
return EXIT_FAILURE;
}
automated_reallocation = false;
BEST_TIME("simdjson despaced", simdjson::json_parse((const uint8_t*)buffer, minisize, pj2, automated_reallocation), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
}
automated_reallocation = false;
BEST_TIME("simdjson despaced",
simdjson::json_parse((const uint8_t *)buffer, minisize, pj2,
automated_reallocation),
true, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
!just_data);
free(buffer);
free(ast_buffer);
free(minibuffer);
free(mini_buffer);
}

View File

@ -28,57 +28,58 @@
#endif
//#define DEBUG
#include "simdjson/common_defs.h"
#include "simdjson/isadetection.h"
#include "simdjson/jsonioutil.h"
#include "simdjson/jsonparser.h"
#include "simdjson/parsedjson.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"
#include "simdjson/isadetection.h"
namespace simdjson {
architecture _find_best_supported_implementation() {
constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ
| SIMDExtensions::BMI1 | SIMDExtensions::BMI2;
constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ;
Architecture _find_best_supported_implementation() {
constexpr uint32_t haswell_flags =
instruction_set::AVX2 | instruction_set::PCLMULQDQ |
instruction_set::BMI1 | instruction_set::BMI2;
constexpr uint32_t westmere_flags =
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
uint32_t supports = detect_supported_architectures();
// Order from best to worst (within architecture)
if ((haswell_flags & supports) == haswell_flags) {
return architecture::haswell;
return Architecture::HASWELL;
}
if ((westmere_flags & supports) == westmere_flags) {
return architecture::westmere;
return Architecture::WESTMERE;
}
if (SIMDExtensions::NEON) return architecture::arm64;
if (instruction_set::NEON)
return Architecture::ARM64;
return architecture::none;
return Architecture::NONE;
}
using unified_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj);
using stage1_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj);
using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
extern unified_functype *unified_ptr;
extern stage1_functype *stage1_ptr;
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
architecture best_implementation = _find_best_supported_implementation();
Architecture best_implementation = _find_best_supported_implementation();
// Selecting the best implementation
switch (best_implementation) {
#ifdef IS_X86_64
case architecture::haswell:
unified_ptr = &unified_machine<architecture::haswell>;
case Architecture::HASWELL:
unified_ptr = &unified_machine<Architecture::HASWELL>;
break;
case architecture::westmere:
unified_ptr = &unified_machine<architecture::westmere>;
case Architecture::WESTMERE:
unified_ptr = &unified_machine<Architecture::WESTMERE>;
break;
#endif
#ifdef IS_ARM64
case architecture::arm64:
unified_ptr = &unified_machine<architecture::arm64>;
case Architecture::ARM64:
unified_ptr = &unified_machine<Architecture::ARM64>;
break;
#endif
default :
default:
std::cerr << "The processor is not supported by simdjson." << std::endl;
return simdjson::UNEXPECTED_ERROR;
}
@ -87,24 +88,25 @@ int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
}
// Responsible to select the best json_parse implementation
int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
architecture best_implementation = _find_best_supported_implementation();
int find_structural_bits_dispatch(const uint8_t *buf, size_t len,
ParsedJson &pj) {
Architecture best_implementation = _find_best_supported_implementation();
// Selecting the best implementation
switch (best_implementation) {
#ifdef IS_X86_64
case architecture::haswell:
stage1_ptr = &find_structural_bits<architecture::haswell>;
case Architecture::HASWELL:
stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
break;
case architecture::westmere:
stage1_ptr = &find_structural_bits<architecture::westmere>;
case Architecture::WESTMERE:
stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
break;
#endif
#ifdef IS_ARM64
case architecture::arm64:
stage1_ptr = &find_structural_bits<architecture::arm64>;
case Architecture::ARM64:
stage1_ptr = &find_structural_bits<Architecture::ARM64>;
break;
#endif
default :
default:
std::cerr << "The processor is not supported by simdjson." << std::endl;
return simdjson::UNEXPECTED_ERROR;
}
@ -114,23 +116,21 @@ int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj
stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
unified_functype *unified_ptr = &unified_machine_dispatch;
}
} // namespace simdjson
int main(int argc, char *argv[]) {
bool verbose = false;
bool dump = false;
bool jsonoutput = false;
bool forceoneiteration = false;
bool justdata = false;
bool json_output = false;
bool force_one_iteration = false;
bool just_data = false;
#ifndef _MSC_VER
int c;
while ((c = getopt(argc, argv, "1vdt")) != -1) {
switch (c) {
case 't':
justdata = true;
just_data = true;
break;
case 'v':
verbose = true;
@ -139,15 +139,15 @@ int main(int argc, char *argv[]) {
dump = true;
break;
case 'j':
jsonoutput = true;
json_output = true;
break;
case '1':
forceoneiteration = true;
force_one_iteration = true;
break;
default:
abort();
}
}
}
#else
int optind = 1;
#endif
@ -157,7 +157,8 @@ int main(int argc, char *argv[]) {
}
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
<< std::endl;
}
if (verbose) {
std::cout << "[verbose] loading " << filename << std::endl;
@ -170,30 +171,41 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
if (verbose) {
std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
<< std::endl;
}
std::cout << "[verbose] loaded " << filename << " (" << p.size()
<< " bytes)" << std::endl;
}
#if defined(DEBUG)
const uint32_t iterations = 1;
#else
const uint32_t iterations =
forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
#endif
std::vector<double> res;
res.resize(iterations);
if(!justdata) printf("number of iterations %u \n", iterations);
if (!just_data)
printf("number of iterations %u \n", iterations);
#if !defined(__linux__)
#define SQUASH_COUNTERS
if (justdata) {
printf("justdata (-t) flag only works under linux.\n");
if (just_data) {
printf("just_data (-t) flag only works under linux.\n");
}
#endif
{// practice run
{ // practice run
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size());
if(allocok) {
simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj);
simdjson::unified_ptr((const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)p.data(), p.size(), pj);
bool allocok = pj.allocate_capacity(p.size());
if (allocok) {
simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
simdjson::unified_ptr(
(const uint8_t
*)(const uint8_t
*)(const uint8_t
*)(const uint8_t
*)(const uint8_t
*)(const uint8_t
*)(const uint8_t
*)(const uint8_t *)
p.data(),
p.size(), pj);
}
}
#ifndef SQUASH_COUNTERS
@ -220,7 +232,7 @@ int main(int argc, char *argv[]) {
}
unified.start();
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size());
bool allocok = pj.allocate_capacity(p.size());
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
@ -235,7 +247,8 @@ int main(int argc, char *argv[]) {
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
}
unified.start();
isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS);
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
simdjson::SUCCESS);
unified.end(results);
cy1 += results[0];
cl1 += results[1];
@ -247,7 +260,9 @@ int main(int argc, char *argv[]) {
break;
}
unified.start();
isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj));
isok = isok &&
(simdjson::SUCCESS ==
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
unified.end(results);
cy2 += results[0];
cl2 += results[1];
@ -266,7 +281,7 @@ int main(int argc, char *argv[]) {
std::cout << "[verbose] iteration # " << i << std::endl;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size());
bool allocok = pj.allocate_capacity(p.size());
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
@ -276,20 +291,24 @@ int main(int argc, char *argv[]) {
}
auto start = std::chrono::steady_clock::now();
isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS);
isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj));
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
simdjson::SUCCESS);
isok = isok &&
(simdjson::SUCCESS ==
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res[i] = secs.count();
if(! isok) {
std::cerr << pj.getErrorMsg() << std::endl;
if (!isok) {
std::cerr << pj.get_error_message() << std::endl;
std::cerr << "Could not parse. " << std::endl;
return EXIT_FAILURE;
}
}
simdjson::ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
if (!pj.isValid()) {
std::cerr << pj.getErrorMsg() << std::endl;
}
simdjson::ParsedJson pj =
build_parsed_json(p); // do the parsing again to get the stats
if (!pj.is_valid()) {
std::cerr << pj.get_error_message() << std::endl;
std::cerr << "Could not parse. " << std::endl;
return EXIT_FAILURE;
}
@ -297,7 +316,7 @@ int main(int argc, char *argv[]) {
double speedinGBs = (p.size()) / (min_result * 1000000000.0);
#ifndef SQUASH_COUNTERS
unsigned long total = cy0 + cy1 + cy2;
if (justdata) {
if (just_data) {
float cpb0 = (double)cy0 / (iterations * p.size());
float cpb1 = (double)cy1 / (iterations * p.size());
float cpb2 = (double)cy2 / (iterations * p.size());
@ -315,8 +334,8 @@ int main(int argc, char *argv[]) {
break;
}
}
printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2,
cpbtotal, speedinGBs);
printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
speedinGBs);
free(newfile);
} else {
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
@ -352,16 +371,16 @@ int main(int argc, char *argv[]) {
printf(" all stages: %.2f cycles per input byte.\n",
(double)total / (iterations * p.size()));
printf("Estimated average frequency: %.3f GHz.\n", (double)total / (iterations * min_result * 1000000000.0));
printf("Estimated average frequency: %.3f GHz.\n",
(double)total / (iterations * min_result * 1000000000.0));
}
#endif
if (!justdata) {
if (!just_data) {
std::cout << "Min: " << min_result << " bytes read: " << p.size()
<< " Gigabytes/second: " << speedinGBs
<< std::endl;
<< " Gigabytes/second: " << speedinGBs << std::endl;
}
if (jsonoutput) {
isok = isok && pj.printjson(std::cout);
if (json_output) {
isok = isok && pj.print_json(std::cout);
}
if (dump) {
isok = isok && pj.dump_raw_tape(std::cout);

View File

@ -43,11 +43,11 @@ void print_stat(const stat_t &s) {
s.true_count, s.false_count);
}
__attribute__ ((noinline))
stat_t simdjson_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) stat_t
simdjson_compute_stats(const simdjson::padded_string &p) {
stat_t answer;
simdjson::ParsedJson pj = build_parsed_json(p);
answer.valid = pj.isValid();
answer.valid = pj.is_valid();
if (!answer.valid) {
return answer;
}
@ -57,24 +57,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
answer.null_count = 0;
answer.true_count = 0;
answer.false_count = 0;
size_t tapeidx = 0;
uint64_t tape_val = pj.tape[tapeidx++];
size_t tape_idx = 0;
uint64_t tape_val = pj.tape[tape_idx++];
uint8_t type = (tape_val >> 56);
size_t howmany = 0;
size_t how_many = 0;
assert(type == 'r');
howmany = tape_val & JSONVALUEMASK;
for (; tapeidx < howmany; tapeidx++) {
tape_val = pj.tape[tapeidx];
// uint64_t payload = tape_val & JSONVALUEMASK;
how_many = tape_val & JSON_VALUE_MASK;
for (; tape_idx < how_many; tape_idx++) {
tape_val = pj.tape[tape_idx];
// uint64_t payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
switch (type) {
case 'l': // we have a long int
answer.number_count++;
tapeidx++; // skipping the integer
tape_idx++; // skipping the integer
break;
case 'd': // we have a double
answer.number_count++;
tapeidx++; // skipping the double
tape_idx++; // skipping the double
break;
case 'n': // we have a null
answer.null_count++;
@ -145,8 +145,8 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) {
}
}
__attribute__ ((noinline))
stat_t sasjon_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) stat_t
sasjon_compute_stats(const simdjson::padded_string &p) {
stat_t answer;
char *buffer = (char *)malloc(p.size());
memcpy(buffer, p.data(), p.size());
@ -203,8 +203,8 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
}
}
__attribute__ ((noinline))
stat_t rapid_computestats(const simdjson::padded_string &p) {
__attribute__((noinline)) stat_t
rapid_compute_stats(const simdjson::padded_string &p) {
stat_t answer;
char *buffer = (char *)malloc(p.size() + 1);
memcpy(buffer, p.data(), p.size());
@ -228,13 +228,13 @@ stat_t rapid_computestats(const simdjson::padded_string &p) {
int main(int argc, char *argv[]) {
bool verbose = false;
bool justdata = false;
bool just_data = false;
int c;
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
justdata = true;
just_data = true;
break;
case 'v':
verbose = true;
@ -243,15 +243,18 @@ int main(int argc, char *argv[]) {
abort();
}
if (optind >= argc) {
std::cerr << "Using different parsers, we compute the content statistics of "
"JSON documents." << std::endl;
std::cerr
<< "Using different parsers, we compute the content statistics of "
"JSON documents."
<< std::endl;
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
exit(1);
}
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
<< std::endl;
}
simdjson::padded_string p;
try {
@ -271,17 +274,17 @@ int main(int argc, char *argv[]) {
std::cout << p.size() << " B ";
std::cout << std::endl;
}
stat_t s1 = simdjson_computestats(p);
stat_t s1 = simdjson_compute_stats(p);
if (verbose) {
printf("simdjson: ");
print_stat(s1);
}
stat_t s2 = rapid_computestats(p);
stat_t s2 = rapid_compute_stats(p);
if (verbose) {
printf("rapid: ");
print_stat(s2);
}
stat_t s3 = sasjon_computestats(p);
stat_t s3 = sasjon_compute_stats(p);
if (verbose) {
printf("sasjon: ");
print_stat(s3);
@ -290,13 +293,13 @@ int main(int argc, char *argv[]) {
assert(stat_equal(s1, s3));
int repeat = 50;
int volume = p.size();
if(justdata) {
if (just_data) {
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
}
BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat,
volume, !justdata);
BEST_TIME("RapidJSON ", rapid_computestats(p).valid, true, , repeat, volume,
!justdata);
BEST_TIME("sasjon ", sasjon_computestats(p).valid, true, , repeat, volume,
!justdata);
BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat,
volume, !just_data);
BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume,
!just_data);
BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume,
!just_data);
}

View File

@ -59,12 +59,12 @@ bool fastjson_parse(const char *input) {
int main(int argc, char *argv[]) {
bool verbose = false;
bool justdata = false;
bool just_data = false;
int c;
while ((c = getopt(argc, argv, "vt")) != -1)
switch (c) {
case 't':
justdata = true;
just_data = true;
break;
case 'v':
verbose = true;
@ -102,24 +102,24 @@ int main(int argc, char *argv[]) {
std::cout << std::endl;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return EXIT_FAILURE;
}
int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
int volume = p.size();
if (justdata) {
if (just_data) {
printf("%-42s %20s %20s %20s %20s \n", "name", "cycles_per_byte",
"cycles_per_byte_err", "gb_per_s", "gb_per_s_err");
}
if (!justdata)
BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
repeat, volume, !justdata);
if (!just_data)
BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).is_valid(), true,
, repeat, volume, !just_data);
// (static alloc)
BEST_TIME("simdjson ", json_parse(p, pj), simdjson::SUCCESS, , repeat, volume,
!justdata);
!just_data);
rapidjson::Document d;
@ -127,56 +127,57 @@ int main(int argc, char *argv[]) {
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
#ifndef ALLPARSER
if (!justdata)
if (!just_data)
#endif
BEST_TIME(
"RapidJSON ", d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
.HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
BEST_TIME("RapidJSON ",
d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
.HasParseError(),
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
!just_data);
BEST_TIME("RapidJSON (insitu)",
d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(),
false,
memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'),
repeat, volume, !justdata);
repeat, volume, !just_data);
#ifndef ALLPARSER
if (!justdata)
if (!just_data)
#endif
BEST_TIME("sajson (dynamic mem)",
sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer))
.is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
!justdata);
!just_data);
size_t astbuffersize = p.size();
size_t *ast_buffer = (size_t *)malloc(astbuffersize * sizeof(size_t));
size_t ast_buffer_size = p.size();
size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t));
// (static alloc, insitu)
BEST_TIME("sajson",
sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize),
sajson::mutable_string_view(p.size(), buffer))
.is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume,
!justdata);
BEST_TIME(
"sajson",
sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(p.size(), buffer))
.is_valid(),
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
#ifdef ALLPARSER
std::string json11err;
BEST_TIME("dropbox (json11) ",
((json11::Json::parse(buffer, json11err).is_null()) ||
(!json11err.empty())),
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
!justdata);
!just_data);
BEST_TIME("fastjson ", fastjson_parse(buffer), true,
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
JsonValue value;
JsonAllocator allocator;
char *endptr;
BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator),
JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume,
!justdata);
!just_data);
void *state;
BEST_TIME("ultrajson ",
(UJDecode(buffer, p.size(), NULL, &state) == NULL), false,
memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
{
std::unique_ptr<jsmntok_t[]> tokens =
@ -185,32 +186,33 @@ int main(int argc, char *argv[]) {
jsmn_init(&parser);
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
BEST_TIME("jsmn ", (jsmn_parse(&parser, buffer, p.size(),
tokens.get(), p.size()) > 0),
true, jsmn_init(&parser), repeat, volume, !justdata);
BEST_TIME(
"jsmn ",
(jsmn_parse(&parser, buffer, p.size(), tokens.get(), p.size()) > 0),
true, jsmn_init(&parser), repeat, volume, !just_data);
}
memcpy(buffer, p.data(), p.size());
buffer[p.size()] = '\0';
cJSON *tree = cJSON_Parse(buffer);
BEST_TIME("cJSON ", ((tree = cJSON_Parse(buffer)) != NULL), true,
cJSON_Delete(tree), repeat, volume, !justdata);
cJSON_Delete(tree), repeat, volume, !just_data);
cJSON_Delete(tree);
Json::CharReaderBuilder b;
Json::CharReader *jsoncppreader = b.newCharReader();
Json::CharReader *json_cpp_reader = b.newCharReader();
Json::Value root;
Json::String errs;
BEST_TIME("jsoncpp ",
jsoncppreader->parse(buffer, buffer + volume, &root, &errs), true, ,
repeat, volume, !justdata);
delete jsoncppreader;
json_cpp_reader->parse(buffer, buffer + volume, &root, &errs), true,
, repeat, volume, !just_data);
delete json_cpp_reader;
#endif
if (!justdata)
if (!just_data)
BEST_TIME("memcpy ",
(memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat,
volume, !justdata);
volume, !just_data);
#ifdef __linux__
if (!justdata) {
if (!just_data) {
printf("\n \n <doing additional analysis with performance counters (Linux "
"only)>\n");
std::vector<int> evts;
@ -265,7 +267,7 @@ int main(int argc, char *argv[]) {
for (int i = 0; i < repeat; i++) {
memcpy(buffer, p.data(), p.size());
unified.start();
if (sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize),
if (sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size),
sajson::mutable_string_view(p.size(), buffer))
.is_valid() != true)
printf("bug\n");

View File

@ -1,5 +1,5 @@
#include <iostream>
#ifndef _MSC_VER
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include "simdjson/jsonioutil.h"
@ -29,7 +29,7 @@ struct stat_s {
size_t float_count;
size_t string_count;
size_t backslash_count;
size_t nonasciibyte_count;
size_t non_ascii_byte_count;
size_t object_count;
size_t array_count;
size_t null_count;
@ -42,16 +42,17 @@ struct stat_s {
using stat_t = struct stat_s;
stat_t simdjson_computestats(const simdjson::padded_string &p) {
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
stat_t answer;
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
answer.valid = pj.isValid();
answer.valid = pj.is_valid();
if (!answer.valid) {
return answer;
}
answer.backslash_count = count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.nonasciibyte_count =
count_nonasciibytes(reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.backslash_count =
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.non_ascii_byte_count = count_nonasciibytes(
reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.byte_count = p.size();
answer.integer_count = 0;
answer.float_count = 0;
@ -62,24 +63,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
answer.false_count = 0;
answer.string_count = 0;
answer.structural_indexes_count = pj.n_structural_indexes;
size_t tapeidx = 0;
uint64_t tape_val = pj.tape[tapeidx++];
size_t tape_idx = 0;
uint64_t tape_val = pj.tape[tape_idx++];
uint8_t type = (tape_val >> 56);
size_t howmany = 0;
size_t how_many = 0;
assert(type == 'r');
howmany = tape_val & JSONVALUEMASK;
for (; tapeidx < howmany; tapeidx++) {
tape_val = pj.tape[tapeidx];
// uint64_t payload = tape_val & JSONVALUEMASK;
how_many = tape_val & JSON_VALUE_MASK;
for (; tape_idx < how_many; tape_idx++) {
tape_val = pj.tape[tape_idx];
// uint64_t payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
switch (type) {
case 'l': // we have a long int
answer.integer_count++;
tapeidx++; // skipping the integer
tape_idx++; // skipping the integer
break;
case 'd': // we have a double
answer.float_count++;
tapeidx++; // skipping the double
tape_idx++; // skipping the double
break;
case 'n': // we have a null
answer.null_count++;
@ -112,14 +113,14 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
int main(int argc, char *argv[]) {
#ifndef _MSC_VER
int c;
while ((c = getopt(argc, argv, "")) != -1) {
int c;
while ((c = getopt(argc, argv, "")) != -1) {
switch (c) {
default:
abort();
}
}
}
#else
int optind = 1;
#endif
@ -141,30 +142,30 @@ int main(int argc, char *argv[]) {
std::cerr << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
stat_t s = simdjson_computestats(p);
stat_t s = simdjson_compute_stats(p);
if (!s.valid) {
std::cerr << "not a valid JSON" << std::endl;
return EXIT_FAILURE;
}
printf("# integer_count float_count string_count backslash_count "
"nonasciibyte_count object_count array_count null_count true_count "
"non_ascii_byte_count object_count array_count null_count true_count "
"false_count byte_count structural_indexes_count ");
#ifdef __linux__
printf(
" stage1_cycle_count stage1_instruction_count stage2_cycle_count "
" stage2_instruction_count stage3_cycle_count stage3_instruction_count ");
printf(" stage1_cycle_count stage1_instruction_count stage2_cycle_count "
" stage2_instruction_count stage3_cycle_count "
"stage3_instruction_count ");
#else
printf("(you are not under linux, so perf counters are disaabled)");
#endif
printf("\n");
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu ", s.integer_count,
s.float_count, s.string_count, s.backslash_count, s.nonasciibyte_count,
s.object_count, s.array_count, s.null_count, s.true_count,
s.false_count, s.byte_count, s.structural_indexes_count);
s.float_count, s.string_count, s.backslash_count,
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
#ifdef __linux__
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size());
bool allocok = pj.allocate_capacity(p.size());
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
@ -180,20 +181,22 @@ int main(int argc, char *argv[]) {
results.resize(evts.size());
for (uint32_t i = 0; i < iterations; i++) {
unified.start();
// The default template is simdjson::architecture::native.
bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS);
// The default template is simdjson::Architecture::NATIVE.
bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) ==
simdjson::SUCCESS);
unified.end(results);
cy1 += results[0];
cl1 += results[1];
unified.start();
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
isok =
isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
unified.end(results);
cy2 += results[0];
cl2 += results[1];
if(!isok) {
if (!isok) {
std::cerr << "failure?" << std::endl;
}
}

View File

@ -10,11 +10,11 @@
// the input buf should be readable up to buf + SIMDJSON_PADDING
#ifdef __AVX2__
#define SIMDJSON_PADDING sizeof(__m256i)
#define SIMDJSON_PADDING sizeof(__m256i)
#else
// this is a stopgap; there should be a better description of the
// main loop and its behavior that abstracts over this
#define SIMDJSON_PADDING 32
#define SIMDJSON_PADDING 32
#endif
#ifndef _MSC_VER
@ -23,7 +23,6 @@
#define SIMDJSON_USE_COMPUTED_GOTO
#endif
// Align to N-byte boundary
#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
@ -49,13 +48,13 @@
#else
// For non-Visual Studio compilers, we may assume that same-page buffer overrun is fine.
// However, it will make it difficult to be "valgrind clean".
// For non-Visual Studio compilers, we may assume that same-page buffer overrun
// is fine. However, it will make it difficult to be "valgrind clean".
//#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN
//#define ALLOW_SAME_PAGE_BUFFER_OVERRUN true
//#else
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
//#endif
//#endif
// The following is likely unnecessarily complex.
#ifdef __SANITIZE_ADDRESS__
@ -63,16 +62,18 @@
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN false
#elif defined(__has_feature)
// we have CLANG?
// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we have a non-empty qualifier?
# if (__has_feature(address_sanitizer))
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER __attribute__((no_sanitize("address")))
# endif
#endif
// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we
// have a non-empty qualifier?
#if (__has_feature(address_sanitizer))
#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER \
__attribute__((no_sanitize("address")))
#endif
#endif
#if defined(__has_feature)
# if (__has_feature(memory_sanitizer))
#if (__has_feature(memory_sanitizer))
#define LENIENT_MEM_SANITIZER __attribute__((no_sanitize("memory")))
# endif
#endif
#endif
#define really_inline inline __attribute__((always_inline, unused))
@ -88,7 +89,7 @@
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
#endif // MSC_VER
#endif // MSC_VER
// if it does not apply, make it an empty macro
#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER

View File

@ -1,5 +1,6 @@
/* From https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
Highly modified.
/* From
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
Highly modified.
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
@ -7,9 +8,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
Samy Bengio, Johnny Mariethoz)
All rights reserved.
@ -23,8 +25,8 @@ modification, are permitted provided that the following conditions are met:
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
America and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
@ -60,51 +62,48 @@ constexpr uint32_t cpuid_bmi2_bit = 1 << 8; // bit 8 of EBX for EAX=0x7
constexpr uint32_t cpuid_sse42_bit = 1 << 20; // bit 20 of ECX for EAX=0x1
constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; // bit 1 of ECX for EAX=0x1
enum SIMDExtensions {
DEFAULT = 0x0,
NEON = 0x1,
AVX2 = 0x4,
SSE42 = 0x8,
enum instruction_set {
DEFAULT = 0x0,
NEON = 0x1,
AVX2 = 0x4,
SSE42 = 0x8,
PCLMULQDQ = 0x10,
BMI1 = 0x20,
BMI2 = 0x40
BMI1 = 0x20,
BMI2 = 0x40
};
#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
#if defined(__NEON__)
#if defined(__NEON__)
static inline uint32_t detect_supported_architectures()
{
return SIMDExtensions::NEON;
static inline uint32_t detect_supported_architectures() {
return instruction_set::NEON;
}
#else //ARM without NEON
#else // ARM without NEON
static inline uint32_t detect_supported_architectures()
{
return SIMDExtensions::DEFAULT;
static inline uint32_t detect_supported_architectures() {
return instruction_set::DEFAULT;
}
#endif
#else // x86
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
#endif
#else // x86
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
uint32_t *edx) {
#if defined(_MSC_VER)
int cpuInfo[4];
__cpuid(cpuInfo, *eax);
*eax = cpuInfo[0];
*ebx = cpuInfo[1];
*ecx = cpuInfo[2];
*edx = cpuInfo[3];
int cpu_info[4];
__cpuid(cpu_info, *eax);
*eax = cpu_info[0];
*ebx = cpu_info[1];
*ecx = cpu_info[2];
*edx = cpu_info[3];
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
uint32_t level = *eax;
__get_cpuid (level, eax, ebx, ecx, edx);
__get_cpuid(level, eax, ebx, ecx, edx);
#else
uint32_t a = *eax, b, c = *ecx, d;
asm volatile ( "cpuid\n\t"
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
*eax = a;
*ebx = b;
*ecx = c;
@ -112,10 +111,9 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
#endif
}
static inline uint32_t detect_supported_architectures()
{
static inline uint32_t detect_supported_architectures() {
uint32_t eax, ebx, ecx, edx;
uint32_t hostSimdExts = 0x0;
uint32_t host_isa = 0x0;
// ECX for EAX=0x7
eax = 0x7;
@ -123,15 +121,15 @@ static inline uint32_t detect_supported_architectures()
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx & cpuid_avx2_bit) {
hostSimdExts |= SIMDExtensions::AVX2;
host_isa |= instruction_set::AVX2;
}
if (ebx & cpuid_bmi1_bit) {
hostSimdExts |= SIMDExtensions::BMI1;
host_isa |= instruction_set::BMI1;
}
if (ebx & cpuid_bmi2_bit) {
hostSimdExts |= SIMDExtensions::BMI2;
host_isa |= instruction_set::BMI2;
}
// EBX for EAX=0x1
@ -139,16 +137,16 @@ static inline uint32_t detect_supported_architectures()
cpuid(&eax, &ebx, &ecx, &edx);
if (ecx & cpuid_sse42_bit) {
hostSimdExts |= SIMDExtensions::SSE42;
host_isa |= instruction_set::SSE42;
}
if (ecx & cpuid_pclmulqdq_bit) {
hostSimdExts |= SIMDExtensions::PCLMULQDQ;
host_isa |= instruction_set::PCLMULQDQ;
}
return hostSimdExts;
return host_isa;
}
#endif // end SIMD extension detection code
}
} // namespace simdjson
#endif

View File

@ -35,7 +35,6 @@ really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) {
return structural_or_whitespace_or_null_negated[c];
}
const uint32_t structural_or_whitespace_negated[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -76,7 +75,6 @@ really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) {
return structural_or_whitespace_or_null[c];
}
const uint32_t structural_or_whitespace[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
@ -94,7 +92,7 @@ really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
return structural_or_whitespace[c];
}
const uint32_t digittoval32[886] = {
const uint32_t digit_to_val32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
@ -103,7 +101,7 @@ const uint32_t digittoval32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF,
@ -138,7 +136,7 @@ const uint32_t digittoval32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF,
@ -173,7 +171,7 @@ const uint32_t digittoval32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF,
@ -208,7 +206,7 @@ const uint32_t digittoval32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF,
@ -244,15 +242,17 @@ const uint32_t digittoval32[886] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
// returns a value with the high 16 bits set if not valid
// otherwise returns the conversion of the 4 hex digits at src into the bottom 16 bits of the 32-bit
// return register
// otherwise returns the conversion of the 4 hex digits at src into the bottom
// 16 bits of the 32-bit return register
//
// see https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
static inline uint32_t hex_to_u32_nocheck(const uint8_t *src) {// strictly speaking, static inline is a C-ism
uint32_t v1 = digittoval32[630 + src[0]];
uint32_t v2 = digittoval32[420 + src[1]];
uint32_t v3 = digittoval32[210 + src[2]];
uint32_t v4 = digittoval32[0 + src[3]];
// see
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
static inline uint32_t hex_to_u32_nocheck(
const uint8_t *src) { // strictly speaking, static inline is a C-ism
uint32_t v1 = digit_to_val32[630 + src[0]];
uint32_t v2 = digit_to_val32[420 + src[1]];
uint32_t v3 = digit_to_val32[210 + src[2]];
uint32_t v4 = digit_to_val32[0 + src[3]];
return v1 | v2 | v3 | v4;
}
@ -272,19 +272,21 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
if (cp <= 0x7F) {
c[0] = cp;
return 1; // ascii
} if (cp <= 0x7FF) {
}
if (cp <= 0x7FF) {
c[0] = (cp >> 6) + 192;
c[1] = (cp & 63) + 128;
return 2; // universal plane
// Surrogates are treated elsewhere...
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
// return 0; // surrogates // could put assert here
// Surrogates are treated elsewhere...
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
// return 0; // surrogates // could put assert here
} else if (cp <= 0xFFFF) {
c[0] = (cp >> 12) + 224;
c[1] = ((cp >> 6) & 63) + 128;
c[2] = (cp & 63) + 128;
return 3;
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
// is not needed
c[0] = (cp >> 18) + 240;
c[1] = ((cp >> 12) & 63) + 128;
c[2] = ((cp >> 6) & 63) + 128;
@ -294,6 +296,6 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
// will return 0 when the code point was too large.
return 0; // bad r
}
}
} // namespace simdjson
#endif

View File

@ -10,40 +10,40 @@ namespace simdjson {
static inline void print_with_escapes(const unsigned char *src) {
while (*src) {
switch (*src) {
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
break;
case '\r':
putchar('\\');
putchar('r');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%04x", *src);
} else {
putchar(*src);
}
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
break;
case '\r':
putchar('\\');
putchar('r');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%04x", *src);
} else {
putchar(*src);
}
}
src++;
}
@ -54,43 +54,43 @@ static inline void print_with_escapes(const unsigned char *src,
std::ostream &os) {
while (*src) {
switch (*src) {
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';
break;
case '\r':
os << '\\';
os << 'r';
break;
case '\"':
os << '\\';
os << '"';
break;
case '\t':
os << '\\';
os << 't';
break;
case '\\':
os << '\\';
os << '\\';
break;
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
}
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';
break;
case '\r':
os << '\\';
os << 'r';
break;
case '\"':
os << '\\';
os << '"';
break;
case '\t':
os << '\\';
os << 't';
break;
case '\\':
os << '\\';
os << '\\';
break;
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
}
}
src++;
}
@ -101,40 +101,40 @@ static inline void print_with_escapes(const unsigned char *src, size_t len) {
const unsigned char *finalsrc = src + len;
while (src < finalsrc) {
switch (*src) {
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
break;
case '\r':
putchar('\\');
putchar('r');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%04x", *src);
} else {
putchar(*src);
}
case '\b':
putchar('\\');
putchar('b');
break;
case '\f':
putchar('\\');
putchar('f');
break;
case '\n':
putchar('\\');
putchar('n');
break;
case '\r':
putchar('\\');
putchar('r');
break;
case '\"':
putchar('\\');
putchar('"');
break;
case '\t':
putchar('\\');
putchar('t');
break;
case '\\':
putchar('\\');
putchar('\\');
break;
default:
if (*src <= 0x1F) {
printf("\\u%04x", *src);
} else {
putchar(*src);
}
}
src++;
}
@ -146,43 +146,43 @@ static inline void print_with_escapes(const unsigned char *src,
const unsigned char *finalsrc = src + len;
while (src < finalsrc) {
switch (*src) {
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';
break;
case '\r':
os << '\\';
os << 'r';
break;
case '\"':
os << '\\';
os << '"';
break;
case '\t':
os << '\\';
os << 't';
break;
case '\\':
os << '\\';
os << '\\';
break;
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
}
case '\b':
os << '\\';
os << 'b';
break;
case '\f':
os << '\\';
os << 'f';
break;
case '\n':
os << '\\';
os << 'n';
break;
case '\r':
os << '\\';
os << 'r';
break;
case '\"':
os << '\\';
os << '"';
break;
case '\t':
os << '\\';
os << 't';
break;
case '\\':
os << '\\';
os << '\\';
break;
default:
if (*src <= 0x1F) {
std::ios::fmtflags f(os.flags());
os << std::hex << std::setw(4) << std::setfill('0')
<< static_cast<int>(*src);
os.flags(f);
} else {
os << *src;
}
}
src++;
}
@ -196,7 +196,7 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
size_t len) {
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os, len);
}
}
} // namespace simdjson
#
#endif

View File

@ -8,10 +8,8 @@
#include <sstream>
#include <string>
#include "simdjson/padded_string.h"
namespace simdjson {
// load a file in memory...
@ -20,15 +18,15 @@ namespace simdjson {
// first element of the pair is a string (null terminated)
// whereas the second element is the length.
// caller is responsible to free (aligned_free((void*)result.data())))
//
//
// throws an exception if the file cannot be opened, use try/catch
// try {
// p = get_corpus(filename);
// } catch (const std::exception& e) {
// } catch (const std::exception& e) {
// aligned_free((void*)p.data());
// std::cout << "Could not load the file " << filename << std::endl;
// }
padded_string get_corpus(const std::string& filename);
}
padded_string get_corpus(const std::string &filename);
} // namespace simdjson
#endif

View File

@ -1,10 +1,10 @@
#ifndef SIMDJSON_JSONMINIFIER_H
#define SIMDJSON_JSONMINIFIER_H
#include "simdjson/padded_string.h"
#include <cstddef>
#include <cstdint>
#include <string_view>
#include "simdjson/padded_string.h"
namespace simdjson {
@ -12,20 +12,19 @@ namespace simdjson {
// out can be the same pointer. Result is null terminated,
// return the string length (minus the null termination).
// The accelerated version of this function only runs on AVX2 hardware.
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out);
static inline size_t jsonminify(const char *buf, size_t len, char *out) {
return jsonminify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(out));
static inline size_t json_minify(const char *buf, size_t len, char *out) {
return json_minify(reinterpret_cast<const uint8_t *>(buf), len,
reinterpret_cast<uint8_t *>(out));
}
static inline size_t jsonminify(const std::string_view & p, char *out) {
return jsonminify(p.data(), p.size(), out);
static inline size_t json_minify(const std::string_view &p, char *out) {
return json_minify(p.data(), p.size(), out);
}
static inline size_t jsonminify(const padded_string & p, char *out) {
return jsonminify(p.data(), p.size(), out);
}
static inline size_t json_minify(const padded_string &p, char *out) {
return json_minify(p.data(), p.size(), out);
}
} // namespace simdjson
#endif

View File

@ -1,136 +1,161 @@
#ifndef SIMDJSON_JSONPARSER_H
#define SIMDJSON_JSONPARSER_H
#include <string>
#include "simdjson/common_defs.h"
#include "simdjson/padded_string.h"
#include "simdjson/jsonioutil.h"
#include "simdjson/padded_string.h"
#include "simdjson/parsedjson.h"
#include "simdjson/simdjson.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"
#include "simdjson/simdjson.h"
#include <string>
#ifdef _MSC_VER
#include <windows.h>
#include <sysinfoapi.h>
#include <sysinfoapi.h> // must be included after windows.h
#else
#include <unistd.h>
#endif
namespace simdjson {
// The function that users are expected to call is json_parse.
// We have more than one such function because we want to support several
// We have more than one such function because we want to support several
// instruction sets.
// function pointer type for json_parse
using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded);
using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj,
bool realloc_if_needed);
// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set
// Pointer that holds the json_parse implementation corresponding to the
// available SIMD instruction set
extern json_parse_functype *json_parse_ptr;
// json_parse_implementation is the generic function, it is specialized for various
// architectures, e.g., as json_parse_implementation<architecture::haswell>
// or json_parse_implementation<architecture::arm64>
template<architecture T>
int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
if (pj.bytecapacity < len) {
// json_parse_implementation is the generic function, it is specialized for
// various architectures, e.g., as
// json_parse_implementation<Architecture::HASWELL> or
// json_parse_implementation<Architecture::ARM64>
template <Architecture T>
int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj,
bool realloc_if_needed = true) {
if (pj.byte_capacity < len) {
return simdjson::CAPACITY;
}
bool reallocated = false;
if(reallocifneeded) {
if (realloc_if_needed) {
#if ALLOW_SAME_PAGE_BUFFER_OVERRUN
// realloc is needed if the end of the memory crosses a page
#ifdef _MSC_VER
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
long pagesize = sysInfo.dwPageSize;
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
long page_size = sysInfo.dwPageSize;
#else
long pagesize = sysconf (_SC_PAGESIZE);
long page_size = sysconf(_SC_PAGESIZE);
#endif
//////////////
// We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING
// are in the same page.
// That is, we want to check that
// (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize
// That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize.
// That is, we want to check that
// (buf + len - 1) / page_size == (buf + len - 1 + SIMDJSON_PADDING) /
// page_size That's true if (buf + len - 1) % page_size + SIMDJSON_PADDING <
// page_size.
///////////
if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
if ((reinterpret_cast<uintptr_t>(buf + len - 1) % page_size) +
SIMDJSON_PADDING <
static_cast<uintptr_t>(page_size)) {
#else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN
if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate
if (true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always
// reallocate
#endif
const uint8_t *tmpbuf = buf;
buf = (uint8_t *) allocate_padded_buffer(len);
if(buf == NULL) return simdjson::MEMALLOC;
memcpy((void*)buf,tmpbuf,len);
const uint8_t *tmp_buf = buf;
buf = (uint8_t *)allocate_padded_buffer(len);
if (buf == NULL)
return simdjson::MEMALLOC;
memcpy((void *)buf, tmp_buf, len);
reallocated = true;
} // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast<uintptr_t>(pagesize) ) {
} // if(reallocifneeded) {
} // if (true) OR if ( (reinterpret_cast<uintptr_t>(buf + len - 1) %
// page_size ) + SIMDJSON_PADDING < static_cast<uintptr_t>(page_size) ) {
} // if(realloc_if_needed) {
int stage1_is_ok = simdjson::find_structural_bits<T>(buf, len, pj);
if(stage1_is_ok != simdjson::SUCCESS) {
pj.errorcode = stage1_is_ok;
return pj.errorcode;
}
if (stage1_is_ok != simdjson::SUCCESS) {
pj.error_code = stage1_is_ok;
return pj.error_code;
}
int res = unified_machine<T>(buf, len, pj);
if(reallocated) { aligned_free((void*)buf);}
if (reallocated) {
aligned_free((void *)buf);
}
return res;
}
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
// Parse a document found in buf.
//
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
// into a string).
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)).
//
// You can also check validity by calling pj.isValid(). The same ParsedJson can be reused for other documents.
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
// (a copy of the input string is made).
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
// all bytes at and after buf + len are ignored (can be garbage).
// The ParsedJson object can be reused.
// You can also check validity by calling pj.is_valid(). The same ParsedJson can
// be reused for other documents.
//
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The ParsedJson object can be reused.
inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
return json_parse_ptr(buf, len, pj, reallocifneeded);
inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj,
bool realloc_if_needed = true) {
return json_parse_ptr(buf, len, pj, realloc_if_needed);
}
// Parse a document found in buf.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
//
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
// into a string).
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
// by calling pj.is_valid(). The same ParsedJson can be reused for other
// documents.
//
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
// (a copy of the input string is made).
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
// all bytes at and after buf + len are ignored (can be garbage).
// The ParsedJson object can be reused.
inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) {
return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj, reallocifneeded);
// If realloc_if_needed is true (default) then a temporary buffer is created
// when needed during processing (a copy of the input string is made). The input
// buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage). The ParsedJson object can be reused.
inline int json_parse(const char *buf, size_t len, ParsedJson &pj,
bool realloc_if_needed = true) {
return json_parse_ptr(reinterpret_cast<const uint8_t *>(buf), len, pj,
realloc_if_needed);
}
// We do not want to allow implicit conversion from C string to std::string.
int json_parse(const char * buf, ParsedJson &pj) = delete;
int json_parse(const char *buf, ParsedJson &pj) = delete;
// Parse a document found in in string s.
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
// into a string).
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
@ -139,72 +164,82 @@ inline int json_parse(const std::string &s, ParsedJson &pj) {
}
// Parse a document found in in string s.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
//
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from
// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC,
// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes
// into a string).
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)).
//
// The function returns simdjson::SUCCESS (an integer = 0) in case of a success
// or an error code from simdjson/simdjson.h in case of failure such as
// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
// the simdjson::error_message function converts these error codes into a
// string).
//
// You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
// by calling pj.is_valid(). The same ParsedJson can be reused for other
// documents.
inline int json_parse(const padded_string &s, ParsedJson &pj) {
return json_parse(s.data(), s.length(), pj, false);
}
// Build a ParsedJson object. You can check validity
// by calling pj.isValid(). This does the memory allocation needed for ParsedJson.
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
// (a copy of the input string is made).
// by calling pj.is_valid(). This does the memory allocation needed for
// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
// all bytes at and after buf + len are ignored (can be garbage).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded = true);
ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
bool realloc_if_needed = true);
WARN_UNUSED
// Build a ParsedJson object. You can check validity
// by calling pj.isValid(). This does the memory allocation needed for ParsedJson.
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
// (a copy of the input string is made).
// by calling pj.is_valid(). This does the memory allocation needed for
// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is
// created when needed during processing (a copy of the input string is made).
//
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false,
// all bytes at and after buf + len are ignored (can be garbage).
// The input buf should be readable up to buf + len + SIMDJSON_PADDING if
// realloc_if_needed is false, all bytes at and after buf + len are ignored
// (can be garbage).
//
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
inline ParsedJson build_parsed_json(const char * buf, size_t len, bool reallocifneeded = true) {
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len, reallocifneeded);
inline ParsedJson build_parsed_json(const char *buf, size_t len,
bool realloc_if_needed = true) {
return build_parsed_json(reinterpret_cast<const uint8_t *>(buf), len,
realloc_if_needed);
}
// We do not want to allow implicit conversion from C string to std::string.
ParsedJson build_parsed_json(const char *buf) = delete;
// Parse a document found in in string s.
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// Return SUCCESS (an integer = 0) in case of a success. You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling pj.is_valid(). The same
// ParsedJson can be reused for other documents.
//
// A temporary buffer is created when needed during processing
// (a copy of the input string is made).
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
@ -212,19 +247,20 @@ inline ParsedJson build_parsed_json(const std::string &s) {
return build_parsed_json(s.data(), s.length(), true);
}
// Parse a document found in in string s.
// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)).
// Return SUCCESS (an integer = 0) in case of a success. You can also check validity
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller
// is responsible for omitting it, UTF-8 BOM are discouraged.
// You need to preallocate ParsedJson with a capacity of len (e.g.,
// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a
// success. You can also check validity by calling pj.is_valid(). The same
// ParsedJson can be reused for other documents.
//
// The content should be a valid JSON document encoded as UTF-8. If there is a
// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
// discouraged.
//
// This is a convenience function which calls json_parse.
WARN_UNUSED
inline ParsedJson build_parsed_json(const padded_string &s) {
return build_parsed_json(s.data(), s.length(), false);
}
}
} // namespace simdjson
#endif

View File

@ -7,16 +7,17 @@
#include "simdjson/portability.h"
#ifdef JSON_TEST_NUMBERS // for unit testing
void foundInvalidNumber(const uint8_t *buf);
void foundInteger(int64_t result, const uint8_t *buf);
void foundFloat(double result, const uint8_t *buf);
void found_invalid_number(const uint8_t *buf);
void found_integer(int64_t result, const uint8_t *buf);
void found_float(double result, const uint8_t *buf);
#endif
namespace simdjson {
// Allowable floating-point values range from std::numeric_limits<double>::lowest()
// to std::numeric_limits<double>::max(), so from
// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero
// normal values is std::numeric_limits<double>::min() or about 2.225074e-308.
// Allowable floating-point values range from
// std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
// so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest
// non-zero normal values is std::numeric_limits<double>::min() or
// about 2.225074e-308.
static const double power_of_ten[] = {
1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
@ -113,7 +114,7 @@ really_inline bool
is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
return structural_or_whitespace_or_exponent_or_decimal_negated[c];
}
}// simdjson
} // namespace simdjson
#ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING
#define SWAR_NUMBER_PARSING
#endif
@ -126,7 +127,7 @@ namespace simdjson {
// http://0x80.pl/articles/swar-digits-validate.html
static inline bool is_made_of_eight_digits_fast(const char *chars) {
uint64_t val;
// this can read up to 7 bytes beyond the buffer size, but we require
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(7 <= SIMDJSON_PADDING);
memcpy(&val, chars, 8);
@ -138,7 +139,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
0x3333333333333333);
}
}
} // namespace simdjson
#ifdef IS_X86_64
TARGET_WESTMERE
namespace simdjson {
@ -150,7 +151,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
const __m128i mul_1_10000 =
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
const __m128i input = _mm_sub_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
const __m128i input = _mm_sub_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
const __m128i t3 = _mm_packus_epi32(t2, t2);
@ -158,7 +160,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
return _mm_cvtsi128_si32(
t4); // only captures the sum of the first 8 digits, drop the rest
}
}
} // namespace simdjson
UNTARGET_REGION
#endif
@ -167,15 +169,14 @@ namespace simdjson {
// we don't have SSE, so let us use a scalar function
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
uint64_t val;
memcpy(&val, chars, sizeof(uint64_t));
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
uint64_t val;
memcpy(&val, chars, sizeof(uint64_t));
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
}
#endif
#endif
//
@ -183,10 +184,9 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
// It is only even going to be used when negative_exponent is tiny.
static double subnormal_power10(double base, int negative_exponent) {
// this is probably not going to be fast
return base * 1e-308 * pow(10, negative_exponent + 308);
return base * 1e-308 * pow(10, negative_exponent + 308);
}
// called by parse_number when we know that the output is a float,
// but where there might be some integer overflow. The trick here is to
// parse using floats from the start.
@ -197,10 +197,8 @@ static double subnormal_power10(double base, int negative_exponent) {
//
// Note: a redesign could avoid this function entirely.
//
static never_inline bool
parse_float(const uint8_t *const buf,
ParsedJson &pj, const uint32_t offset,
bool found_minus) {
static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
const uint32_t offset, bool found_minus) {
const char *p = reinterpret_cast<const char *>(buf + offset);
bool negative = false;
if (found_minus) {
@ -223,100 +221,102 @@ parse_float(const uint8_t *const buf,
}
if ('.' == *p) {
++p;
int fractionalweight = 308;
if(is_integer(*p)) {
int fractional_weight = 308;
if (is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
fractionalweight --;
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
fractional_weight--;
i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
: 0);
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
while (is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
fractionalweight --;
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
fractional_weight--;
i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
: 0);
}
}
if (('e' == *p) || ('E' == *p)) {
++p;
bool negexp = false;
bool neg_exp = false;
if ('-' == *p) {
negexp = true;
neg_exp = true;
++p;
} else if ('+' == *p) {
++p;
}
if (!is_integer(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
unsigned char digit = *p - '0';
int64_t expnumber = digit; // exponential part
int64_t exp_number = digit; // exponential part
p++;
if (is_integer(*p)) {
digit = *p - '0';
expnumber = 10 * expnumber + digit;
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = *p - '0';
expnumber = 10 * expnumber + digit;
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = *p - '0';
expnumber = 10 * expnumber + digit;
exp_number = 10 * exp_number + digit;
++p;
}
while (is_integer(*p)) {
if(expnumber > 0x100000000) {// we need to check for overflows
if (exp_number > 0x100000000) { // we need to check for overflows
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
digit = *p - '0';
expnumber = 10 * expnumber + digit;
++p;
exp_number = 10 * exp_number + digit;
++p;
}
if (unlikely(expnumber > 308)) {
if (unlikely(exp_number > 308)) {
// this path is unlikely
if(negexp) {
// We either have zero or a subnormal.
if (neg_exp) {
// We either have zero or a subnormal.
// We expect this to be uncommon so we go through a slow path.
i = subnormal_power10(i, - expnumber);
i = subnormal_power10(i, -exp_number);
} else {
// We know for sure that we have a number that is too large,
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
} else {
int exponent = (negexp ? -expnumber : expnumber);
// we have that expnumber is [0,308] so that
// exponent is [-308,308] so that
int exponent = (neg_exp ? -exp_number : exp_number);
// we have that exp_number is [0,308] so that
// exponent is [-308,308] so that
// 308 + exponent is in [0, 2 * 308]
i *= power_of_ten[308 + exponent];
}
}
}
if(is_not_structural_or_whitespace(*p)) {
if (is_not_structural_or_whitespace(*p)) {
return false;
}
double d = negative ? -i : i;
pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset);
found_float(d, buf + offset);
#endif
return is_structural_or_whitespace(*p);
}
@ -354,13 +354,13 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
digit = *p - '0';
if (mul_overflow(i, 10, &i)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false; // overflow
}
if (add_overflow(i, digit, &i)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false; // overflow
}
@ -371,7 +371,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
if (i > 0x8000000000000000) {
// overflows!
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false; // overflow
}
@ -379,15 +379,16 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
if (i >= 0x8000000000000000) {
// overflows!
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false; // overflow
}
}
int64_t signed_answer = negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
int64_t signed_answer =
negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
pj.write_tape_s64(signed_answer);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInteger(signed_answer, buf + offset);
found_integer(signed_answer, buf + offset);
#endif
return is_structural_or_whitespace(*p);
}
@ -396,18 +397,18 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
// define JSON_TEST_NUMBERS for unit testing
//
// It is assumed that the number is followed by a structural ({,},],[) character
// or a white space character. If that is not the case (e.g., when the JSON document
// is made of a single number), then it is necessary to copy the content and append
// a space before calling this function.
// or a white space character. If that is not the case (e.g., when the JSON
// document is made of a single number), then it is necessary to copy the
// content and append a space before calling this function.
//
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
static really_inline bool parse_number(const uint8_t *const buf,
ParsedJson &pj,
static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
const uint32_t offset,
bool found_minus) {
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes useful to skip parsing
pj.write_tape_s64(0); // always write zero
return true; // always succeeds
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
// useful to skip parsing
pj.write_tape_s64(0); // always write zero
return true; // always succeeds
#else
const char *p = reinterpret_cast<const char *>(buf + offset);
bool negative = false;
@ -415,28 +416,28 @@ static really_inline bool parse_number(const uint8_t *const buf,
++p;
negative = true;
if (!is_integer(*p)) { // a negative sign must be followed by an integer
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
#endif
return false;
}
}
const char *const startdigits = p;
const char *const start_digits = p;
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
if (*p == '0') { // 0 cannot be followed by an integer
++p;
if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
i = 0;
} else {
if (!(is_integer(*p))) { // must start with an integer
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
#ifdef JSON_TEST_NUMBERS // for unit testing
found_invalid_number(buf + offset);
#endif
return false;
}
@ -447,7 +448,8 @@ static really_inline bool parse_number(const uint8_t *const buf,
// we rarely see large integer parts like 123456789
while (is_integer(*p)) {
digit = *p - '0';
// a multiplication by 10 is cheaper than an arbitrary integer multiplication
// a multiplication by 10 is cheaper than an arbitrary integer
// multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
++p;
}
@ -461,17 +463,18 @@ static really_inline bool parse_number(const uint8_t *const buf,
// z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner.
++p;
const char *const firstafterperiod = p;
if(is_integer(*p)) {
const char *const first_after_period = p;
if (is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
// cheaper than arbitrary mult.
// we will handle the overflow later
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
return false;
}
#ifdef SWAR_NUMBER_PARSING
// this helps if we have lots of decimals!
@ -484,102 +487,100 @@ static really_inline bool parse_number(const uint8_t *const buf,
while (is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
// because we have parse_highprecision_float later.
}
exponent = firstafterperiod - p;
exponent = first_after_period - p;
}
int digitcount = p - startdigits - 1; // used later to guard against overflows
int64_t expnumber = 0; // exponential part
int digit_count =
p - start_digits - 1; // used later to guard against overflows
int64_t exp_number = 0; // exponential part
if (('e' == *p) || ('E' == *p)) {
is_float = true;
++p;
bool negexp = false;
bool neg_exp = false;
if ('-' == *p) {
negexp = true;
neg_exp = true;
++p;
} else if ('+' == *p) {
++p;
}
if (!is_integer(*p)) {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
unsigned char digit = *p - '0';
expnumber = digit;
exp_number = digit;
p++;
if (is_integer(*p)) {
digit = *p - '0';
expnumber = 10 * expnumber + digit;
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = *p - '0';
expnumber = 10 * expnumber + digit;
exp_number = 10 * exp_number + digit;
++p;
}
while (is_integer(*p)) {
if(expnumber > 0x100000000) {// we need to check for overflows
// we refuse to parse this
if (exp_number > 0x100000000) { // we need to check for overflows
// we refuse to parse this
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
found_invalid_number(buf + offset);
#endif
return false;
}
digit = *p - '0';
expnumber = 10 * expnumber + digit;
++p;
exp_number = 10 * exp_number + digit;
++p;
}
exponent += (negexp ? -expnumber : expnumber);
exponent += (neg_exp ? -exp_number : exp_number);
}
if (is_float) {
uint64_t powerindex = 308 + exponent;
if (unlikely((digitcount >= 19))) { // this is uncommon
// It is possible that the integer had an overflow.
uint64_t power_index = 308 + exponent;
if (unlikely((digit_count >= 19))) { // this is uncommon
// It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
const char * start = startdigits;
while((*start == '0') || (*start == '.')) {
start++;
const char *start = start_digits;
while ((*start == '0') || (*start == '.')) {
start++;
}
digitcount -= (start - startdigits);
if(digitcount >= 19) {
digit_count -= (start - start_digits);
if (digit_count >= 19) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, pj, offset,
found_minus);
}
return parse_float(buf, pj, offset, found_minus);
}
}
if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, pj, offset,
found_minus);
return parse_float(buf, pj, offset, found_minus);
}
double factor = power_of_ten[powerindex];
double factor = power_of_ten[power_index];
factor = negative ? -factor : factor;
double d = i * factor;
pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset);
found_float(d, buf + offset);
#endif
} else {
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
if (unlikely(digit_count >= 18)) { // this is uncommon!!!
// there is a good chance that we had an overflow, so we need
// need to recover: we parse the whole thing again.
return parse_large_integer(buf, pj, offset,
found_minus);
return parse_large_integer(buf, pj, offset, found_minus);
}
i = negative ? 0-i : i;
i = negative ? 0 - i : i;
pj.write_tape_s64(i);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInteger(i, buf + offset);
found_integer(i, buf + offset);
#endif
}
return is_structural_or_whitespace(*p);
return is_structural_or_whitespace(*p);
#endif // SIMDJSON_SKIPNUMBERPARSING
}
}//simdjson
} // simdjson
#endif

View File

@ -1,8 +1,8 @@
#ifndef SIMDJSON_PADDING_STRING_H
#define SIMDJSON_PADDING_STRING_H
#include "simdjson/portability.h"
#include <memory>
#include <cstring>
#include <memory>
namespace simdjson {
// low-level function to allocate memory with padding so we can read passed the
@ -65,6 +65,6 @@ private:
size_t viable_size;
char *data_ptr;
};
}
} // namespace simdjson
#endif

View File

@ -1,48 +1,49 @@
#ifndef SIMDJSON_PARSEDJSON_H
#define SIMDJSON_PARSEDJSON_H
#include "simdjson/common_defs.h"
#include "simdjson/jsonformatutils.h"
#include "simdjson/portability.h"
#include "simdjson/simdjson.h"
#include <cinttypes>
#include <cmath>
#include <cstring>
#include <iomanip>
#include <iostream>
#include "simdjson/simdjson.h"
#include "simdjson/common_defs.h"
#include "simdjson/jsonformatutils.h"
#include "simdjson/portability.h"
#define JSONVALUEMASK 0xFFFFFFFFFFFFFF
#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
#define DEFAULTMAXDEPTH 1024// a JSON document with a depth exceeding 1024 is probably de facto invalid
#define DEFAULT_MAX_DEPTH \
1024 // a JSON document with a depth exceeding 1024 is probably de facto
// invalid
namespace simdjson {
/************
* The JSON is parsed to a tape, see the accompanying tape.md file
* for documentation.
***********/
struct ParsedJson {
class ParsedJson {
public:
// create a ParsedJson container with zero capacity, call allocateCapacity to
// create a ParsedJson container with zero capacity, call allocate_capacity to
// allocate memory
ParsedJson();
~ParsedJson();
ParsedJson(ParsedJson && p);
ParsedJson(ParsedJson &&p);
// if needed, allocate memory so that the object is able to process JSON
// documents having up to len bytes and maxdepth "depth"
// documents having up to len bytes and max_depth "depth"
WARN_UNUSED
bool allocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH);
bool allocate_capacity(size_t len, size_t max_depth = DEFAULT_MAX_DEPTH);
// returns true if the document parsed was valid
bool isValid() const;
bool is_valid() const;
// return an error code corresponding to the last parsing attempt, see simdjson.h
// will return simdjson::UNITIALIZED if no parsing was attempted
int getErrorCode() const;
// return an error code corresponding to the last parsing attempt, see
// simdjson.h will return simdjson::UNITIALIZED if no parsing was attempted
int get_error_code() const;
// return the string equivalent of "getErrorCode"
std::string getErrorMsg() const;
// return the string equivalent of "get_error_code"
std::string get_error_message() const;
// deallocate memory and set capacity to zero, called automatically by the
// destructor
@ -55,11 +56,10 @@ public:
// return false if the tape is likely wrong (e.g., you did not parse a valid
// JSON).
WARN_UNUSED
bool printjson(std::ostream &os);
bool print_json(std::ostream &os);
WARN_UNUSED
bool dump_raw_tape(std::ostream &os);
// all nodes are stored on the tape using a 64-bit word.
//
// strings, double and ints are stored as
@ -76,43 +76,42 @@ public:
// this should be considered a private function
really_inline void write_tape(uint64_t val, uint8_t c) {
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
}
really_inline void write_tape_s64(int64_t i) {
write_tape(0, 'l');
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
write_tape(0, 'l');
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
}
really_inline void write_tape_double(double d) {
write_tape(0, 'd');
static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size");
memcpy(& tape[current_loc++], &d, sizeof(double));
//tape[current_loc++] = *((uint64_t *)&d);
memcpy(&tape[current_loc++], &d, sizeof(double));
// tape[current_loc++] = *((uint64_t *)&d);
}
really_inline uint32_t get_current_loc() { return current_loc; }
really_inline void annotate_previousloc(uint32_t saved_loc, uint64_t val) {
tape[saved_loc] |= val;
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
tape[saved_loc] |= val;
}
struct InvalidJSON : public std::exception {
const char * what () const throw () {
return "JSON document is invalid";
}
class InvalidJSON : public std::exception {
const char *what() const throw() { return "JSON document is invalid"; }
};
struct iterator {
class Iterator {
// might throw InvalidJSON if ParsedJson is invalid
explicit iterator(ParsedJson &pj_);
~iterator();
public:
explicit Iterator(ParsedJson &pj_);
~Iterator();
iterator(const iterator &o);
Iterator(const Iterator &o);
iterator(iterator &&o);
Iterator(Iterator &&o);
inline bool isOk() const;
inline bool is_ok() const;
// useful for debuging purposes
inline size_t get_tape_location() const;
@ -120,11 +119,12 @@ public:
// useful for debuging purposes
inline size_t get_tape_length() const;
// returns the current depth (start at 1 with 0 reserved for the fictitious root node)
// returns the current depth (start at 1 with 0 reserved for the fictitious
// root node)
inline size_t get_depth() const;
// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([).
// The root node has type 'r'.
// A scope is a series of nodes at the same depth, typically it is either an
// object ({) or an array ([). The root node has type 'r'.
inline uint8_t get_scope_type() const;
// move forward in document order
@ -132,81 +132,65 @@ public:
// retrieve the character code of what we're looking at:
// [{"sltfn are the possibilities
inline uint8_t get_type() const {
return current_type; // short functions should be inlined!
inline uint8_t get_type() const {
return current_type; // short functions should be inlined!
}
// get the int64_t value at this node; valid only if we're at "l"
inline int64_t get_integer() const {
if(location + 1 >= tape_length) {
return 0;// default value in case of error
}
return static_cast<int64_t>(pj.tape[location + 1]);
inline int64_t get_integer() const {
if (location + 1 >= tape_length) {
return 0; // default value in case of error
}
return static_cast<int64_t>(pj.tape[location + 1]);
}
// get the string value at this node (NULL ended); valid only if we're at "
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
// return value is valid UTF-8
// It may contain NULL chars within the string: get_string_length determines the true
// string length.
inline const char * get_string() const {
return reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK) + sizeof(uint32_t)) ;
// note that tabs, and line endings are escaped in the returned value (see
// print_with_escapes) return value is valid UTF-8 It may contain NULL chars
// within the string: get_string_length determines the true string length.
inline const char *get_string() const {
return reinterpret_cast<const char *>(
pj.string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
}
// return the length of the string in bytes
inline uint32_t get_string_length() const {
uint32_t answer;
memcpy(&answer, reinterpret_cast<const char *>(pj.string_buf + (current_val & JSONVALUEMASK)), sizeof(uint32_t));
memcpy(&answer,
reinterpret_cast<const char *>(pj.string_buf +
(current_val & JSON_VALUE_MASK)),
sizeof(uint32_t));
return answer;
}
// get the double value at this node; valid only if
// we're at "d"
inline double get_double() const {
if(location + 1 >= tape_length) {
return NAN;// default value in case of error
inline double get_double() const {
if (location + 1 >= tape_length) {
return NAN; // default value in case of error
}
double answer;
memcpy(&answer, & pj.tape[location + 1], sizeof(answer));
memcpy(&answer, &pj.tape[location + 1], sizeof(answer));
return answer;
}
inline bool is_object_or_array() const { return is_object() || is_array(); }
inline bool is_object_or_array() const {
return is_object() || is_array();
}
inline bool is_object() const { return get_type() == '{'; }
inline bool is_object() const {
return get_type() == '{';
}
inline bool is_array() const { return get_type() == '['; }
inline bool is_array() const {
return get_type() == '[';
}
inline bool is_string() const { return get_type() == '"'; }
inline bool is_string() const {
return get_type() == '"';
}
inline bool is_integer() const { return get_type() == 'l'; }
inline bool is_integer() const {
return get_type() == 'l';
}
inline bool is_double() const { return get_type() == 'd'; }
inline bool is_double() const {
return get_type() == 'd';
}
inline bool is_true() const { return get_type() == 't'; }
inline bool is_true() const {
return get_type() == 't';
}
inline bool is_false() const { return get_type() == 'f'; }
inline bool is_false() const {
return get_type() == 'f';
}
inline bool is_null() const {
return get_type() == 'n';
}
inline bool is_null() const { return get_type() == 'n'; }
static bool is_object_or_array(uint8_t type) {
return ((type == '[') || (type == '{'));
@ -219,16 +203,17 @@ public:
// We seek the key using C's strcmp so if your JSON strings contain
// NULL chars, this would trigger a false positive: if you expect that
// to be the case, take extra precautions.
inline bool move_to_key(const char * key);
inline bool move_to_key(const char *key);
// when at {, go one level deep, looking for a given key
// if successful, we are left pointing at the value,
// if not, we are still pointing at the object ({)
// (in case of repeated keys, this only finds the first one).
// The string we search for can contain NULL values.
inline bool move_to_key(const char * key, uint32_t length);
// when at a key location within an object, this moves to the accompanying value (located next to it).
// this is equivalent but much faster than calling "next()".
inline bool move_to_key(const char *key, uint32_t length);
// when at a key location within an object, this moves to the accompanying
// value (located next to it). this is equivalent but much faster than
// calling "next()".
inline void move_to_value();
// when at [, go one level deep, and advance to the given index.
@ -239,54 +224,55 @@ public:
// Moves the iterator to the value correspoding to the json pointer.
// Always search from the root of the document.
// if successful, we are left pointing at the value,
// if not, we are still pointing the same value we were pointing before the call.
// The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
// However, the standard says "If a referenced member name is not unique in an object,
// the member that is referenced is undefined, and evaluation fails".
// Here we just return the first corresponding value.
// The length parameter is the length of the jsonpointer string ('pointer').
bool move_to(const char * pointer, uint32_t length);
// if not, we are still pointing the same value we were pointing before the
// call. The json pointer follows the rfc6901 standard's syntax:
// https://tools.ietf.org/html/rfc6901 However, the standard says "If a
// referenced member name is not unique in an object, the member that is
// referenced is undefined, and evaluation fails". Here we just return the
// first corresponding value. The length parameter is the length of the
// jsonpointer string ('pointer').
bool move_to(const char *pointer, uint32_t length);
// Moves the iterator to the value correspoding to the json pointer.
// Always search from the root of the document.
// if successful, we are left pointing at the value,
// if not, we are still pointing the same value we were pointing before the call.
// The json pointer implementation follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
// However, the standard says "If a referenced member name is not unique in an object,
// the member that is referenced is undefined, and evaluation fails".
// Here we just return the first corresponding value.
inline bool move_to(const std::string & pointer) {
// if not, we are still pointing the same value we were pointing before the
// call. The json pointer implementation follows the rfc6901 standard's
// syntax: https://tools.ietf.org/html/rfc6901 However, the standard says
// "If a referenced member name is not unique in an object, the member that
// is referenced is undefined, and evaluation fails". Here we just return
// the first corresponding value.
inline bool move_to(const std::string &pointer) {
return move_to(pointer.c_str(), pointer.length());
}
private:
// Almost the same as move_to(), except it searchs from the current
// position. The pointer's syntax is identical, though that case is not
// handled by the rfc6901 standard. The '/' is still required at the
// beginning. However, contrary to move_to(), the URI Fragment Identifier
// Representation is not supported here. Also, in case of failure, we are
// left pointing at the closest value it could reach. For these reasons it
// is private. It exists because it is used by move_to().
bool relative_move_to(const char *pointer, uint32_t length);
// Almost the same as move_to(), except it searchs from the current position.
// The pointer's syntax is identical, though that case is not handled by the rfc6901 standard.
// The '/' is still required at the beginning.
// However, contrary to move_to(), the URI Fragment Identifier Representation is not supported here.
// Also, in case of failure, we are left pointing at the closest value it could reach.
// For these reasons it is private. It exists because it is used by move_to().
bool relative_move_to(const char * pointer, uint32_t length);
public:
// throughout return true if we can do the navigation, false
// otherwise
// Withing a given scope (series of nodes at the same depth within either an
// array or an object), we move forward.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [.
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
// valid if we're not at the end of a scope (returns true).
// Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, {
// and [. At the object ({) or at the array ([), you can issue a "down" to
// visit their content. valid if we're not at the end of a scope (returns
// true).
inline bool next();
// Withing a given scope (series of nodes at the same depth within either an
// array or an object), we move backward.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end
// of the scope.
// At the object ({) or at the array ([), you can issue a "down" to visit their content.
// Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true
// when starting at the end of the scope. At the object ({) or at the array
// ([), you can issue a "down" to visit their content.
inline bool prev();
// Moves back to either the containing array or object (type { or [) from
@ -294,11 +280,9 @@ public:
// Valid unless we are at the first level of the document
inline bool up();
// Valid if we're at a [ or { and it starts a non-empty scope; moves us to start of
// that deeper scope if it not empty.
// Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the
// "a" node.
// Valid if we're at a [ or { and it starts a non-empty scope; moves us to
// start of that deeper scope if it not empty. Thus, given [true, null,
// {"a":1}, [1,2]], if we are at the { node, we would move to the "a" node.
inline bool down();
// move us to the start of our current scope,
@ -306,7 +290,8 @@ public:
inline void to_start_scope();
inline void rewind() {
while(up());
while (up())
;
}
// void to_end_scope(); // move us to
@ -314,26 +299,28 @@ public:
// print the thing we're currently pointing at
bool print(std::ostream &os, bool escape_strings = true) const;
typedef struct {size_t start_of_scope; uint8_t scope_type;} scopeindex_t;
typedef struct {
size_t start_of_scope;
uint8_t scope_type;
} scopeindex_t;
private:
iterator& operator=(const iterator& other) = delete ;
private:
Iterator &operator=(const Iterator &other) = delete;
ParsedJson &pj;
size_t depth;
size_t location; // our current location on a tape
size_t location; // our current location on a tape
size_t tape_length;
uint8_t current_type;
uint64_t current_val;
scopeindex_t *depthindex;
scopeindex_t *depth_index;
};
size_t bytecapacity{0}; // indicates how many bits are meant to be supported
size_t byte_capacity{0}; // indicates how many bits are meant to be supported
size_t depthcapacity{0}; // how deep we can go
size_t tapecapacity{0};
size_t stringcapacity{0};
size_t depth_capacity{0}; // how deep we can go
size_t tape_capacity{0};
size_t string_capacity{0};
uint32_t current_loc{0};
uint32_t n_structural_indexes{0};
@ -343,24 +330,23 @@ private:
uint32_t *containing_scope_offset;
#ifdef SIMDJSON_USE_COMPUTED_GOTO
void **ret_address;
#else
#else
char *ret_address;
#endif
uint8_t *string_buf; // should be at least bytecapacity
uint8_t *string_buf; // should be at least byte_capacity
uint8_t *current_string_buf_loc;
bool isvalid{false};
int errorcode{simdjson::UNITIALIZED};
bool valid{false};
int error_code{simdjson::UNITIALIZED};
private :
// we don't want the default constructor to be called
ParsedJson(const ParsedJson & p) = delete; // we don't want the default constructor to be called
// we don't want the assignment to be called
ParsedJson & operator=(const ParsedJson&o) = delete;
private:
// we don't want the default constructor to be called
ParsedJson(const ParsedJson &p) =
delete; // we don't want the default constructor to be called
// we don't want the assignment to be called
ParsedJson &operator=(const ParsedJson &o) = delete;
};
// dump bits low to high
inline void dumpbits_always(uint64_t v, const std::string &msg) {
for (uint32_t i = 0; i < 64; i++) {
@ -377,188 +363,180 @@ inline void dumpbits32_always(uint32_t v, const std::string &msg) {
}
WARN_UNUSED
bool ParsedJson::iterator::isOk() const {
return location < tape_length;
}
bool ParsedJson::Iterator::is_ok() const { return location < tape_length; }
// useful for debuging purposes
size_t ParsedJson::iterator::get_tape_location() const {
return location;
}
size_t ParsedJson::Iterator::get_tape_location() const { return location; }
// useful for debuging purposes
size_t ParsedJson::iterator::get_tape_length() const {
return tape_length;
size_t ParsedJson::Iterator::get_tape_length() const { return tape_length; }
// returns the current depth (start at 1 with 0 reserved for the fictitious root
// node)
size_t ParsedJson::Iterator::get_depth() const { return depth; }
// A scope is a series of nodes at the same depth, typically it is either an
// object ({) or an array ([). The root node has type 'r'.
uint8_t ParsedJson::Iterator::get_scope_type() const {
return depth_index[depth].scope_type;
}
// returns the current depth (start at 1 with 0 reserved for the fictitious root node)
size_t ParsedJson::iterator::get_depth() const {
return depth;
}
// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([).
// The root node has type 'r'.
uint8_t ParsedJson::iterator::get_scope_type() const {
return depthindex[depth].scope_type;
}
bool ParsedJson::iterator::move_forward() {
if(location + 1 >= tape_length) {
return false; // we are at the end!
}
if ((current_type == '[') || (current_type == '{')){
// We are entering a new scope
depth++;
depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
} else if ((current_type == ']') || (current_type == '}')) {
// Leaving a scope.
depth--;
} else if ((current_type == 'd') || (current_type == 'l')) {
// d and l types use 2 locations on the tape, not just one.
location += 1;
}
bool ParsedJson::Iterator::move_forward() {
if (location + 1 >= tape_length) {
return false; // we are at the end!
}
if ((current_type == '[') || (current_type == '{')) {
// We are entering a new scope
depth++;
depth_index[depth].start_of_scope = location;
depth_index[depth].scope_type = current_type;
} else if ((current_type == ']') || (current_type == '}')) {
// Leaving a scope.
depth--;
} else if ((current_type == 'd') || (current_type == 'l')) {
// d and l types use 2 locations on the tape, not just one.
location += 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
return true;
}
location += 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
return true;
}
void ParsedJson::iterator::move_to_value() {
// assume that we are on a key, so move by 1.
location += 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
void ParsedJson::Iterator::move_to_value() {
// assume that we are on a key, so move by 1.
location += 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
bool ParsedJson::iterator::move_to_key(const char * key) {
if(down()) {
do {
assert(is_string());
bool rightkey = (strcmp(get_string(),key)==0);// null chars would fool this
move_to_value();
if(rightkey) {
return true;
}
} while(next());
assert(up());// not found
}
return false;
}
bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) {
if(down()) {
do {
assert(is_string());
bool rightkey = ((get_string_length() == length) && (memcmp(get_string(),key,length)==0));
move_to_value();
if(rightkey) {
return true;
}
} while(next());
assert(up());// not found
}
return false;
}
bool ParsedJson::iterator::move_to_index(uint32_t index) {
assert(is_array());
if (down()) {
uint32_t i = 0;
for (; i < index; i++) {
if (!next()) {
break;
}
}
if (i == index) {
bool ParsedJson::Iterator::move_to_key(const char *key) {
if (down()) {
do {
assert(is_string());
bool right_key =
(strcmp(get_string(), key) == 0); // null chars would fool this
move_to_value();
if (right_key) {
return true;
}
assert(up());
}
return false;
} while (next());
assert(up()); // not found
}
return false;
}
bool ParsedJson::iterator::prev() {
if(location - 1 < depthindex[depth].start_of_scope) {
return false;
}
location -= 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
if ((current_type == ']') || (current_type == '}')){
// we need to jump
size_t new_location = ( current_val & JSONVALUEMASK);
if(new_location < depthindex[depth].start_of_scope) {
return false; // shoud never happen
bool ParsedJson::Iterator::move_to_key(const char *key, uint32_t length) {
if (down()) {
do {
assert(is_string());
bool right_key = ((get_string_length() == length) &&
(memcmp(get_string(), key, length) == 0));
move_to_value();
if (right_key) {
return true;
}
location = new_location;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
return true;
} while (next());
assert(up()); // not found
}
return false;
}
bool ParsedJson::iterator::up() {
if(depth == 1) {
return false; // don't allow moving back to root
}
to_start_scope();
// next we just move to the previous value
depth--;
location -= 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
return true;
}
bool ParsedJson::iterator::down() {
if(location + 1 >= tape_length) {
return false;
}
if ((current_type == '[') || (current_type == '{')) {
size_t npos = (current_val & JSONVALUEMASK);
if(npos == location + 2) {
return false; // we have an empty scope
bool ParsedJson::Iterator::move_to_index(uint32_t index) {
assert(is_array());
if (down()) {
uint32_t i = 0;
for (; i < index; i++) {
if (!next()) {
break;
}
depth++;
location = location + 1;
depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
if (i == index) {
return true;
}
return false;
assert(up());
}
return false;
}
void ParsedJson::iterator::to_start_scope() {
location = depthindex[depth].start_of_scope;
bool ParsedJson::Iterator::prev() {
if (location - 1 < depth_index[depth].start_of_scope) {
return false;
}
location -= 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
if ((current_type == ']') || (current_type == '}')) {
// we need to jump
size_t new_location = (current_val & JSON_VALUE_MASK);
if (new_location < depth_index[depth].start_of_scope) {
return false; // shoud never happen
}
location = new_location;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
return true;
}
bool ParsedJson::iterator::next() {
size_t npos;
if ((current_type == '[') || (current_type == '{')){
// we need to jump
npos = ( current_val & JSONVALUEMASK);
} else {
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
bool ParsedJson::Iterator::up() {
if (depth == 1) {
return false; // don't allow moving back to root
}
to_start_scope();
// next we just move to the previous value
depth--;
location -= 1;
current_val = pj.tape[location];
current_type = (current_val >> 56);
return true;
}
bool ParsedJson::Iterator::down() {
if (location + 1 >= tape_length) {
return false;
}
if ((current_type == '[') || (current_type == '{')) {
size_t npos = (current_val & JSON_VALUE_MASK);
if (npos == location + 2) {
return false; // we have an empty scope
}
uint64_t nextval = pj.tape[npos];
uint8_t nexttype = (nextval >> 56);
if((nexttype == ']') || (nexttype == '}')) {
return false; // we reached the end of the scope
}
location = npos;
current_val = nextval;
current_type = nexttype;
depth++;
location = location + 1;
depth_index[depth].start_of_scope = location;
depth_index[depth].scope_type = current_type;
current_val = pj.tape[location];
current_type = (current_val >> 56);
return true;
}
return false;
}
void ParsedJson::Iterator::to_start_scope() {
location = depth_index[depth].start_of_scope;
current_val = pj.tape[location];
current_type = (current_val >> 56);
}
bool ParsedJson::Iterator::next() {
size_t npos;
if ((current_type == '[') || (current_type == '{')) {
// we need to jump
npos = (current_val & JSON_VALUE_MASK);
} else {
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
}
uint64_t next_val = pj.tape[npos];
uint8_t next_type = (next_val >> 56);
if ((next_type == ']') || (next_type == '}')) {
return false; // we reached the end of the scope
}
location = npos;
current_val = next_val;
current_type = next_type;
return true;
}
} // namespace simdjson
#endif

View File

@ -2,33 +2,32 @@
#define SIMDJSON_PORTABILITY_H
#if defined(__x86_64__) || defined(_M_AMD64)
# define IS_X86_64 1
#define IS_X86_64 1
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
# define IS_ARM64 1
#define IS_ARM64 1
#endif
// this is almost standard?
#define STRINGIFY(a) #a
// we are going to use runtime dispatch
#ifdef IS_X86_64
#ifdef __clang__
// clang does not have GCC push pop
// warning: clang attribute push can't be used within a namespace in clang up til 8.0 so TARGET_REGION and
// UNTARGET_REGION must be *outside* of a namespace.
#define TARGET_REGION(T) _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), apply_to=function)))
// warning: clang attribute push can't be used within a namespace in clang up
// til 8.0 so TARGET_REGION and UNTARGET_REGION must be *outside* of a
// namespace.
#define TARGET_REGION(T) \
_Pragma(STRINGIFY( \
clang attribute push(__attribute__((target(T))), apply_to = function)))
#define UNTARGET_REGION _Pragma("clang attribute pop")
#elif defined(__GNUC__)
// GCC is easier
#define TARGET_REGION(T) \
_Pragma("GCC push_options") \
_Pragma(STRINGIFY(GCC target(T)))
#define UNTARGET_REGION \
_Pragma("GCC pop_options")
#else
#define TARGET_REGION(T) \
_Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
#define UNTARGET_REGION _Pragma("GCC pop_options")
#else
#define TARGET_REGION(T)
#define UNTARGET_REGION
#endif // clang then gcc
@ -39,49 +38,50 @@ _Pragma("GCC pop_options")
#endif // x86
#ifdef _MSC_VER
# include <intrin.h>
#include <intrin.h>
#else
# if IS_X86_64
# include <x86intrin.h>
# elif IS_ARM64
# include <arm_neon.h>
# endif
#if IS_X86_64
#include <x86intrin.h>
#elif IS_ARM64
#include <arm_neon.h>
#endif
#endif
#ifdef _MSC_VER
/* Microsoft C/C++-compatible compiler */
#include <iso646.h>
#include <cstdint>
#include <iso646.h>
namespace simdjson {
static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
return _addcarry_u64(0, value1, value2, reinterpret_cast<unsigned __int64 *>(result));
static inline bool add_overflow(uint64_t value1, uint64_t value2,
uint64_t *result) {
return _addcarry_u64(0, value1, value2,
reinterpret_cast<unsigned __int64 *>(result));
}
# pragma intrinsic(_umul128)
static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
uint64_t high;
*result = _umul128(value1, value2, &high);
return high;
#pragma intrinsic(_umul128)
static inline bool mul_overflow(uint64_t value1, uint64_t value2,
uint64_t *result) {
uint64_t high;
*result = _umul128(value1, value2, &high);
return high;
}
static inline int trailingzeroes(uint64_t input_num) {
return static_cast<int>(_tzcnt_u64(input_num));
static inline int trailing_zeroes(uint64_t input_num) {
return static_cast<int>(_tzcnt_u64(input_num));
}
static inline int leadingzeroes(uint64_t input_num) {
return static_cast<int>(_lzcnt_u64(input_num));
static inline int leading_zeroes(uint64_t input_num) {
return static_cast<int>(_lzcnt_u64(input_num));
}
static inline int hamming(uint64_t input_num) {
#ifdef _WIN64 // highly recommended!!!
return (int)__popcnt64(input_num);
#else // if we must support 32-bit Windows
return (int)(__popcnt((uint32_t)input_num) +
__popcnt((uint32_t)(input_num >> 32)));
#ifdef _WIN64 // highly recommended!!!
return (int)__popcnt64(input_num);
#else // if we must support 32-bit Windows
return (int)(__popcnt((uint32_t)input_num) +
__popcnt((uint32_t)(input_num >> 32)));
#endif
}
} // namespace simdjson
@ -90,78 +90,83 @@ static inline int hamming(uint64_t input_num) {
#include <cstdlib>
namespace simdjson {
static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
return __builtin_uaddll_overflow(value1, value2, (unsigned long long*)result);
static inline bool add_overflow(uint64_t value1, uint64_t value2,
uint64_t *result) {
return __builtin_uaddll_overflow(value1, value2,
(unsigned long long *)result);
}
static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
return __builtin_umulll_overflow(value1, value2, (unsigned long long *)result);
static inline bool mul_overflow(uint64_t value1, uint64_t value2,
uint64_t *result) {
return __builtin_umulll_overflow(value1, value2,
(unsigned long long *)result);
}
/* result might be undefined when input_num is zero */
static inline int trailingzeroes(uint64_t input_num) {
#ifdef __BMI__// tzcnt is BMI1
return _tzcnt_u64(input_num);
static inline int trailing_zeroes(uint64_t input_num) {
#ifdef __BMI__ // tzcnt is BMI1
return _tzcnt_u64(input_num);
#else
return __builtin_ctzll(input_num);
return __builtin_ctzll(input_num);
#endif
}
/* result might be undefined when input_num is zero */
static inline int leadingzeroes(uint64_t input_num) {
static inline int leading_zeroes(uint64_t input_num) {
#ifdef __BMI2__
return _lzcnt_u64(input_num);
return _lzcnt_u64(input_num);
#else
return __builtin_clzll(input_num);
return __builtin_clzll(input_num);
#endif
}
/* result might be undefined when input_num is zero */
static inline int hamming(uint64_t input_num) {
#ifdef __POPCOUNT__
return _popcnt64(input_num);
return _popcnt64(input_num);
#else
return __builtin_popcountll(input_num);
return __builtin_popcountll(input_num);
#endif
}
}
} // namespace simdjson
#endif // _MSC_VER
namespace simdjson {
// portable version of posix_memalign
static inline void *aligned_malloc(size_t alignment, size_t size) {
void *p;
void *p;
#ifdef _MSC_VER
p = _aligned_malloc(size, alignment);
p = _aligned_malloc(size, alignment);
#elif defined(__MINGW32__) || defined(__MINGW64__)
p = __mingw_aligned_malloc(size, alignment);
p = __mingw_aligned_malloc(size, alignment);
#else
// somehow, if this is used before including "x86intrin.h", it creates an
// implicit defined warning.
if (posix_memalign(&p, alignment, size) != 0) { return nullptr; }
// somehow, if this is used before including "x86intrin.h", it creates an
// implicit defined warning.
if (posix_memalign(&p, alignment, size) != 0) {
return nullptr;
}
#endif
return p;
return p;
}
static inline char *aligned_malloc_char(size_t alignment, size_t size) {
return (char*)aligned_malloc(alignment, size);
return (char *)aligned_malloc(alignment, size);
}
static inline void aligned_free(void *memblock) {
if(memblock == nullptr) { return; }
static inline void aligned_free(void *mem_block) {
if (mem_block == nullptr) {
return;
}
#ifdef _MSC_VER
_aligned_free(memblock);
_aligned_free(mem_block);
#elif defined(__MINGW32__) || defined(__MINGW64__)
__mingw_aligned_free(memblock);
__mingw_aligned_free(mem_block);
#else
free(memblock);
free(mem_block);
#endif
}
static inline void aligned_free_char(char *memblock) {
aligned_free((void*)memblock);
}
static inline void aligned_free_char(char *mem_block) {
aligned_free((void *)mem_block);
}
} // namespace simdjson
#endif // SIMDJSON_PORTABILITY_H

View File

@ -5,38 +5,40 @@
namespace simdjson {
// Represents the minimal architecture that would support an implementation
enum class architecture {
westmere,
haswell,
arm64,
none,
// TODO remove 'native' in favor of runtime dispatch?
// the 'native' enum class value should point at a good default on the current machine
enum class Architecture {
WESTMERE,
HASWELL,
ARM64,
NONE,
// TODO remove 'native' in favor of runtime dispatch?
// the 'native' enum class value should point at a good default on the current
// machine
#ifdef IS_X86_64
native = westmere
NATIVE = WESTMERE
#elif defined(IS_ARM64)
native = arm64
NATIVE = ARM64
#endif
};
enum errorValues {
enum ErrorValues {
SUCCESS = 0,
CAPACITY, // This ParsedJson can't support a document that big
MEMALLOC, // Error allocating memory, most likely out of memory
TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this is a generic error
CAPACITY, // This ParsedJson can't support a document that big
MEMALLOC, // Error allocating memory, most likely out of memory
TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this
// is a generic error
DEPTH_ERROR, // Your document exceeds the user-specified depth limitation
STRING_ERROR, // Problem while parsing a string
T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't'
F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
NUMBER_ERROR, // Problem while parsing a number
UTF8_ERROR, // the input is not valid UTF-8
UNITIALIZED, // unknown error, or uninitialized document
EMPTY, // no structural document found
STRING_ERROR, // Problem while parsing a string
T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't'
F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
NUMBER_ERROR, // Problem while parsing a number
UTF8_ERROR, // the input is not valid UTF-8
UNITIALIZED, // unknown error, or uninitialized document
EMPTY, // no structural document found
UNESCAPED_CHARS, // found unescaped characters in a string.
UNCLOSED_STRING, // missing quote at the end
UNEXPECTED_ERROR // indicative of a bug in simdjson
};
const std::string& errorMsg(const int);
}
const std::string &error_message(const int);
} // namespace simdjson
#endif

View File

@ -1,12 +1,13 @@
// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
#define SIMDJSON_VERSION 0.1.2
// /include/simdjson/simdjson_version.h automatically generated by release.py,
// do not change by hand
#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION
#define SIMDJSON_INCLUDE_SIMDJSON_VERSION
#define SIMDJSON_VERSION 0.1.2
namespace simdjson {
enum {
SIMDJSON_VERSION_MAJOR = 0,
SIMDJSON_VERSION_MINOR = 1,
SIMDJSON_VERSION_REVISION = 2
};
enum {
SIMDJSON_VERSION_MAJOR = 0,
SIMDJSON_VERSION_MINOR = 1,
SIMDJSON_VERSION_REVISION = 2
};
}
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION
#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION

View File

@ -4,14 +4,15 @@
#ifndef SIMDJSON_SIMDUTF8CHECK_ARM64_H
#define SIMDJSON_SIMDUTF8CHECK_ARM64_H
#if defined(_ARM_NEON) || defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64))
#if defined(_ARM_NEON) || defined(__aarch64__) || \
(defined(_MSC_VER) && defined(_M_ARM64))
#include <cstdio>
#include <arm_neon.h>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <arm_neon.h>
/*
* legal utf-8 byte sequence
@ -32,47 +33,49 @@
namespace simdjson {
// all byte values must be no larger than 0xF4
static inline void checkSmallerThan0xF4(int8x16_t current_bytes,
int8x16_t *has_error) {
static inline void check_smaller_than_0xF4(int8x16_t current_bytes,
int8x16_t *has_error) {
// unsigned, saturates to 0 below max
*has_error = vorrq_s8(*has_error,
vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
*has_error = vorrq_s8(
*has_error, vreinterpretq_s8_u8(vqsubq_u8(
vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
}
static const int8_t _nibbles[] = {
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
2, 2, // 110x
3, // 1110
4, // 1111, next should be 0 (not checked here)
1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
2, 2, // 110x
3, // 1110
4, // 1111, next should be 0 (not checked here)
};
static inline int8x16_t continuationLengths(int8x16_t high_nibbles) {
static inline int8x16_t continuation_lengths(int8x16_t high_nibbles) {
return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles));
}
static inline int8x16_t carryContinuations(int8x16_t initial_lengths,
int8x16_t previous_carries) {
static inline int8x16_t carry_continuations(int8x16_t initial_lengths,
int8x16_t previous_carries) {
int8x16_t right1 =
vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
vdupq_n_u8(1)));
int8x16_t right1 = vreinterpretq_s8_u8(vqsubq_u8(
vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
vdupq_n_u8(1)));
int8x16_t sum = vaddq_s8(initial_lengths, right1);
int8x16_t right2 = vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
vdupq_n_u8(2)));
int8x16_t right2 = vreinterpretq_s8_u8(
vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
vdupq_n_u8(2)));
return vaddq_s8(sum, right2);
}
static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carries,
int8x16_t *has_error) {
static inline void check_continuations(int8x16_t initial_lengths,
int8x16_t carries,
int8x16_t *has_error) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
// (carries > length) == (lengths > 0)
uint8x16_t overunder =
vceqq_u8(vcgtq_s8(carries, initial_lengths),
vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
uint8x16_t overunder = vceqq_u8(vcgtq_s8(carries, initial_lengths),
vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
*has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
}
@ -80,9 +83,9 @@ static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carri
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
static inline void checkFirstContinuationMax(int8x16_t current_bytes,
int8x16_t off1_current_bytes,
int8x16_t *has_error) {
static inline void check_first_continuation_max(int8x16_t current_bytes,
int8x16_t off1_current_bytes,
int8x16_t *has_error) {
uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED));
uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4));
@ -91,23 +94,24 @@ static inline void checkFirstContinuationMax(int8x16_t current_bytes,
uint8x16_t badfollowF4 =
vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4);
*has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)));
*has_error = vorrq_s8(
*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4)));
}
static const int8_t _initial_mins[] = {
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
(int8_t) 0xC2, -128, // 110x
(int8_t) 0xE1, // 1110
(int8_t) 0xF1,
-128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, -128, -128, // 10xx => false
(int8_t)0xC2, -128, // 110x
(int8_t)0xE1, // 1110
(int8_t)0xF1,
};
static const int8_t _second_mins[] = {
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, // 10xx => false
127, 127, // 110x => true
(int8_t) 0xA0, // 1110
(int8_t) 0x90,
-128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, -128, -128, // 10xx => false
127, 127, // 110x => true
(int8_t)0xA0, // 1110
(int8_t)0x90,
};
// map off1_hibits => error condition
@ -116,58 +120,61 @@ static const int8_t _second_mins[] = {
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
static inline void checkOverlong(int8x16_t current_bytes,
int8x16_t off1_current_bytes, int8x16_t hibits,
int8x16_t previous_hibits, int8x16_t *has_error) {
static inline void check_overlong(int8x16_t current_bytes,
int8x16_t off1_current_bytes,
int8x16_t hibits, int8x16_t previous_hibits,
int8x16_t *has_error) {
int8x16_t off1_hibits = vextq_s8(previous_hibits, hibits, 16 - 1);
int8x16_t initial_mins = vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits));
int8x16_t initial_mins =
vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits));
uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes);
int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits));
int8x16_t second_mins =
vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits));
uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes);
*has_error =
vorrq_s8(*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
*has_error = vorrq_s8(
*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
}
struct processed_utf_bytes {
int8x16_t rawbytes;
int8x16_t raw_bytes;
int8x16_t high_nibbles;
int8x16_t carried_continuations;
};
static inline void count_nibbles(int8x16_t bytes,
struct processed_utf_bytes *answer) {
answer->rawbytes = bytes;
answer->raw_bytes = bytes;
answer->high_nibbles =
vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
}
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static inline struct processed_utf_bytes
checkUTF8Bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
int8x16_t *has_error) {
check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
int8x16_t *has_error) {
struct processed_utf_bytes pb;
count_nibbles(current_bytes, &pb);
checkSmallerThan0xF4(current_bytes, has_error);
check_smaller_than_0xF4(current_bytes, has_error);
int8x16_t initial_lengths = continuationLengths(pb.high_nibbles);
int8x16_t initial_lengths = continuation_lengths(pb.high_nibbles);
pb.carried_continuations =
carryContinuations(initial_lengths, previous->carried_continuations);
carry_continuations(initial_lengths, previous->carried_continuations);
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
check_continuations(initial_lengths, pb.carried_continuations, has_error);
int8x16_t off1_current_bytes =
vextq_s8(previous->rawbytes, pb.rawbytes, 16 - 1);
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
vextq_s8(previous->raw_bytes, pb.raw_bytes, 16 - 1);
check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
}// simdjson
} // namespace simdjson
#endif
#endif

View File

@ -1,11 +1,10 @@
#ifndef SIMDJSON_SIMDUTF8CHECK_HASWELL_H
#define SIMDJSON_SIMDUTF8CHECK_HASWELL_H
#include "simdjson/portability.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "simdjson/portability.h"
#ifdef IS_X86_64
/*
@ -38,14 +37,14 @@ static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) {
}
// all byte values must be no larger than 0xF4
static inline void avxcheckSmallerThan0xF4(__m256i current_bytes,
__m256i *has_error) {
static inline void avx_check_smaller_than_0xF4(__m256i current_bytes,
__m256i *has_error) {
// unsigned, saturates to 0 below max
*has_error = _mm256_or_si256(
*has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4)));
}
static inline __m256i avxcontinuationLengths(__m256i high_nibbles) {
static inline __m256i avx_continuation_lengths(__m256i high_nibbles) {
return _mm256_shuffle_epi8(
_mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
@ -61,8 +60,8 @@ static inline __m256i avxcontinuationLengths(__m256i high_nibbles) {
high_nibbles);
}
static inline __m256i avxcarryContinuations(__m256i initial_lengths,
__m256i previous_carries) {
static inline __m256i avx_carry_continuations(__m256i initial_lengths,
__m256i previous_carries) {
__m256i right1 = _mm256_subs_epu8(
push_last_byte_of_a_to_b(previous_carries, initial_lengths),
@ -74,8 +73,9 @@ static inline __m256i avxcarryContinuations(__m256i initial_lengths,
return _mm256_add_epi8(sum, right2);
}
static inline void avxcheckContinuations(__m256i initial_lengths,
__m256i carries, __m256i *has_error) {
static inline void avx_check_continuations(__m256i initial_lengths,
__m256i carries,
__m256i *has_error) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
@ -90,9 +90,9 @@ static inline void avxcheckContinuations(__m256i initial_lengths,
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
static inline void avxcheckFirstContinuationMax(__m256i current_bytes,
__m256i off1_current_bytes,
__m256i *has_error) {
static inline void avx_check_first_continuation_max(__m256i current_bytes,
__m256i off1_current_bytes,
__m256i *has_error) {
__m256i maskED =
_mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED));
__m256i maskF4 =
@ -113,37 +113,37 @@ static inline void avxcheckFirstContinuationMax(__m256i current_bytes,
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
static inline void avxcheckOverlong(__m256i current_bytes,
__m256i off1_current_bytes, __m256i hibits,
__m256i previous_hibits,
__m256i *has_error) {
static inline void avx_check_overlong(__m256i current_bytes,
__m256i off1_current_bytes,
__m256i hibits, __m256i previous_hibits,
__m256i *has_error) {
__m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
__m256i initial_mins = _mm256_shuffle_epi8(
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1, // 1111
-128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1), // 1111
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1, // 1111
-128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, // 10xx => false
0xC2, -128, // 110x
0xE1, // 1110
0xF1), // 1111
off1_hibits);
__m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
__m256i second_mins = _mm256_shuffle_epi8(
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90, // 1111
-128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90), // 1111
_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90, // 1111
-128, -128, -128, -128, -128, -128, -128, -128, -128,
-128, -128, -128, // 10xx => false
127, 127, // 110x => true
0xA0, // 1110
0x90), // 1111
off1_hibits);
__m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
*has_error = _mm256_or_si256(*has_error,
@ -151,14 +151,14 @@ static inline void avxcheckOverlong(__m256i current_bytes,
}
struct avx_processed_utf_bytes {
__m256i rawbytes;
__m256i raw_bytes;
__m256i high_nibbles;
__m256i carried_continuations;
};
static inline void avx_count_nibbles(__m256i bytes,
struct avx_processed_utf_bytes *answer) {
answer->rawbytes = bytes;
answer->raw_bytes = bytes;
answer->high_nibbles =
_mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F));
}
@ -166,33 +166,33 @@ static inline void avx_count_nibbles(__m256i bytes,
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static inline struct avx_processed_utf_bytes
avxcheckUTF8Bytes(__m256i current_bytes,
struct avx_processed_utf_bytes *previous,
__m256i *has_error) {
struct avx_processed_utf_bytes pb{};
avx_check_utf8_bytes(__m256i current_bytes,
struct avx_processed_utf_bytes *previous,
__m256i *has_error) {
struct avx_processed_utf_bytes pb {};
avx_count_nibbles(current_bytes, &pb);
avxcheckSmallerThan0xF4(current_bytes, has_error);
avx_check_smaller_than_0xF4(current_bytes, has_error);
__m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
__m256i initial_lengths = avx_continuation_lengths(pb.high_nibbles);
pb.carried_continuations =
avxcarryContinuations(initial_lengths, previous->carried_continuations);
avx_carry_continuations(initial_lengths, previous->carried_continuations);
avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
avx_check_continuations(initial_lengths, pb.carried_continuations, has_error);
__m256i off1_current_bytes =
push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
push_last_byte_of_a_to_b(previous->raw_bytes, pb.raw_bytes);
avx_check_first_continuation_max(current_bytes, off1_current_bytes,
has_error);
avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
avx_check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
}// simdjson
} // namespace simdjson
UNTARGET_REGION // haswell
#endif // IS_X86_64
#endif

View File

@ -1,10 +1,10 @@
#ifndef SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
#define SIMDJSON_SIMDUTF8CHECK_WESTMERE_H
#include "simdjson/portability.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "simdjson/portability.h"
#ifdef IS_X86_64
/*
@ -29,16 +29,16 @@
/********** sse code **********/
TARGET_WESTMERE
namespace simdjson{
namespace simdjson {
// all byte values must be no larger than 0xF4
static inline void checkSmallerThan0xF4(__m128i current_bytes,
__m128i *has_error) {
static inline void check_smaller_than_0xF4(__m128i current_bytes,
__m128i *has_error) {
// unsigned, saturates to 0 below max
*has_error = _mm_or_si128(*has_error,
_mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
}
static inline __m128i continuationLengths(__m128i high_nibbles) {
static inline __m128i continuation_lengths(__m128i high_nibbles) {
return _mm_shuffle_epi8(
_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
0, 0, 0, 0, // 10xx (continuation)
@ -48,8 +48,8 @@ static inline __m128i continuationLengths(__m128i high_nibbles) {
high_nibbles);
}
static inline __m128i carryContinuations(__m128i initial_lengths,
__m128i previous_carries) {
static inline __m128i carry_continuations(__m128i initial_lengths,
__m128i previous_carries) {
__m128i right1 =
_mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
@ -61,8 +61,8 @@ static inline __m128i carryContinuations(__m128i initial_lengths,
return _mm_add_epi8(sum, right2);
}
static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
__m128i *has_error) {
static inline void check_continuations(__m128i initial_lengths, __m128i carries,
__m128i *has_error) {
// overlap || underlap
// carry > length && length > 0 || !(carry > length) && !(length > 0)
@ -77,9 +77,9 @@ static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
// when 0xED is found, next byte must be no larger than 0x9F
// when 0xF4 is found, next byte must be no larger than 0x8F
// next byte must be continuation, ie sign bit is set, so signed < is ok
static inline void checkFirstContinuationMax(__m128i current_bytes,
__m128i off1_current_bytes,
__m128i *has_error) {
static inline void check_first_continuation_max(__m128i current_bytes,
__m128i off1_current_bytes,
__m128i *has_error) {
__m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
__m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
@ -97,9 +97,9 @@ static inline void checkFirstContinuationMax(__m128i current_bytes,
// E => < E1 && < A0
// F => < F1 && < 90
// else false && false
static inline void checkOverlong(__m128i current_bytes,
__m128i off1_current_bytes, __m128i hibits,
__m128i previous_hibits, __m128i *has_error) {
static inline void check_overlong(__m128i current_bytes,
__m128i off1_current_bytes, __m128i hibits,
__m128i previous_hibits, __m128i *has_error) {
__m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
__m128i initial_mins = _mm_shuffle_epi8(
_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
@ -124,14 +124,14 @@ static inline void checkOverlong(__m128i current_bytes,
}
struct processed_utf_bytes {
__m128i rawbytes;
__m128i raw_bytes;
__m128i high_nibbles;
__m128i carried_continuations;
};
static inline void count_nibbles(__m128i bytes,
struct processed_utf_bytes *answer) {
answer->rawbytes = bytes;
answer->raw_bytes = bytes;
answer->high_nibbles =
_mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
}
@ -139,32 +139,31 @@ static inline void count_nibbles(__m128i bytes,
// check whether the current bytes are valid UTF-8
// at the end of the function, previous gets updated
static struct processed_utf_bytes
checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
__m128i *has_error) {
check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
__m128i *has_error) {
struct processed_utf_bytes pb;
count_nibbles(current_bytes, &pb);
checkSmallerThan0xF4(current_bytes, has_error);
check_smaller_than_0xF4(current_bytes, has_error);
__m128i initial_lengths = continuationLengths(pb.high_nibbles);
__m128i initial_lengths = continuation_lengths(pb.high_nibbles);
pb.carried_continuations =
carryContinuations(initial_lengths, previous->carried_continuations);
carry_continuations(initial_lengths, previous->carried_continuations);
checkContinuations(initial_lengths, pb.carried_continuations, has_error);
check_continuations(initial_lengths, pb.carried_continuations, has_error);
__m128i off1_current_bytes =
_mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
_mm_alignr_epi8(pb.raw_bytes, previous->raw_bytes, 16 - 1);
check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
previous->high_nibbles, has_error);
return pb;
}
}//simdjson
} // namespace simdjson
UNTARGET_REGION // westmere
#endif // IS_X86_64
#endif

View File

@ -1,67 +1,60 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_H
#define SIMDJSON_STAGE1_FIND_MARKS_H
#include <cassert>
#include "simdjson/common_defs.h"
#include "simdjson/simdjson.h"
#include "simdjson/parsedjson.h"
#include "simdjson/portability.h"
#include "simdjson/simdjson.h"
#include <cassert>
namespace simdjson {
template<architecture>
struct simd_input;
template <Architecture> struct simd_input;
template<architecture T>
uint64_t compute_quote_mask(uint64_t quote_bits);
template <Architecture T> uint64_t compute_quote_mask(uint64_t quote_bits);
namespace {
// for when clmul is unavailable
[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
quote_mask = quote_mask ^ (quote_mask << 2);
quote_mask = quote_mask ^ (quote_mask << 4);
quote_mask = quote_mask ^ (quote_mask << 8);
quote_mask = quote_mask ^ (quote_mask << 16);
quote_mask = quote_mask ^ (quote_mask << 32);
return quote_mask;
}
// for when clmul is unavailable
[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
quote_mask = quote_mask ^ (quote_mask << 2);
quote_mask = quote_mask ^ (quote_mask << 4);
quote_mask = quote_mask ^ (quote_mask << 8);
quote_mask = quote_mask ^ (quote_mask << 16);
quote_mask = quote_mask ^ (quote_mask << 32);
return quote_mask;
}
} // namespace
// Holds the state required to perform check_utf8().
template<architecture>
struct utf8_checking_state;
template <Architecture> struct utf8_checking_state;
template<architecture T>
void check_utf8(simd_input<T> in, utf8_checking_state<T>& state);
template <Architecture T>
void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
// Checks if the utf8 validation has found any error.
template<architecture T>
errorValues check_utf8_errors(utf8_checking_state<T>& state);
template <Architecture T>
ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
// a straightforward comparison of a mask against input.
template<architecture T>
// a straightforward comparison of a mask against input.
template <Architecture T>
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
template<architecture T>
simd_input<T> fill_input(const uint8_t * ptr);
template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
template<architecture T>
// find all values less than or equal than the content of maxval (using unsigned
// arithmetic)
template <Architecture T>
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
template <Architecture T>
really_inline uint64_t find_odd_backslash_sequences(
simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
template<architecture T> really_inline
uint64_t find_odd_backslash_sequences(simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);
template<architecture T> really_inline
uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask);
template <Architecture T>
really_inline uint64_t find_quote_mask_and_bits(
simd_input<T> in, uint64_t odd_ends, uint64_t &prev_iter_inside_quote,
uint64_t &quote_bits, uint64_t &error_mask);
// do a 'shufti' to detect structural JSON characters
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
@ -70,9 +63,8 @@ uint64_t find_quote_mask_and_bits(simd_input<T> in, uint64_t odd_ends,
// we are also interested in the four whitespace characters
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
// these go into the next 2 buckets of the comparison (8/16)
template<architecture T>
void find_whitespace_and_structurals(simd_input<T> in,
uint64_t &whitespace,
template <Architecture T>
void find_whitespace_and_structurals(simd_input<T> in, uint64_t &whitespace,
uint64_t &structurals);
// return a updated structural bit vector with quoted contents cleared out and
@ -86,7 +78,7 @@ really_inline uint64_t finalize_structurals(
uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) {
// mask off anything inside quotes
structurals &= ~quote_mask;
// add the real quote bits back into our bitmask as well, so we can
// add the real quote bits back into our bit_mask as well, so we can
// quickly traverse the strings we've spent all this trouble gathering
structurals |= quote_bits;
// Now, establish "pseudo-structural characters". These are non-whitespace
@ -114,12 +106,14 @@ really_inline uint64_t finalize_structurals(
return structurals;
}
template<architecture T = architecture::native>
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj);
template <Architecture T = Architecture::NATIVE>
int find_structural_bits(const uint8_t *buf, size_t len,
simdjson::ParsedJson &pj);
template<architecture T = architecture::native>
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) {
return find_structural_bits((const uint8_t*)buf, len, pj);
template <Architecture T = Architecture::NATIVE>
int find_structural_bits(const char *buf, size_t len,
simdjson::ParsedJson &pj) {
return find_structural_bits((const uint8_t *)buf, len, pj);
}
} // namespace simdjson

View File

@ -1,23 +1,24 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
#define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_macros.h"
#include "simdjson/stage1_find_marks_flatten.h"
#include "simdjson/simdutf8check_arm64.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_flatten.h"
#include "simdjson/stage1_find_marks_macros.h"
#ifdef IS_ARM64
namespace simdjson {
template<> struct simd_input<architecture::arm64> {
template <> struct simd_input<Architecture::ARM64> {
uint8x16_t i0;
uint8x16_t i1;
uint8x16_t i2;
uint8x16_t i3;
};
template<> really_inline
simd_input<architecture::arm64> fill_input<architecture::arm64>(const uint8_t * ptr) {
struct simd_input<architecture::arm64> in;
template <>
really_inline simd_input<Architecture::ARM64>
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
struct simd_input<Architecture::ARM64> in;
in.i0 = vld1q_u8(ptr + 0);
in.i1 = vld1q_u8(ptr + 16);
in.i2 = vld1q_u8(ptr + 32);
@ -25,26 +26,24 @@ simd_input<architecture::arm64> fill_input<architecture::arm64>(const uint8_t *
return in;
}
really_inline
uint16_t neonmovemask(uint8x16_t input) {
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
really_inline uint16_t neon_movemask(uint8x16_t input) {
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t minput = vandq_u8(input, bitmask);
uint8x16_t minput = vandq_u8(input, bit_mask);
uint8x16_t tmp = vpaddq_u8(minput, minput);
tmp = vpaddq_u8(tmp, tmp);
tmp = vpaddq_u8(tmp, tmp);
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
}
really_inline
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
uint8x16_t p2, uint8x16_t p3) {
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t t0 = vandq_u8(p0, bitmask);
uint8x16_t t1 = vandq_u8(p1, bitmask);
uint8x16_t t2 = vandq_u8(p2, bitmask);
uint8x16_t t3 = vandq_u8(p3, bitmask);
uint8x16_t t0 = vandq_u8(p0, bit_mask);
uint8x16_t t1 = vandq_u8(p1, bit_mask);
uint8x16_t t2 = vandq_u8(p2, bit_mask);
uint8x16_t t3 = vandq_u8(p3, bit_mask);
uint8x16_t sum0 = vpaddq_u8(t0, t1);
uint8x16_t sum1 = vpaddq_u8(t2, t3);
sum0 = vpaddq_u8(sum0, sum1);
@ -52,108 +51,122 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
}
template<> really_inline
uint64_t compute_quote_mask<architecture::arm64>(uint64_t quote_bits) {
template <>
really_inline uint64_t
compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
return vmull_p64( -1ULL, quote_bits);
return vmull_p64(-1ULL, quote_bits);
#else
return portable_compute_quote_mask(quote_bits);
#endif
#endif
}
template<>
struct utf8_checking_state<architecture::arm64>
{
int8x16_t has_error {};
processed_utf_bytes previous {};
template <> struct utf8_checking_state<Architecture::ARM64> {
int8x16_t has_error{};
processed_utf_bytes previous{};
};
// Checks that all bytes are ascii
really_inline
bool check_ascii_neon(simd_input<architecture::arm64> in) {
really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
// checking if the most significant bit is always equal to 0.
uint8x16_t highbit = vdupq_n_u8(0x80);
uint8x16_t high_bit = vdupq_n_u8(0x80);
uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
uint8x16_t t3 = vorrq_u8(t0, t1);
uint8x16_t t4 = vandq_u8(t3, highbit);
uint8x16_t t4 = vandq_u8(t3, high_bit);
uint64x2_t v64 = vreinterpretq_u64_u8(t4);
uint32x2_t v32 = vqmovn_u64(v64);
uint64x1_t result = vreinterpret_u64_u32(v32);
return vget_lane_u64(result, 0) == 0;
}
template<> really_inline
void check_utf8<architecture::arm64>(simd_input<architecture::arm64> in,
utf8_checking_state<architecture::arm64>& state) {
template <>
really_inline void check_utf8<Architecture::ARM64>(
simd_input<Architecture::ARM64> in,
utf8_checking_state<Architecture::ARM64> &state) {
if (check_ascii_neon(in)) {
// All bytes are ascii. Therefore the byte that was just before must be ascii too.
// We only check the byte that was just before simd_input. Nines are arbitrary values.
const int8x16_t verror = (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
// All bytes are ascii. Therefore the byte that was just before must be
// ascii too. We only check the byte that was just before simd_input. Nines
// are arbitrary values.
const int8x16_t verror =
(int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
state.has_error =
vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(state.previous.carried_continuations,
verror)),
state.has_error);
vorrq_s8(vreinterpretq_s8_u8(
vcgtq_s8(state.previous.carried_continuations, verror)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i0), &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i1), &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i2), &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i3), &(state.previous), &(state.has_error));
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
&(state.previous), &(state.has_error));
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
&(state.previous), &(state.has_error));
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
&(state.previous), &(state.has_error));
state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
&(state.previous), &(state.has_error));
}
}
template<> really_inline
errorValues check_utf8_errors<architecture::arm64>(utf8_checking_state<architecture::arm64>& state) {
template <>
really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
utf8_checking_state<Architecture::ARM64> &state) {
uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
uint32x2_t v32 = vqmovn_u64(v64);
uint64x1_t result = vreinterpret_u64_u32(v32);
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
: simdjson::SUCCESS;
}
template<> really_inline
uint64_t cmp_mask_against_input<architecture::arm64>(simd_input<architecture::arm64> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
template <>
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
simd_input<Architecture::ARM64> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
}
template<> really_inline
uint64_t unsigned_lteq_against_input<architecture::arm64>(simd_input<architecture::arm64> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
template <>
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
simd_input<Architecture::ARM64> in, uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
}
template<> really_inline
uint64_t find_odd_backslash_sequences<architecture::arm64>(simd_input<architecture::arm64> in, uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(architecture::arm64, in, prev_iter_ends_odd_backslash);
template <>
really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
simd_input<Architecture::ARM64> in,
uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
prev_iter_ends_odd_backslash);
}
template<> really_inline
uint64_t find_quote_mask_and_bits<architecture::arm64>(simd_input<architecture::arm64> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(architecture::arm64, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
template <>
really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
simd_input<Architecture::ARM64> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
prev_iter_inside_quote, quote_bits, error_mask)
}
template<> really_inline
void find_whitespace_and_structurals<architecture::arm64>(
simd_input<architecture::arm64> in,
uint64_t &whitespace,
uint64_t &structurals) {
const uint8x16_t low_nibble_mask = (uint8x16_t){
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask = (uint8x16_t){
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
template <>
really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
simd_input<Architecture::ARM64> in, uint64_t &whitespace,
uint64_t &structurals) {
const uint8x16_t low_nibble_mask =
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask =
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
@ -183,15 +196,15 @@ void find_whitespace_and_structurals<architecture::arm64>(
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
}
}// simdjson namespace
} // namespace simdjson
#endif // IS_ARM64
#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H

View File

@ -10,17 +10,17 @@ namespace simdjson {
// again our optimized version.
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
uint32_t idx, uint64_t bits) {
uint32_t * out_ptr = base_ptr + base;
uint32_t *out_ptr = base_ptr + base;
idx -= 64;
while(bits != 0) {
out_ptr[0] = idx + trailingzeroes(bits);
bits = bits & (bits - 1);
out_ptr++;
while (bits != 0) {
out_ptr[0] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
out_ptr++;
}
base = (out_ptr - base_ptr);
}
#else
#else
// flatten out values in 'bits' assuming that they are are to have values of idx
// plus their position in the bitvector, and store these indexes at
// base_ptr[base] incrementing base as we go
@ -28,65 +28,66 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
// needs to be large enough to handle this
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
uint32_t idx, uint64_t bits) {
// In some instances, the next branch is expensive because it is mispredicted.
// In some instances, the next branch is expensive because it is mispredicted.
// Unfortunately, in other cases,
// it helps tremendously.
if(bits == 0) return;
if (bits == 0)
return;
uint32_t cnt = hamming(bits);
uint32_t next_base = base + cnt;
idx -= 64;
base_ptr += base;
{
base_ptr[0] = idx + trailingzeroes(bits);
{
base_ptr[0] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[1] = idx + trailingzeroes(bits);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[2] = idx + trailingzeroes(bits);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[3] = idx + trailingzeroes(bits);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[4] = idx + trailingzeroes(bits);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[5] = idx + trailingzeroes(bits);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[6] = idx + trailingzeroes(bits);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[7] = idx + trailingzeroes(bits);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr += 8;
}
// We hope that the next branch is easily predicted.
if (cnt > 8) {
base_ptr[0] = idx + trailingzeroes(bits);
base_ptr[0] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[1] = idx + trailingzeroes(bits);
base_ptr[1] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[2] = idx + trailingzeroes(bits);
base_ptr[2] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[3] = idx + trailingzeroes(bits);
base_ptr[3] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[4] = idx + trailingzeroes(bits);
base_ptr[4] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[5] = idx + trailingzeroes(bits);
base_ptr[5] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[6] = idx + trailingzeroes(bits);
base_ptr[6] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr[7] = idx + trailingzeroes(bits);
base_ptr[7] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr += 8;
}
if (cnt > 16) { // unluckly: we rarely get here
// since it means having one structural or pseudo-structral element
// since it means having one structural or pseudo-structral element
// every 4 characters (possible with inputs like "","","",...).
do {
base_ptr[0] = idx + trailingzeroes(bits);
base_ptr[0] = idx + trailing_zeroes(bits);
bits = bits & (bits - 1);
base_ptr++;
} while(bits != 0);
} while (bits != 0);
}
base = next_base;
}
#endif // SIMDJSON_NAIVE_FLATTEN
}
} // namespace simdjson
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H

View File

@ -1,7 +1,7 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
#define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
// This file provides the same function as
// This file provides the same function as
// stage1_find_marks_flatten.h, but uses Intel intrinsics.
// This should provide better performance on Visual Studio
// and other compilers that do a conservative optimization.
@ -20,15 +20,16 @@ namespace haswell {
// needs to be large enough to handle this
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
uint32_t idx, uint64_t bits) {
// In some instances, the next branch is expensive because it is mispredicted.
// In some instances, the next branch is expensive because it is mispredicted.
// Unfortunately, in other cases,
// it helps tremendously.
if(bits == 0) return;
if (bits == 0)
return;
uint32_t cnt = _mm_popcnt_u64(bits);
uint32_t next_base = base + cnt;
idx -= 64;
base_ptr += base;
{
{
base_ptr[0] = idx + _tzcnt_u64(bits);
bits = _blsr_u64(bits);
base_ptr[1] = idx + _tzcnt_u64(bits);
@ -68,19 +69,18 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
base_ptr += 8;
}
if (cnt > 16) { // unluckly: we rarely get here
// since it means having one structural or pseudo-structral element
// since it means having one structural or pseudo-structral element
// every 4 characters (possible with inputs like "","","",...).
do {
base_ptr[0] = idx + _tzcnt_u64(bits);
bits = _blsr_u64(bits);
base_ptr++;
} while(bits != 0);
} while (bits != 0);
}
base = next_base;
}
} // haswell
} // simdjson
} // namespace haswell
} // namespace simdjson
UNTARGET_REGION
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H

View File

@ -1,31 +1,32 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
#define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_macros.h"
#include "simdjson/stage1_find_marks_flatten_haswell.h"
#include "simdjson/simdutf8check_haswell.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_flatten_haswell.h"
#include "simdjson/stage1_find_marks_macros.h"
#ifdef IS_X86_64
TARGET_HASWELL
namespace simdjson {
template<>
struct simd_input<architecture::haswell> {
template <> struct simd_input<Architecture::HASWELL> {
__m256i lo;
__m256i hi;
};
template<> really_inline
simd_input<architecture::haswell> fill_input<architecture::haswell>(const uint8_t * ptr) {
struct simd_input<architecture::haswell> in;
template <>
really_inline simd_input<Architecture::HASWELL>
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
struct simd_input<Architecture::HASWELL> in;
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
return in;
}
template<> really_inline
uint64_t compute_quote_mask<architecture::haswell>(uint64_t quote_bits) {
template <>
really_inline uint64_t
compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
// There should be no such thing with a processing supporting avx2
// but not clmul.
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
@ -33,45 +34,50 @@ uint64_t compute_quote_mask<architecture::haswell>(uint64_t quote_bits) {
return quote_mask;
}
template<>
struct utf8_checking_state<architecture::haswell> {
template <> struct utf8_checking_state<Architecture::HASWELL> {
__m256i has_error;
avx_processed_utf_bytes previous;
utf8_checking_state() {
has_error = _mm256_setzero_si256();
previous.rawbytes = _mm256_setzero_si256();
previous.raw_bytes = _mm256_setzero_si256();
previous.high_nibbles = _mm256_setzero_si256();
previous.carried_continuations =_mm256_setzero_si256();
previous.carried_continuations = _mm256_setzero_si256();
}
};
template<> really_inline
void check_utf8<architecture::haswell>(simd_input<architecture::haswell> in,
utf8_checking_state<architecture::haswell>& state) {
__m256i highbit = _mm256_set1_epi8(0x80);
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
template <>
really_inline void check_utf8<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in,
utf8_checking_state<Architecture::HASWELL> &state) {
__m256i high_bit = _mm256_set1_epi8(0x80);
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
// it is ascii, we just check continuation
state.has_error = _mm256_or_si256(
_mm256_cmpgt_epi8(
state.previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
_mm256_cmpgt_epi8(state.previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error));
state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error));
state.previous =
avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
state.previous =
avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
}
}
template<> really_inline
errorValues check_utf8_errors<architecture::haswell>(utf8_checking_state<architecture::haswell>& state) {
return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
template <>
really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
utf8_checking_state<Architecture::HASWELL> &state) {
return _mm256_testz_si256(state.has_error, state.has_error) == 0
? simdjson::UTF8_ERROR
: simdjson::SUCCESS;
}
template<> really_inline
uint64_t cmp_mask_against_input<architecture::haswell>(simd_input<architecture::haswell> in, uint8_t m) {
template <>
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in, uint8_t m) {
const __m256i mask = _mm256_set1_epi8(m);
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@ -80,31 +86,38 @@ uint64_t cmp_mask_against_input<architecture::haswell>(simd_input<architecture::
return res_0 | (res_1 << 32);
}
template<> really_inline
uint64_t unsigned_lteq_against_input<architecture::haswell>(simd_input<architecture::haswell> in, uint8_t m) {
template <>
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in, uint8_t m) {
const __m256i maxval = _mm256_set1_epi8(m);
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval);
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval);
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
return res_0 | (res_1 << 32);
}
template<> really_inline
uint64_t find_odd_backslash_sequences<architecture::haswell>(simd_input<architecture::haswell> in, uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(architecture::haswell, in, prev_iter_ends_odd_backslash);
template <>
really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in,
uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
prev_iter_ends_odd_backslash);
}
template<> really_inline
uint64_t find_quote_mask_and_bits<architecture::haswell>(simd_input<architecture::haswell> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(architecture::haswell, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
template <>
really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
prev_iter_inside_quote, quote_bits, error_mask)
}
template<> really_inline
void find_whitespace_and_structurals<architecture::haswell>(simd_input<architecture::haswell> in,
uint64_t &whitespace,
uint64_t &structurals) {
template <>
really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
uint64_t &structurals) {
#ifdef SIMDJSON_NAIVE_STRUCTURAL
// You should never need this naive approach, but it can be useful
// for research purposes
@ -112,21 +125,28 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
__m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
__m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_brace));
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_brace));
struct_lo =
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
struct_hi =
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_open_bracket));
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_open_bracket));
struct_lo =
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
struct_hi =
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_close_bracket));
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_close_bracket));
struct_lo =
_mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
struct_hi =
_mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
const __m256i mask_column = _mm256_set1_epi8(0x3a);
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_column));
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_column));
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
struct_lo = _mm256_or_si256(struct_lo,_mm256_cmpeq_epi8(in.lo, mask_comma));
struct_hi = _mm256_or_si256(struct_hi,_mm256_cmpeq_epi8(in.hi, mask_comma));
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
uint64_t structural_res_0 =
static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
structurals = (structural_res_0 | (structural_res_1 << 32));
@ -134,34 +154,34 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
__m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
__m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_linefeed));
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_linefeed));
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
const __m256i mask_tab = _mm256_set1_epi8(0x09);
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_tab));
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_tab));
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
space_lo = _mm256_or_si256(space_lo,_mm256_cmpeq_epi8(in.lo, mask_carriage));
space_hi = _mm256_or_si256(space_hi,_mm256_cmpeq_epi8(in.hi, mask_carriage));
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
whitespace = (ws_res_0 | (ws_res_1 << 32));
// end of naive approach
#else // SIMDJSON_NAIVE_STRUCTURAL
const __m256i structural_table = _mm256_setr_epi8(
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
#else // SIMDJSON_NAIVE_STRUCTURAL
const __m256i structural_table =
_mm256_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
const __m256i white_table = _mm256_setr_epi8(
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
const __m256i struct_offset = _mm256_set1_epi8(0xd4);
const __m256i struct_mask = _mm256_set1_epi8(32);
__m256i lo_white = _mm256_cmpeq_epi8(in.lo,
_mm256_shuffle_epi8(white_table, in.lo));
__m256i hi_white = _mm256_cmpeq_epi8(in.hi,
_mm256_shuffle_epi8(white_table, in.hi));
__m256i lo_white =
_mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
__m256i hi_white =
_mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
whitespace = (ws_res_0 | (ws_res_1 << 32));
@ -173,7 +193,7 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
__m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
__m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
__m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
uint64_t structural_res_0 =
static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
@ -184,6 +204,5 @@ void find_whitespace_and_structurals<architecture::haswell>(simd_input<architect
} // namespace simdjson
UNTARGET_REGION
#endif // IS_X86_64
#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H

View File

@ -10,41 +10,44 @@
// indicate whether we end an iteration on an odd-length sequence of
// backslashes, which modifies our subsequent search for odd-length
// sequences of backslashes in an obvious way.
// We need to compile that code for multiple architectures. However, target attributes can be used
// only once by function definition. Huge macro seemed better than huge code duplication.
// uint64_t FIND_ODD_BACKSLASH_SEQUENCES(architecture T, simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash)
#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) { \
const uint64_t even_bits = 0x5555555555555555ULL; \
const uint64_t odd_bits = ~even_bits; \
uint64_t bs_bits = cmp_mask_against_input<T>(in, '\\'); \
uint64_t start_edges = bs_bits & ~(bs_bits << 1); \
/* flip lowest if we have an odd-length run at the end of the prior */ \
/* iteration */ \
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \
uint64_t even_starts = start_edges & even_start_mask; \
uint64_t odd_starts = start_edges & ~even_start_mask; \
uint64_t even_carries = bs_bits + even_starts; \
\
uint64_t odd_carries; \
/* must record the carry-out of our odd-carries out of bit 63; this */ \
/* indicates whether the sense of any edge going to the next iteration */ \
/* should be flipped */ \
bool iter_ends_odd_backslash = \
add_overflow(bs_bits, odd_starts, &odd_carries); \
\
odd_carries |= \
prev_iter_ends_odd_backslash; /* push in bit zero as a potential end */ \
/* if we had an odd-numbered run at the */ \
/* end of the previous iteration */ \
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \
uint64_t even_carry_ends = even_carries & ~bs_bits; \
uint64_t odd_carry_ends = odd_carries & ~bs_bits; \
uint64_t even_start_odd_end = even_carry_ends & odd_bits; \
uint64_t odd_start_even_end = odd_carry_ends & even_bits; \
uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \
return odd_ends; \
}
// We need to compile that code for multiple architectures. However, target
// attributes can be used only once by function definition. Huge macro seemed
// better than huge code duplication. uint64_t
// FIND_ODD_BACKSLASH_SEQUENCES(Architecture T, simd_input<T> in, uint64_t
// &prev_iter_ends_odd_backslash)
#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) \
{ \
const uint64_t even_bits = 0x5555555555555555ULL; \
const uint64_t odd_bits = ~even_bits; \
uint64_t bs_bits = cmp_mask_against_input<T>(in, '\\'); \
uint64_t start_edges = bs_bits & ~(bs_bits << 1); \
/* flip lowest if we have an odd-length run at the end of the prior \
* iteration */ \
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \
uint64_t even_starts = start_edges & even_start_mask; \
uint64_t odd_starts = start_edges & ~even_start_mask; \
uint64_t even_carries = bs_bits + even_starts; \
\
uint64_t odd_carries; \
/* must record the carry-out of our odd-carries out of bit 63; this \
* indicates whether the sense of any edge going to the next iteration \
* should be flipped */ \
bool iter_ends_odd_backslash = \
add_overflow(bs_bits, odd_starts, &odd_carries); \
\
odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a \
* potential end if we had an \
* odd-numbered run at the \
* end of the previous \
* iteration */ \
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \
uint64_t even_carry_ends = even_carries & ~bs_bits; \
uint64_t odd_carry_ends = odd_carries & ~bs_bits; \
uint64_t even_start_odd_end = even_carry_ends & odd_bits; \
uint64_t odd_start_even_end = odd_carry_ends & even_bits; \
uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \
return odd_ends; \
}
// return both the quote mask (which is a half-open mask that covers the first
// quote
@ -58,34 +61,39 @@
// Note that we don't do any error checking to see if we have backslash
// sequences outside quotes; these
// backslash sequences (of any length) will be detected elsewhere.
// We need to compile that code for multiple architectures. However, target attributes can be used
// only once by function definition. Huge macro seemed better than huge code duplication.
// uint64_t FIND_QUOTE_MASK_AND_BITS(architecture T, simd_input<T> in, uint64_t odd_ends,
// uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask)
#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) { \
quote_bits = cmp_mask_against_input<T>(in, '"'); \
quote_bits = quote_bits & ~odd_ends; \
uint64_t quote_mask = compute_quote_mask<T>(quote_bits); \
quote_mask ^= prev_iter_inside_quote; \
/* All Unicode characters may be placed within the */ \
/* quotation marks, except for the characters that MUST be escaped: */ \
/* quotation mark, reverse solidus, and the control characters (U+0000 */ \
/*through U+001F). */ \
/* https://tools.ietf.org/html/rfc8259 */ \
uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F); \
error_mask |= quote_mask & unescaped; \
/* right shift of a signed value expected to be well-defined and standard */ \
/* compliant as of C++20, */ \
/* John Regher from Utah U. says this is fine code */ \
prev_iter_inside_quote = \
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63); \
return quote_mask; \
} \
// We need to compile that code for multiple architectures. However, target
// attributes can be used only once by function definition. Huge macro seemed
// better than huge code duplication. uint64_t
// FIND_QUOTE_MASK_AND_BITS(Architecture T, simd_input<T> in, uint64_t odd_ends,
// uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t
// &error_mask)
#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, \
quote_bits, error_mask) \
{ \
quote_bits = cmp_mask_against_input<T>(in, '"'); \
quote_bits = quote_bits & ~odd_ends; \
uint64_t quote_mask = compute_quote_mask<T>(quote_bits); \
quote_mask ^= prev_iter_inside_quote; \
/* All Unicode characters may be placed within the \
* quotation marks, except for the characters that MUST be escaped: \
* quotation mark, reverse solidus, and the control characters (U+0000 \
* through U+001F). \
* https://tools.ietf.org/html/rfc8259 */ \
uint64_t unescaped = unsigned_lteq_against_input<T>(in, 0x1F); \
error_mask |= quote_mask & unescaped; \
/* right shift of a signed value expected to be well-defined and standard \
* compliant as of C++20, \
* John Regher from Utah U. says this is fine code */ \
prev_iter_inside_quote = \
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63); \
return quote_mask; \
}
// Find structural bits in a 64-byte chunk.
// We need to compile that code for multiple architectures. However, target attributes can be used
// only once by function definition. Huge macro seemed better than huge code duplication.
// void FIND_STRUCTURAL_BITS_64(architecture T,
// We need to compile that code for multiple architectures. However, target
// attributes can be used only once by function definition. Huge macro seemed
// better than huge code duplication. void FIND_STRUCTURAL_BITS_64(
// Architecture T,
// const uint8_t *buf,
// size_t idx,
// uint32_t *base_ptr,
@ -95,131 +103,137 @@
// uint64_t &prev_iter_ends_pseudo_pred,
// uint64_t &structurals,
// uint64_t &error_mask,
// utf8_checking_state<T> &utf8_state, flatten function)
#define FIND_STRUCTURAL_BITS_64(T, \
buf, \
idx, \
base_ptr, \
base, \
prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, \
prev_iter_ends_pseudo_pred, \
structurals, \
error_mask, \
utf8_state, \
flat \
) { \
simd_input<T> in = fill_input<T>(buf); \
check_utf8<T>(in, utf8_state); \
/* detect odd sequences of backslashes */ \
uint64_t odd_ends = find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash); \
\
/* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \
/* themselves */ \
uint64_t quote_bits; \
uint64_t quote_mask = find_quote_mask_and_bits<T>( \
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \
\
/* take the previous iterations structural bits, not our current iteration, */ \
/* and flatten */ \
flat(base_ptr, base, idx, structurals); \
\
uint64_t whitespace; \
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
\
/* fixup structurals to reflect quotes and add pseudo-structural characters */ \
structurals = finalize_structurals(structurals, whitespace, quote_mask, \
quote_bits, prev_iter_ends_pseudo_pred); \
} \
// We need to compile that code for multiple architectures. However, target attributes can be used
// only once by function definition. Huge macro seemed better than huge code duplication.
// errorValues FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj, flatten functio )
#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) { \
if (len > pj.bytecapacity) { \
std::cerr << "Your ParsedJson object only supports documents up to " \
<< pj.bytecapacity << " bytes but you are trying to process " << len \
<< " bytes" << std::endl; \
return simdjson::CAPACITY; \
} \
uint32_t *base_ptr = pj.structural_indexes; \
uint32_t base = 0; \
utf8_checking_state<T> utf8_state; \
\
/* we have padded the input out to 64 byte multiple with the remainder being */ \
/* zeros */ \
\
/* persistent state across loop */ \
/* does the last iteration end with an odd-length sequence of backslashes? */ \
/* either 0 or 1, but a 64-bit value */ \
uint64_t prev_iter_ends_odd_backslash = 0ULL; \
/* does the previous iteration end inside a double-quote pair? */ \
uint64_t prev_iter_inside_quote = 0ULL; /* either all zeros or all ones */ \
/* does the previous iteration end on something that is a predecessor of a */ \
/* pseudo-structural character - i.e. whitespace or a structural character */ \
/* effectively the very first char is considered to follow "whitespace" for */ \
/* the */ \
/* purposes of pseudo-structural character detection so we initialize to 1 */ \
uint64_t prev_iter_ends_pseudo_pred = 1ULL; \
\
/* structurals are persistent state across loop as we flatten them on the */ \
/* subsequent iteration into our array pointed to be base_ptr. */ \
/* This is harmless on the first iteration as structurals==0 */ \
/* and is done for performance reasons; we can hide some of the latency of the */ \
/* expensive carryless multiply in the previous step with this work */ \
uint64_t structurals = 0; \
\
size_t lenminus64 = len < 64 ? 0 : len - 64; \
size_t idx = 0; \
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII code points < 0x20) */ \
\
for (; idx < lenminus64; idx += 64) { \
FIND_STRUCTURAL_BITS_64(T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
error_mask, utf8_state, flat); \
} \
/* If we have a final chunk of less than 64 bytes, pad it to 64 with spaces */ \
/* before processing it (otherwise, we risk invalidating the UTF-8 checks). */ \
if (idx < len) { \
uint8_t tmpbuf[64]; \
memset(tmpbuf, 0x20, 64); \
memcpy(tmpbuf, buf + idx, len - idx); \
FIND_STRUCTURAL_BITS_64(T, &tmpbuf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
error_mask, utf8_state, flat); \
idx += 64; \
} \
\
/* is last string quote closed? */ \
if (prev_iter_inside_quote) { \
return simdjson::UNCLOSED_STRING; \
} \
\
/* finally, flatten out the remaining structurals from the last iteration */ \
flat(base_ptr, base, idx, structurals); \
\
pj.n_structural_indexes = base; \
/* a valid JSON file cannot have zero structural indexes - we should have */ \
/* found something */ \
if (pj.n_structural_indexes == 0u) { \
return simdjson::EMPTY; \
} \
if (base_ptr[pj.n_structural_indexes - 1] > len) { \
return simdjson::UNEXPECTED_ERROR; \
} \
if (len != base_ptr[pj.n_structural_indexes - 1]) { \
/* the string might not be NULL terminated, but we add a virtual NULL ending */ \
/* character. */ \
base_ptr[pj.n_structural_indexes++] = len; \
} \
/* make it safe to dereference one beyond this array */ \
base_ptr[pj.n_structural_indexes] = 0; \
if (error_mask) { \
return simdjson::UNESCAPED_CHARS; \
} \
return check_utf8_errors<T>(utf8_state); \
}
// utf8_checking_state<T> &utf8_state, flatten
// function)
#define FIND_STRUCTURAL_BITS_64( \
T, buf, idx, base_ptr, base, prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
error_mask, utf8_state, flat) \
{ \
simd_input<T> in = fill_input<T>(buf); \
check_utf8<T>(in, utf8_state); \
/* detect odd sequences of backslashes */ \
uint64_t odd_ends = \
find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash); \
\
/* detect insides of quote pairs ("quote_mask") and also our quote_bits \
* themselves */ \
uint64_t quote_bits; \
uint64_t quote_mask = find_quote_mask_and_bits<T>( \
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \
\
/* take the previous iterations structural bits, not our current \
* iteration, \
* and flatten */ \
flat(base_ptr, base, idx, structurals); \
\
uint64_t whitespace; \
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
\
/* fixup structurals to reflect quotes and add pseudo-structural \
* characters */ \
structurals = \
finalize_structurals(structurals, whitespace, quote_mask, quote_bits, \
prev_iter_ends_pseudo_pred); \
}
// We need to compile that code for multiple architectures. However, target
// attributes can be used only once by function definition. Huge macro seemed
// better than huge code duplication. ErrorValues
// FIND_STRUCTURAL_BITS(Architecture T, const uint8_t *buf, size_t len,
// ParsedJson &pj, flatten function)
#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) \
{ \
if (len > pj.byte_capacity) { \
std::cerr << "Your ParsedJson object only supports documents up to " \
<< pj.byte_capacity << " bytes but you are trying to process " \
<< len << " bytes" << std::endl; \
return simdjson::CAPACITY; \
} \
uint32_t *base_ptr = pj.structural_indexes; \
uint32_t base = 0; \
utf8_checking_state<T> utf8_state; \
\
/* we have padded the input out to 64 byte multiple with the remainder \
* being zeros persistent state across loop does the last iteration end \
* with an odd-length sequence of backslashes? */ \
\
/* either 0 or 1, but a 64-bit value */ \
uint64_t prev_iter_ends_odd_backslash = 0ULL; \
/* does the previous iteration end inside a double-quote pair? */ \
uint64_t prev_iter_inside_quote = \
0ULL; /* either all zeros or all ones \
* does the previous iteration end on something that is a \
* predecessor of a pseudo-structural character - i.e. \
* whitespace or a structural character effectively the very \
* first char is considered to follow "whitespace" for the \
* purposes of pseudo-structural character detection so we \
* initialize to 1 */ \
uint64_t prev_iter_ends_pseudo_pred = 1ULL; \
\
/* structurals are persistent state across loop as we flatten them on the \
* subsequent iteration into our array pointed to be base_ptr. \
* This is harmless on the first iteration as structurals==0 \
* and is done for performance reasons; we can hide some of the latency of \
* the \
* expensive carryless multiply in the previous step with this work */ \
uint64_t structurals = 0; \
\
size_t lenminus64 = len < 64 ? 0 : len - 64; \
size_t idx = 0; \
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII \
code points < 0x20) */ \
\
for (; idx < lenminus64; idx += 64) { \
FIND_STRUCTURAL_BITS_64( \
T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
error_mask, utf8_state, flat); \
} \
/* If we have a final chunk of less than 64 bytes, pad it to 64 with \
* spaces before processing it (otherwise, we risk invalidating the UTF-8 \
* checks). */ \
if (idx < len) { \
uint8_t tmp_buf[64]; \
memset(tmp_buf, 0x20, 64); \
memcpy(tmp_buf, buf + idx, len - idx); \
FIND_STRUCTURAL_BITS_64( \
T, &tmp_buf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \
prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \
error_mask, utf8_state, flat); \
idx += 64; \
} \
\
/* is last string quote closed? */ \
if (prev_iter_inside_quote) { \
return simdjson::UNCLOSED_STRING; \
} \
\
/* finally, flatten out the remaining structurals from the last iteration \
*/ \
flat(base_ptr, base, idx, structurals); \
\
pj.n_structural_indexes = base; \
/* a valid JSON file cannot have zero structural indexes - we should have \
* found something */ \
if (pj.n_structural_indexes == 0u) { \
return simdjson::EMPTY; \
} \
if (base_ptr[pj.n_structural_indexes - 1] > len) { \
return simdjson::UNEXPECTED_ERROR; \
} \
if (len != base_ptr[pj.n_structural_indexes - 1]) { \
/* the string might not be NULL terminated, but we add a virtual NULL \
* ending \
* character. */ \
base_ptr[pj.n_structural_indexes++] = len; \
} \
/* make it safe to dereference one beyond this array */ \
base_ptr[pj.n_structural_indexes] = 0; \
if (error_mask) { \
return simdjson::UNESCAPED_CHARS; \
} \
return check_utf8_errors<T>(utf8_state); \
}
#endif // SIMDJSON_STAGE1_FIND_MARKS_MACROS_H

View File

@ -1,26 +1,26 @@
#ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
#define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_macros.h"
#include "simdjson/stage1_find_marks_flatten.h"
#include "simdjson/simdutf8check_westmere.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage1_find_marks_flatten.h"
#include "simdjson/stage1_find_marks_macros.h"
#ifdef IS_X86_64
TARGET_WESTMERE
namespace simdjson {
template<>
struct simd_input<architecture::westmere> {
template <> struct simd_input<Architecture::WESTMERE> {
__m128i v0;
__m128i v1;
__m128i v2;
__m128i v3;
};
template<> really_inline
simd_input<architecture::westmere> fill_input<architecture::westmere>(const uint8_t * ptr) {
struct simd_input<architecture::westmere> in;
template <>
really_inline simd_input<Architecture::WESTMERE>
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
struct simd_input<Architecture::WESTMERE> in;
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
@ -28,61 +28,69 @@ simd_input<architecture::westmere> fill_input<architecture::westmere>(const uint
return in;
}
template<> really_inline
uint64_t compute_quote_mask<architecture::westmere>(uint64_t quote_bits) {
template <>
really_inline uint64_t
compute_quote_mask<Architecture::WESTMERE>(uint64_t quote_bits) {
return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
}
template<>
struct utf8_checking_state<architecture::westmere>
{
template <> struct utf8_checking_state<Architecture::WESTMERE> {
__m128i has_error = _mm_setzero_si128();
processed_utf_bytes previous {
_mm_setzero_si128(), // rawbytes
_mm_setzero_si128(), // high_nibbles
_mm_setzero_si128() // carried_continuations
processed_utf_bytes previous{
_mm_setzero_si128(), // raw_bytes
_mm_setzero_si128(), // high_nibbles
_mm_setzero_si128() // carried_continuations
};
};
template<> really_inline
void check_utf8<architecture::westmere>(simd_input<architecture::westmere> in,
utf8_checking_state<architecture::westmere>& state) {
__m128i highbit = _mm_set1_epi8(0x80);
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
template <>
really_inline void check_utf8<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in,
utf8_checking_state<Architecture::WESTMERE> &state) {
__m128i high_bit = _mm_set1_epi8(0x80);
if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) {
// it is ascii, we just check continuation
state.has_error = _mm_or_si128(
_mm_cmpgt_epi8(
state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
state.has_error);
state.has_error =
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 1)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error));
state.previous =
check_utf8_bytes(in.v0, &(state.previous), &(state.has_error));
state.previous =
check_utf8_bytes(in.v1, &(state.previous), &(state.has_error));
}
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) {
// it is ascii, we just check continuation
state.has_error = _mm_or_si128(
_mm_cmpgt_epi8(
state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
state.has_error);
state.has_error =
_mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations,
_mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 1)),
state.has_error);
} else {
// it is not ascii so we have to do heavy work
state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error));
state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error));
state.previous =
check_utf8_bytes(in.v2, &(state.previous), &(state.has_error));
state.previous =
check_utf8_bytes(in.v3, &(state.previous), &(state.has_error));
}
}
template<> really_inline
errorValues check_utf8_errors<architecture::westmere>(utf8_checking_state<architecture::westmere>& state) {
return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
template <>
really_inline ErrorValues check_utf8_errors<Architecture::WESTMERE>(
utf8_checking_state<Architecture::WESTMERE> &state) {
return _mm_testz_si128(state.has_error, state.has_error) == 0
? simdjson::UTF8_ERROR
: simdjson::SUCCESS;
}
template<> really_inline
uint64_t cmp_mask_against_input<architecture::westmere>(simd_input<architecture::westmere> in, uint8_t m) {
template <>
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in, uint8_t m) {
const __m128i mask = _mm_set1_epi8(m);
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
@ -95,54 +103,60 @@ uint64_t cmp_mask_against_input<architecture::westmere>(simd_input<architecture:
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
}
template<> really_inline
uint64_t unsigned_lteq_against_input<architecture::westmere>(simd_input<architecture::westmere> in, uint8_t m) {
template <>
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in, uint8_t m) {
const __m128i maxval = _mm_set1_epi8(m);
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval);
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval);
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval);
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval);
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
}
template<> really_inline
uint64_t find_odd_backslash_sequences<architecture::westmere>(simd_input<architecture::westmere> in, uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(architecture::westmere, in, prev_iter_ends_odd_backslash);
template <>
really_inline uint64_t find_odd_backslash_sequences<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in,
uint64_t &prev_iter_ends_odd_backslash) {
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::WESTMERE, in,
prev_iter_ends_odd_backslash);
}
template<> really_inline
uint64_t find_quote_mask_and_bits<architecture::westmere>(simd_input<architecture::westmere> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits, uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(architecture::westmere, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask)
template <>
really_inline uint64_t find_quote_mask_and_bits<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
uint64_t &error_mask) {
FIND_QUOTE_MASK_AND_BITS(Architecture::WESTMERE, in, odd_ends,
prev_iter_inside_quote, quote_bits, error_mask)
}
template<> really_inline
void find_whitespace_and_structurals<architecture::westmere>(simd_input<architecture::westmere> in,
uint64_t &whitespace, uint64_t &structurals) {
const __m128i structural_table = _mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
const __m128i white_table = _mm_setr_epi8(
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
template <>
really_inline void find_whitespace_and_structurals<Architecture::WESTMERE>(
simd_input<Architecture::WESTMERE> in, uint64_t &whitespace,
uint64_t &structurals) {
const __m128i structural_table =
_mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2,
100, 9, 10, 112, 100, 13, 100, 100);
const __m128i struct_offset = _mm_set1_epi8(0xd4);
const __m128i struct_mask = _mm_set1_epi8(32);
__m128i white0 = _mm_cmpeq_epi8(in.v0,
_mm_shuffle_epi8(white_table, in.v0));
__m128i white1 = _mm_cmpeq_epi8(in.v1,
_mm_shuffle_epi8(white_table, in.v1));
__m128i white2 = _mm_cmpeq_epi8(in.v2,
_mm_shuffle_epi8(white_table, in.v2));
__m128i white3 = _mm_cmpeq_epi8(in.v3,
_mm_shuffle_epi8(white_table, in.v3));
__m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0));
__m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1));
__m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2));
__m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3));
uint64_t ws_res_0 = _mm_movemask_epi8(white0);
uint64_t ws_res_1 = _mm_movemask_epi8(white1);
uint64_t ws_res_2 = _mm_movemask_epi8(white2);
uint64_t ws_res_3 = _mm_movemask_epi8(white3);
whitespace = (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
whitespace =
(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
__m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
__m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
@ -169,13 +183,12 @@ void find_whitespace_and_structurals<architecture::westmere>(simd_input<architec
uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
uint64_t structural_res_3 = _mm_movemask_epi8(struct4);
structurals = (structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
structurals = (structural_res_0 | (structural_res_1 << 16) |
(structural_res_2 << 32) | (structural_res_3 << 48));
}
} // namespace simdjson
UNTARGET_REGION
#endif // IS_X86_64
#endif // SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H

View File

@ -9,8 +9,8 @@
#include "simdjson/jsoncharutils.h"
#include "simdjson/numberparsing.h"
#include "simdjson/parsedjson.h"
#include "simdjson/stringparsing.h"
#include "simdjson/simdjson.h"
#include "simdjson/stringparsing.h"
namespace simdjson {
void init_state_machine();
@ -20,7 +20,8 @@ really_inline bool is_valid_true_atom(const uint8_t *loc) {
uint64_t tv = *reinterpret_cast<const uint64_t *>("true ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
uint64_t
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
@ -43,8 +44,9 @@ really_inline bool is_valid_false_atom(const uint8_t *loc) {
// the last character of false (it being 5 byte long!) would be
// ignored
uint64_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
uint64_t
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
@ -58,8 +60,9 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
uint64_t nv = *reinterpret_cast<const uint64_t *>("null ");
uint64_t mask4 = 0x00000000ffffffff;
uint32_t error = 0;
uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
uint64_t
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
// this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING);
std::memcpy(&locval, loc, sizeof(uint64_t));
@ -68,15 +71,15 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) {
return error == 0;
}
template<architecture T = architecture::native>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
template <Architecture T = Architecture::NATIVE>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
template<architecture T = architecture::native>
template <Architecture T = Architecture::NATIVE>
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
return unified_machine<T>(reinterpret_cast<const uint8_t*>(buf), len, pj);
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
}
}
} // namespace simdjson
#endif

View File

@ -6,8 +6,9 @@
#include "simdjson/parsedjson.h"
#ifdef JSON_TEST_STRINGS
void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end);
void foundBadString(const uint8_t *buf);
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end);
void found_bad_string(const uint8_t *buf);
#endif
namespace simdjson {
@ -37,7 +38,6 @@ static const uint8_t escape_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
// handle a unicode codepoint
// write appropriate values into dest
// src will advance 6 bytes or 12 bytes
@ -45,9 +45,10 @@ static const uint8_t escape_map[256] = {
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time
WARN_UNUSED
really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **dst_ptr) {
really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
uint8_t **dst_ptr) {
// hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
// conversion isn't valid; we defer the check for this to inside the
// conversion isn't valid; we defer the check for this to inside the
// multilingual plane check
uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
*src_ptr += 6;
@ -58,14 +59,14 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d
return false;
}
uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
// if the first code point is invalid we will get here, as we will go past
// the check for being outside the Basic Multilingual plane. If we don't
// find a \u immediately afterwards we fail out anyhow, but if we do,
// find a \u immediately afterwards we fail out anyhow, but if we do,
// this check catches both the case of the first code point being invalid
// or the second code point being invalid.
if ((code_point | code_point_2) >> 16) {
return false;
return false;
}
code_point =
@ -84,18 +85,17 @@ struct parse_string_helper {
};
// Finds where the backslashes and quotes are located.
template<architecture>
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst);
template <Architecture>
parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src,
uint8_t *dst);
template <Architecture T>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
really_inline bool
parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth, UNUSED uint32_t offset);
template<architecture T>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset);
}
} // namespace simdjson
/// Now include the specializations:
#include "simdjson/stringparsing_arm64.h"

View File

@ -6,46 +6,51 @@
#ifdef IS_ARM64
namespace simdjson {
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<architecture::arm64> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
uint8x16_t v0 = vld1q_u8(src);
uint8x16_t v1 = vld1q_u8(src+16);
vst1q_u8(dst, v0);
vst1q_u8(dst+16, v1);
uint8x16_t bs_mask = vmovq_n_u8('\\');
uint8x16_t qt_mask = vmovq_n_u8('"');
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
cmp_bs_0 = vandq_u8(cmp_bs_0, bitmask);
cmp_bs_1 = vandq_u8(cmp_bs_1, bitmask);
cmp_qt_0 = vandq_u8(cmp_qt_0, bitmask);
cmp_qt_1 = vandq_u8(cmp_qt_1, bitmask);
template <>
really_inline parse_string_helper
find_bs_bits_and_quote_bits<Architecture::ARM64>(const uint8_t *src,
uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING);
uint8x16_t v0 = vld1q_u8(src);
uint8x16_t v1 = vld1q_u8(src + 16);
vst1q_u8(dst, v0);
vst1q_u8(dst + 16, v1);
uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
sum0 = vpaddq_u8(sum0, sum1);
sum0 = vpaddq_u8(sum0, sum0);
return {
uint8x16_t bs_mask = vmovq_n_u8('\\');
uint8x16_t qt_mask = vmovq_n_u8('"');
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask);
uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask);
uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask);
uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask);
cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1);
uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1);
sum0 = vpaddq_u8(sum0, sum1);
sum0 = vpaddq_u8(sum0, sum0);
return {
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
};
vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits
};
}
template<>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
bool parse_string<architecture::arm64>(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
PARSE_STRING(architecture::arm64, buf, len, pj, depth, offset);
}
template <>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
really_inline bool
parse_string<Architecture::ARM64>(UNUSED const uint8_t *buf,
UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth,
UNUSED uint32_t offset) {
PARSE_STRING(Architecture::ARM64, buf, len, pj, depth, offset);
}
} // namespace simdjson
#endif
#endif

View File

@ -4,34 +4,39 @@
#include "simdjson/stringparsing.h"
#include "simdjson/stringparsing_macros.h"
#ifdef IS_X86_64
TARGET_HASWELL
namespace simdjson {
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<architecture::haswell> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
return {
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
template <>
really_inline parse_string_helper
find_bs_bits_and_quote_bits<Architecture::HASWELL>(const uint8_t *src,
uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING);
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v);
auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"'));
return {
static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm256_movemask_epi8(quote_mask)) // quote_bits
};
};
}
template<>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
bool parse_string<architecture::haswell>(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
PARSE_STRING(architecture::haswell, buf, len, pj, depth, offset);
template <>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
really_inline bool
parse_string<Architecture::HASWELL>(UNUSED const uint8_t *buf,
UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth,
UNUSED uint32_t offset) {
PARSE_STRING(Architecture::HASWELL, buf, len, pj, depth, offset);
}
}
} // namespace simdjson
UNTARGET_REGION
#endif

View File

@ -1,80 +1,88 @@
#ifndef SIMDJSON_STRINGPARSING_MACROS_H
#define SIMDJSON_STRINGPARSING_MACROS_H
// We need to compile that code for multiple architectures. However, target attributes can be used
// only once by function definition. Huge macro seemed better than huge code duplication.
// bool PARSE_STRING(architecture T, UNUSED const uint8_t *buf, UNUSED size_t len,
// ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset)
#define PARSE_STRING(T, buf, len, pj, depth, offset) { \
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */ \
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \
const uint8_t *const start_of_string = dst; \
while (1) { \
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst); \
if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) { \
/* we encountered quotes first. Move dst to point to quotes and exit */ \
\
/* find out where the quote is... */ \
uint32_t quote_dist = trailingzeroes(helper.quote_bits); \
\
/* NULL termination is still handy if you expect all your strings to be NULL terminated? */ \
/* It comes at a small cost */ \
dst[quote_dist] = 0; \
\
uint32_t str_length = (dst - start_of_string) + quote_dist; \
memcpy(pj.current_string_buf_loc,&str_length, sizeof(uint32_t)); \
/*///////////////////// */ \
/* Above, check for overflow in case someone has a crazy string (>=4GB?) */ \
/* But only add the overflow check when the document itself exceeds 4GB */ \
/* Currently unneeded because we refuse to parse docs larger or equal to 4GB. */ \
/*////////////////////// */ \
\
\
/* we advance the point, accounting for the fact that we have a NULL termination */ \
pj.current_string_buf_loc = dst + quote_dist + 1; \
return true; \
} \
if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) { \
/* find out where the backspace is */ \
uint32_t bs_dist = trailingzeroes(helper.bs_bits); \
uint8_t escape_char = src[bs_dist + 1]; \
/* we encountered backslash first. Handle backslash */ \
if (escape_char == 'u') { \
/* move src/dst up to the start; they will be further adjusted */ \
/* within the unicode codepoint handling code. */ \
src += bs_dist; \
dst += bs_dist; \
if (!handle_unicode_codepoint(&src, &dst)) { \
return false; \
} \
} else { \
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and */ \
/* write bs_dist+1 characters to output */ \
/* note this may reach beyond the part of the buffer we've actually */ \
/* seen. I think this is ok */ \
uint8_t escape_result = escape_map[escape_char]; \
if (escape_result == 0u) { \
return false; /* bogus escape value is an error */ \
} \
dst[bs_dist] = escape_result; \
src += bs_dist + 2; \
dst += bs_dist + 1; \
} \
} else { \
/* they are the same. Since they can't co-occur, it means we encountered */ \
/* neither. */ \
if constexpr(T == architecture::westmere) { \
src += 16; \
dst += 16; \
} else { \
src += 32; \
dst += 32; \
} \
} \
} \
/* can't be reached */ \
return true; \
}
// We need to compile that code for multiple architectures. However, target
// attributes can be used only once by function definition. Huge macro seemed
// better than huge code duplication.ç
// bool PARSE_STRING(Architecture T, const uint8_t *buf, size_t len, ParsedJson
// &pj,const uint32_t depth, uint32_t offset)
#define PARSE_STRING(T, buf, len, pj, depth, offset) \
{ \
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \
const uint8_t *src = \
&buf[offset + 1]; /* we know that buf at offset is a " */ \
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \
const uint8_t *const start_of_string = dst; \
while (1) { \
parse_string_helper helper = find_bs_bits_and_quote_bits<T>(src, dst); \
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { \
/* we encountered quotes first. Move dst to point to quotes and exit \
*/ \
\
/* find out where the quote is... */ \
uint32_t quote_dist = trailing_zeroes(helper.quote_bits); \
\
/* NULL termination is still handy if you expect all your strings to \
* be NULL terminated? */ \
/* It comes at a small cost */ \
dst[quote_dist] = 0; \
\
uint32_t str_length = (dst - start_of_string) + quote_dist; \
memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); \
/***************************** \
* Above, check for overflow in case someone has a crazy string \
* (>=4GB?) _ \
* But only add the overflow check when the document itself exceeds \
* 4GB \
* Currently unneeded because we refuse to parse docs larger or equal \
* to 4GB. \
****************************/ \
\
/* we advance the point, accounting for the fact that we have a NULL \
* termination */ \
pj.current_string_buf_loc = dst + quote_dist + 1; \
return true; \
} \
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { \
/* find out where the backspace is */ \
uint32_t bs_dist = trailing_zeroes(helper.bs_bits); \
uint8_t escape_char = src[bs_dist + 1]; \
/* we encountered backslash first. Handle backslash */ \
if (escape_char == 'u') { \
/* move src/dst up to the start; they will be further adjusted \
within the unicode codepoint handling code. */ \
src += bs_dist; \
dst += bs_dist; \
if (!handle_unicode_codepoint(&src, &dst)) { \
return false; \
} \
} else { \
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and \
* write bs_dist+1 characters to output \
* note this may reach beyond the part of the buffer we've actually \
* seen. I think this is ok */ \
uint8_t escape_result = escape_map[escape_char]; \
if (escape_result == 0u) { \
return false; /* bogus escape value is an error */ \
} \
dst[bs_dist] = escape_result; \
src += bs_dist + 2; \
dst += bs_dist + 1; \
} \
} else { \
/* they are the same. Since they can't co-occur, it means we \
* encountered neither. */ \
if constexpr (T == Architecture::WESTMERE) { \
src += 16; \
dst += 16; \
} else { \
src += 32; \
dst += 32; \
} \
} \
} \
/* can't be reached */ \
return true; \
}
#endif

View File

@ -4,32 +4,37 @@
#include "simdjson/stringparsing.h"
#include "simdjson/stringparsing_macros.h"
#ifdef IS_X86_64
TARGET_WESTMERE
namespace simdjson {
template<> really_inline
parse_string_helper find_bs_bits_and_quote_bits<architecture::westmere> (const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
return {
static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
template <>
really_inline parse_string_helper
find_bs_bits_and_quote_bits<Architecture::WESTMERE>(const uint8_t *src,
uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
// store to dest unconditionally - we can overwrite the bits we don't like
// later
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
return {
static_cast<uint32_t>(
_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
};
};
}
template<>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline
bool parse_string<architecture::westmere>(UNUSED const uint8_t *buf, UNUSED size_t len,
ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) {
PARSE_STRING(architecture::westmere, buf, len, pj, depth, offset);
}
template <>
WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER
really_inline bool
parse_string<Architecture::WESTMERE>(UNUSED const uint8_t *buf,
UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth,
UNUSED uint32_t offset) {
PARSE_STRING(Architecture::WESTMERE, buf, len, pj, depth, offset);
}
} // namespace simdjson
UNTARGET_REGION
#endif

View File

@ -1,35 +1,35 @@
#include "simdjson/jsonioutil.h"
#include <cstring>
#include <cstdlib>
#include <cstring>
namespace simdjson {
char * allocate_padded_buffer(size_t length) {
// we could do a simple malloc
//return (char *) malloc(length + SIMDJSON_PADDING);
// However, we might as well align to cache lines...
size_t totalpaddedlength = length + SIMDJSON_PADDING;
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
return padded_buffer;
char *allocate_padded_buffer(size_t length) {
// we could do a simple malloc
// return (char *) malloc(length + SIMDJSON_PADDING);
// However, we might as well align to cache lines...
size_t totalpaddedlength = length + SIMDJSON_PADDING;
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
return padded_buffer;
}
padded_string get_corpus(const std::string& filename) {
padded_string get_corpus(const std::string &filename) {
std::FILE *fp = std::fopen(filename.c_str(), "rb");
if (fp != nullptr) {
std::fseek(fp, 0, SEEK_END);
size_t len = std::ftell(fp);
padded_string s(len);
if(s.data() == nullptr) {
if (s.data() == nullptr) {
std::fclose(fp);
throw std::runtime_error("could not allocate memory");
throw std::runtime_error("could not allocate memory");
}
std::rewind(fp);
size_t readb = std::fread(s.data(), 1, len, fp);
std::fclose(fp);
if(readb != len) {
throw std::runtime_error("could not read the data");
if (readb != len) {
throw std::runtime_error("could not read the data");
}
return s;
}
throw std::runtime_error("could not load corpus");
}
throw std::runtime_error("could not load corpus");
}
} // namespace simdjson

View File

@ -38,13 +38,13 @@ static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
};
size_t jsonminify(const unsigned char *bytes, size_t howmany,
unsigned char *out) {
size_t json_minify(const unsigned char *bytes, size_t how_many,
unsigned char *out) {
size_t i = 0, pos = 0;
uint8_t quote = 0;
uint8_t nonescape = 1;
while (i < howmany) {
while (i < how_many) {
unsigned char c = bytes[i];
uint8_t *meta = jump_table + 3 * c;
@ -64,7 +64,6 @@ size_t jsonminify(const unsigned char *bytes, size_t howmany,
namespace simdjson {
// some intrinsics are missing under GCC?
#ifndef __clang__
#ifndef _MSC_VER
@ -85,8 +84,6 @@ static inline void _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo,
#endif
#endif
// a straightforward comparison of a mask against input.
static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
__m256i mask) {
@ -98,8 +95,9 @@ static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
}
// take input from buf and remove useless whitespace, input and output can be
// the same, result is null terminated, return the string length (minus the null termination)
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
// the same, result is null terminated, return the string length (minus the null
// termination)
size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
// Useful constant masks
const uint64_t even_bits = 0x5555555555555555ULL;
const uint64_t odd_bits = ~even_bits;
@ -109,11 +107,13 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
size_t idx = 0;
if (len >= 64) {
size_t avxlen = len - 63;
size_t avx_len = len - 63;
for (; idx < avxlen; idx += 64) {
__m256i input_lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
__m256i input_hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
for (; idx < avx_len; idx += 64) {
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
_mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
@ -122,8 +122,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
bool iter_ends_odd_backslash = add_overflow(
bs_bits, odd_starts, &odd_carries);
bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
uint64_t even_carry_ends = even_carries & ~bs_bits;
@ -137,7 +137,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);// might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University
prev_iter_inside_quote = static_cast<uint64_t>(
static_cast<int64_t>(quote_mask) >>
63); // might be undefined behavior, should be fully defined in C++20,
// ok according to John Regher from Utah University
const __m256i low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
@ -163,7 +166,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
__m256i tmp_ws_hi = _mm256_cmpeq_epi8(
_mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_0 =
static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
whitespace &= ~quote_mask;
@ -175,17 +179,18 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming((~whitespace));
__m256i vmask1 =
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
__m256i vmask2 =
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
__m256i vmask1 = _mm256_loadu2_m128i(
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
__m256i vmask2 = _mm256_loadu2_m128i(
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), reinterpret_cast<__m128i *>(out), result1);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), reinterpret_cast<__m128i *>(out + pop2),
result2);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1),
reinterpret_cast<__m128i *>(out), result1);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3),
reinterpret_cast<__m128i *>(out + pop2), result2);
out += pop4;
}
}
@ -195,8 +200,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint8_t buffer[64];
memset(buffer, 0, 64);
memcpy(buffer, buf + idx, len - idx);
__m256i input_lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
__m256i input_hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
__m256i input_lo =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
__m256i input_hi =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
uint64_t bs_bits =
cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
@ -205,10 +212,11 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t odd_starts = start_edges & ~even_start_mask;
uint64_t even_carries = bs_bits + even_starts;
uint64_t odd_carries;
//bool iter_ends_odd_backslash =
add_overflow( bs_bits, odd_starts, &odd_carries);
// bool iter_ends_odd_backslash =
add_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash;
//prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it
// prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
// // we never use it
uint64_t even_carry_ends = even_carries & ~bs_bits;
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
@ -220,7 +228,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore
// prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
// don't need this anymore
__m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
__m256i mask_70 =
@ -254,23 +263,23 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) {
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
int pop4 = hamming((~whitespace));
__m256i vmask1 =
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
__m256i vmask2 =
_mm256_loadu2_m128i(reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
__m256i vmask1 = _mm256_loadu2_m128i(
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
__m256i vmask2 = _mm256_loadu2_m128i(
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), reinterpret_cast<__m128i *>(buffer),
result1);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), reinterpret_cast<__m128i *>(buffer + pop2),
result2);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1),
reinterpret_cast<__m128i *>(buffer), result1);
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3),
reinterpret_cast<__m128i *>(buffer + pop2), result2);
memcpy(out, buffer, pop4);
out += pop4;
}
*out = '\0';// NULL termination
*out = '\0'; // NULL termination
return out - initout;
}
}
} // namespace simdjson
#endif

View File

@ -1,67 +1,68 @@
#include "simdjson/jsonparser.h"
#ifdef _MSC_VER
#include <windows.h>
#include <sysinfoapi.h>
#else
#include <unistd.h>
#endif
#include "simdjson/simdjson.h"
#include "simdjson/isadetection.h"
#include "simdjson/jsonparser.h"
#include "simdjson/portability.h"
#include "simdjson/simdjson.h"
namespace simdjson {
architecture find_best_supported_implementation() {
constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ
| SIMDExtensions::BMI1 | SIMDExtensions::BMI2;
constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ;
Architecture find_best_supported_implementation() {
constexpr uint32_t haswell_flags =
instruction_set::AVX2 | instruction_set::PCLMULQDQ |
instruction_set::BMI1 | instruction_set::BMI2;
constexpr uint32_t westmere_flags =
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
uint32_t supports = detect_supported_architectures();
// Order from best to worst (within architecture)
if ((haswell_flags & supports) == haswell_flags) return architecture::haswell;
if ((westmere_flags & supports) == westmere_flags) return architecture::westmere;
if (SIMDExtensions::NEON) return architecture::arm64;
if ((haswell_flags & supports) == haswell_flags)
return Architecture::HASWELL;
if ((westmere_flags & supports) == westmere_flags)
return Architecture::WESTMERE;
if (instruction_set::NEON)
return Architecture::ARM64;
return architecture::none;
return Architecture::NONE;
}
// Responsible to select the best json_parse implementation
int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) {
architecture best_implementation = find_best_supported_implementation();
int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj,
bool realloc_if_needed) {
Architecture best_implementation = find_best_supported_implementation();
// Selecting the best implementation
switch (best_implementation) {
#ifdef IS_X86_64
case architecture::haswell:
json_parse_ptr = &json_parse_implementation<architecture::haswell>;
case Architecture::HASWELL:
json_parse_ptr = &json_parse_implementation<Architecture::HASWELL>;
break;
case architecture::westmere:
json_parse_ptr = &json_parse_implementation<architecture::westmere>;
case Architecture::WESTMERE:
json_parse_ptr = &json_parse_implementation<Architecture::WESTMERE>;
break;
#endif
#ifdef IS_ARM64
case architecture::arm64:
json_parse_ptr = &json_parse_implementation<architecture::arm64>;
case Architecture::ARM64:
json_parse_ptr = &json_parse_implementation<Architecture::ARM64>;
break;
#endif
default :
default:
std::cerr << "The processor is not supported by simdjson." << std::endl;
return simdjson::UNEXPECTED_ERROR;
}
return json_parse_ptr(buf, len, pj, reallocifneeded);
return json_parse_ptr(buf, len, pj, realloc_if_needed);
}
json_parse_functype *json_parse_ptr = &json_parse_dispatch;
WARN_UNUSED
ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded) {
ParsedJson build_parsed_json(const uint8_t *buf, size_t len,
bool realloc_if_needed) {
ParsedJson pj;
bool ok = pj.allocateCapacity(len);
if(ok) {
json_parse(buf, len, pj, reallocifneeded);
bool ok = pj.allocate_capacity(len);
if (ok) {
json_parse(buf, len, pj, realloc_if_needed);
} else {
std::cerr << "failure during memory allocation " << std::endl;
}
return pj;
}
}
} // namespace simdjson

View File

@ -1,324 +1,323 @@
#include "simdjson/parsedjson.h"
namespace simdjson {
ParsedJson::ParsedJson() :
structural_indexes(nullptr), tape(nullptr), containing_scope_offset(nullptr),
ret_address(nullptr), string_buf(nullptr), current_string_buf_loc(nullptr) {}
ParsedJson::ParsedJson()
: structural_indexes(nullptr), tape(nullptr),
containing_scope_offset(nullptr), ret_address(nullptr),
string_buf(nullptr), current_string_buf_loc(nullptr) {}
ParsedJson::~ParsedJson() {
deallocate();
ParsedJson::~ParsedJson() { deallocate(); }
ParsedJson::ParsedJson(ParsedJson &&p)
: byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity),
tape_capacity(p.tape_capacity), string_capacity(p.string_capacity),
current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes),
structural_indexes(p.structural_indexes), tape(p.tape),
containing_scope_offset(p.containing_scope_offset),
ret_address(p.ret_address), string_buf(p.string_buf),
current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) {
p.structural_indexes = nullptr;
p.tape = nullptr;
p.containing_scope_offset = nullptr;
p.ret_address = nullptr;
p.string_buf = nullptr;
p.current_string_buf_loc = nullptr;
}
ParsedJson::ParsedJson(ParsedJson && p)
: bytecapacity(p.bytecapacity),
depthcapacity(p.depthcapacity),
tapecapacity(p.tapecapacity),
stringcapacity(p.stringcapacity),
current_loc(p.current_loc),
n_structural_indexes(p.n_structural_indexes),
structural_indexes(p.structural_indexes),
tape(p.tape),
containing_scope_offset(p.containing_scope_offset),
ret_address(p.ret_address),
string_buf(p.string_buf),
current_string_buf_loc(p.current_string_buf_loc),
isvalid(p.isvalid) {
p.structural_indexes=nullptr;
p.tape=nullptr;
p.containing_scope_offset=nullptr;
p.ret_address=nullptr;
p.string_buf=nullptr;
p.current_string_buf_loc=nullptr;
}
WARN_UNUSED
bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
if (maxdepth <= 0) {
maxdepth = 1; // don't let the user allocate nothing
}
if (len <= 0) {
len = 64; // allocating 0 bytes is wasteful.
}
if(len > SIMDJSON_MAXSIZE_BYTES) {
return false;
}
if ((len <= bytecapacity) && (depthcapacity < maxdepth)) {
return true;
}
deallocate();
isvalid = false;
bytecapacity = 0; // will only set it to len after allocations are a success
n_structural_indexes = 0;
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
structural_indexes = new (std::nothrow) uint32_t[max_structures];
// a pathological input like "[[[[..." would generate len tape elements, so need a capacity of len + 1
size_t localtapecapacity = ROUNDUP_N(len + 1, 64);
// a document with only zero-length strings... could have len/3 string
// and we would need len/3 * 5 bytes on the string buffer
size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64);
string_buf = new (std::nothrow) uint8_t[localstringcapacity];
tape = new (std::nothrow) uint64_t[localtapecapacity];
containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
#ifdef SIMDJSON_USE_COMPUTED_GOTO
ret_address = new (std::nothrow) void *[maxdepth];
#else
ret_address = new (std::nothrow) char[maxdepth];
#endif
if ((string_buf == nullptr) || (tape == nullptr) ||
(containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) {
std::cerr << "Could not allocate memory" << std::endl;
delete[] ret_address;
delete[] containing_scope_offset;
delete[] tape;
delete[] string_buf;
delete[] structural_indexes;
return false;
}
/*
// We do not need to initialize this content for parsing, though we could
// need to initialize it for safety.
memset(string_buf, 0 , localstringcapacity);
memset(structural_indexes, 0, max_structures * sizeof(uint32_t));
memset(tape, 0, localtapecapacity * sizeof(uint64_t));
*/
bytecapacity = len;
depthcapacity = maxdepth;
tapecapacity = localtapecapacity;
stringcapacity = localstringcapacity;
bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) {
if (max_depth <= 0) {
max_depth = 1; // don't let the user allocate nothing
}
if (len <= 0) {
len = 64; // allocating 0 bytes is wasteful.
}
if (len > SIMDJSON_MAXSIZE_BYTES) {
return false;
}
if ((len <= byte_capacity) && (depth_capacity < max_depth)) {
return true;
}
bool ParsedJson::isValid() const {
return isvalid;
}
int ParsedJson::getErrorCode() const {
return errorcode;
}
std::string ParsedJson::getErrorMsg() const {
return errorMsg(errorcode);
}
void ParsedJson::deallocate() {
bytecapacity = 0;
depthcapacity = 0;
tapecapacity = 0;
stringcapacity = 0;
}
deallocate();
valid = false;
byte_capacity = 0; // will only set it to len after allocations are a success
n_structural_indexes = 0;
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
structural_indexes = new (std::nothrow) uint32_t[max_structures];
// a pathological input like "[[[[..." would generate len tape elements, so
// need a capacity of len + 1
size_t local_tape_capacity = ROUNDUP_N(len + 1, 64);
// a document with only zero-length strings... could have len/3 string
// and we would need len/3 * 5 bytes on the string buffer
size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64);
string_buf = new (std::nothrow) uint8_t[local_string_capacity];
tape = new (std::nothrow) uint64_t[local_tape_capacity];
containing_scope_offset = new (std::nothrow) uint32_t[max_depth];
#ifdef SIMDJSON_USE_COMPUTED_GOTO
ret_address = new (std::nothrow) void *[max_depth];
#else
ret_address = new (std::nothrow) char[max_depth];
#endif
if ((string_buf == nullptr) || (tape == nullptr) ||
(containing_scope_offset == nullptr) || (ret_address == nullptr) ||
(structural_indexes == nullptr)) {
std::cerr << "Could not allocate memory" << std::endl;
delete[] ret_address;
delete[] containing_scope_offset;
delete[] tape;
delete[] string_buf;
delete[] structural_indexes;
isvalid = false;
return false;
}
/*
// We do not need to initialize this content for parsing, though we could
// need to initialize it for safety.
memset(string_buf, 0 , local_string_capacity);
memset(structural_indexes, 0, max_structures * sizeof(uint32_t));
memset(tape, 0, local_tape_capacity * sizeof(uint64_t));
*/
byte_capacity = len;
depth_capacity = max_depth;
tape_capacity = local_tape_capacity;
string_capacity = local_string_capacity;
return true;
}
bool ParsedJson::is_valid() const { return valid; }
int ParsedJson::get_error_code() const { return error_code; }
std::string ParsedJson::get_error_message() const {
return error_message(error_code);
}
void ParsedJson::deallocate() {
byte_capacity = 0;
depth_capacity = 0;
tape_capacity = 0;
string_capacity = 0;
delete[] ret_address;
delete[] containing_scope_offset;
delete[] tape;
delete[] string_buf;
delete[] structural_indexes;
valid = false;
}
void ParsedJson::init() {
current_string_buf_loc = string_buf;
current_loc = 0;
isvalid = false;
current_string_buf_loc = string_buf;
current_loc = 0;
valid = false;
}
WARN_UNUSED
bool ParsedJson::printjson(std::ostream &os) {
if(!isvalid) {
return false;
}
uint32_t string_length;
size_t tapeidx = 0;
uint64_t tape_val = tape[tapeidx];
uint8_t type = (tape_val >> 56);
size_t howmany = 0;
if (type == 'r') {
howmany = tape_val & JSONVALUEMASK;
} else {
fprintf(stderr, "Error: no starting root node?");
return false;
}
if (howmany > tapecapacity) {
fprintf(stderr,
"We may be exceeding the tape capacity. Is this a valid document?\n");
return false;
}
tapeidx++;
bool *inobject = new bool[depthcapacity];
auto *inobjectidx = new size_t[depthcapacity];
int depth = 1; // only root at level 0
inobjectidx[depth] = 0;
inobject[depth] = false;
for (; tapeidx < howmany; tapeidx++) {
tape_val = tape[tapeidx];
uint64_t payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
if (!inobject[depth]) {
if ((inobjectidx[depth] > 0) && (type != ']')) {
os << ",";
}
inobjectidx[depth]++;
} else { // if (inobject) {
if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) &&
(type != '}')) {
os << ",";
}
if (((inobjectidx[depth] & 1) == 1)) {
os << ":";
}
inobjectidx[depth]++;
bool ParsedJson::print_json(std::ostream &os) {
if (!valid) {
return false;
}
uint32_t string_length;
size_t tape_idx = 0;
uint64_t tape_val = tape[tape_idx];
uint8_t type = (tape_val >> 56);
size_t how_many = 0;
if (type == 'r') {
how_many = tape_val & JSON_VALUE_MASK;
} else {
fprintf(stderr, "Error: no starting root node?");
return false;
}
if (how_many > tape_capacity) {
fprintf(
stderr,
"We may be exceeding the tape capacity. Is this a valid document?\n");
return false;
}
tape_idx++;
bool *in_object = new bool[depth_capacity];
auto *in_object_idx = new size_t[depth_capacity];
int depth = 1; // only root at level 0
in_object_idx[depth] = 0;
in_object[depth] = false;
for (; tape_idx < how_many; tape_idx++) {
tape_val = tape[tape_idx];
uint64_t payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
if (!in_object[depth]) {
if ((in_object_idx[depth] > 0) && (type != ']')) {
os << ",";
}
switch (type) {
case '"': // we have a string
os << '"';
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
os << '"';
break;
case 'l': // we have a long int
if (tapeidx + 1 >= howmany) {
delete[] inobject;
delete[] inobjectidx;
return false;
}
os << static_cast<int64_t>(tape[++tapeidx]);
break;
case 'd': // we have a double
if (tapeidx + 1 >= howmany){
delete[] inobject;
delete[] inobjectidx;
return false;
}
double answer;
memcpy(&answer, &tape[++tapeidx], sizeof(answer));
os << answer;
break;
case 'n': // we have a null
os << "null";
break;
case 't': // we have a true
os << "true";
break;
case 'f': // we have a false
os << "false";
break;
case '{': // we have an object
os << '{';
depth++;
inobject[depth] = true;
inobjectidx[depth] = 0;
break;
case '}': // we end an object
depth--;
os << '}';
break;
case '[': // we start an array
os << '[';
depth++;
inobject[depth] = false;
inobjectidx[depth] = 0;
break;
case ']': // we end an array
depth--;
os << ']';
break;
case 'r': // we start and end with the root node
fprintf(stderr, "should we be hitting the root node?\n");
delete[] inobject;
delete[] inobjectidx;
return false;
default:
fprintf(stderr, "bug %c\n", type);
delete[] inobject;
delete[] inobjectidx;
in_object_idx[depth]++;
} else { // if (in_object) {
if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) &&
(type != '}')) {
os << ",";
}
if (((in_object_idx[depth] & 1) == 1)) {
os << ":";
}
in_object_idx[depth]++;
}
switch (type) {
case '"': // we have a string
os << '"';
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
print_with_escapes(
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
string_length);
os << '"';
break;
case 'l': // we have a long int
if (tape_idx + 1 >= how_many) {
delete[] in_object;
delete[] in_object_idx;
return false;
}
os << static_cast<int64_t>(tape[++tape_idx]);
break;
case 'd': // we have a double
if (tape_idx + 1 >= how_many) {
delete[] in_object;
delete[] in_object_idx;
return false;
}
double answer;
memcpy(&answer, &tape[++tape_idx], sizeof(answer));
os << answer;
break;
case 'n': // we have a null
os << "null";
break;
case 't': // we have a true
os << "true";
break;
case 'f': // we have a false
os << "false";
break;
case '{': // we have an object
os << '{';
depth++;
in_object[depth] = true;
in_object_idx[depth] = 0;
break;
case '}': // we end an object
depth--;
os << '}';
break;
case '[': // we start an array
os << '[';
depth++;
in_object[depth] = false;
in_object_idx[depth] = 0;
break;
case ']': // we end an array
depth--;
os << ']';
break;
case 'r': // we start and end with the root node
fprintf(stderr, "should we be hitting the root node?\n");
delete[] in_object;
delete[] in_object_idx;
return false;
default:
fprintf(stderr, "bug %c\n", type);
delete[] in_object;
delete[] in_object_idx;
return false;
}
delete[] inobject;
delete[] inobjectidx;
return true;
}
delete[] in_object;
delete[] in_object_idx;
return true;
}
WARN_UNUSED
bool ParsedJson::dump_raw_tape(std::ostream &os) {
if(!isvalid) {
return false;
}
uint32_t string_length;
size_t tapeidx = 0;
uint64_t tape_val = tape[tapeidx];
uint8_t type = (tape_val >> 56);
os << tapeidx << " : " << type;
tapeidx++;
size_t howmany = 0;
if (type == 'r') {
howmany = tape_val & JSONVALUEMASK;
} else {
fprintf(stderr, "Error: no starting root node?");
return false;
}
os << "\t// pointing to " << howmany <<" (right after last node)\n";
uint64_t payload;
for (; tapeidx < howmany; tapeidx++) {
os << tapeidx << " : ";
tape_val = tape[tapeidx];
payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
switch (type) {
case '"': // we have a string
os << "string \"";
memcpy(&string_length,string_buf + payload, sizeof(uint32_t));
print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length);
os << '"';
os << '\n';
break;
case 'l': // we have a long int
if (tapeidx + 1 >= howmany) {
return false;
}
os << "integer " << static_cast<int64_t>(tape[++tapeidx]) << "\n";
break;
case 'd': // we have a double
os << "float ";
if (tapeidx + 1 >= howmany) {
return false;
}
double answer;
memcpy(&answer, &tape[++tapeidx], sizeof(answer));
os << answer << '\n';
break;
case 'n': // we have a null
os << "null\n";
break;
case 't': // we have a true
os << "true\n";
break;
case 'f': // we have a false
os << "false\n";
break;
case '{': // we have an object
os << "{\t// pointing to next tape location " << payload << " (first node after the scope) \n";
break;
case '}': // we end an object
os << "}\t// pointing to previous tape location " << payload << " (start of the scope) \n";
break;
case '[': // we start an array
os << "[\t// pointing to next tape location " << payload << " (first node after the scope) \n";
break;
case ']': // we end an array
os << "]\t// pointing to previous tape location " << payload << " (start of the scope) \n";
break;
case 'r': // we start and end with the root node
printf("end of root\n");
return false;
default:
if (!valid) {
return false;
}
uint32_t string_length;
size_t tape_idx = 0;
uint64_t tape_val = tape[tape_idx];
uint8_t type = (tape_val >> 56);
os << tape_idx << " : " << type;
tape_idx++;
size_t how_many = 0;
if (type == 'r') {
how_many = tape_val & JSON_VALUE_MASK;
} else {
fprintf(stderr, "Error: no starting root node?");
return false;
}
os << "\t// pointing to " << how_many << " (right after last node)\n";
uint64_t payload;
for (; tape_idx < how_many; tape_idx++) {
os << tape_idx << " : ";
tape_val = tape[tape_idx];
payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
switch (type) {
case '"': // we have a string
os << "string \"";
memcpy(&string_length, string_buf + payload, sizeof(uint32_t));
print_with_escapes(
(const unsigned char *)(string_buf + payload + sizeof(uint32_t)),
string_length);
os << '"';
os << '\n';
break;
case 'l': // we have a long int
if (tape_idx + 1 >= how_many) {
return false;
}
os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n";
break;
case 'd': // we have a double
os << "float ";
if (tape_idx + 1 >= how_many) {
return false;
}
double answer;
memcpy(&answer, &tape[++tape_idx], sizeof(answer));
os << answer << '\n';
break;
case 'n': // we have a null
os << "null\n";
break;
case 't': // we have a true
os << "true\n";
break;
case 'f': // we have a false
os << "false\n";
break;
case '{': // we have an object
os << "{\t// pointing to next tape location " << payload
<< " (first node after the scope) \n";
break;
case '}': // we end an object
os << "}\t// pointing to previous tape location " << payload
<< " (start of the scope) \n";
break;
case '[': // we start an array
os << "[\t// pointing to next tape location " << payload
<< " (first node after the scope) \n";
break;
case ']': // we end an array
os << "]\t// pointing to previous tape location " << payload
<< " (start of the scope) \n";
break;
case 'r': // we start and end with the root node
printf("end of root\n");
return false;
default:
return false;
}
tape_val = tape[tapeidx];
payload = tape_val & JSONVALUEMASK;
type = (tape_val >> 56);
os << tapeidx << " : "<< type <<"\t// pointing to " << payload <<" (start root)\n";
return true;
}
}
tape_val = tape[tape_idx];
payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
os << tape_idx << " : " << type << "\t// pointing to " << payload
<< " (start root)\n";
return true;
}
} // namespace simdjson

View File

@ -1,264 +1,269 @@
#include "simdjson/parsedjson.h"
#include "simdjson/common_defs.h"
#include "simdjson/parsedjson.h"
#include <iterator>
namespace simdjson {
ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) {
if(!pj.isValid()) {
throw InvalidJSON();
}
depthindex = new scopeindex_t[pj.depthcapacity];
// memory allocation would throw
//if(depthindex == nullptr) {
// return;
//}
depthindex[0].start_of_scope = location;
current_val = pj.tape[location++];
current_type = (current_val >> 56);
depthindex[0].scope_type = current_type;
if (current_type == 'r') {
tape_length = current_val & JSONVALUEMASK;
if(location < tape_length) {
current_val = pj.tape[location];
current_type = (current_val >> 56);
depth++;
depthindex[depth].start_of_scope = location;
depthindex[depth].scope_type = current_type;
}
} else {
// should never happen
throw InvalidJSON();
}
}
ParsedJson::iterator::~iterator() {
delete[] depthindex;
}
ParsedJson::iterator::iterator(const iterator &o):
pj(o.pj), depth(o.depth), location(o.location),
tape_length(0), current_type(o.current_type),
current_val(o.current_val), depthindex(nullptr) {
depthindex = new scopeindex_t[pj.depthcapacity];
// allocation might throw
memcpy(depthindex, o.depthindex, pj.depthcapacity * sizeof(depthindex[0]));
tape_length = o.tape_length;
}
ParsedJson::iterator::iterator(iterator &&o):
pj(o.pj), depth(o.depth), location(o.location),
tape_length(o.tape_length), current_type(o.current_type),
current_val(o.current_val), depthindex(o.depthindex) {
o.depthindex = nullptr;// we take ownership
}
bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
if(!isOk()) {
return false;
ParsedJson::Iterator::Iterator(ParsedJson &pj_)
: pj(pj_), depth(0), location(0), tape_length(0), depth_index(nullptr) {
if (!pj.is_valid()) {
throw InvalidJSON();
}
depth_index = new scopeindex_t[pj.depth_capacity];
// memory allocation would throw
// if(depth_index == nullptr) {
// return;
//}
depth_index[0].start_of_scope = location;
current_val = pj.tape[location++];
current_type = (current_val >> 56);
depth_index[0].scope_type = current_type;
if (current_type == 'r') {
tape_length = current_val & JSON_VALUE_MASK;
if (location < tape_length) {
current_val = pj.tape[location];
current_type = (current_val >> 56);
depth++;
depth_index[depth].start_of_scope = location;
depth_index[depth].scope_type = current_type;
}
switch (current_type) {
case '"': // we have a string
} else {
// should never happen
throw InvalidJSON();
}
}
ParsedJson::Iterator::~Iterator() { delete[] depth_index; }
ParsedJson::Iterator::Iterator(const Iterator &o)
: pj(o.pj), depth(o.depth), location(o.location), tape_length(0),
current_type(o.current_type), current_val(o.current_val),
depth_index(nullptr) {
depth_index = new scopeindex_t[pj.depth_capacity];
// allocation might throw
memcpy(depth_index, o.depth_index,
pj.depth_capacity * sizeof(depth_index[0]));
tape_length = o.tape_length;
}
ParsedJson::Iterator::Iterator(Iterator &&o)
: pj(o.pj), depth(o.depth), location(o.location),
tape_length(o.tape_length), current_type(o.current_type),
current_val(o.current_val), depth_index(o.depth_index) {
o.depth_index = nullptr; // we take ownership
}
bool ParsedJson::Iterator::print(std::ostream &os, bool escape_strings) const {
if (!is_ok()) {
return false;
}
switch (current_type) {
case '"': // we have a string
os << '"';
if(escape_strings) {
print_with_escapes(get_string(), os, get_string_length());
if (escape_strings) {
print_with_escapes(get_string(), os, get_string_length());
} else {
// was: os << get_string();, but given that we can include null chars, we have to do something crazier:
std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator<char>(os));
// was: os << get_string();, but given that we can include null chars, we
// have to do something crazier:
std::copy(get_string(), get_string() + get_string_length(),
std::ostream_iterator<char>(os));
}
os << '"';
break;
case 'l': // we have a long int
case 'l': // we have a long int
os << get_integer();
break;
case 'd':
case 'd':
os << get_double();
break;
case 'n': // we have a null
case 'n': // we have a null
os << "null";
break;
case 't': // we have a true
case 't': // we have a true
os << "true";
break;
case 'f': // we have a false
case 'f': // we have a false
os << "false";
break;
case '{': // we have an object
case '}': // we end an object
case '[': // we start an array
case ']': // we end an array
case '{': // we have an object
case '}': // we end an object
case '[': // we start an array
case ']': // we end an array
os << static_cast<char>(current_type);
break;
default:
default:
return false;
}
return true;
}
return true;
}
bool ParsedJson::iterator::move_to(const char * pointer, uint32_t length) {
char* new_pointer = nullptr;
if (pointer[0] == '#') {
// Converting fragment representation to string representation
new_pointer = new char[length];
uint32_t new_length = 0;
for (uint32_t i = 1; i < length; i++) {
if (pointer[i] == '%' && pointer[i+1] == 'x') {
try {
int fragment = std::stoi(std::string(&pointer[i+2], 2), nullptr, 16);
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
// escaping the character
new_pointer[new_length] = '\\';
new_length++;
}
new_pointer[new_length] = fragment;
i += 3;
}
catch(std::invalid_argument& e) {
delete[] new_pointer;
return false; // the fragment is invalid
bool ParsedJson::Iterator::move_to(const char *pointer, uint32_t length) {
char *new_pointer = nullptr;
if (pointer[0] == '#') {
// Converting fragment representation to string representation
new_pointer = new char[length];
uint32_t new_length = 0;
for (uint32_t i = 1; i < length; i++) {
if (pointer[i] == '%' && pointer[i + 1] == 'x') {
try {
int fragment =
std::stoi(std::string(&pointer[i + 2], 2), nullptr, 16);
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
// escaping the character
new_pointer[new_length] = '\\';
new_length++;
}
new_pointer[new_length] = fragment;
i += 3;
} catch (std::invalid_argument &e) {
delete[] new_pointer;
return false; // the fragment is invalid
}
else {
new_pointer[new_length] = pointer[i];
}
new_length++;
} else {
new_pointer[new_length] = pointer[i];
}
length = new_length;
pointer = new_pointer;
new_length++;
}
// saving the current state
size_t depth_s = depth;
size_t location_s = location;
uint8_t current_type_s = current_type;
uint64_t current_val_s = current_val;
scopeindex_t *depthindex_s = depthindex;
rewind(); // The json pointer is used from the root of the document.
length = new_length;
pointer = new_pointer;
}
bool found = relative_move_to(pointer, length);
delete[] new_pointer;
// saving the current state
size_t depth_s = depth;
size_t location_s = location;
uint8_t current_type_s = current_type;
uint64_t current_val_s = current_val;
scopeindex_t *depth_index_s = depth_index;
if (!found) {
// since the pointer has found nothing, we get back to the original position.
depth = depth_s;
location = location_s;
current_type = current_type_s;
current_val = current_val_s;
depthindex = depthindex_s;
}
rewind(); // The json pointer is used from the root of the document.
return found;
bool found = relative_move_to(pointer, length);
delete[] new_pointer;
if (!found) {
// since the pointer has found nothing, we get back to the original
// position.
depth = depth_s;
location = location_s;
current_type = current_type_s;
current_val = current_val_s;
depth_index = depth_index_s;
}
return found;
}
bool ParsedJson::iterator::relative_move_to(const char * pointer, uint32_t length) {
if (length == 0) {
// returns the whole document
return true;
}
bool ParsedJson::Iterator::relative_move_to(const char *pointer,
uint32_t length) {
if (length == 0) {
// returns the whole document
return true;
}
if (pointer[0] != '/') {
// '/' must be the first character
if (pointer[0] != '/') {
// '/' must be the first character
return false;
}
// finding the key in an object or the index in an array
std::string key_or_index;
uint32_t offset = 1;
// checking for the "-" case
if (is_array() && pointer[1] == '-') {
if (length != 2) {
// the pointer must be exactly "/-"
// there can't be anything more after '-' as an index
return false;
}
key_or_index = '-';
offset = length; // will skip the loop coming right after
}
// finding the key in an object or the index in an array
std::string key_or_index;
uint32_t offset = 1;
// We either transform the first reference token to a valid json key
// or we make sure it is a valid index in an array.
for (; offset < length; offset++) {
if (pointer[offset] == '/') {
// beginning of the next key or index
break;
}
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
// the index of an array must be an integer
// we also make sure std::stoi won't discard whitespaces later
return false;
}
if (pointer[offset] == '~') {
// "~1" represents "/"
if (pointer[offset + 1] == '1') {
key_or_index += '/';
offset++;
continue;
}
// "~0" represents "~"
if (pointer[offset + 1] == '0') {
key_or_index += '~';
offset++;
continue;
}
}
if (pointer[offset] == '\\') {
if (pointer[offset + 1] == '\\' || pointer[offset + 1] == '"' ||
(pointer[offset + 1] <= 0x1F)) {
key_or_index += pointer[offset + 1];
offset++;
continue;
}
return false; // invalid escaped character
}
if (pointer[offset] == '\"') {
// unescaped quote character. this is an invalid case.
// lets do nothing and assume most pointers will be valid.
// it won't find any corresponding json key anyway.
// return false;
}
key_or_index += pointer[offset];
}
// checking for the "-" case
if (is_array() && pointer[1] == '-') {
if (length != 2) {
// the pointer must be exactly "/-"
// there can't be anything more after '-' as an index
bool found = false;
if (is_object()) {
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
found = relative_move_to(pointer + offset, length - offset);
}
} else if (is_array()) {
if (key_or_index == "-") { // handling "-" case first
if (down()) {
while (next())
; // moving to the end of the array
// moving to the nonexistent value right after...
size_t npos;
if ((current_type == '[') || (current_type == '{')) {
// we need to jump
npos = (current_val & JSON_VALUE_MASK);
} else {
npos =
location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
}
location = npos;
current_val = pj.tape[npos];
current_type = (current_val >> 56);
return true; // how could it fail ?
}
} else { // regular numeric index
// The index can't have a leading '0'
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
return false;
}
key_or_index = '-';
offset = length; // will skip the loop coming right after
}
// We either transform the first reference token to a valid json key
// or we make sure it is a valid index in an array.
for (; offset < length ; offset++) {
if (pointer[offset] == '/') {
// beginning of the next key or index
break;
}
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
// the index of an array must be an integer
// we also make sure std::stoi won't discard whitespaces later
// it cannot be empty
if (key_or_index.length() == 0) {
return false;
}
if (pointer[offset] == '~') {
// "~1" represents "/"
if (pointer[offset+1] == '1') {
key_or_index += '/';
offset++;
continue;
}
// "~0" represents "~"
if (pointer[offset+1] == '0') {
key_or_index += '~';
offset++;
continue;
}
// we already checked the index contains only valid digits
uint32_t index = std::stoi(key_or_index);
if (move_to_index(index)) {
found = relative_move_to(pointer + offset, length - offset);
}
if (pointer[offset] == '\\') {
if (pointer[offset+1] == '\\' || pointer[offset+1] == '"' || (pointer[offset+1] <= 0x1F)) {
key_or_index += pointer[offset+1];
offset++;
continue;
}
return false; // invalid escaped character
}
if (pointer[offset] == '\"') {
// unescaped quote character. this is an invalid case.
// lets do nothing and assume most pointers will be valid.
// it won't find any corresponding json key anyway.
// return false;
}
key_or_index += pointer[offset];
}
}
bool found = false;
if (is_object()) {
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
found = relative_move_to(pointer+offset, length-offset);
}
}
else if(is_array()) {
if (key_or_index == "-") { // handling "-" case first
if (down()) {
while(next()); // moving to the end of the array
// moving to the nonexistent value right after...
size_t npos;
if ((current_type == '[') || (current_type == '{')) {
// we need to jump
npos = ( current_val & JSONVALUEMASK);
} else {
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
}
location = npos;
current_val = pj.tape[npos];
current_type = (current_val >> 56);
return true; // how could it fail ?
}
} else { // regular numeric index
// The index can't have a leading '0'
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
return false;
}
// it cannot be empty
if (key_or_index.length() == 0) {
return false;
}
// we already checked the index contains only valid digits
uint32_t index = std::stoi(key_or_index);
if (move_to_index(index)) {
found = relative_move_to(pointer+offset, length-offset);
}
}
}
return found;
}
return found;
}
} // namespace simdjson

View File

@ -1,25 +1,30 @@
#include <map>
#include "simdjson/simdjson.h"
#include <map>
namespace simdjson {
const std::map<int, const std::string> errorStrings = {
const std::map<int, const std::string> error_strings = {
{SUCCESS, "No errors"},
{CAPACITY, "This ParsedJson can't support a document that big"},
{MEMALLOC, "Error allocating memory, we're most likely out of memory"},
{TAPE_ERROR, "Something went wrong while writing to the tape"},
{STRING_ERROR, "Problem while parsing a string"},
{T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'"},
{F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"},
{N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"},
{T_ATOM_ERROR,
"Problem while parsing an atom starting with the letter 't'"},
{F_ATOM_ERROR,
"Problem while parsing an atom starting with the letter 'f'"},
{N_ATOM_ERROR,
"Problem while parsing an atom starting with the letter 'n'"},
{NUMBER_ERROR, "Problem while parsing a number"},
{UTF8_ERROR, "The input is not valid UTF-8"},
{UNITIALIZED, "Unitialized"},
{EMPTY, "Empty"},
{UNESCAPED_CHARS, "Within strings, some characters must be escapted, we found unescapted characters"},
{UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson"},
{UNESCAPED_CHARS, "Within strings, some characters must be escapted, we "
"found unescapted characters"},
{UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as "
"you may have found a bug in simdjson"},
};
const std::string& errorMsg(const int errorCode) {
return errorStrings.at(errorCode);
}
const std::string &error_message(const int error_code) {
return error_strings.at(error_code);
}
} // namespace simdjson

View File

@ -1,37 +1,41 @@
#include "simdjson/portability.h"
#ifdef IS_X86_64
#include "simdjson/stage1_find_marks_haswell.h"
#include "simdjson/stage1_find_marks_westmere.h"
TARGET_HASWELL
namespace simdjson {
template<>
int find_structural_bits<architecture::haswell>(const uint8_t *buf, size_t len, ParsedJson &pj) {
FIND_STRUCTURAL_BITS(architecture::haswell, buf, len, pj, simdjson::haswell::flatten_bits);
template <>
int find_structural_bits<Architecture::HASWELL>(const uint8_t *buf, size_t len,
ParsedJson &pj) {
FIND_STRUCTURAL_BITS(Architecture::HASWELL, buf, len, pj,
simdjson::haswell::flatten_bits);
}
} // simdjson
} // namespace simdjson
UNTARGET_REGION
TARGET_WESTMERE
namespace simdjson {
template<>
int find_structural_bits<architecture::westmere>(const uint8_t *buf, size_t len, ParsedJson &pj) {
FIND_STRUCTURAL_BITS(architecture::westmere, buf, len, pj, simdjson::flatten_bits);
template <>
int find_structural_bits<Architecture::WESTMERE>(const uint8_t *buf, size_t len,
ParsedJson &pj) {
FIND_STRUCTURAL_BITS(Architecture::WESTMERE, buf, len, pj,
simdjson::flatten_bits);
}
} // simdjson
} // namespace simdjson
UNTARGET_REGION
#endif
#ifdef IS_ARM64
#include "simdjson/stage1_find_marks_arm64.h"
namespace simdjson {
template<>
int find_structural_bits<architecture::arm64>(const uint8_t *buf, size_t len, ParsedJson &pj) {
FIND_STRUCTURAL_BITS(architecture::arm64, buf, len, pj, simdjson::flatten_bits);
}
template <>
int find_structural_bits<Architecture::ARM64>(const uint8_t *buf, size_t len,
ParsedJson &pj) {
FIND_STRUCTURAL_BITS(Architecture::ARM64, buf, len, pj,
simdjson::flatten_bits);
}
} // namespace simdjson
#endif

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,7 @@ using namespace rapidjson;
int main(int argc, char *argv[]) {
bool verbose = false;
bool justfavorites = false;
bool just_favorites = false;
int c;
while ((c = getopt(argc, argv, "vm")) != -1)
switch (c) {
@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
verbose = true;
break;
case 'm':
justfavorites = true;
just_favorites = true;
break;
default:
abort();
@ -77,8 +77,8 @@ int main(int argc, char *argv[]) {
std::cout << std::endl;
}
simdjson::ParsedJson pj;
size_t maxdepth = 1024 * 4;
bool allocok = pj.allocateCapacity(p.size(), maxdepth);
size_t max_depth = 1024 * 4;
bool allocok = pj.allocate_capacity(p.size(), max_depth);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return EXIT_FAILURE;
@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
sajson::parse(sajson::dynamic_allocation(),
sajson::mutable_string_view(p.size(), buffer))
.is_valid();
if (justfavorites) {
if (just_favorites) {
printf("our parser : %s \n",
ours_correct ? "correct" : "invalid");
printf("rapid (check encoding) : %s \n",
@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
if (oursreturn == simdjson::DEPTH_ERROR) {
printf("simdjson encountered a DEPTH_ERROR, it was parametrized to "
"reject documents with depth exceeding %zu.\n",
maxdepth);
max_depth);
}
if ((ours_correct != rapid_correct_checkencoding) ||
(rapid_correct_checkencoding != sajson_correct) ||
@ -157,12 +157,12 @@ int main(int argc, char *argv[]) {
}
Json::CharReaderBuilder b;
Json::CharReader *jsoncppreader = b.newCharReader();
Json::CharReader *json_cpp_reader = b.newCharReader();
Json::Value root;
Json::String errs;
bool isjsoncppok =
jsoncppreader->parse(buffer, buffer + p.size(), &root, &errs);
delete jsoncppreader;
bool is_json_cpp_ok =
json_cpp_reader->parse(buffer, buffer + p.size(), &root, &errs);
delete json_cpp_reader;
printf("our parser : %s \n",
ours_correct ? "correct" : "invalid");
@ -185,7 +185,7 @@ int main(int argc, char *argv[]) {
printf("cjson : %s \n",
cjson_correct ? "correct" : "invalid");
printf("jsoncpp : %s \n",
isjsoncppok ? "correct" : "invalid");
is_json_cpp_ok ? "correct" : "invalid");
free(buffer);
return EXIT_SUCCESS;

View File

@ -15,10 +15,10 @@ bool skyprophet_test() {
std::vector<std::string> data;
char buf[1024];
for (size_t i = 0; i < n_records; ++i) {
auto n =
sprintf(buf, "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
auto n = sprintf(buf,
"{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", "
"\"school\": {\"id\": %zu, \"name\": \"school%zu\"}}",
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
i, i, (i % 2) ? "male" : "female", i % 10, i % 10);
data.emplace_back(std::string(buf, n));
}
for (size_t i = 0; i < n_records; ++i) {
@ -40,7 +40,7 @@ bool skyprophet_test() {
maxsize = s.size();
}
simdjson::ParsedJson pj;
if (!pj.allocateCapacity(maxsize)) {
if (!pj.allocate_capacity(maxsize)) {
printf("allocation failure in skyprophet_test\n");
return false;
}
@ -52,12 +52,12 @@ bool skyprophet_test() {
}
counter++;
auto ok1 = json_parse(rec.c_str(), rec.length(), pj);
if (ok1 != 0 || !pj.isValid()) {
if (ok1 != 0 || !pj.is_valid()) {
printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str());
return false;
}
auto ok2 = json_parse(rec, pj);
if (ok2 != 0 || !pj.isValid()) {
if (ok2 != 0 || !pj.is_valid()) {
printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str());
return false;
}

View File

@ -17,14 +17,14 @@
/**
* Does the file filename ends with the given extension.
*/
static bool hasExtension(const char *filename, const char *extension) {
static bool has_extension(const char *filename, const char *extension) {
const char *ext = strrchr(filename, '.');
return ((ext != nullptr) && (strcmp(ext, extension) == 0));
}
bool startsWith(const char *pre, const char *str) {
size_t lenpre = strlen(pre), lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
bool starts_with(const char *pre, const char *str) {
size_t len_pre = strlen(pre), len_str = strlen(str);
return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0;
}
bool contains(const char *pre, const char *str) {
@ -32,7 +32,7 @@ bool contains(const char *pre, const char *str) {
}
bool validate(const char *dirname) {
bool everythingfine = true;
bool everything_fine = true;
const char *extension = ".json";
size_t dirlen = strlen(dirname);
struct dirent **entry_list;
@ -45,15 +45,15 @@ bool validate(const char *dirname) {
printf("nothing in dir %s \n", dirname);
return false;
}
bool *isfileasexpected = new bool[c];
bool *is_file_as_expected = new bool[c];
for (int i = 0; i < c; i++) {
isfileasexpected[i] = true;
is_file_as_expected[i] = true;
}
size_t howmany = 0;
size_t how_many = 0;
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name;
if (hasExtension(name, extension)) {
if (has_extension(name, extension)) {
printf("validating: file %s ", name);
fflush(nullptr);
size_t filelen = strlen(name);
@ -73,38 +73,38 @@ bool validate(const char *dirname) {
return EXIT_FAILURE;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
++howmany;
const int parseRes = json_parse(p, pj);
printf("%s\n", parseRes == 0 ? "ok" : "invalid");
++how_many;
const int parse_res = json_parse(p, pj);
printf("%s\n", parse_res == 0 ? "ok" : "invalid");
if (contains("EXCLUDE", name)) {
// skipping
howmany--;
} else if (startsWith("pass", name) && parseRes != 0) {
isfileasexpected[i] = false;
how_many--;
} else if (starts_with("pass", name) && parse_res != 0) {
is_file_as_expected[i] = false;
printf("warning: file %s should pass but it fails. Error is: %s\n",
name, simdjson::errorMsg(parseRes).data());
everythingfine = false;
} else if (startsWith("fail", name) && parseRes == 0) {
isfileasexpected[i] = false;
name, simdjson::error_message(parse_res).data());
everything_fine = false;
} else if (starts_with("fail", name) && parse_res == 0) {
is_file_as_expected[i] = false;
printf("warning: file %s should fail but it passes.\n", name);
everythingfine = false;
everything_fine = false;
}
free(fullpath);
}
}
printf("%zu files checked.\n", howmany);
if (everythingfine) {
printf("%zu files checked.\n", how_many);
if (everything_fine) {
printf("All ok!\n");
} else {
fprintf(stderr,
"There were problems! Consider reviewing the following files:\n");
for (int i = 0; i < c; i++) {
if (!isfileasexpected[i]) {
if (!is_file_as_expected[i]) {
fprintf(stderr, "%s \n", entry_list[i]->d_name);
}
}
@ -113,8 +113,8 @@ bool validate(const char *dirname) {
free(entry_list[i]);
}
free(entry_list);
delete[] isfileasexpected;
return everythingfine;
delete[] is_file_as_expected;
return everything_fine;
}
int main(int argc, char *argv[]) {

View File

@ -13,31 +13,30 @@
#include "simdjson/common_defs.h"
// ulp distance
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint32_t f32_ulp_dist(float a, float b) {
uint32_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int32_t)(ub^ua) >= 0)
return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int32_t)(ub ^ ua) >= 0)
return (int32_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua);
return ua + ub + 0x80000000;
}
// ulp distance
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint64_t f64_ulp_dist(double a, double b) {
uint64_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int64_t)(ub^ua) >= 0)
return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int64_t)(ub ^ ua) >= 0)
return (int64_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua);
return ua + ub + 0x80000000;
}
int parse_error;
@ -51,7 +50,7 @@ size_t invalid_count;
// strings that start with these should not be parsed as numbers
const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"};
bool startsWith(const char *pre, const char *str) {
bool starts_with(const char *pre, const char *str) {
size_t lenpre = strlen(pre);
return strncmp(pre, str, lenpre) == 0;
}
@ -60,27 +59,27 @@ bool is_in_bad_list(const char *buf) {
if (buf[0] != '0')
return false;
for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++)
if (startsWith(really_bad[i], buf))
if (starts_with(really_bad[i], buf))
return true;
return false;
}
void foundInvalidNumber(const uint8_t *buf) {
void found_invalid_number(const uint8_t *buf) {
invalid_count++;
char *endptr;
double expected = strtod((const char *)buf, &endptr);
if (endptr != (const char *)buf) {
if (!is_in_bad_list((const char *)buf)) {
printf(
"Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ",
buf, expected);
printf("Warning: found_invalid_number %.32s whereas strtod parses it to "
"%f, ",
buf, expected);
printf(" while parsing %s \n", fullpath);
parse_error |= PARSE_WARNING;
}
}
}
void foundInteger(int64_t result, const uint8_t *buf) {
void found_integer(int64_t result, const uint8_t *buf) {
int_count++;
char *endptr;
long long expected = strtoll((const char *)buf, &endptr, 10);
@ -91,7 +90,7 @@ void foundInteger(int64_t result, const uint8_t *buf) {
}
}
void foundFloat(double result, const uint8_t *buf) {
void found_float(double result, const uint8_t *buf) {
char *endptr;
float_count++;
double expected = strtod((const char *)buf, &endptr);
@ -111,8 +110,8 @@ void foundFloat(double result, const uint8_t *buf) {
return;
}
// we want to get some reasonable relative accuracy
uint64_t ULP = f64_ulp_dist(expected,result);
if (f64_ulp_dist(expected,result) > 1) {
uint64_t ULP = f64_ulp_dist(expected, result);
if (f64_ulp_dist(expected, result) > 1) {
fprintf(stderr, "parsed %.128e from \n", result);
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
fprintf(stderr, " %.128e,", expected);
@ -128,7 +127,7 @@ void foundFloat(double result, const uint8_t *buf) {
/**
* Does the file filename ends with the given extension.
*/
static bool hasExtension(const char *filename, const char *extension) {
static bool has_extension(const char *filename, const char *extension) {
const char *ext = strrchr(filename, '.');
return (ext && !strcmp(ext, extension));
}
@ -151,7 +150,7 @@ bool validate(const char *dirname) {
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name;
if (hasExtension(name, extension)) {
if (has_extension(name, extension)) {
size_t filelen = strlen(name);
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
strcpy(fullpath, dirname);
@ -170,7 +169,7 @@ bool validate(const char *dirname) {
}
// terrible hack but just to get it working
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return false;

View File

@ -4,34 +4,35 @@
#include "simdjson/parsedjson.h"
int main() {
// {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}"
std::string json = "{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}";
simdjson::ParsedJson pj;
assert(pj.allocateCapacity(json.length()));
simdjson::json_parse(json.c_str(), json.length(), pj);
assert(pj.isValid());
simdjson::ParsedJson::iterator it(pj);
// {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}"
std::string json =
"{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}";
simdjson::ParsedJson pj;
assert(pj.allocate_capacity(json.length()));
simdjson::json_parse(json.c_str(), json.length(), pj);
assert(pj.is_valid());
simdjson::ParsedJson::Iterator it(pj);
// valid JSON String Representation pointer
std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0");
assert(it.move_to(pointer1.c_str(), pointer1.length()));
assert(it.is_string());
assert(it.get_string() == std::string("value0"));
// valid JSON String Representation pointer
std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0");
assert(it.move_to(pointer1.c_str(), pointer1.length()));
assert(it.is_string());
assert(it.get_string() == std::string("value0"));
// valid URI Fragment Identifier Representation pointer
std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1");
assert(it.move_to(pointer2.c_str(), pointer2.length()));
assert(it.is_string());
assert(it.get_string() == std::string("value1"));
// valid URI Fragment Identifier Representation pointer
std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1");
assert(it.move_to(pointer2.c_str(), pointer2.length()));
assert(it.is_string());
assert(it.get_string() == std::string("value1"));
// invalid pointer with leading 0 in index
std::string pointer3("#/~1~001abc/01");
assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed
assert(it.is_string()); // has probably not moved
assert(it.get_string() == std::string("value1")); // has not move
// invalid pointer with leading 0 in index
std::string pointer3("#/~1~001abc/01");
assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed
assert(it.is_string()); // has probably not moved
assert(it.get_string() == std::string("value1")); // has not move
// "the (nonexistent) member after the last array element"
std::string pointer4("/~1~001abc/-");
assert(it.move_to(pointer4.c_str(), pointer4.length()));
assert(it.get_type() == ']');
// "the (nonexistent) member after the last array element"
std::string pointer4("/~1~001abc/-");
assert(it.move_to(pointer4.c_str(), pointer4.length()));
assert(it.get_type() == ']');
}

View File

@ -7,15 +7,15 @@ int main() {
const char *filename = JSON_TEST_PATH;
padded_string p = get_corpus(filename);
ParsedJson pj = build_parsed_json(p); // do the parsing
if (!pj.isValid()) {
if (!pj.is_valid()) {
return EXIT_FAILURE;
}
if (!pj.allocateCapacity(p.size())) {
if (!pj.allocate_capacity(p.size())) {
return EXIT_FAILURE;
}
const int res = json_parse(p, pj);
if (res) {
std::cerr << errorMsg(res) << std::endl;
std::cerr << error_message(res) << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;

View File

@ -1,8 +1,8 @@
#include <assert.h>
#include <climits>
#include <cstring>
#include <dirent.h>
#include <inttypes.h>
#include <climits>
#include <iostream>
#include <math.h>
#include <stdbool.h>
@ -72,7 +72,7 @@ static bool parse_string(const char *p, char *output, char **end) {
for (;;) {
#if (CHAR_MIN < 0) || (!defined(CHAR_MIN)) // the '!defined' is just paranoia
// in this path, char is *signed*
// in this path, char is *signed*
if ((*p >= 0 && *p < 0x20)) {
return false; // unescaped
}
@ -209,12 +209,12 @@ static bool parse_string(const char *p, char *output, char **end) {
}
}
// end of borrowed code
char *bigbuffer; // global variable
char *big_buffer; // global variable
void foundBadString(const uint8_t *buf) {
void found_bad_string(const uint8_t *buf) {
bad_string++;
char *end;
if (parse_string((const char *)buf, bigbuffer, &end)) {
if (parse_string((const char *)buf, big_buffer, &end)) {
printf("WARNING: Sajson-like parser seems to think that the string is "
"valid %32s \n",
buf);
@ -234,18 +234,18 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) {
}
}
void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end) {
size_t thislen = parsed_end - parsed_begin;
total_string_length += thislen;
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
const uint8_t *parsed_end) {
size_t this_len = parsed_end - parsed_begin;
total_string_length += this_len;
good_string++;
char *end = NULL;
if (!parse_string((const char *)buf, bigbuffer, &end)) {
if (!parse_string((const char *)buf, big_buffer, &end)) {
printf("WARNING: reference parser seems to think that the string is NOT "
"valid %32s \n",
buf);
}
if (end == bigbuffer) {
if (end == big_buffer) {
// we have a zero-length string
if (parsed_begin != parsed_end) {
printf("WARNING: We have a zero-length but gap is %zu \n",
@ -255,35 +255,35 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
empty_string++;
return;
}
size_t len = end - bigbuffer;
if (len != thislen) {
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen,
size_t len = end - big_buffer;
if (len != this_len) {
printf("WARNING: lengths on parsed strings disagree %zu %zu \n", this_len,
len);
printf("\nour parsed string : '%*s'\n\n", (int)thislen,
printf("\nour parsed string : '%*s'\n\n", (int)this_len,
(const char *)parsed_begin);
print_hex((const char *)parsed_begin, thislen);
print_hex((const char *)parsed_begin, this_len);
printf("\n");
printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer);
print_hex((const char *)bigbuffer, len);
printf("reference parsing :'%*s'\n\n", (int)len, big_buffer);
print_hex((const char *)big_buffer, len);
printf("\n");
probable_bug = true;
}
if (memcmp(bigbuffer, parsed_begin, thislen) != 0) {
if (memcmp(big_buffer, parsed_begin, this_len) != 0) {
printf("WARNING: parsed strings disagree \n");
printf("Lengths %zu %zu \n", thislen, len);
printf("Lengths %zu %zu \n", this_len, len);
printf("\nour parsed string : '%*s'\n", (int)thislen,
printf("\nour parsed string : '%*s'\n", (int)this_len,
(const char *)parsed_begin);
print_hex((const char *)parsed_begin, thislen);
print_hex((const char *)parsed_begin, this_len);
printf("\n");
printf("reference parsing :'%*s'\n", (int)len, bigbuffer);
print_hex((const char *)bigbuffer, len);
printf("reference parsing :'%*s'\n", (int)len, big_buffer);
print_hex((const char *)big_buffer, len);
printf("\n");
print_cmp_hex((const char *)parsed_begin, bigbuffer, thislen);
print_cmp_hex((const char *)parsed_begin, big_buffer, this_len);
probable_bug = true;
}
@ -295,12 +295,12 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin,
/**
* Does the file filename ends with the given extension.
*/
static bool hasExtension(const char *filename, const char *extension) {
static bool has_extension(const char *filename, const char *extension) {
const char *ext = strrchr(filename, '.');
return (ext && !strcmp(ext, extension));
}
bool startsWith(const char *pre, const char *str) {
bool starts_with(const char *pre, const char *str) {
size_t lenpre = strlen(pre), lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
}
@ -323,7 +323,7 @@ bool validate(const char *dirname) {
bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/');
for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name;
if (hasExtension(name, extension)) {
if (has_extension(name, extension)) {
size_t filelen = strlen(name);
fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
strcpy(fullpath, dirname);
@ -341,13 +341,13 @@ bool validate(const char *dirname) {
return EXIT_FAILURE;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
bigbuffer = (char *)malloc(p.size());
if (bigbuffer == NULL) {
big_buffer = (char *)malloc(p.size());
if (big_buffer == NULL) {
std::cerr << "can't allocate memory" << std::endl;
return false;
}
@ -356,7 +356,7 @@ bool validate(const char *dirname) {
total_string_length = 0;
empty_string = 0;
bool isok = json_parse(p, pj);
free(bigbuffer);
free(big_buffer);
if (good_string > 0) {
printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t "
"empty strings: %10zu "

View File

@ -5,7 +5,7 @@
#include "simdjson/jsonioutil.h"
#include "simdjson/jsonparser.h"
void compute_dump(simdjson::ParsedJson::iterator &pjh) {
void compute_dump(simdjson::ParsedJson::Iterator &pjh) {
if (pjh.is_object()) {
std::cout << "{";
if (pjh.down()) {
@ -40,8 +40,8 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) {
}
int main(int argc, char *argv[]) {
bool rawdump = false;
bool apidump = false;
bool rawdump = false;
bool apidump = false;
#ifndef _MSC_VER
int c;
@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
default:
abort();
}
}
}
#else
int optind = 1;
#endif
@ -70,7 +70,8 @@ int main(int argc, char *argv[]) {
}
const char *filename = argv[optind];
if (optind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
<< std::endl;
}
simdjson::padded_string p;
try {
@ -80,25 +81,28 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
}
int res = simdjson::json_parse(p, pj); // do the parsing, return false on error
int res =
simdjson::json_parse(p, pj); // do the parsing, return false on error
if (res != simdjson::SUCCESS) {
std::cerr << " Parsing failed. Error is '" << simdjson::errorMsg(res) << "'." << std::endl;
std::cerr << " Parsing failed. Error is '" << simdjson::error_message(res)
<< "'." << std::endl;
return EXIT_FAILURE;
}
if (apidump) {
simdjson::ParsedJson::iterator pjh(pj);
if (!pjh.isOk()) {
simdjson::ParsedJson::Iterator pjh(pj);
if (!pjh.is_ok()) {
std::cerr << " Could not iterate parsed result. " << std::endl;
return EXIT_FAILURE;
}
compute_dump(pjh);
} else {
const bool is_ok = rawdump ? pj.dump_raw_tape(std::cout) : pj.printjson(std::cout);
const bool is_ok =
rawdump ? pj.dump_raw_tape(std::cout) : pj.print_json(std::cout);
if (!is_ok) {
std::cerr << " Could not print out parsed result. " << std::endl;
return EXIT_FAILURE;

View File

@ -1,9 +1,8 @@
#include <iostream>
#include "simdjson/jsonioutil.h"
#include "simdjson/jsonparser.h"
#include <iostream>
void compute_dump(simdjson::ParsedJson::iterator &pjh) {
void compute_dump(simdjson::ParsedJson::Iterator &pjh) {
if (pjh.is_object()) {
std::cout << "{";
if (pjh.down()) {
@ -40,9 +39,16 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) {
int main(int argc, char *argv[]) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <jsonfile> <jsonpath>" << std::endl;
std::cerr << "Follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901" << std::endl;
std::cerr << " Example: " << argv[0] << " jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2 " << std::endl;
std::cerr << "Multiple <jsonpath> can be issued in the same command, but at least one is needed." << std::endl;
std::cerr << "Follows the rfc6901 standard's syntax: "
"https://tools.ietf.org/html/rfc6901"
<< std::endl;
std::cerr << " Example: " << argv[0]
<< " jsonexamples/small/demo.json /Image/Width /Image/Height "
"/Image/IDs/2 "
<< std::endl;
std::cerr << "Multiple <jsonpath> can be issued in the same command, but "
"at least one is needed."
<< std::endl;
exit(1);
}
const char *filename = argv[1];
@ -54,31 +60,33 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
simdjson::ParsedJson pj;
bool allocok = pj.allocateCapacity(p.size(), 1024);
bool allocok = pj.allocate_capacity(p.size(), 1024);
if (!allocok) {
std::cerr << "failed to allocate memory" << std::endl;
return EXIT_FAILURE;
}
int res = simdjson::json_parse(p, pj); // do the parsing, return false on error
int res =
simdjson::json_parse(p, pj); // do the parsing, return false on error
if (res) {
std::cerr << " Parsing failed with error " << simdjson::errorMsg(res) << std::endl;
std::cerr << " Parsing failed with error " << simdjson::error_message(res)
<< std::endl;
return EXIT_FAILURE;
}
std::cout << "[" << std::endl;
for(int idx = 2; idx < argc; idx++) {
const char * jsonpath = argv[idx];
simdjson::ParsedJson::iterator it(pj);
if(it.move_to(std::string(jsonpath))) {
std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl;
std::cout << "\"value\":";
compute_dump(it);
std::cout << "}" << std::endl;
} else {
std::cout << "null" << std::endl;
}
if(idx + 1 < argc) {
std::cout << "," << std::endl;
}
for (int idx = 2; idx < argc; idx++) {
const char *jsonpath = argv[idx];
simdjson::ParsedJson::Iterator it(pj);
if (it.move_to(std::string(jsonpath))) {
std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl;
std::cout << "\"value\":";
compute_dump(it);
std::cout << "}" << std::endl;
} else {
std::cout << "null" << std::endl;
}
if (idx + 1 < argc) {
std::cout << "," << std::endl;
}
}
std::cout << "]" << std::endl;
return EXIT_SUCCESS;

View File

@ -3,30 +3,28 @@
#include "simdjson/jsonioutil.h"
#include "simdjson/jsonparser.h"
size_t count_nonasciibytes(const uint8_t* input, size_t length) {
size_t count_nonasciibytes(const uint8_t *input, size_t length) {
size_t count = 0;
for(size_t i = 0; i < length; i++) {
for (size_t i = 0; i < length; i++) {
count += input[i] >> 7;
}
return count;
}
size_t count_backslash(const uint8_t* input, size_t length) {
size_t count = 0;
for(size_t i = 0; i < length; i++) {
count += (input[i] == '\\') ? 1 : 0;
}
return count;
}
size_t count_backslash(const uint8_t *input, size_t length) {
size_t count = 0;
for (size_t i = 0; i < length; i++) {
count += (input[i] == '\\') ? 1 : 0;
}
return count;
}
struct stat_s {
size_t integer_count;
size_t float_count;
size_t string_count;
size_t backslash_count;
size_t nonasciibyte_count;
size_t non_ascii_byte_count;
size_t object_count;
size_t array_count;
size_t null_count;
@ -39,18 +37,18 @@ struct stat_s {
using stat_t = struct stat_s;
stat_t simdjson_computestats(const simdjson::padded_string &p) {
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
stat_t answer;
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
answer.valid = pj.isValid();
answer.valid = pj.is_valid();
if (!answer.valid) {
std::cerr << pj.getErrorMsg() << std::endl;
std::cerr << pj.get_error_message() << std::endl;
return answer;
}
answer.backslash_count = count_backslash(reinterpret_cast<const uint8_t*>(p.data()), p.size());
answer.nonasciibyte_count = count_nonasciibytes(reinterpret_cast<const uint8_t*>(p.data()), p.size());
answer.backslash_count =
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.non_ascii_byte_count = count_nonasciibytes(
reinterpret_cast<const uint8_t *>(p.data()), p.size());
answer.byte_count = p.size();
answer.integer_count = 0;
answer.float_count = 0;
@ -61,24 +59,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
answer.false_count = 0;
answer.string_count = 0;
answer.structural_indexes_count = pj.n_structural_indexes;
size_t tapeidx = 0;
uint64_t tape_val = pj.tape[tapeidx++];
size_t tape_idx = 0;
uint64_t tape_val = pj.tape[tape_idx++];
uint8_t type = (tape_val >> 56);
size_t howmany = 0;
size_t how_many = 0;
assert(type == 'r');
howmany = tape_val & JSONVALUEMASK;
for (; tapeidx < howmany; tapeidx++) {
tape_val = pj.tape[tapeidx];
// uint64_t payload = tape_val & JSONVALUEMASK;
how_many = tape_val & JSON_VALUE_MASK;
for (; tape_idx < how_many; tape_idx++) {
tape_val = pj.tape[tape_idx];
// uint64_t payload = tape_val & JSON_VALUE_MASK;
type = (tape_val >> 56);
switch (type) {
case 'l': // we have a long int
answer.integer_count++;
tapeidx++; // skipping the integer
tape_idx++; // skipping the integer
break;
case 'd': // we have a double
answer.float_count++;
tapeidx++; // skipping the double
tape_idx++; // skipping the double
break;
case 'n': // we have a null
answer.null_count++;
@ -109,12 +107,6 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) {
return answer;
}
int main(int argc, char *argv[]) {
int myoptind = 1;
if (myoptind >= argc) {
@ -124,7 +116,8 @@ int main(int argc, char *argv[]) {
}
const char *filename = argv[myoptind];
if (myoptind + 1 < argc) {
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] << std::endl;
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1]
<< std::endl;
}
simdjson::padded_string p;
try {
@ -133,16 +126,18 @@ int main(int argc, char *argv[]) {
std::cerr << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
stat_t s = simdjson_computestats(p);
if(!s.valid) {
stat_t s = simdjson_compute_stats(p);
if (!s.valid) {
std::cerr << "not a valid JSON" << std::endl;
return EXIT_FAILURE;
}
printf("# integer_count float_count string_count backslash_count nonasciibyte_count object_count array_count null_count true_count false_count byte_count structural_indexes_count\n");
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, s.float_count,
s.string_count, s.backslash_count, s.nonasciibyte_count, s.object_count, s.array_count,
s.null_count, s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
printf("# integer_count float_count string_count backslash_count "
"non_ascii_byte_count object_count array_count null_count true_count "
"false_count byte_count structural_indexes_count\n");
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count,
s.float_count, s.string_count, s.backslash_count,
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
return EXIT_SUCCESS;
}

View File

@ -10,12 +10,12 @@ int main(int argc, char *argv[]) {
}
simdjson::padded_string p;
std::string filename = argv[argc - 1];
try{
try {
simdjson::get_corpus(filename).swap(p);
} catch (const std::exception& e) {
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
} catch (const std::exception &e) {
std::cout << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
simdjson::jsonminify(p, p.data());
printf("%s",p.data());
simdjson::json_minify(p, p.data());
printf("%s", p.data());
}