diff --git a/README.md b/README.md index caf8e554..383f94aa 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi ## Code usage and example -The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document'). +The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::Iterator pjh(pj)`, see 'Navigating the parsed document'). ```C #include "simdjson/jsonparser.h" @@ -80,12 +80,12 @@ const char * filename = ... // // use whatever means you want to get a string (UTF-8) of your JSON document padded_string p = get_corpus(filename); ParsedJson pj; -pj.allocateCapacity(p.size()); // allocate memory for parsing up to p.size() bytes +pj.allocate_capacity(p.size()); // allocate memory for parsing up to p.size() bytes const int res = json_parse(p, pj); // do the parsing, return 0 on success // parsing is done! if (res != 0) { // You can use the "simdjson/simdjson.h" header to access the error message - std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl; + std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl; } // the ParsedJson document can be used here // pj can be reused with other json_parse calls. @@ -103,9 +103,9 @@ using namespace simdjson; const char * filename = ... // padded_string p = get_corpus(filename); ParsedJson pj = build_parsed_json(p); // do the parsing -if( ! pj.isValid() ) { +if( ! pj.is_valid() ) { // something went wrong - std::cout << pj.getErrorMsg() << std::endl; + std::cout << pj.get_error_message() << std::endl; } ``` @@ -119,13 +119,13 @@ using namespace simdjson; /... std::string mystring = ... // ParsedJson pj; -pj.allocateCapacity(mystring.size()); // allocate memory for parsing up to p.size() bytes +pj.allocate_capacity(mystring.size()); // allocate memory for parsing up to p.size() bytes // std::string may not overallocate so a copy will be needed const int res = json_parse(mystring, pj); // do the parsing, return 0 on success // parsing is done! if (res != 0) { // You can use the "simdjson/simdjson.h" header to access the error message - std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl; + std::cout << "Error parsing:" << simdjson::error_message(res) << std::endl; } // pj can be reused with other json_parse calls. ``` @@ -141,9 +141,9 @@ using namespace simdjson; std::string mystring = ... // // std::string may not overallocate so a copy will be needed ParsedJson pj = build_parsed_json(mystring); // do the parsing -if( ! pj.isValid() ) { +if( ! pj.is_valid() ) { // something went wrong - std::cout << pj.getErrorMsg() << std::endl; + std::cout << pj.get_error_message() << std::endl; } ``` @@ -164,9 +164,9 @@ int main(int argc, char *argv[]) { const char * filename = argv[1]; padded_string p = get_corpus(filename); ParsedJson pj = build_parsed_json(p); // do the parsing - if( ! pj.isValid() ) { + if( ! pj.is_valid() ) { std::cout << "not valid" << std::endl; - std::cout << pj.getErrorMsg() << std::endl; + std::cout << pj.get_error_message() << std::endl; } else { std::cout << "valid" << std::endl; } @@ -370,8 +370,8 @@ In C++, given a `ParsedJson`, we can move to a node with the `move_to` method, p Here is a code sample to dump back the parsed JSON to a string: ```c - ParsedJson::iterator pjh(pj); - if (!pjh.isOk()) { + ParsedJson::Iterator pjh(pj); + if (!pjh.is_ok()) { std::cerr << " Could not iterate parsed result. " << std::endl; return EXIT_FAILURE; } @@ -379,7 +379,7 @@ Here is a code sample to dump back the parsed JSON to a string: // // where compute_dump is : -void compute_dump(ParsedJson::iterator &pjh) { +void compute_dump(ParsedJson::Iterator &pjh) { if (pjh.is_object()) { std::cout << "{"; if (pjh.down()) { @@ -417,12 +417,12 @@ void compute_dump(ParsedJson::iterator &pjh) { The following function will find all user.id integers: ```C -void simdjson_scan(std::vector &answer, ParsedJson::iterator &i) { +void simdjson_scan(std::vector &answer, ParsedJson::Iterator &i) { while(i.move_forward()) { if(i.get_scope_type() == '{') { - bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); + bool found_user = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); i.move_to_value(); - if(founduser) { + if(found_user) { if(i.is_object() && i.move_to_key("id",2)) { if (i.is_integer()) { answer.push_back(i.get_integer()); diff --git a/amalgamation.sh b/amalgamation.sh index 035ce01b..fc517f4c 100755 --- a/amalgamation.sh +++ b/amalgamation.sh @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) { const char * filename = argv[1]; simdjson::padded_string p = simdjson::get_corpus(filename); simdjson::ParsedJson pj = simdjson::build_parsed_json(p); // do the parsing - if( ! pj.isValid() ) { + if( ! pj.is_valid() ) { std::cout << "not valid" << std::endl; } else { std::cout << "valid" << std::endl; diff --git a/benchmark/benchmark.h b/benchmark/benchmark.h index 96775ca0..244b25e0 100644 --- a/benchmark/benchmark.h +++ b/benchmark/benchmark.h @@ -18,7 +18,7 @@ const char *unitname = "cycles"; : \ : /* no read only */ \ "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ - ); \ + ); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while (0) @@ -32,7 +32,7 @@ const char *unitname = "cycles"; : "=r"(cyc_high), "=r"(cyc_low) \ : /* no read only registers */ \ : "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ - ); \ + ); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while (0) diff --git a/benchmark/distinctuseridcompetition.cpp b/benchmark/distinctuseridcompetition.cpp index d1e1bd0a..f8a67ad3 100644 --- a/benchmark/distinctuseridcompetition.cpp +++ b/benchmark/distinctuseridcompetition.cpp @@ -30,49 +30,51 @@ void print_vec(const std::vector &v) { std::cout << std::endl; } -void simdjson_scan(std::vector &answer, simdjson::ParsedJson::iterator &i) { - while(i.move_forward()) { - if(i.get_scope_type() == '{') { - bool founduser = (i.get_string_length() == 4) && (memcmp(i.get_string(), "user", 4) == 0); - i.move_to_value(); - if(founduser) { - if(i.is_object() && i.move_to_key("id",2)) { - if (i.is_integer()) { - answer.push_back(i.get_integer()); - } - i.up(); - } - } - } - } +void simdjson_scan(std::vector &answer, + simdjson::ParsedJson::Iterator &i) { + while (i.move_forward()) { + if (i.get_scope_type() == '{') { + bool found_user = (i.get_string_length() == 4) && + (memcmp(i.get_string(), "user", 4) == 0); + i.move_to_value(); + if (found_user) { + if (i.is_object() && i.move_to_key("id", 2)) { + if (i.is_integer()) { + answer.push_back(i.get_integer()); + } + i.up(); + } + } + } + } } -__attribute__ ((noinline)) -std::vector simdjson_justdom(simdjson::ParsedJson &pj) { +__attribute__((noinline)) std::vector +simdjson_just_dom(simdjson::ParsedJson &pj) { std::vector answer; - simdjson::ParsedJson::iterator i(pj); - simdjson_scan(answer,i); + simdjson::ParsedJson::Iterator i(pj); + simdjson_scan(answer, i); remove_duplicates(answer); return answer; } -__attribute__ ((noinline)) -std::vector simdjson_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) std::vector +simdjson_compute_stats(const simdjson::padded_string &p) { std::vector answer; simdjson::ParsedJson pj = simdjson::build_parsed_json(p); - if (!pj.isValid()) { + if (!pj.is_valid()) { return answer; } - simdjson::ParsedJson::iterator i(pj); - simdjson_scan(answer,i); + simdjson::ParsedJson::Iterator i(pj); + simdjson_scan(answer, i); remove_duplicates(answer); return answer; } -__attribute__ ((noinline)) -bool simdjson_justparse(const simdjson::padded_string &p) { +__attribute__((noinline)) bool +simdjson_just_parse(const simdjson::padded_string &p) { simdjson::ParsedJson pj = simdjson::build_parsed_json(p); - bool answer = !pj.isValid(); + bool answer = !pj.is_valid(); return answer; } @@ -88,25 +90,27 @@ void sajson_traverse(std::vector &answer, const sajson::value &node) { } case TYPE_OBJECT: { auto length = node.get_length(); - // sajson has O(log n) find_object_key, but we still visit each node anyhow because we - // need to visit all values. + // sajson has O(log n) find_object_key, but we still visit each node anyhow + // because we need to visit all values. for (auto i = 0u; i < length; ++i) { auto key = node.get_object_key(i); // expected: sajson::string - bool founduser = (key.length() == 4) && (memcmp(key.data(), "user", 4) == 0); - if (founduser) { // found a user!!! - auto uservalue = node.get_object_value(i); // get the value - if (uservalue.get_type() == + bool found_user = + (key.length() == 4) && (memcmp(key.data(), "user", 4) == 0); + if (found_user) { // found a user!!! + auto user_value = node.get_object_value(i); // get the value + if (user_value.get_type() == TYPE_OBJECT) { // the value should be an object // now we know that we only need one value - auto uservaluelength = uservalue.get_length(); - auto rightindex = uservalue.find_object_key(sajson::string("id",2)); - if(rightindex < uservaluelength) { - auto v = uservalue.get_object_value(rightindex); - if (v.get_type() == TYPE_INTEGER) { // check that it is an integer - answer.push_back(v.get_integer_value()); // record it! - } else if (v.get_type() == TYPE_DOUBLE) { - answer.push_back((int64_t)v.get_double_value()); // record it! - } + auto user_value_length = user_value.get_length(); + auto right_index = + user_value.find_object_key(sajson::string("id", 2)); + if (right_index < user_value_length) { + auto v = user_value.get_object_value(right_index); + if (v.get_type() == TYPE_INTEGER) { // check that it is an integer + answer.push_back(v.get_integer_value()); // record it! + } else if (v.get_type() == TYPE_DOUBLE) { + answer.push_back((int64_t)v.get_double_value()); // record it! + } } } } @@ -126,16 +130,16 @@ void sajson_traverse(std::vector &answer, const sajson::value &node) { } } -__attribute__ ((noinline)) -std::vector sasjon_justdom(sajson::document & d) { +__attribute__((noinline)) std::vector +sasjon_just_dom(sajson::document &d) { std::vector answer; sajson_traverse(answer, d.get_root()); remove_duplicates(answer); return answer; } -__attribute__ ((noinline)) -std::vector sasjon_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) std::vector +sasjon_compute_stats(const simdjson::padded_string &p) { std::vector answer; char *buffer = (char *)malloc(p.size()); memcpy(buffer, p.data(), p.size()); @@ -151,8 +155,8 @@ std::vector sasjon_computestats(const simdjson::padded_string &p) { return answer; } -__attribute__ ((noinline)) -bool sasjon_justparse(const simdjson::padded_string &p) { +__attribute__((noinline)) bool +sasjon_just_parse(const simdjson::padded_string &p) { char *buffer = (char *)malloc(p.size()); memcpy(buffer, p.data(), p.size()); auto d = sajson::parse(sajson::dynamic_allocation(), @@ -167,8 +171,9 @@ void rapid_traverse(std::vector &answer, const rapidjson::Value &v) { case kObjectType: for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); ++m) { - bool founduser = (m->name.GetStringLength() == 4) && (memcmp(m->name.GetString(), "user", 4) == 0); - if (founduser) { + bool found_user = (m->name.GetStringLength() == 4) && + (memcmp(m->name.GetString(), "user", 4) == 0); + if (found_user) { const rapidjson::Value &child = m->value; if (child.GetType() == kObjectType) { for (Value::ConstMemberIterator k = child.MemberBegin(); @@ -201,16 +206,16 @@ void rapid_traverse(std::vector &answer, const rapidjson::Value &v) { } } -__attribute__ ((noinline)) -std::vector rapid_justdom(rapidjson::Document &d) { +__attribute__((noinline)) std::vector +rapid_just_dom(rapidjson::Document &d) { std::vector answer; rapid_traverse(answer, d); remove_duplicates(answer); return answer; } -__attribute__ ((noinline)) -std::vector rapid_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) std::vector +rapid_compute_stats(const simdjson::padded_string &p) { std::vector answer; char *buffer = (char *)malloc(p.size() + 1); memcpy(buffer, p.data(), p.size()); @@ -218,8 +223,8 @@ std::vector rapid_computestats(const simdjson::padded_string &p) { rapidjson::Document d; d.ParseInsitu(buffer); if (d.HasParseError()) { - free(buffer); - return answer; + free(buffer); + return answer; } rapid_traverse(answer, d); free(buffer); @@ -227,8 +232,8 @@ std::vector rapid_computestats(const simdjson::padded_string &p) { return answer; } -__attribute__ ((noinline)) -bool rapid_justparse(const simdjson::padded_string &p) { +__attribute__((noinline)) bool +rapid_just_parse(const simdjson::padded_string &p) { char *buffer = (char *)malloc(p.size() + 1); memcpy(buffer, p.data(), p.size()); buffer[p.size()] = '\0'; @@ -239,16 +244,15 @@ bool rapid_justparse(const simdjson::padded_string &p) { return answer; } - int main(int argc, char *argv[]) { bool verbose = false; - bool justdata = false; + bool just_data = false; int c; while ((c = getopt(argc, argv, "vt")) != -1) switch (c) { case 't': - justdata = true; + just_data = true; break; case 'v': verbose = true; @@ -257,15 +261,18 @@ int main(int argc, char *argv[]) { abort(); } if (optind >= argc) { - std::cerr << "Using different parsers, we compute the content statistics of " - "JSON documents." << std::endl; + std::cerr + << "Using different parsers, we compute the content statistics of " + "JSON documents." + << std::endl; std::cerr << "Usage: " << argv[0] << " " << std::endl; std::cerr << "Or " << argv[0] << " -v " << std::endl; exit(1); } const char *filename = argv[optind]; if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl; + std::cerr << "warning: ignoring everything after " << argv[optind + 1] + << std::endl; } simdjson::padded_string p; try { @@ -285,17 +292,17 @@ int main(int argc, char *argv[]) { std::cout << p.size() << " B "; std::cout << std::endl; } - std::vector s1 = simdjson_computestats(p); + std::vector s1 = simdjson_compute_stats(p); if (verbose) { printf("simdjson: "); print_vec(s1); } - std::vector s2 = rapid_computestats(p); + std::vector s2 = rapid_compute_stats(p); if (verbose) { printf("rapid: "); print_vec(s2); } - std::vector s3 = sasjon_computestats(p); + std::vector s3 = sasjon_compute_stats(p); if (verbose) { printf("sasjon: "); print_vec(s3); @@ -306,34 +313,35 @@ int main(int argc, char *argv[]) { int repeat = 500; int volume = p.size(); - if(justdata) { - printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); + if (just_data) { + printf( + "name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); } - BEST_TIME("simdjson ", simdjson_computestats(p).size(), size, , repeat, - volume, !justdata); - BEST_TIME("rapid ", rapid_computestats(p).size(), size, , repeat, volume, - !justdata); - BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume, - !justdata); - BEST_TIME("simdjson (just parse) ", simdjson_justparse(p), false, , repeat, - volume, !justdata); - BEST_TIME("rapid (just parse) ", rapid_justparse(p), false, , repeat, volume, - !justdata); - BEST_TIME("sasjon (just parse) ", sasjon_justparse(p), false, , repeat, volume, - !justdata); + BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat, + volume, !just_data); + BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume, + !just_data); + BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume, + !just_data); + BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), false, , repeat, + volume, !just_data); + BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat, + volume, !just_data); + BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat, + volume, !just_data); simdjson::ParsedJson dsimdjson = simdjson::build_parsed_json(p); - BEST_TIME("simdjson (just dom) ", simdjson_justdom(dsimdjson).size(), size, , repeat, - volume, !justdata); + BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size, + , repeat, volume, !just_data); char *buffer = (char *)malloc(p.size()); memcpy(buffer, p.data(), p.size()); rapidjson::Document drapid; drapid.ParseInsitu(buffer); - BEST_TIME("rapid (just dom) ", rapid_justdom(drapid).size(), size, , repeat, volume, - !justdata); + BEST_TIME("rapid (just dom) ", rapid_just_dom(drapid).size(), size, , repeat, + volume, !just_data); memcpy(buffer, p.data(), p.size()); auto dsasjon = sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)); - BEST_TIME("sasjon (just dom) ", sasjon_justdom(dsasjon).size(), size, , repeat, volume, - !justdata); + sajson::mutable_string_view(p.size(), buffer)); + BEST_TIME("sasjon (just dom) ", sasjon_just_dom(dsasjon).size(), size, , + repeat, volume, !just_data); free(buffer); } diff --git a/benchmark/minifiercompetition.cpp b/benchmark/minifiercompetition.cpp index c662a220..a09e8db9 100644 --- a/benchmark/minifiercompetition.cpp +++ b/benchmark/minifiercompetition.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include "benchmark.h" #include "simdjson/jsonioutil.h" @@ -17,7 +17,7 @@ using namespace simdjson; using namespace rapidjson; -std::string rapidstringmeInsitu(char *json) { +std::string rapid_stringme_insitu(char *json) { Document d; d.ParseInsitu(json); if (d.HasParseError()) { @@ -30,7 +30,7 @@ std::string rapidstringmeInsitu(char *json) { return buffer.GetString(); } -std::string rapidstringme(char *json) { +std::string rapid_stringme(char *json) { Document d; d.Parse(json); if (d.HasParseError()) { @@ -46,29 +46,28 @@ std::string rapidstringme(char *json) { int main(int argc, char *argv[]) { int c; bool verbose = false; - bool justdata = false; + bool just_data = false; - while ((c = getopt (argc, argv, "vt")) != -1) - switch (c) - { - case 't': - justdata = true; - break; - case 'v': - verbose = true; - break; - default: - abort (); - } + while ((c = getopt(argc, argv, "vt")) != -1) + switch (c) { + case 't': + just_data = true; + break; + case 'v': + verbose = true; + break; + default: + abort(); + } if (optind >= argc) { std::cerr << "Usage: " << argv[0] << " " << std::endl; exit(1); } - const char * filename = argv[optind]; + const char *filename = argv[optind]; simdjson::padded_string p; try { simdjson::get_corpus(filename).swap(p); - } catch (const std::exception& e) { // caught by reference to base + } catch (const std::exception &e) { // caught by reference to base std::cout << "Could not load the file " << filename << std::endl; return EXIT_FAILURE; } @@ -88,71 +87,95 @@ int main(int argc, char *argv[]) { int repeat = 50; int volume = p.size(); - if(justdata) { - printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); + if (just_data) { + printf( + "name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); } - size_t strlength = rapidstringme((char *)p.data()).size(); + size_t strlength = rapid_stringme((char *)p.data()).size(); if (verbose) std::cout << "input length is " << p.size() << " stringified length is " << strlength << std::endl; - BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, !justdata); - BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer), - memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + BEST_TIME_NOCHECK("despacing with RapidJSON", + rapid_stringme((char *)p.data()), , repeat, volume, + !just_data); + BEST_TIME_NOCHECK( + "despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer), + memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); memcpy(buffer, p.data(), p.size()); - size_t outlength = - simdjson::jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer); + size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(), + (uint8_t *)buffer); if (verbose) - std::cout << "jsonminify length is " << outlength << std::endl; + std::cout << "json_minify length is " << outlength << std::endl; uint8_t *cbuffer = (uint8_t *)buffer; - BEST_TIME("jsonminify", simdjson::jsonminify(cbuffer, p.size(), cbuffer), outlength, - memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size()); + BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer), + outlength, memcpy(buffer, p.data(), p.size()), repeat, volume, + !just_data); + printf("minisize = %zu, original size = %zu (minified down to %.2f percent " + "of original) \n", + outlength, p.size(), outlength * 100.0 / p.size()); /*** * Is it worth it to minify before parsing? ***/ rapidjson::Document d; - BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false, - memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), + false, memcpy(buffer, p.data(), p.size()), repeat, volume, + !just_data); - char *minibuffer = simdjson::allocate_padded_buffer(p.size() + 1); - size_t minisize = simdjson::jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer); - minibuffer[minisize] = '\0'; + char *mini_buffer = simdjson::allocate_padded_buffer(p.size() + 1); + size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(), + (uint8_t *)mini_buffer); + mini_buffer[minisize] = '\0'; - BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false, - memcpy(buffer, minibuffer, p.size()), - repeat, volume, !justdata); + BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), + false, memcpy(buffer, mini_buffer, p.size()), repeat, volume, + !just_data); - size_t astbuffersize = p.size() * 2; - size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t)); + size_t ast_buffer_size = p.size() * 2; + size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t)); - BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + BEST_TIME( + "sajson orig", + sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), + sajson::mutable_string_view(p.size(), buffer)) + .is_valid(), + true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - - BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata); + BEST_TIME( + "sajson despaced", + sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), + sajson::mutable_string_view(minisize, buffer)) + .is_valid(), + true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data); simdjson::ParsedJson pj; - bool isallocok = pj.allocateCapacity(p.size(), 1024); - if(!isallocok) { + bool is_alloc_ok = pj.allocate_capacity(p.size(), 1024); + if (!is_alloc_ok) { fprintf(stderr, "failed to allocate memory\n"); return EXIT_FAILURE; - } - bool automated_reallocation = false; - BEST_TIME("simdjson orig", simdjson::json_parse((const uint8_t*)buffer, p.size(), pj, automated_reallocation), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); - + } + bool automated_reallocation = false; + BEST_TIME("simdjson orig", + simdjson::json_parse((const uint8_t *)buffer, p.size(), pj, + automated_reallocation), + true, memcpy(buffer, p.data(), p.size()), repeat, volume, + !just_data); + simdjson::ParsedJson pj2; - bool isallocok2 = pj2.allocateCapacity(p.size(), 1024); - if(!isallocok2) { + bool is_alloc_ok2 = pj2.allocate_capacity(p.size(), 1024); + if (!is_alloc_ok2) { fprintf(stderr, "failed to allocate memory\n"); return EXIT_FAILURE; - } - automated_reallocation = false; - BEST_TIME("simdjson despaced", simdjson::json_parse((const uint8_t*)buffer, minisize, pj2, automated_reallocation), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata); + } + automated_reallocation = false; + BEST_TIME("simdjson despaced", + simdjson::json_parse((const uint8_t *)buffer, minisize, pj2, + automated_reallocation), + true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, + !just_data); free(buffer); free(ast_buffer); - free(minibuffer); - - + free(mini_buffer); } diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp index 44e20d81..bcce04c7 100644 --- a/benchmark/parse.cpp +++ b/benchmark/parse.cpp @@ -28,57 +28,58 @@ #endif //#define DEBUG #include "simdjson/common_defs.h" +#include "simdjson/isadetection.h" #include "simdjson/jsonioutil.h" #include "simdjson/jsonparser.h" #include "simdjson/parsedjson.h" #include "simdjson/stage1_find_marks.h" #include "simdjson/stage2_build_tape.h" -#include "simdjson/isadetection.h" namespace simdjson { -architecture _find_best_supported_implementation() { - constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ - | SIMDExtensions::BMI1 | SIMDExtensions::BMI2; - constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ; +Architecture _find_best_supported_implementation() { + constexpr uint32_t haswell_flags = + instruction_set::AVX2 | instruction_set::PCLMULQDQ | + instruction_set::BMI1 | instruction_set::BMI2; + constexpr uint32_t westmere_flags = + instruction_set::SSE42 | instruction_set::PCLMULQDQ; uint32_t supports = detect_supported_architectures(); // Order from best to worst (within architecture) if ((haswell_flags & supports) == haswell_flags) { - return architecture::haswell; + return Architecture::HASWELL; } if ((westmere_flags & supports) == westmere_flags) { - return architecture::westmere; + return Architecture::WESTMERE; } - if (SIMDExtensions::NEON) return architecture::arm64; + if (instruction_set::NEON) + return Architecture::ARM64; - return architecture::none; + return Architecture::NONE; } - -using unified_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj); -using stage1_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj); - +using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj); +using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj); extern unified_functype *unified_ptr; extern stage1_functype *stage1_ptr; int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) { - architecture best_implementation = _find_best_supported_implementation(); + Architecture best_implementation = _find_best_supported_implementation(); // Selecting the best implementation switch (best_implementation) { #ifdef IS_X86_64 - case architecture::haswell: - unified_ptr = &unified_machine; + case Architecture::HASWELL: + unified_ptr = &unified_machine; break; - case architecture::westmere: - unified_ptr = &unified_machine; + case Architecture::WESTMERE: + unified_ptr = &unified_machine; break; #endif #ifdef IS_ARM64 - case architecture::arm64: - unified_ptr = &unified_machine; + case Architecture::ARM64: + unified_ptr = &unified_machine; break; #endif - default : + default: std::cerr << "The processor is not supported by simdjson." << std::endl; return simdjson::UNEXPECTED_ERROR; } @@ -87,24 +88,25 @@ int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) { } // Responsible to select the best json_parse implementation -int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) { - architecture best_implementation = _find_best_supported_implementation(); +int find_structural_bits_dispatch(const uint8_t *buf, size_t len, + ParsedJson &pj) { + Architecture best_implementation = _find_best_supported_implementation(); // Selecting the best implementation switch (best_implementation) { #ifdef IS_X86_64 - case architecture::haswell: - stage1_ptr = &find_structural_bits; + case Architecture::HASWELL: + stage1_ptr = &find_structural_bits; break; - case architecture::westmere: - stage1_ptr = &find_structural_bits; + case Architecture::WESTMERE: + stage1_ptr = &find_structural_bits; break; #endif #ifdef IS_ARM64 - case architecture::arm64: - stage1_ptr = &find_structural_bits; + case Architecture::ARM64: + stage1_ptr = &find_structural_bits; break; #endif - default : + default: std::cerr << "The processor is not supported by simdjson." << std::endl; return simdjson::UNEXPECTED_ERROR; } @@ -114,23 +116,21 @@ int find_structural_bits_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj stage1_functype *stage1_ptr = &find_structural_bits_dispatch; unified_functype *unified_ptr = &unified_machine_dispatch; -} - - +} // namespace simdjson int main(int argc, char *argv[]) { bool verbose = false; bool dump = false; - bool jsonoutput = false; - bool forceoneiteration = false; - bool justdata = false; + bool json_output = false; + bool force_one_iteration = false; + bool just_data = false; #ifndef _MSC_VER int c; while ((c = getopt(argc, argv, "1vdt")) != -1) { switch (c) { case 't': - justdata = true; + just_data = true; break; case 'v': verbose = true; @@ -139,15 +139,15 @@ int main(int argc, char *argv[]) { dump = true; break; case 'j': - jsonoutput = true; + json_output = true; break; case '1': - forceoneiteration = true; + force_one_iteration = true; break; default: abort(); } -} + } #else int optind = 1; #endif @@ -157,7 +157,8 @@ int main(int argc, char *argv[]) { } const char *filename = argv[optind]; if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl; + std::cerr << "warning: ignoring everything after " << argv[optind + 1] + << std::endl; } if (verbose) { std::cout << "[verbose] loading " << filename << std::endl; @@ -170,30 +171,41 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } if (verbose) { - std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)" - << std::endl; -} + std::cout << "[verbose] loaded " << filename << " (" << p.size() + << " bytes)" << std::endl; + } #if defined(DEBUG) const uint32_t iterations = 1; #else const uint32_t iterations = - forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10); + force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10); #endif std::vector res; res.resize(iterations); - if(!justdata) printf("number of iterations %u \n", iterations); + if (!just_data) + printf("number of iterations %u \n", iterations); #if !defined(__linux__) #define SQUASH_COUNTERS - if (justdata) { - printf("justdata (-t) flag only works under linux.\n"); + if (just_data) { + printf("just_data (-t) flag only works under linux.\n"); } #endif - {// practice run + { // practice run simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size()); - if(allocok) { - simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj); - simdjson::unified_ptr((const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)(const uint8_t*)p.data(), p.size(), pj); + bool allocok = pj.allocate_capacity(p.size()); + if (allocok) { + simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj); + simdjson::unified_ptr( + (const uint8_t + *)(const uint8_t + *)(const uint8_t + *)(const uint8_t + *)(const uint8_t + *)(const uint8_t + *)(const uint8_t + *)(const uint8_t *) + p.data(), + p.size(), pj); } } #ifndef SQUASH_COUNTERS @@ -220,7 +232,7 @@ int main(int argc, char *argv[]) { } unified.start(); simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size()); + bool allocok = pj.allocate_capacity(p.size()); if (!allocok) { std::cerr << "failed to allocate memory" << std::endl; return EXIT_FAILURE; @@ -235,7 +247,8 @@ int main(int argc, char *argv[]) { std::cout << "[verbose] allocated memory for parsed JSON " << std::endl; } unified.start(); - isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS); + isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) == + simdjson::SUCCESS); unified.end(results); cy1 += results[0]; cl1 += results[1]; @@ -247,7 +260,9 @@ int main(int argc, char *argv[]) { break; } unified.start(); - isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj)); + isok = isok && + (simdjson::SUCCESS == + simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj)); unified.end(results); cy2 += results[0]; cl2 += results[1]; @@ -266,7 +281,7 @@ int main(int argc, char *argv[]) { std::cout << "[verbose] iteration # " << i << std::endl; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size()); + bool allocok = pj.allocate_capacity(p.size()); if (!allocok) { std::cerr << "failed to allocate memory" << std::endl; return EXIT_FAILURE; @@ -276,20 +291,24 @@ int main(int argc, char *argv[]) { } auto start = std::chrono::steady_clock::now(); - isok = (simdjson::stage1_ptr((const uint8_t*)p.data(), p.size(), pj) == simdjson::SUCCESS); - isok = isok && (simdjson::SUCCESS == simdjson::unified_ptr((const uint8_t*)p.data(), p.size(), pj)); + isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) == + simdjson::SUCCESS); + isok = isok && + (simdjson::SUCCESS == + simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj)); auto end = std::chrono::steady_clock::now(); std::chrono::duration secs = end - start; res[i] = secs.count(); - if(! isok) { - std::cerr << pj.getErrorMsg() << std::endl; + if (!isok) { + std::cerr << pj.get_error_message() << std::endl; std::cerr << "Could not parse. " << std::endl; return EXIT_FAILURE; } - } - simdjson::ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats - if (!pj.isValid()) { - std::cerr << pj.getErrorMsg() << std::endl; + } + simdjson::ParsedJson pj = + build_parsed_json(p); // do the parsing again to get the stats + if (!pj.is_valid()) { + std::cerr << pj.get_error_message() << std::endl; std::cerr << "Could not parse. " << std::endl; return EXIT_FAILURE; } @@ -297,7 +316,7 @@ int main(int argc, char *argv[]) { double speedinGBs = (p.size()) / (min_result * 1000000000.0); #ifndef SQUASH_COUNTERS unsigned long total = cy0 + cy1 + cy2; - if (justdata) { + if (just_data) { float cpb0 = (double)cy0 / (iterations * p.size()); float cpb1 = (double)cy1 / (iterations * p.size()); float cpb2 = (double)cy2 / (iterations * p.size()); @@ -315,8 +334,8 @@ int main(int argc, char *argv[]) { break; } } - printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, - cpbtotal, speedinGBs); + printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal, + speedinGBs); free(newfile); } else { printf("number of bytes %ld number of structural chars %u ratio %.3f\n", @@ -352,16 +371,16 @@ int main(int argc, char *argv[]) { printf(" all stages: %.2f cycles per input byte.\n", (double)total / (iterations * p.size())); - printf("Estimated average frequency: %.3f GHz.\n", (double)total / (iterations * min_result * 1000000000.0)); + printf("Estimated average frequency: %.3f GHz.\n", + (double)total / (iterations * min_result * 1000000000.0)); } #endif - if (!justdata) { + if (!just_data) { std::cout << "Min: " << min_result << " bytes read: " << p.size() - << " Gigabytes/second: " << speedinGBs - << std::endl; + << " Gigabytes/second: " << speedinGBs << std::endl; } - if (jsonoutput) { - isok = isok && pj.printjson(std::cout); + if (json_output) { + isok = isok && pj.print_json(std::cout); } if (dump) { isok = isok && pj.dump_raw_tape(std::cout); diff --git a/benchmark/parseandstatcompetition.cpp b/benchmark/parseandstatcompetition.cpp index 4ea12132..8adfcd07 100644 --- a/benchmark/parseandstatcompetition.cpp +++ b/benchmark/parseandstatcompetition.cpp @@ -43,11 +43,11 @@ void print_stat(const stat_t &s) { s.true_count, s.false_count); } -__attribute__ ((noinline)) -stat_t simdjson_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) stat_t +simdjson_compute_stats(const simdjson::padded_string &p) { stat_t answer; simdjson::ParsedJson pj = build_parsed_json(p); - answer.valid = pj.isValid(); + answer.valid = pj.is_valid(); if (!answer.valid) { return answer; } @@ -57,24 +57,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) { answer.null_count = 0; answer.true_count = 0; answer.false_count = 0; - size_t tapeidx = 0; - uint64_t tape_val = pj.tape[tapeidx++]; + size_t tape_idx = 0; + uint64_t tape_val = pj.tape[tape_idx++]; uint8_t type = (tape_val >> 56); - size_t howmany = 0; + size_t how_many = 0; assert(type == 'r'); - howmany = tape_val & JSONVALUEMASK; - for (; tapeidx < howmany; tapeidx++) { - tape_val = pj.tape[tapeidx]; - // uint64_t payload = tape_val & JSONVALUEMASK; + how_many = tape_val & JSON_VALUE_MASK; + for (; tape_idx < how_many; tape_idx++) { + tape_val = pj.tape[tape_idx]; + // uint64_t payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); switch (type) { case 'l': // we have a long int answer.number_count++; - tapeidx++; // skipping the integer + tape_idx++; // skipping the integer break; case 'd': // we have a double answer.number_count++; - tapeidx++; // skipping the double + tape_idx++; // skipping the double break; case 'n': // we have a null answer.null_count++; @@ -145,8 +145,8 @@ void sajson_traverse(stat_t &stats, const sajson::value &node) { } } -__attribute__ ((noinline)) -stat_t sasjon_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) stat_t +sasjon_compute_stats(const simdjson::padded_string &p) { stat_t answer; char *buffer = (char *)malloc(p.size()); memcpy(buffer, p.data(), p.size()); @@ -203,8 +203,8 @@ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) { } } -__attribute__ ((noinline)) -stat_t rapid_computestats(const simdjson::padded_string &p) { +__attribute__((noinline)) stat_t +rapid_compute_stats(const simdjson::padded_string &p) { stat_t answer; char *buffer = (char *)malloc(p.size() + 1); memcpy(buffer, p.data(), p.size()); @@ -228,13 +228,13 @@ stat_t rapid_computestats(const simdjson::padded_string &p) { int main(int argc, char *argv[]) { bool verbose = false; - bool justdata = false; + bool just_data = false; int c; while ((c = getopt(argc, argv, "vt")) != -1) switch (c) { case 't': - justdata = true; + just_data = true; break; case 'v': verbose = true; @@ -243,15 +243,18 @@ int main(int argc, char *argv[]) { abort(); } if (optind >= argc) { - std::cerr << "Using different parsers, we compute the content statistics of " - "JSON documents." << std::endl; + std::cerr + << "Using different parsers, we compute the content statistics of " + "JSON documents." + << std::endl; std::cerr << "Usage: " << argv[0] << " " << std::endl; std::cerr << "Or " << argv[0] << " -v " << std::endl; exit(1); } const char *filename = argv[optind]; if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl; + std::cerr << "warning: ignoring everything after " << argv[optind + 1] + << std::endl; } simdjson::padded_string p; try { @@ -271,17 +274,17 @@ int main(int argc, char *argv[]) { std::cout << p.size() << " B "; std::cout << std::endl; } - stat_t s1 = simdjson_computestats(p); + stat_t s1 = simdjson_compute_stats(p); if (verbose) { printf("simdjson: "); print_stat(s1); } - stat_t s2 = rapid_computestats(p); + stat_t s2 = rapid_compute_stats(p); if (verbose) { printf("rapid: "); print_stat(s2); } - stat_t s3 = sasjon_computestats(p); + stat_t s3 = sasjon_compute_stats(p); if (verbose) { printf("sasjon: "); print_stat(s3); @@ -290,13 +293,13 @@ int main(int argc, char *argv[]) { assert(stat_equal(s1, s3)); int repeat = 50; int volume = p.size(); - if(justdata) { + if (just_data) { printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); } - BEST_TIME("simdjson ", simdjson_computestats(p).valid, true, , repeat, - volume, !justdata); - BEST_TIME("RapidJSON ", rapid_computestats(p).valid, true, , repeat, volume, - !justdata); - BEST_TIME("sasjon ", sasjon_computestats(p).valid, true, , repeat, volume, - !justdata); + BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat, + volume, !just_data); + BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume, + !just_data); + BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume, + !just_data); } diff --git a/benchmark/parsingcompetition.cpp b/benchmark/parsingcompetition.cpp index 17297232..5930c4db 100644 --- a/benchmark/parsingcompetition.cpp +++ b/benchmark/parsingcompetition.cpp @@ -59,12 +59,12 @@ bool fastjson_parse(const char *input) { int main(int argc, char *argv[]) { bool verbose = false; - bool justdata = false; + bool just_data = false; int c; while ((c = getopt(argc, argv, "vt")) != -1) switch (c) { case 't': - justdata = true; + just_data = true; break; case 'v': verbose = true; @@ -102,24 +102,24 @@ int main(int argc, char *argv[]) { std::cout << std::endl; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "can't allocate memory" << std::endl; return EXIT_FAILURE; } - int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10); + int repeat = (p.size() < 1 * 1000 * 1000 ? 1000 : 10); int volume = p.size(); - if (justdata) { + if (just_data) { printf("%-42s %20s %20s %20s %20s \n", "name", "cycles_per_byte", "cycles_per_byte_err", "gb_per_s", "gb_per_s_err"); } - if (!justdata) - BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, , - repeat, volume, !justdata); + if (!just_data) + BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).is_valid(), true, + , repeat, volume, !just_data); // (static alloc) BEST_TIME("simdjson ", json_parse(p, pj), simdjson::SUCCESS, , repeat, volume, - !justdata); + !just_data); rapidjson::Document d; @@ -127,56 +127,57 @@ int main(int argc, char *argv[]) { memcpy(buffer, p.data(), p.size()); buffer[p.size()] = '\0'; #ifndef ALLPARSER - if (!justdata) + if (!just_data) #endif - BEST_TIME( - "RapidJSON ", d.Parse((const char *)buffer) - .HasParseError(), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + BEST_TIME("RapidJSON ", + d.Parse((const char *)buffer) + .HasParseError(), + false, memcpy(buffer, p.data(), p.size()), repeat, volume, + !just_data); BEST_TIME("RapidJSON (insitu)", d.ParseInsitu(buffer).HasParseError(), false, memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), - repeat, volume, !justdata); + repeat, volume, !just_data); #ifndef ALLPARSER - if (!justdata) + if (!just_data) #endif BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)) .is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, - !justdata); + !just_data); - size_t astbuffersize = p.size(); - size_t *ast_buffer = (size_t *)malloc(astbuffersize * sizeof(size_t)); + size_t ast_buffer_size = p.size(); + size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t)); // (static alloc, insitu) - BEST_TIME("sajson", - sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), - sajson::mutable_string_view(p.size(), buffer)) - .is_valid(), - true, memcpy(buffer, p.data(), p.size()), repeat, volume, - !justdata); + BEST_TIME( + "sajson", + sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), + sajson::mutable_string_view(p.size(), buffer)) + .is_valid(), + true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); #ifdef ALLPARSER std::string json11err; BEST_TIME("dropbox (json11) ", ((json11::Json::parse(buffer, json11err).is_null()) || (!json11err.empty())), false, memcpy(buffer, p.data(), p.size()), repeat, volume, - !justdata); + !just_data); BEST_TIME("fastjson ", fastjson_parse(buffer), true, - memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); JsonValue value; JsonAllocator allocator; char *endptr; BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, - !justdata); + !just_data); void *state; BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, - memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata); + memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); { std::unique_ptr tokens = @@ -185,32 +186,33 @@ int main(int argc, char *argv[]) { jsmn_init(&parser); memcpy(buffer, p.data(), p.size()); buffer[p.size()] = '\0'; - BEST_TIME("jsmn ", (jsmn_parse(&parser, buffer, p.size(), - tokens.get(), p.size()) > 0), - true, jsmn_init(&parser), repeat, volume, !justdata); + BEST_TIME( + "jsmn ", + (jsmn_parse(&parser, buffer, p.size(), tokens.get(), p.size()) > 0), + true, jsmn_init(&parser), repeat, volume, !just_data); } memcpy(buffer, p.data(), p.size()); buffer[p.size()] = '\0'; cJSON *tree = cJSON_Parse(buffer); BEST_TIME("cJSON ", ((tree = cJSON_Parse(buffer)) != NULL), true, - cJSON_Delete(tree), repeat, volume, !justdata); + cJSON_Delete(tree), repeat, volume, !just_data); cJSON_Delete(tree); Json::CharReaderBuilder b; - Json::CharReader *jsoncppreader = b.newCharReader(); + Json::CharReader *json_cpp_reader = b.newCharReader(); Json::Value root; Json::String errs; BEST_TIME("jsoncpp ", - jsoncppreader->parse(buffer, buffer + volume, &root, &errs), true, , - repeat, volume, !justdata); - delete jsoncppreader; + json_cpp_reader->parse(buffer, buffer + volume, &root, &errs), true, + , repeat, volume, !just_data); + delete json_cpp_reader; #endif - if (!justdata) + if (!just_data) BEST_TIME("memcpy ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, - volume, !justdata); + volume, !just_data); #ifdef __linux__ - if (!justdata) { + if (!just_data) { printf("\n \n \n"); std::vector evts; @@ -265,7 +267,7 @@ int main(int argc, char *argv[]) { for (int i = 0; i < repeat; i++) { memcpy(buffer, p.data(), p.size()); unified.start(); - if (sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), + if (sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), sajson::mutable_string_view(p.size(), buffer)) .is_valid() != true) printf("bug\n"); diff --git a/benchmark/statisticalmodel.cpp b/benchmark/statisticalmodel.cpp index d79c853e..a43dd9ae 100644 --- a/benchmark/statisticalmodel.cpp +++ b/benchmark/statisticalmodel.cpp @@ -1,5 +1,5 @@ #include -#ifndef _MSC_VER +#ifndef _MSC_VER #include #endif #include "simdjson/jsonioutil.h" @@ -29,7 +29,7 @@ struct stat_s { size_t float_count; size_t string_count; size_t backslash_count; - size_t nonasciibyte_count; + size_t non_ascii_byte_count; size_t object_count; size_t array_count; size_t null_count; @@ -42,16 +42,17 @@ struct stat_s { using stat_t = struct stat_s; -stat_t simdjson_computestats(const simdjson::padded_string &p) { +stat_t simdjson_compute_stats(const simdjson::padded_string &p) { stat_t answer; simdjson::ParsedJson pj = simdjson::build_parsed_json(p); - answer.valid = pj.isValid(); + answer.valid = pj.is_valid(); if (!answer.valid) { return answer; } - answer.backslash_count = count_backslash(reinterpret_cast(p.data()), p.size()); - answer.nonasciibyte_count = - count_nonasciibytes(reinterpret_cast(p.data()), p.size()); + answer.backslash_count = + count_backslash(reinterpret_cast(p.data()), p.size()); + answer.non_ascii_byte_count = count_nonasciibytes( + reinterpret_cast(p.data()), p.size()); answer.byte_count = p.size(); answer.integer_count = 0; answer.float_count = 0; @@ -62,24 +63,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) { answer.false_count = 0; answer.string_count = 0; answer.structural_indexes_count = pj.n_structural_indexes; - size_t tapeidx = 0; - uint64_t tape_val = pj.tape[tapeidx++]; + size_t tape_idx = 0; + uint64_t tape_val = pj.tape[tape_idx++]; uint8_t type = (tape_val >> 56); - size_t howmany = 0; + size_t how_many = 0; assert(type == 'r'); - howmany = tape_val & JSONVALUEMASK; - for (; tapeidx < howmany; tapeidx++) { - tape_val = pj.tape[tapeidx]; - // uint64_t payload = tape_val & JSONVALUEMASK; + how_many = tape_val & JSON_VALUE_MASK; + for (; tape_idx < how_many; tape_idx++) { + tape_val = pj.tape[tape_idx]; + // uint64_t payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); switch (type) { case 'l': // we have a long int answer.integer_count++; - tapeidx++; // skipping the integer + tape_idx++; // skipping the integer break; case 'd': // we have a double answer.float_count++; - tapeidx++; // skipping the double + tape_idx++; // skipping the double break; case 'n': // we have a null answer.null_count++; @@ -112,14 +113,14 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) { int main(int argc, char *argv[]) { #ifndef _MSC_VER - int c; - while ((c = getopt(argc, argv, "")) != -1) { + int c; + while ((c = getopt(argc, argv, "")) != -1) { switch (c) { default: abort(); } -} + } #else int optind = 1; #endif @@ -141,30 +142,30 @@ int main(int argc, char *argv[]) { std::cerr << "Could not load the file " << filename << std::endl; return EXIT_FAILURE; } - stat_t s = simdjson_computestats(p); + stat_t s = simdjson_compute_stats(p); if (!s.valid) { std::cerr << "not a valid JSON" << std::endl; return EXIT_FAILURE; } printf("# integer_count float_count string_count backslash_count " - "nonasciibyte_count object_count array_count null_count true_count " + "non_ascii_byte_count object_count array_count null_count true_count " "false_count byte_count structural_indexes_count "); #ifdef __linux__ - printf( - " stage1_cycle_count stage1_instruction_count stage2_cycle_count " - " stage2_instruction_count stage3_cycle_count stage3_instruction_count "); + printf(" stage1_cycle_count stage1_instruction_count stage2_cycle_count " + " stage2_instruction_count stage3_cycle_count " + "stage3_instruction_count "); #else printf("(you are not under linux, so perf counters are disaabled)"); #endif printf("\n"); printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu ", s.integer_count, - s.float_count, s.string_count, s.backslash_count, s.nonasciibyte_count, - s.object_count, s.array_count, s.null_count, s.true_count, - s.false_count, s.byte_count, s.structural_indexes_count); + s.float_count, s.string_count, s.backslash_count, + s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count, + s.true_count, s.false_count, s.byte_count, s.structural_indexes_count); #ifdef __linux__ simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size()); + bool allocok = pj.allocate_capacity(p.size()); if (!allocok) { std::cerr << "failed to allocate memory" << std::endl; return EXIT_FAILURE; @@ -180,20 +181,22 @@ int main(int argc, char *argv[]) { results.resize(evts.size()); for (uint32_t i = 0; i < iterations; i++) { unified.start(); - // The default template is simdjson::architecture::native. - bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == simdjson::SUCCESS); + // The default template is simdjson::Architecture::NATIVE. + bool isok = (simdjson::find_structural_bits<>(p.data(), p.size(), pj) == + simdjson::SUCCESS); unified.end(results); - + cy1 += results[0]; cl1 += results[1]; - + unified.start(); - isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj)); + isok = + isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj)); unified.end(results); - + cy2 += results[0]; cl2 += results[1]; - if(!isok) { + if (!isok) { std::cerr << "failure?" << std::endl; } } diff --git a/include/simdjson/common_defs.h b/include/simdjson/common_defs.h index eb936984..81a442c3 100644 --- a/include/simdjson/common_defs.h +++ b/include/simdjson/common_defs.h @@ -10,11 +10,11 @@ // the input buf should be readable up to buf + SIMDJSON_PADDING #ifdef __AVX2__ -#define SIMDJSON_PADDING sizeof(__m256i) +#define SIMDJSON_PADDING sizeof(__m256i) #else // this is a stopgap; there should be a better description of the // main loop and its behavior that abstracts over this -#define SIMDJSON_PADDING 32 +#define SIMDJSON_PADDING 32 #endif #ifndef _MSC_VER @@ -23,7 +23,6 @@ #define SIMDJSON_USE_COMPUTED_GOTO #endif - // Align to N-byte boundary #define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) #define ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) @@ -49,13 +48,13 @@ #else -// For non-Visual Studio compilers, we may assume that same-page buffer overrun is fine. -// However, it will make it difficult to be "valgrind clean". +// For non-Visual Studio compilers, we may assume that same-page buffer overrun +// is fine. However, it will make it difficult to be "valgrind clean". //#ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN //#define ALLOW_SAME_PAGE_BUFFER_OVERRUN true //#else #define ALLOW_SAME_PAGE_BUFFER_OVERRUN false -//#endif +//#endif // The following is likely unnecessarily complex. #ifdef __SANITIZE_ADDRESS__ @@ -63,16 +62,18 @@ #define ALLOW_SAME_PAGE_BUFFER_OVERRUN false #elif defined(__has_feature) // we have CLANG? -// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we have a non-empty qualifier? -# if (__has_feature(address_sanitizer)) -#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER __attribute__((no_sanitize("address"))) -# endif -#endif +// todo: if we're setting ALLOW_SAME_PAGE_BUFFER_OVERRUN to false, why do we +// have a non-empty qualifier? +#if (__has_feature(address_sanitizer)) +#define ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER \ + __attribute__((no_sanitize("address"))) +#endif +#endif #if defined(__has_feature) -# if (__has_feature(memory_sanitizer)) +#if (__has_feature(memory_sanitizer)) #define LENIENT_MEM_SANITIZER __attribute__((no_sanitize("memory"))) -# endif +#endif #endif #define really_inline inline __attribute__((always_inline, unused)) @@ -88,7 +89,7 @@ #define unlikely(x) __builtin_expect(!!(x), 0) #endif -#endif // MSC_VER +#endif // MSC_VER // if it does not apply, make it an empty macro #ifndef ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER diff --git a/include/simdjson/isadetection.h b/include/simdjson/isadetection.h index 903c8118..8f4bba4c 100644 --- a/include/simdjson/isadetection.h +++ b/include/simdjson/isadetection.h @@ -1,5 +1,6 @@ -/* From https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h -Highly modified. +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. Copyright (c) 2016- Facebook, Inc (Adam Paszke) Copyright (c) 2014- Facebook, Inc (Soumith Chintala) @@ -7,9 +8,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) Copyright (c) 2011-2013 NYU (Clement Farabet) -Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) -Copyright (c) 2006 Idiap Research Institute (Samy Bengio) -Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) All rights reserved. @@ -23,8 +25,8 @@ modification, are permitted provided that the following conditions are met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America - and IDIAP Research Institute nor the names of its contributors may be +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -60,51 +62,48 @@ constexpr uint32_t cpuid_bmi2_bit = 1 << 8; // bit 8 of EBX for EAX=0x7 constexpr uint32_t cpuid_sse42_bit = 1 << 20; // bit 20 of ECX for EAX=0x1 constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; // bit 1 of ECX for EAX=0x1 -enum SIMDExtensions { - DEFAULT = 0x0, - NEON = 0x1, - AVX2 = 0x4, - SSE42 = 0x8, +enum instruction_set { + DEFAULT = 0x0, + NEON = 0x1, + AVX2 = 0x4, + SSE42 = 0x8, PCLMULQDQ = 0x10, - BMI1 = 0x20, - BMI2 = 0x40 + BMI1 = 0x20, + BMI2 = 0x40 }; #if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64 - #if defined(__NEON__) +#if defined(__NEON__) -static inline uint32_t detect_supported_architectures() -{ - return SIMDExtensions::NEON; +static inline uint32_t detect_supported_architectures() { + return instruction_set::NEON; } - #else //ARM without NEON +#else // ARM without NEON -static inline uint32_t detect_supported_architectures() -{ - return SIMDExtensions::DEFAULT; +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; } - #endif - -#else // x86 -static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) -{ +#endif + +#else // x86 +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { #if defined(_MSC_VER) - int cpuInfo[4]; - __cpuid(cpuInfo, *eax); - *eax = cpuInfo[0]; - *ebx = cpuInfo[1]; - *ecx = cpuInfo[2]; - *edx = cpuInfo[3]; + int cpu_info[4]; + __cpuid(cpu_info, *eax); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) uint32_t level = *eax; - __get_cpuid (level, eax, ebx, ecx, edx); + __get_cpuid(level, eax, ebx, ecx, edx); #else uint32_t a = *eax, b, c = *ecx, d; - asm volatile ( "cpuid\n\t" - : "+a"(a), "=b"(b), "+c"(c), "=d"(d) ); + asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); *eax = a; *ebx = b; *ecx = c; @@ -112,10 +111,9 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t * #endif } -static inline uint32_t detect_supported_architectures() -{ +static inline uint32_t detect_supported_architectures() { uint32_t eax, ebx, ecx, edx; - uint32_t hostSimdExts = 0x0; + uint32_t host_isa = 0x0; // ECX for EAX=0x7 eax = 0x7; @@ -123,15 +121,15 @@ static inline uint32_t detect_supported_architectures() cpuid(&eax, &ebx, &ecx, &edx); if (ebx & cpuid_avx2_bit) { - hostSimdExts |= SIMDExtensions::AVX2; + host_isa |= instruction_set::AVX2; } if (ebx & cpuid_bmi1_bit) { - hostSimdExts |= SIMDExtensions::BMI1; + host_isa |= instruction_set::BMI1; } if (ebx & cpuid_bmi2_bit) { - hostSimdExts |= SIMDExtensions::BMI2; + host_isa |= instruction_set::BMI2; } // EBX for EAX=0x1 @@ -139,16 +137,16 @@ static inline uint32_t detect_supported_architectures() cpuid(&eax, &ebx, &ecx, &edx); if (ecx & cpuid_sse42_bit) { - hostSimdExts |= SIMDExtensions::SSE42; + host_isa |= instruction_set::SSE42; } if (ecx & cpuid_pclmulqdq_bit) { - hostSimdExts |= SIMDExtensions::PCLMULQDQ; + host_isa |= instruction_set::PCLMULQDQ; } - return hostSimdExts; + return host_isa; } #endif // end SIMD extension detection code -} +} // namespace simdjson #endif diff --git a/include/simdjson/jsoncharutils.h b/include/simdjson/jsoncharutils.h index 261f0e9e..361ff989 100644 --- a/include/simdjson/jsoncharutils.h +++ b/include/simdjson/jsoncharutils.h @@ -35,7 +35,6 @@ really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) { return structural_or_whitespace_or_null_negated[c]; } - const uint32_t structural_or_whitespace_negated[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -76,7 +75,6 @@ really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) { return structural_or_whitespace_or_null[c]; } - const uint32_t structural_or_whitespace[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, @@ -94,7 +92,7 @@ really_inline uint32_t is_structural_or_whitespace(uint8_t c) { return structural_or_whitespace[c]; } -const uint32_t digittoval32[886] = { +const uint32_t digit_to_val32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, @@ -103,7 +101,7 @@ const uint32_t digittoval32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF, @@ -138,7 +136,7 @@ const uint32_t digittoval32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, - 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF, @@ -173,7 +171,7 @@ const uint32_t digittoval32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, - 0x0, 0x100, 0x200, 0x300, 0x400, 0x500, + 0x0, 0x100, 0x200, 0x300, 0x400, 0x500, 0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF, @@ -208,7 +206,7 @@ const uint32_t digittoval32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, - 0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, + 0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, 0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF, @@ -244,15 +242,17 @@ const uint32_t digittoval32[886] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; // returns a value with the high 16 bits set if not valid -// otherwise returns the conversion of the 4 hex digits at src into the bottom 16 bits of the 32-bit -// return register +// otherwise returns the conversion of the 4 hex digits at src into the bottom +// 16 bits of the 32-bit return register // -// see https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ -static inline uint32_t hex_to_u32_nocheck(const uint8_t *src) {// strictly speaking, static inline is a C-ism - uint32_t v1 = digittoval32[630 + src[0]]; - uint32_t v2 = digittoval32[420 + src[1]]; - uint32_t v3 = digittoval32[210 + src[2]]; - uint32_t v4 = digittoval32[0 + src[3]]; +// see +// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ +static inline uint32_t hex_to_u32_nocheck( + const uint8_t *src) { // strictly speaking, static inline is a C-ism + uint32_t v1 = digit_to_val32[630 + src[0]]; + uint32_t v2 = digit_to_val32[420 + src[1]]; + uint32_t v3 = digit_to_val32[210 + src[2]]; + uint32_t v4 = digit_to_val32[0 + src[3]]; return v1 | v2 | v3 | v4; } @@ -272,19 +272,21 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) { if (cp <= 0x7F) { c[0] = cp; return 1; // ascii - } if (cp <= 0x7FF) { + } + if (cp <= 0x7FF) { c[0] = (cp >> 6) + 192; c[1] = (cp & 63) + 128; return 2; // universal plane - // Surrogates are treated elsewhere... - //} //else if (0xd800 <= cp && cp <= 0xdfff) { - // return 0; // surrogates // could put assert here + // Surrogates are treated elsewhere... + //} //else if (0xd800 <= cp && cp <= 0xdfff) { + // return 0; // surrogates // could put assert here } else if (cp <= 0xFFFF) { c[0] = (cp >> 12) + 224; c[1] = ((cp >> 6) & 63) + 128; c[2] = (cp & 63) + 128; return 3; - } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed + } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this + // is not needed c[0] = (cp >> 18) + 240; c[1] = ((cp >> 12) & 63) + 128; c[2] = ((cp >> 6) & 63) + 128; @@ -294,6 +296,6 @@ inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) { // will return 0 when the code point was too large. return 0; // bad r } -} +} // namespace simdjson #endif diff --git a/include/simdjson/jsonformatutils.h b/include/simdjson/jsonformatutils.h index 8b86ec1c..f4eeea52 100644 --- a/include/simdjson/jsonformatutils.h +++ b/include/simdjson/jsonformatutils.h @@ -10,40 +10,40 @@ namespace simdjson { static inline void print_with_escapes(const unsigned char *src) { while (*src) { switch (*src) { - case '\b': - putchar('\\'); - putchar('b'); - break; - case '\f': - putchar('\\'); - putchar('f'); - break; - case '\n': - putchar('\\'); - putchar('n'); - break; - case '\r': - putchar('\\'); - putchar('r'); - break; - case '\"': - putchar('\\'); - putchar('"'); - break; - case '\t': - putchar('\\'); - putchar('t'); - break; - case '\\': - putchar('\\'); - putchar('\\'); - break; - default: - if (*src <= 0x1F) { - printf("\\u%04x", *src); - } else { - putchar(*src); - } + case '\b': + putchar('\\'); + putchar('b'); + break; + case '\f': + putchar('\\'); + putchar('f'); + break; + case '\n': + putchar('\\'); + putchar('n'); + break; + case '\r': + putchar('\\'); + putchar('r'); + break; + case '\"': + putchar('\\'); + putchar('"'); + break; + case '\t': + putchar('\\'); + putchar('t'); + break; + case '\\': + putchar('\\'); + putchar('\\'); + break; + default: + if (*src <= 0x1F) { + printf("\\u%04x", *src); + } else { + putchar(*src); + } } src++; } @@ -54,43 +54,43 @@ static inline void print_with_escapes(const unsigned char *src, std::ostream &os) { while (*src) { switch (*src) { - case '\b': - os << '\\'; - os << 'b'; - break; - case '\f': - os << '\\'; - os << 'f'; - break; - case '\n': - os << '\\'; - os << 'n'; - break; - case '\r': - os << '\\'; - os << 'r'; - break; - case '\"': - os << '\\'; - os << '"'; - break; - case '\t': - os << '\\'; - os << 't'; - break; - case '\\': - os << '\\'; - os << '\\'; - break; - default: - if (*src <= 0x1F) { - std::ios::fmtflags f(os.flags()); - os << std::hex << std::setw(4) << std::setfill('0') - << static_cast(*src); - os.flags(f); - } else { - os << *src; - } + case '\b': + os << '\\'; + os << 'b'; + break; + case '\f': + os << '\\'; + os << 'f'; + break; + case '\n': + os << '\\'; + os << 'n'; + break; + case '\r': + os << '\\'; + os << 'r'; + break; + case '\"': + os << '\\'; + os << '"'; + break; + case '\t': + os << '\\'; + os << 't'; + break; + case '\\': + os << '\\'; + os << '\\'; + break; + default: + if (*src <= 0x1F) { + std::ios::fmtflags f(os.flags()); + os << std::hex << std::setw(4) << std::setfill('0') + << static_cast(*src); + os.flags(f); + } else { + os << *src; + } } src++; } @@ -101,40 +101,40 @@ static inline void print_with_escapes(const unsigned char *src, size_t len) { const unsigned char *finalsrc = src + len; while (src < finalsrc) { switch (*src) { - case '\b': - putchar('\\'); - putchar('b'); - break; - case '\f': - putchar('\\'); - putchar('f'); - break; - case '\n': - putchar('\\'); - putchar('n'); - break; - case '\r': - putchar('\\'); - putchar('r'); - break; - case '\"': - putchar('\\'); - putchar('"'); - break; - case '\t': - putchar('\\'); - putchar('t'); - break; - case '\\': - putchar('\\'); - putchar('\\'); - break; - default: - if (*src <= 0x1F) { - printf("\\u%04x", *src); - } else { - putchar(*src); - } + case '\b': + putchar('\\'); + putchar('b'); + break; + case '\f': + putchar('\\'); + putchar('f'); + break; + case '\n': + putchar('\\'); + putchar('n'); + break; + case '\r': + putchar('\\'); + putchar('r'); + break; + case '\"': + putchar('\\'); + putchar('"'); + break; + case '\t': + putchar('\\'); + putchar('t'); + break; + case '\\': + putchar('\\'); + putchar('\\'); + break; + default: + if (*src <= 0x1F) { + printf("\\u%04x", *src); + } else { + putchar(*src); + } } src++; } @@ -146,43 +146,43 @@ static inline void print_with_escapes(const unsigned char *src, const unsigned char *finalsrc = src + len; while (src < finalsrc) { switch (*src) { - case '\b': - os << '\\'; - os << 'b'; - break; - case '\f': - os << '\\'; - os << 'f'; - break; - case '\n': - os << '\\'; - os << 'n'; - break; - case '\r': - os << '\\'; - os << 'r'; - break; - case '\"': - os << '\\'; - os << '"'; - break; - case '\t': - os << '\\'; - os << 't'; - break; - case '\\': - os << '\\'; - os << '\\'; - break; - default: - if (*src <= 0x1F) { - std::ios::fmtflags f(os.flags()); - os << std::hex << std::setw(4) << std::setfill('0') - << static_cast(*src); - os.flags(f); - } else { - os << *src; - } + case '\b': + os << '\\'; + os << 'b'; + break; + case '\f': + os << '\\'; + os << 'f'; + break; + case '\n': + os << '\\'; + os << 'n'; + break; + case '\r': + os << '\\'; + os << 'r'; + break; + case '\"': + os << '\\'; + os << '"'; + break; + case '\t': + os << '\\'; + os << 't'; + break; + case '\\': + os << '\\'; + os << '\\'; + break; + default: + if (*src <= 0x1F) { + std::ios::fmtflags f(os.flags()); + os << std::hex << std::setw(4) << std::setfill('0') + << static_cast(*src); + os.flags(f); + } else { + os << *src; + } } src++; } @@ -196,7 +196,7 @@ static inline void print_with_escapes(const char *src, std::ostream &os, size_t len) { print_with_escapes(reinterpret_cast(src), os, len); } -} +} // namespace simdjson # #endif diff --git a/include/simdjson/jsonioutil.h b/include/simdjson/jsonioutil.h index 407718d0..07b163d2 100644 --- a/include/simdjson/jsonioutil.h +++ b/include/simdjson/jsonioutil.h @@ -8,10 +8,8 @@ #include #include - #include "simdjson/padded_string.h" - namespace simdjson { // load a file in memory... @@ -20,15 +18,15 @@ namespace simdjson { // first element of the pair is a string (null terminated) // whereas the second element is the length. // caller is responsible to free (aligned_free((void*)result.data()))) -// +// // throws an exception if the file cannot be opened, use try/catch // try { // p = get_corpus(filename); -// } catch (const std::exception& e) { +// } catch (const std::exception& e) { // aligned_free((void*)p.data()); // std::cout << "Could not load the file " << filename << std::endl; // } -padded_string get_corpus(const std::string& filename); -} +padded_string get_corpus(const std::string &filename); +} // namespace simdjson #endif diff --git a/include/simdjson/jsonminifier.h b/include/simdjson/jsonminifier.h index 0ff9f1ce..c510d0f4 100644 --- a/include/simdjson/jsonminifier.h +++ b/include/simdjson/jsonminifier.h @@ -1,10 +1,10 @@ #ifndef SIMDJSON_JSONMINIFIER_H #define SIMDJSON_JSONMINIFIER_H +#include "simdjson/padded_string.h" #include #include #include -#include "simdjson/padded_string.h" namespace simdjson { @@ -12,20 +12,19 @@ namespace simdjson { // out can be the same pointer. Result is null terminated, // return the string length (minus the null termination). // The accelerated version of this function only runs on AVX2 hardware. -size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out); +size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out); - -static inline size_t jsonminify(const char *buf, size_t len, char *out) { - return jsonminify(reinterpret_cast(buf), len, reinterpret_cast(out)); +static inline size_t json_minify(const char *buf, size_t len, char *out) { + return json_minify(reinterpret_cast(buf), len, + reinterpret_cast(out)); } - -static inline size_t jsonminify(const std::string_view & p, char *out) { - return jsonminify(p.data(), p.size(), out); +static inline size_t json_minify(const std::string_view &p, char *out) { + return json_minify(p.data(), p.size(), out); } -static inline size_t jsonminify(const padded_string & p, char *out) { - return jsonminify(p.data(), p.size(), out); -} +static inline size_t json_minify(const padded_string &p, char *out) { + return json_minify(p.data(), p.size(), out); } +} // namespace simdjson #endif diff --git a/include/simdjson/jsonparser.h b/include/simdjson/jsonparser.h index ddd4dc61..283a0488 100644 --- a/include/simdjson/jsonparser.h +++ b/include/simdjson/jsonparser.h @@ -1,136 +1,161 @@ #ifndef SIMDJSON_JSONPARSER_H #define SIMDJSON_JSONPARSER_H -#include #include "simdjson/common_defs.h" -#include "simdjson/padded_string.h" #include "simdjson/jsonioutil.h" +#include "simdjson/padded_string.h" #include "simdjson/parsedjson.h" +#include "simdjson/simdjson.h" #include "simdjson/stage1_find_marks.h" #include "simdjson/stage2_build_tape.h" -#include "simdjson/simdjson.h" +#include #ifdef _MSC_VER #include -#include +#include // must be included after windows.h #else #include #endif namespace simdjson { // The function that users are expected to call is json_parse. -// We have more than one such function because we want to support several +// We have more than one such function because we want to support several // instruction sets. // function pointer type for json_parse -using json_parse_functype = int (const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded); +using json_parse_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj, + bool realloc_if_needed); -// Pointer that holds the json_parse implementation corresponding to the available SIMD instruction set +// Pointer that holds the json_parse implementation corresponding to the +// available SIMD instruction set extern json_parse_functype *json_parse_ptr; -// json_parse_implementation is the generic function, it is specialized for various -// architectures, e.g., as json_parse_implementation -// or json_parse_implementation -template -int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { - if (pj.bytecapacity < len) { +// json_parse_implementation is the generic function, it is specialized for +// various architectures, e.g., as +// json_parse_implementation or +// json_parse_implementation +template +int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj, + bool realloc_if_needed = true) { + if (pj.byte_capacity < len) { return simdjson::CAPACITY; } bool reallocated = false; - if(reallocifneeded) { + if (realloc_if_needed) { #if ALLOW_SAME_PAGE_BUFFER_OVERRUN // realloc is needed if the end of the memory crosses a page #ifdef _MSC_VER - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); - long pagesize = sysInfo.dwPageSize; + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + long page_size = sysInfo.dwPageSize; #else - long pagesize = sysconf (_SC_PAGESIZE); + long page_size = sysconf(_SC_PAGESIZE); #endif ////////////// // We want to check that buf + len - 1 and buf + len - 1 + SIMDJSON_PADDING // are in the same page. - // That is, we want to check that - // (buf + len - 1) / pagesize == (buf + len - 1 + SIMDJSON_PADDING) / pagesize - // That's true if (buf + len - 1) % pagesize + SIMDJSON_PADDING < pagesize. + // That is, we want to check that + // (buf + len - 1) / page_size == (buf + len - 1 + SIMDJSON_PADDING) / + // page_size That's true if (buf + len - 1) % page_size + SIMDJSON_PADDING < + // page_size. /////////// - if ( (reinterpret_cast(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast(pagesize) ) { + if ((reinterpret_cast(buf + len - 1) % page_size) + + SIMDJSON_PADDING < + static_cast(page_size)) { #else // SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN - if(true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always reallocate + if (true) { // if not SIMDJSON_SAFE_SAME_PAGE_READ_OVERRUN, we always + // reallocate #endif - const uint8_t *tmpbuf = buf; - buf = (uint8_t *) allocate_padded_buffer(len); - if(buf == NULL) return simdjson::MEMALLOC; - memcpy((void*)buf,tmpbuf,len); + const uint8_t *tmp_buf = buf; + buf = (uint8_t *)allocate_padded_buffer(len); + if (buf == NULL) + return simdjson::MEMALLOC; + memcpy((void *)buf, tmp_buf, len); reallocated = true; - } // if (true) OR if ( (reinterpret_cast(buf + len - 1) % pagesize ) + SIMDJSON_PADDING < static_cast(pagesize) ) { - } // if(reallocifneeded) { + } // if (true) OR if ( (reinterpret_cast(buf + len - 1) % + // page_size ) + SIMDJSON_PADDING < static_cast(page_size) ) { + } // if(realloc_if_needed) { int stage1_is_ok = simdjson::find_structural_bits(buf, len, pj); - if(stage1_is_ok != simdjson::SUCCESS) { - pj.errorcode = stage1_is_ok; - return pj.errorcode; - } + if (stage1_is_ok != simdjson::SUCCESS) { + pj.error_code = stage1_is_ok; + return pj.error_code; + } int res = unified_machine(buf, len, pj); - if(reallocated) { aligned_free((void*)buf);} + if (reallocated) { + aligned_free((void *)buf); + } return res; } -// Parse a document found in buf. -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. +// Parse a document found in buf. // -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // -// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from -// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC, -// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes -// into a string). +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). // -// You can also check validity by calling pj.isValid(). The same ParsedJson can be reused for other documents. +// The function returns simdjson::SUCCESS (an integer = 0) in case of a success +// or an error code from simdjson/simdjson.h in case of failure such as +// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; +// the simdjson::error_message function converts these error codes into a +// string). // -// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing -// (a copy of the input string is made). -// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false, -// all bytes at and after buf + len are ignored (can be garbage). -// The ParsedJson object can be reused. +// You can also check validity by calling pj.is_valid(). The same ParsedJson can +// be reused for other documents. +// +// If realloc_if_needed is true (default) then a temporary buffer is created +// when needed during processing (a copy of the input string is made). The input +// buf should be readable up to buf + len + SIMDJSON_PADDING if +// realloc_if_needed is false, all bytes at and after buf + len are ignored +// (can be garbage). The ParsedJson object can be reused. -inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { - return json_parse_ptr(buf, len, pj, reallocifneeded); +inline int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, + bool realloc_if_needed = true) { + return json_parse_ptr(buf, len, pj, realloc_if_needed); } // Parse a document found in buf. -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. // -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // -// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from -// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC, -// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes -// into a string). +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). +// +// The function returns simdjson::SUCCESS (an integer = 0) in case of a success +// or an error code from simdjson/simdjson.h in case of failure such as +// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; +// the simdjson::error_message function converts these error codes into a +// string). // // You can also check validity -// by calling pj.isValid(). The same ParsedJson can be reused for other documents. +// by calling pj.is_valid(). The same ParsedJson can be reused for other +// documents. // -// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing -// (a copy of the input string is made). -// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false, -// all bytes at and after buf + len are ignored (can be garbage). -// The ParsedJson object can be reused. -inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool reallocifneeded = true) { - return json_parse_ptr(reinterpret_cast(buf), len, pj, reallocifneeded); +// If realloc_if_needed is true (default) then a temporary buffer is created +// when needed during processing (a copy of the input string is made). The input +// buf should be readable up to buf + len + SIMDJSON_PADDING if +// realloc_if_needed is false, all bytes at and after buf + len are ignored +// (can be garbage). The ParsedJson object can be reused. +inline int json_parse(const char *buf, size_t len, ParsedJson &pj, + bool realloc_if_needed = true) { + return json_parse_ptr(reinterpret_cast(buf), len, pj, + realloc_if_needed); } // We do not want to allow implicit conversion from C string to std::string. -int json_parse(const char * buf, ParsedJson &pj) = delete; +int json_parse(const char *buf, ParsedJson &pj) = delete; // Parse a document found in in string s. -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). // -// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from -// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC, -// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes -// into a string). +// The function returns simdjson::SUCCESS (an integer = 0) in case of a success +// or an error code from simdjson/simdjson.h in case of failure such as +// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; +// the simdjson::error_message function converts these error codes into a +// string). // // A temporary buffer is created when needed during processing // (a copy of the input string is made). @@ -139,72 +164,82 @@ inline int json_parse(const std::string &s, ParsedJson &pj) { } // Parse a document found in in string s. -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. // -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // -// The function returns simdjson::SUCCESS (an integer = 0) in case of a success or an error code from -// simdjson/simdjson.h in case of failure such as simdjson::CAPACITY, simdjson::MEMALLOC, -// simdjson::DEPTH_ERROR and so forth; the simdjson::errorMsg function converts these error codes -// into a string). +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). +// +// The function returns simdjson::SUCCESS (an integer = 0) in case of a success +// or an error code from simdjson/simdjson.h in case of failure such as +// simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; +// the simdjson::error_message function converts these error codes into a +// string). // // You can also check validity -// by calling pj.isValid(). The same ParsedJson can be reused for other documents. +// by calling pj.is_valid(). The same ParsedJson can be reused for other +// documents. inline int json_parse(const padded_string &s, ParsedJson &pj) { return json_parse(s.data(), s.length(), pj, false); } - // Build a ParsedJson object. You can check validity -// by calling pj.isValid(). This does the memory allocation needed for ParsedJson. -// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing -// (a copy of the input string is made). +// by calling pj.is_valid(). This does the memory allocation needed for +// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is +// created when needed during processing (a copy of the input string is made). // -// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false, -// all bytes at and after buf + len are ignored (can be garbage). -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. +// The input buf should be readable up to buf + len + SIMDJSON_PADDING if +// realloc_if_needed is false, all bytes at and after buf + len are ignored +// (can be garbage). +// +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // // This is a convenience function which calls json_parse. WARN_UNUSED -ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded = true); +ParsedJson build_parsed_json(const uint8_t *buf, size_t len, + bool realloc_if_needed = true); WARN_UNUSED // Build a ParsedJson object. You can check validity -// by calling pj.isValid(). This does the memory allocation needed for ParsedJson. -// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing -// (a copy of the input string is made). +// by calling pj.is_valid(). This does the memory allocation needed for +// ParsedJson. If realloc_if_needed is true (default) then a temporary buffer is +// created when needed during processing (a copy of the input string is made). // -// The input buf should be readable up to buf + len + SIMDJSON_PADDING if reallocifneeded is false, -// all bytes at and after buf + len are ignored (can be garbage). +// The input buf should be readable up to buf + len + SIMDJSON_PADDING if +// realloc_if_needed is false, all bytes at and after buf + len are ignored +// (can be garbage). // -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. +// +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // // This is a convenience function which calls json_parse. -inline ParsedJson build_parsed_json(const char * buf, size_t len, bool reallocifneeded = true) { - return build_parsed_json(reinterpret_cast(buf), len, reallocifneeded); +inline ParsedJson build_parsed_json(const char *buf, size_t len, + bool realloc_if_needed = true) { + return build_parsed_json(reinterpret_cast(buf), len, + realloc_if_needed); } - // We do not want to allow implicit conversion from C string to std::string. ParsedJson build_parsed_json(const char *buf) = delete; - // Parse a document found in in string s. -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). -// Return SUCCESS (an integer = 0) in case of a success. You can also check validity -// by calling pj.isValid(). The same ParsedJson can be reused for other documents. +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a +// success. You can also check validity by calling pj.is_valid(). The same +// ParsedJson can be reused for other documents. // // A temporary buffer is created when needed during processing // (a copy of the input string is made). -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. +// +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // // This is a convenience function which calls json_parse. WARN_UNUSED @@ -212,19 +247,20 @@ inline ParsedJson build_parsed_json(const std::string &s) { return build_parsed_json(s.data(), s.length(), true); } - // Parse a document found in in string s. -// You need to preallocate ParsedJson with a capacity of len (e.g., pj.allocateCapacity(len)). -// Return SUCCESS (an integer = 0) in case of a success. You can also check validity -// by calling pj.isValid(). The same ParsedJson can be reused for other documents. -// -// The content should be a valid JSON document encoded as UTF-8. If there is a UTF-8 BOM, the caller -// is responsible for omitting it, UTF-8 BOM are discouraged. +// You need to preallocate ParsedJson with a capacity of len (e.g., +// pj.allocate_capacity(len)). Return SUCCESS (an integer = 0) in case of a +// success. You can also check validity by calling pj.is_valid(). The same +// ParsedJson can be reused for other documents. +// +// The content should be a valid JSON document encoded as UTF-8. If there is a +// UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are +// discouraged. // // This is a convenience function which calls json_parse. WARN_UNUSED inline ParsedJson build_parsed_json(const padded_string &s) { return build_parsed_json(s.data(), s.length(), false); } -} +} // namespace simdjson #endif diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h index eae709eb..4878a362 100644 --- a/include/simdjson/numberparsing.h +++ b/include/simdjson/numberparsing.h @@ -7,16 +7,17 @@ #include "simdjson/portability.h" #ifdef JSON_TEST_NUMBERS // for unit testing -void foundInvalidNumber(const uint8_t *buf); -void foundInteger(int64_t result, const uint8_t *buf); -void foundFloat(double result, const uint8_t *buf); +void found_invalid_number(const uint8_t *buf); +void found_integer(int64_t result, const uint8_t *buf); +void found_float(double result, const uint8_t *buf); #endif namespace simdjson { -// Allowable floating-point values range from std::numeric_limits::lowest() -// to std::numeric_limits::max(), so from -// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero -// normal values is std::numeric_limits::min() or about 2.225074e-308. +// Allowable floating-point values range from +// std::numeric_limits::lowest() to std::numeric_limits::max(), +// so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest +// non-zero normal values is std::numeric_limits::min() or +// about 2.225074e-308. static const double power_of_ten[] = { 1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300, 1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291, @@ -113,7 +114,7 @@ really_inline bool is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { return structural_or_whitespace_or_exponent_or_decimal_negated[c]; } -}// simdjson +} // namespace simdjson #ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING #define SWAR_NUMBER_PARSING #endif @@ -126,7 +127,7 @@ namespace simdjson { // http://0x80.pl/articles/swar-digits-validate.html static inline bool is_made_of_eight_digits_fast(const char *chars) { uint64_t val; - // this can read up to 7 bytes beyond the buffer size, but we require + // this can read up to 7 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding static_assert(7 <= SIMDJSON_PADDING); memcpy(&val, chars, 8); @@ -138,7 +139,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) { (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333); } -} +} // namespace simdjson #ifdef IS_X86_64 TARGET_WESTMERE namespace simdjson { @@ -150,7 +151,8 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); - const __m128i input = _mm_sub_epi8(_mm_loadu_si128(reinterpret_cast(chars)), ascii0); + const __m128i input = _mm_sub_epi8( + _mm_loadu_si128(reinterpret_cast(chars)), ascii0); const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10); const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); const __m128i t3 = _mm_packus_epi32(t2, t2); @@ -158,7 +160,7 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { return _mm_cvtsi128_si32( t4); // only captures the sum of the first 8 digits, drop the rest } -} +} // namespace simdjson UNTARGET_REGION #endif @@ -167,15 +169,14 @@ namespace simdjson { // we don't have SSE, so let us use a scalar function // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { - uint64_t val; - memcpy(&val, chars, sizeof(uint64_t)); - val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; - val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; - return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32; + uint64_t val; + memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32; } #endif - #endif // @@ -183,10 +184,9 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) { // It is only even going to be used when negative_exponent is tiny. static double subnormal_power10(double base, int negative_exponent) { // this is probably not going to be fast - return base * 1e-308 * pow(10, negative_exponent + 308); + return base * 1e-308 * pow(10, negative_exponent + 308); } - // called by parse_number when we know that the output is a float, // but where there might be some integer overflow. The trick here is to // parse using floats from the start. @@ -197,10 +197,8 @@ static double subnormal_power10(double base, int negative_exponent) { // // Note: a redesign could avoid this function entirely. // -static never_inline bool -parse_float(const uint8_t *const buf, - ParsedJson &pj, const uint32_t offset, - bool found_minus) { +static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj, + const uint32_t offset, bool found_minus) { const char *p = reinterpret_cast(buf + offset); bool negative = false; if (found_minus) { @@ -223,100 +221,102 @@ parse_float(const uint8_t *const buf, } if ('.' == *p) { ++p; - int fractionalweight = 308; - if(is_integer(*p)) { + int fractional_weight = 308; + if (is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - fractionalweight --; - i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); + fractional_weight--; + i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight] + : 0); } else { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } while (is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - fractionalweight --; - i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); + fractional_weight--; + i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight] + : 0); } } if (('e' == *p) || ('E' == *p)) { ++p; - bool negexp = false; + bool neg_exp = false; if ('-' == *p) { - negexp = true; + neg_exp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } unsigned char digit = *p - '0'; - int64_t expnumber = digit; // exponential part + int64_t exp_number = digit; // exponential part p++; if (is_integer(*p)) { digit = *p - '0'; - expnumber = 10 * expnumber + digit; + exp_number = 10 * exp_number + digit; ++p; } if (is_integer(*p)) { digit = *p - '0'; - expnumber = 10 * expnumber + digit; + exp_number = 10 * exp_number + digit; ++p; } if (is_integer(*p)) { digit = *p - '0'; - expnumber = 10 * expnumber + digit; + exp_number = 10 * exp_number + digit; ++p; } while (is_integer(*p)) { - if(expnumber > 0x100000000) {// we need to check for overflows + if (exp_number > 0x100000000) { // we need to check for overflows // we refuse to parse this #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } digit = *p - '0'; - expnumber = 10 * expnumber + digit; - ++p; + exp_number = 10 * exp_number + digit; + ++p; } - if (unlikely(expnumber > 308)) { + if (unlikely(exp_number > 308)) { // this path is unlikely - if(negexp) { - // We either have zero or a subnormal. + if (neg_exp) { + // We either have zero or a subnormal. // We expect this to be uncommon so we go through a slow path. - i = subnormal_power10(i, - expnumber); + i = subnormal_power10(i, -exp_number); } else { // We know for sure that we have a number that is too large, // we refuse to parse this #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } } else { - int exponent = (negexp ? -expnumber : expnumber); - // we have that expnumber is [0,308] so that - // exponent is [-308,308] so that + int exponent = (neg_exp ? -exp_number : exp_number); + // we have that exp_number is [0,308] so that + // exponent is [-308,308] so that // 308 + exponent is in [0, 2 * 308] i *= power_of_ten[308 + exponent]; - } + } } - if(is_not_structural_or_whitespace(*p)) { + if (is_not_structural_or_whitespace(*p)) { return false; } double d = negative ? -i : i; pj.write_tape_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing - foundFloat(d, buf + offset); + found_float(d, buf + offset); #endif return is_structural_or_whitespace(*p); } @@ -354,13 +354,13 @@ static never_inline bool parse_large_integer(const uint8_t *const buf, digit = *p - '0'; if (mul_overflow(i, 10, &i)) { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; // overflow } if (add_overflow(i, digit, &i)) { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; // overflow } @@ -371,7 +371,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf, if (i > 0x8000000000000000) { // overflows! #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; // overflow } @@ -379,15 +379,16 @@ static never_inline bool parse_large_integer(const uint8_t *const buf, if (i >= 0x8000000000000000) { // overflows! #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; // overflow } } - int64_t signed_answer = negative ? -static_cast(i) : static_cast(i); + int64_t signed_answer = + negative ? -static_cast(i) : static_cast(i); pj.write_tape_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing - foundInteger(signed_answer, buf + offset); + found_integer(signed_answer, buf + offset); #endif return is_structural_or_whitespace(*p); } @@ -396,18 +397,18 @@ static never_inline bool parse_large_integer(const uint8_t *const buf, // define JSON_TEST_NUMBERS for unit testing // // It is assumed that the number is followed by a structural ({,},],[) character -// or a white space character. If that is not the case (e.g., when the JSON document -// is made of a single number), then it is necessary to copy the content and append -// a space before calling this function. +// or a white space character. If that is not the case (e.g., when the JSON +// document is made of a single number), then it is necessary to copy the +// content and append a space before calling this function. // // Our objective is accurate parsing (ULP of 0 or 1) at high speed. -static really_inline bool parse_number(const uint8_t *const buf, - ParsedJson &pj, +static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj, const uint32_t offset, bool found_minus) { -#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes useful to skip parsing - pj.write_tape_s64(0); // always write zero - return true; // always succeeds +#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes + // useful to skip parsing + pj.write_tape_s64(0); // always write zero + return true; // always succeeds #else const char *p = reinterpret_cast(buf + offset); bool negative = false; @@ -415,28 +416,28 @@ static really_inline bool parse_number(const uint8_t *const buf, ++p; negative = true; if (!is_integer(*p)) { // a negative sign must be followed by an integer -#ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); +#ifdef JSON_TEST_NUMBERS // for unit testing + found_invalid_number(buf + offset); #endif return false; } } - const char *const startdigits = p; + const char *const start_digits = p; - uint64_t i; // an unsigned int avoids signed overflows (which are bad) + uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } i = 0; } else { if (!(is_integer(*p))) { // must start with an integer -#ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); +#ifdef JSON_TEST_NUMBERS // for unit testing + found_invalid_number(buf + offset); #endif return false; } @@ -447,7 +448,8 @@ static really_inline bool parse_number(const uint8_t *const buf, // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = *p - '0'; - // a multiplication by 10 is cheaper than an arbitrary integer multiplication + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication i = 10 * i + digit; // might overflow, we will handle the overflow later ++p; } @@ -461,17 +463,18 @@ static really_inline bool parse_number(const uint8_t *const buf, // z that fits in 53 bits, then we will be able to convert back the // the integer into a float in a lossless manner. ++p; - const char *const firstafterperiod = p; - if(is_integer(*p)) { + const char *const first_after_period = p; + if (is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult. + i = i * 10 + digit; // might overflow + multiplication by 10 is likely + // cheaper than arbitrary mult. // we will handle the overflow later } else { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif - return false; + return false; } #ifdef SWAR_NUMBER_PARSING // this helps if we have lots of decimals! @@ -484,102 +487,100 @@ static really_inline bool parse_number(const uint8_t *const buf, while (is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later. + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + // because we have parse_highprecision_float later. } - exponent = firstafterperiod - p; + exponent = first_after_period - p; } - int digitcount = p - startdigits - 1; // used later to guard against overflows - int64_t expnumber = 0; // exponential part + int digit_count = + p - start_digits - 1; // used later to guard against overflows + int64_t exp_number = 0; // exponential part if (('e' == *p) || ('E' == *p)) { is_float = true; ++p; - bool negexp = false; + bool neg_exp = false; if ('-' == *p) { - negexp = true; + neg_exp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } unsigned char digit = *p - '0'; - expnumber = digit; + exp_number = digit; p++; if (is_integer(*p)) { digit = *p - '0'; - expnumber = 10 * expnumber + digit; + exp_number = 10 * exp_number + digit; ++p; } if (is_integer(*p)) { digit = *p - '0'; - expnumber = 10 * expnumber + digit; + exp_number = 10 * exp_number + digit; ++p; } while (is_integer(*p)) { - if(expnumber > 0x100000000) {// we need to check for overflows -// we refuse to parse this + if (exp_number > 0x100000000) { // we need to check for overflows + // we refuse to parse this #ifdef JSON_TEST_NUMBERS // for unit testing - foundInvalidNumber(buf + offset); + found_invalid_number(buf + offset); #endif return false; } digit = *p - '0'; - expnumber = 10 * expnumber + digit; - ++p; + exp_number = 10 * exp_number + digit; + ++p; } - exponent += (negexp ? -expnumber : expnumber); + exponent += (neg_exp ? -exp_number : exp_number); } if (is_float) { - uint64_t powerindex = 308 + exponent; - if (unlikely((digitcount >= 19))) { // this is uncommon - // It is possible that the integer had an overflow. + uint64_t power_index = 308 + exponent; + if (unlikely((digit_count >= 19))) { // this is uncommon + // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. - const char * start = startdigits; - while((*start == '0') || (*start == '.')) { - start++; + const char *start = start_digits; + while ((*start == '0') || (*start == '.')) { + start++; } - digitcount -= (start - startdigits); - if(digitcount >= 19) { + digit_count -= (start - start_digits); + if (digit_count >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! - return parse_float(buf, pj, offset, - found_minus); - - } + return parse_float(buf, pj, offset, found_minus); + } } - if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!! + if (unlikely((power_index > 2 * 308))) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - return parse_float(buf, pj, offset, - found_minus); + return parse_float(buf, pj, offset, found_minus); } - double factor = power_of_ten[powerindex]; + double factor = power_of_ten[power_index]; factor = negative ? -factor : factor; double d = i * factor; pj.write_tape_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing - foundFloat(d, buf + offset); + found_float(d, buf + offset); #endif } else { - if (unlikely(digitcount >= 18)) { // this is uncommon!!! + if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - return parse_large_integer(buf, pj, offset, - found_minus); + return parse_large_integer(buf, pj, offset, found_minus); } - i = negative ? 0-i : i; + i = negative ? 0 - i : i; pj.write_tape_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing - foundInteger(i, buf + offset); + found_integer(i, buf + offset); #endif } - return is_structural_or_whitespace(*p); + return is_structural_or_whitespace(*p); #endif // SIMDJSON_SKIPNUMBERPARSING } -}//simdjson +} // simdjson #endif diff --git a/include/simdjson/padded_string.h b/include/simdjson/padded_string.h index 6c7e7e63..e30aca14 100644 --- a/include/simdjson/padded_string.h +++ b/include/simdjson/padded_string.h @@ -1,8 +1,8 @@ #ifndef SIMDJSON_PADDING_STRING_H #define SIMDJSON_PADDING_STRING_H #include "simdjson/portability.h" -#include #include +#include namespace simdjson { // low-level function to allocate memory with padding so we can read passed the @@ -65,6 +65,6 @@ private: size_t viable_size; char *data_ptr; }; -} +} // namespace simdjson #endif diff --git a/include/simdjson/parsedjson.h b/include/simdjson/parsedjson.h index a3ee5cf6..a4b9427a 100644 --- a/include/simdjson/parsedjson.h +++ b/include/simdjson/parsedjson.h @@ -1,48 +1,49 @@ #ifndef SIMDJSON_PARSEDJSON_H #define SIMDJSON_PARSEDJSON_H +#include "simdjson/common_defs.h" +#include "simdjson/jsonformatutils.h" +#include "simdjson/portability.h" +#include "simdjson/simdjson.h" #include #include #include #include #include -#include "simdjson/simdjson.h" -#include "simdjson/common_defs.h" -#include "simdjson/jsonformatutils.h" -#include "simdjson/portability.h" -#define JSONVALUEMASK 0xFFFFFFFFFFFFFF +#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF -#define DEFAULTMAXDEPTH 1024// a JSON document with a depth exceeding 1024 is probably de facto invalid +#define DEFAULT_MAX_DEPTH \ + 1024 // a JSON document with a depth exceeding 1024 is probably de facto + // invalid namespace simdjson { /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -struct ParsedJson { +class ParsedJson { public: - - // create a ParsedJson container with zero capacity, call allocateCapacity to + // create a ParsedJson container with zero capacity, call allocate_capacity to // allocate memory ParsedJson(); ~ParsedJson(); - ParsedJson(ParsedJson && p); + ParsedJson(ParsedJson &&p); // if needed, allocate memory so that the object is able to process JSON - // documents having up to len bytes and maxdepth "depth" + // documents having up to len bytes and max_depth "depth" WARN_UNUSED - bool allocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH); + bool allocate_capacity(size_t len, size_t max_depth = DEFAULT_MAX_DEPTH); // returns true if the document parsed was valid - bool isValid() const; + bool is_valid() const; - // return an error code corresponding to the last parsing attempt, see simdjson.h - // will return simdjson::UNITIALIZED if no parsing was attempted - int getErrorCode() const; + // return an error code corresponding to the last parsing attempt, see + // simdjson.h will return simdjson::UNITIALIZED if no parsing was attempted + int get_error_code() const; - // return the string equivalent of "getErrorCode" - std::string getErrorMsg() const; + // return the string equivalent of "get_error_code" + std::string get_error_message() const; // deallocate memory and set capacity to zero, called automatically by the // destructor @@ -55,11 +56,10 @@ public: // return false if the tape is likely wrong (e.g., you did not parse a valid // JSON). WARN_UNUSED - bool printjson(std::ostream &os); + bool print_json(std::ostream &os); WARN_UNUSED bool dump_raw_tape(std::ostream &os); - // all nodes are stored on the tape using a 64-bit word. // // strings, double and ints are stored as @@ -76,43 +76,42 @@ public: // this should be considered a private function really_inline void write_tape(uint64_t val, uint8_t c) { - tape[current_loc++] = val | ((static_cast(c)) << 56); + tape[current_loc++] = val | ((static_cast(c)) << 56); } really_inline void write_tape_s64(int64_t i) { - write_tape(0, 'l'); - tape[current_loc++] = *(reinterpret_cast(&i)); + write_tape(0, 'l'); + tape[current_loc++] = *(reinterpret_cast(&i)); } really_inline void write_tape_double(double d) { write_tape(0, 'd'); static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size"); - memcpy(& tape[current_loc++], &d, sizeof(double)); - //tape[current_loc++] = *((uint64_t *)&d); + memcpy(&tape[current_loc++], &d, sizeof(double)); + // tape[current_loc++] = *((uint64_t *)&d); } really_inline uint32_t get_current_loc() { return current_loc; } - really_inline void annotate_previousloc(uint32_t saved_loc, uint64_t val) { - tape[saved_loc] |= val; + really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) { + tape[saved_loc] |= val; } - struct InvalidJSON : public std::exception { - const char * what () const throw () { - return "JSON document is invalid"; - } + class InvalidJSON : public std::exception { + const char *what() const throw() { return "JSON document is invalid"; } }; - struct iterator { + class Iterator { // might throw InvalidJSON if ParsedJson is invalid - explicit iterator(ParsedJson &pj_); - ~iterator(); + public: + explicit Iterator(ParsedJson &pj_); + ~Iterator(); - iterator(const iterator &o); + Iterator(const Iterator &o); - iterator(iterator &&o); + Iterator(Iterator &&o); - inline bool isOk() const; + inline bool is_ok() const; // useful for debuging purposes inline size_t get_tape_location() const; @@ -120,11 +119,12 @@ public: // useful for debuging purposes inline size_t get_tape_length() const; - // returns the current depth (start at 1 with 0 reserved for the fictitious root node) + // returns the current depth (start at 1 with 0 reserved for the fictitious + // root node) inline size_t get_depth() const; - // A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([). - // The root node has type 'r'. + // A scope is a series of nodes at the same depth, typically it is either an + // object ({) or an array ([). The root node has type 'r'. inline uint8_t get_scope_type() const; // move forward in document order @@ -132,81 +132,65 @@ public: // retrieve the character code of what we're looking at: // [{"sltfn are the possibilities - inline uint8_t get_type() const { - return current_type; // short functions should be inlined! + inline uint8_t get_type() const { + return current_type; // short functions should be inlined! } // get the int64_t value at this node; valid only if we're at "l" - inline int64_t get_integer() const { - if(location + 1 >= tape_length) { - return 0;// default value in case of error - } - return static_cast(pj.tape[location + 1]); + inline int64_t get_integer() const { + if (location + 1 >= tape_length) { + return 0; // default value in case of error + } + return static_cast(pj.tape[location + 1]); } // get the string value at this node (NULL ended); valid only if we're at " - // note that tabs, and line endings are escaped in the returned value (see print_with_escapes) - // return value is valid UTF-8 - // It may contain NULL chars within the string: get_string_length determines the true - // string length. - inline const char * get_string() const { - return reinterpret_cast(pj.string_buf + (current_val & JSONVALUEMASK) + sizeof(uint32_t)) ; + // note that tabs, and line endings are escaped in the returned value (see + // print_with_escapes) return value is valid UTF-8 It may contain NULL chars + // within the string: get_string_length determines the true string length. + inline const char *get_string() const { + return reinterpret_cast( + pj.string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t)); } // return the length of the string in bytes inline uint32_t get_string_length() const { uint32_t answer; - memcpy(&answer, reinterpret_cast(pj.string_buf + (current_val & JSONVALUEMASK)), sizeof(uint32_t)); + memcpy(&answer, + reinterpret_cast(pj.string_buf + + (current_val & JSON_VALUE_MASK)), + sizeof(uint32_t)); return answer; } // get the double value at this node; valid only if // we're at "d" - inline double get_double() const { - if(location + 1 >= tape_length) { - return NAN;// default value in case of error + inline double get_double() const { + if (location + 1 >= tape_length) { + return NAN; // default value in case of error } double answer; - memcpy(&answer, & pj.tape[location + 1], sizeof(answer)); + memcpy(&answer, &pj.tape[location + 1], sizeof(answer)); return answer; } + inline bool is_object_or_array() const { return is_object() || is_array(); } - inline bool is_object_or_array() const { - return is_object() || is_array(); - } + inline bool is_object() const { return get_type() == '{'; } - inline bool is_object() const { - return get_type() == '{'; - } + inline bool is_array() const { return get_type() == '['; } - inline bool is_array() const { - return get_type() == '['; - } + inline bool is_string() const { return get_type() == '"'; } - inline bool is_string() const { - return get_type() == '"'; - } + inline bool is_integer() const { return get_type() == 'l'; } - inline bool is_integer() const { - return get_type() == 'l'; - } + inline bool is_double() const { return get_type() == 'd'; } - inline bool is_double() const { - return get_type() == 'd'; - } + inline bool is_true() const { return get_type() == 't'; } - inline bool is_true() const { - return get_type() == 't'; - } + inline bool is_false() const { return get_type() == 'f'; } - inline bool is_false() const { - return get_type() == 'f'; - } - - inline bool is_null() const { - return get_type() == 'n'; - } + inline bool is_null() const { return get_type() == 'n'; } static bool is_object_or_array(uint8_t type) { return ((type == '[') || (type == '{')); @@ -219,16 +203,17 @@ public: // We seek the key using C's strcmp so if your JSON strings contain // NULL chars, this would trigger a false positive: if you expect that // to be the case, take extra precautions. - inline bool move_to_key(const char * key); + inline bool move_to_key(const char *key); // when at {, go one level deep, looking for a given key // if successful, we are left pointing at the value, // if not, we are still pointing at the object ({) // (in case of repeated keys, this only finds the first one). // The string we search for can contain NULL values. - inline bool move_to_key(const char * key, uint32_t length); - - // when at a key location within an object, this moves to the accompanying value (located next to it). - // this is equivalent but much faster than calling "next()". + inline bool move_to_key(const char *key, uint32_t length); + + // when at a key location within an object, this moves to the accompanying + // value (located next to it). this is equivalent but much faster than + // calling "next()". inline void move_to_value(); // when at [, go one level deep, and advance to the given index. @@ -239,54 +224,55 @@ public: // Moves the iterator to the value correspoding to the json pointer. // Always search from the root of the document. // if successful, we are left pointing at the value, - // if not, we are still pointing the same value we were pointing before the call. - // The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901 - // However, the standard says "If a referenced member name is not unique in an object, - // the member that is referenced is undefined, and evaluation fails". - // Here we just return the first corresponding value. - // The length parameter is the length of the jsonpointer string ('pointer'). - bool move_to(const char * pointer, uint32_t length); + // if not, we are still pointing the same value we were pointing before the + // call. The json pointer follows the rfc6901 standard's syntax: + // https://tools.ietf.org/html/rfc6901 However, the standard says "If a + // referenced member name is not unique in an object, the member that is + // referenced is undefined, and evaluation fails". Here we just return the + // first corresponding value. The length parameter is the length of the + // jsonpointer string ('pointer'). + bool move_to(const char *pointer, uint32_t length); // Moves the iterator to the value correspoding to the json pointer. // Always search from the root of the document. // if successful, we are left pointing at the value, - // if not, we are still pointing the same value we were pointing before the call. - // The json pointer implementation follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901 - // However, the standard says "If a referenced member name is not unique in an object, - // the member that is referenced is undefined, and evaluation fails". - // Here we just return the first corresponding value. - inline bool move_to(const std::string & pointer) { + // if not, we are still pointing the same value we were pointing before the + // call. The json pointer implementation follows the rfc6901 standard's + // syntax: https://tools.ietf.org/html/rfc6901 However, the standard says + // "If a referenced member name is not unique in an object, the member that + // is referenced is undefined, and evaluation fails". Here we just return + // the first corresponding value. + inline bool move_to(const std::string &pointer) { return move_to(pointer.c_str(), pointer.length()); } - - private: + // Almost the same as move_to(), except it searchs from the current + // position. The pointer's syntax is identical, though that case is not + // handled by the rfc6901 standard. The '/' is still required at the + // beginning. However, contrary to move_to(), the URI Fragment Identifier + // Representation is not supported here. Also, in case of failure, we are + // left pointing at the closest value it could reach. For these reasons it + // is private. It exists because it is used by move_to(). + bool relative_move_to(const char *pointer, uint32_t length); - // Almost the same as move_to(), except it searchs from the current position. - // The pointer's syntax is identical, though that case is not handled by the rfc6901 standard. - // The '/' is still required at the beginning. - // However, contrary to move_to(), the URI Fragment Identifier Representation is not supported here. - // Also, in case of failure, we are left pointing at the closest value it could reach. - // For these reasons it is private. It exists because it is used by move_to(). - bool relative_move_to(const char * pointer, uint32_t length); public: - // throughout return true if we can do the navigation, false // otherwise // Withing a given scope (series of nodes at the same depth within either an // array or an object), we move forward. - // Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [. - // At the object ({) or at the array ([), you can issue a "down" to visit their content. - // valid if we're not at the end of a scope (returns true). + // Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { + // and [. At the object ({) or at the array ([), you can issue a "down" to + // visit their content. valid if we're not at the end of a scope (returns + // true). inline bool next(); // Withing a given scope (series of nodes at the same depth within either an // array or an object), we move backward. - // Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true when starting at the end - // of the scope. - // At the object ({) or at the array ([), you can issue a "down" to visit their content. + // Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true + // when starting at the end of the scope. At the object ({) or at the array + // ([), you can issue a "down" to visit their content. inline bool prev(); // Moves back to either the containing array or object (type { or [) from @@ -294,11 +280,9 @@ public: // Valid unless we are at the first level of the document inline bool up(); - - // Valid if we're at a [ or { and it starts a non-empty scope; moves us to start of - // that deeper scope if it not empty. - // Thus, given [true, null, {"a":1}, [1,2]], if we are at the { node, we would move to the - // "a" node. + // Valid if we're at a [ or { and it starts a non-empty scope; moves us to + // start of that deeper scope if it not empty. Thus, given [true, null, + // {"a":1}, [1,2]], if we are at the { node, we would move to the "a" node. inline bool down(); // move us to the start of our current scope, @@ -306,7 +290,8 @@ public: inline void to_start_scope(); inline void rewind() { - while(up()); + while (up()) + ; } // void to_end_scope(); // move us to @@ -314,26 +299,28 @@ public: // print the thing we're currently pointing at bool print(std::ostream &os, bool escape_strings = true) const; - typedef struct {size_t start_of_scope; uint8_t scope_type;} scopeindex_t; + typedef struct { + size_t start_of_scope; + uint8_t scope_type; + } scopeindex_t; -private: - - iterator& operator=(const iterator& other) = delete ; + private: + Iterator &operator=(const Iterator &other) = delete; ParsedJson &pj; size_t depth; - size_t location; // our current location on a tape + size_t location; // our current location on a tape size_t tape_length; uint8_t current_type; uint64_t current_val; - scopeindex_t *depthindex; + scopeindex_t *depth_index; }; - size_t bytecapacity{0}; // indicates how many bits are meant to be supported + size_t byte_capacity{0}; // indicates how many bits are meant to be supported - size_t depthcapacity{0}; // how deep we can go - size_t tapecapacity{0}; - size_t stringcapacity{0}; + size_t depth_capacity{0}; // how deep we can go + size_t tape_capacity{0}; + size_t string_capacity{0}; uint32_t current_loc{0}; uint32_t n_structural_indexes{0}; @@ -343,24 +330,23 @@ private: uint32_t *containing_scope_offset; #ifdef SIMDJSON_USE_COMPUTED_GOTO void **ret_address; -#else +#else char *ret_address; #endif - uint8_t *string_buf; // should be at least bytecapacity + uint8_t *string_buf; // should be at least byte_capacity uint8_t *current_string_buf_loc; - bool isvalid{false}; - int errorcode{simdjson::UNITIALIZED}; + bool valid{false}; + int error_code{simdjson::UNITIALIZED}; -private : - - // we don't want the default constructor to be called - ParsedJson(const ParsedJson & p) = delete; // we don't want the default constructor to be called - // we don't want the assignment to be called - ParsedJson & operator=(const ParsedJson&o) = delete; +private: + // we don't want the default constructor to be called + ParsedJson(const ParsedJson &p) = + delete; // we don't want the default constructor to be called + // we don't want the assignment to be called + ParsedJson &operator=(const ParsedJson &o) = delete; }; - // dump bits low to high inline void dumpbits_always(uint64_t v, const std::string &msg) { for (uint32_t i = 0; i < 64; i++) { @@ -377,188 +363,180 @@ inline void dumpbits32_always(uint32_t v, const std::string &msg) { } WARN_UNUSED -bool ParsedJson::iterator::isOk() const { - return location < tape_length; -} +bool ParsedJson::Iterator::is_ok() const { return location < tape_length; } // useful for debuging purposes -size_t ParsedJson::iterator::get_tape_location() const { - return location; -} +size_t ParsedJson::Iterator::get_tape_location() const { return location; } // useful for debuging purposes -size_t ParsedJson::iterator::get_tape_length() const { - return tape_length; +size_t ParsedJson::Iterator::get_tape_length() const { return tape_length; } + +// returns the current depth (start at 1 with 0 reserved for the fictitious root +// node) +size_t ParsedJson::Iterator::get_depth() const { return depth; } + +// A scope is a series of nodes at the same depth, typically it is either an +// object ({) or an array ([). The root node has type 'r'. +uint8_t ParsedJson::Iterator::get_scope_type() const { + return depth_index[depth].scope_type; } -// returns the current depth (start at 1 with 0 reserved for the fictitious root node) -size_t ParsedJson::iterator::get_depth() const { - return depth; -} - -// A scope is a series of nodes at the same depth, typically it is either an object ({) or an array ([). -// The root node has type 'r'. -uint8_t ParsedJson::iterator::get_scope_type() const { - return depthindex[depth].scope_type; -} - -bool ParsedJson::iterator::move_forward() { - if(location + 1 >= tape_length) { - return false; // we are at the end! - } - - if ((current_type == '[') || (current_type == '{')){ - // We are entering a new scope - depth++; - depthindex[depth].start_of_scope = location; - depthindex[depth].scope_type = current_type; - } else if ((current_type == ']') || (current_type == '}')) { - // Leaving a scope. - depth--; - } else if ((current_type == 'd') || (current_type == 'l')) { - // d and l types use 2 locations on the tape, not just one. - location += 1; - } +bool ParsedJson::Iterator::move_forward() { + if (location + 1 >= tape_length) { + return false; // we are at the end! + } + if ((current_type == '[') || (current_type == '{')) { + // We are entering a new scope + depth++; + depth_index[depth].start_of_scope = location; + depth_index[depth].scope_type = current_type; + } else if ((current_type == ']') || (current_type == '}')) { + // Leaving a scope. + depth--; + } else if ((current_type == 'd') || (current_type == 'l')) { + // d and l types use 2 locations on the tape, not just one. location += 1; - current_val = pj.tape[location]; - current_type = (current_val >> 56); - return true; + } + + location += 1; + current_val = pj.tape[location]; + current_type = (current_val >> 56); + return true; } -void ParsedJson::iterator::move_to_value() { - // assume that we are on a key, so move by 1. - location += 1; - current_val = pj.tape[location]; - current_type = (current_val >> 56); +void ParsedJson::Iterator::move_to_value() { + // assume that we are on a key, so move by 1. + location += 1; + current_val = pj.tape[location]; + current_type = (current_val >> 56); } - -bool ParsedJson::iterator::move_to_key(const char * key) { - if(down()) { - do { - assert(is_string()); - bool rightkey = (strcmp(get_string(),key)==0);// null chars would fool this - move_to_value(); - if(rightkey) { - return true; - } - } while(next()); - assert(up());// not found - } - return false; -} - -bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) { - if(down()) { - do { - assert(is_string()); - bool rightkey = ((get_string_length() == length) && (memcmp(get_string(),key,length)==0)); - move_to_value(); - if(rightkey) { - return true; - } - } while(next()); - assert(up());// not found - } - return false; -} - -bool ParsedJson::iterator::move_to_index(uint32_t index) { - assert(is_array()); - if (down()) { - uint32_t i = 0; - for (; i < index; i++) { - if (!next()) { - break; - } - } - if (i == index) { +bool ParsedJson::Iterator::move_to_key(const char *key) { + if (down()) { + do { + assert(is_string()); + bool right_key = + (strcmp(get_string(), key) == 0); // null chars would fool this + move_to_value(); + if (right_key) { return true; } - assert(up()); - } - return false; + } while (next()); + assert(up()); // not found + } + return false; } -bool ParsedJson::iterator::prev() { - if(location - 1 < depthindex[depth].start_of_scope) { - return false; - } - location -= 1; - current_val = pj.tape[location]; - current_type = (current_val >> 56); - if ((current_type == ']') || (current_type == '}')){ - // we need to jump - size_t new_location = ( current_val & JSONVALUEMASK); - if(new_location < depthindex[depth].start_of_scope) { - return false; // shoud never happen +bool ParsedJson::Iterator::move_to_key(const char *key, uint32_t length) { + if (down()) { + do { + assert(is_string()); + bool right_key = ((get_string_length() == length) && + (memcmp(get_string(), key, length) == 0)); + move_to_value(); + if (right_key) { + return true; } - location = new_location; - current_val = pj.tape[location]; - current_type = (current_val >> 56); - } - return true; + } while (next()); + assert(up()); // not found + } + return false; } - - bool ParsedJson::iterator::up() { - if(depth == 1) { - return false; // don't allow moving back to root - } - to_start_scope(); - // next we just move to the previous value - depth--; - location -= 1; - current_val = pj.tape[location]; - current_type = (current_val >> 56); - return true; -} - - - bool ParsedJson::iterator::down() { - if(location + 1 >= tape_length) { - return false; - } - if ((current_type == '[') || (current_type == '{')) { - size_t npos = (current_val & JSONVALUEMASK); - if(npos == location + 2) { - return false; // we have an empty scope +bool ParsedJson::Iterator::move_to_index(uint32_t index) { + assert(is_array()); + if (down()) { + uint32_t i = 0; + for (; i < index; i++) { + if (!next()) { + break; } - depth++; - location = location + 1; - depthindex[depth].start_of_scope = location; - depthindex[depth].scope_type = current_type; - current_val = pj.tape[location]; - current_type = (current_val >> 56); + } + if (i == index) { return true; } - return false; + assert(up()); + } + return false; } -void ParsedJson::iterator::to_start_scope() { - location = depthindex[depth].start_of_scope; +bool ParsedJson::Iterator::prev() { + if (location - 1 < depth_index[depth].start_of_scope) { + return false; + } + location -= 1; + current_val = pj.tape[location]; + current_type = (current_val >> 56); + if ((current_type == ']') || (current_type == '}')) { + // we need to jump + size_t new_location = (current_val & JSON_VALUE_MASK); + if (new_location < depth_index[depth].start_of_scope) { + return false; // shoud never happen + } + location = new_location; current_val = pj.tape[location]; current_type = (current_val >> 56); + } + return true; } -bool ParsedJson::iterator::next() { - size_t npos; - if ((current_type == '[') || (current_type == '{')){ - // we need to jump - npos = ( current_val & JSONVALUEMASK); - } else { - npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1); +bool ParsedJson::Iterator::up() { + if (depth == 1) { + return false; // don't allow moving back to root + } + to_start_scope(); + // next we just move to the previous value + depth--; + location -= 1; + current_val = pj.tape[location]; + current_type = (current_val >> 56); + return true; +} + +bool ParsedJson::Iterator::down() { + if (location + 1 >= tape_length) { + return false; + } + if ((current_type == '[') || (current_type == '{')) { + size_t npos = (current_val & JSON_VALUE_MASK); + if (npos == location + 2) { + return false; // we have an empty scope } - uint64_t nextval = pj.tape[npos]; - uint8_t nexttype = (nextval >> 56); - if((nexttype == ']') || (nexttype == '}')) { - return false; // we reached the end of the scope - } - location = npos; - current_val = nextval; - current_type = nexttype; + depth++; + location = location + 1; + depth_index[depth].start_of_scope = location; + depth_index[depth].scope_type = current_type; + current_val = pj.tape[location]; + current_type = (current_val >> 56); return true; + } + return false; } + +void ParsedJson::Iterator::to_start_scope() { + location = depth_index[depth].start_of_scope; + current_val = pj.tape[location]; + current_type = (current_val >> 56); } + +bool ParsedJson::Iterator::next() { + size_t npos; + if ((current_type == '[') || (current_type == '{')) { + // we need to jump + npos = (current_val & JSON_VALUE_MASK); + } else { + npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1); + } + uint64_t next_val = pj.tape[npos]; + uint8_t next_type = (next_val >> 56); + if ((next_type == ']') || (next_type == '}')) { + return false; // we reached the end of the scope + } + location = npos; + current_val = next_val; + current_type = next_type; + return true; +} +} // namespace simdjson #endif diff --git a/include/simdjson/portability.h b/include/simdjson/portability.h index 37d1532a..f2f22bc9 100644 --- a/include/simdjson/portability.h +++ b/include/simdjson/portability.h @@ -2,33 +2,32 @@ #define SIMDJSON_PORTABILITY_H #if defined(__x86_64__) || defined(_M_AMD64) -# define IS_X86_64 1 +#define IS_X86_64 1 #endif #if defined(__aarch64__) || defined(_M_ARM64) -# define IS_ARM64 1 +#define IS_ARM64 1 #endif // this is almost standard? #define STRINGIFY(a) #a - - // we are going to use runtime dispatch #ifdef IS_X86_64 #ifdef __clang__ // clang does not have GCC push pop -// warning: clang attribute push can't be used within a namespace in clang up til 8.0 so TARGET_REGION and -// UNTARGET_REGION must be *outside* of a namespace. -#define TARGET_REGION(T) _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), apply_to=function))) +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so TARGET_REGION and UNTARGET_REGION must be *outside* of a +// namespace. +#define TARGET_REGION(T) \ + _Pragma(STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) #define UNTARGET_REGION _Pragma("clang attribute pop") #elif defined(__GNUC__) // GCC is easier -#define TARGET_REGION(T) \ -_Pragma("GCC push_options") \ -_Pragma(STRINGIFY(GCC target(T))) -#define UNTARGET_REGION \ -_Pragma("GCC pop_options") -#else +#define TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) +#define UNTARGET_REGION _Pragma("GCC pop_options") +#else #define TARGET_REGION(T) #define UNTARGET_REGION #endif // clang then gcc @@ -39,49 +38,50 @@ _Pragma("GCC pop_options") #endif // x86 - - #ifdef _MSC_VER -# include +#include #else -# if IS_X86_64 -# include -# elif IS_ARM64 -# include -# endif +#if IS_X86_64 +#include +#elif IS_ARM64 +#include +#endif #endif #ifdef _MSC_VER /* Microsoft C/C++-compatible compiler */ -#include #include +#include namespace simdjson { -static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { - return _addcarry_u64(0, value1, value2, reinterpret_cast(result)); +static inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); } -# pragma intrinsic(_umul128) -static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { - uint64_t high; - *result = _umul128(value1, value2, &high); - return high; +#pragma intrinsic(_umul128) +static inline bool mul_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { + uint64_t high; + *result = _umul128(value1, value2, &high); + return high; } -static inline int trailingzeroes(uint64_t input_num) { - return static_cast(_tzcnt_u64(input_num)); +static inline int trailing_zeroes(uint64_t input_num) { + return static_cast(_tzcnt_u64(input_num)); } -static inline int leadingzeroes(uint64_t input_num) { - return static_cast(_lzcnt_u64(input_num)); +static inline int leading_zeroes(uint64_t input_num) { + return static_cast(_lzcnt_u64(input_num)); } static inline int hamming(uint64_t input_num) { -#ifdef _WIN64 // highly recommended!!! - return (int)__popcnt64(input_num); -#else // if we must support 32-bit Windows - return (int)(__popcnt((uint32_t)input_num) + - __popcnt((uint32_t)(input_num >> 32))); +#ifdef _WIN64 // highly recommended!!! + return (int)__popcnt64(input_num); +#else // if we must support 32-bit Windows + return (int)(__popcnt((uint32_t)input_num) + + __popcnt((uint32_t)(input_num >> 32))); #endif } } // namespace simdjson @@ -90,78 +90,83 @@ static inline int hamming(uint64_t input_num) { #include namespace simdjson { -static inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { - return __builtin_uaddll_overflow(value1, value2, (unsigned long long*)result); +static inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { + return __builtin_uaddll_overflow(value1, value2, + (unsigned long long *)result); } -static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { - return __builtin_umulll_overflow(value1, value2, (unsigned long long *)result); +static inline bool mul_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { + return __builtin_umulll_overflow(value1, value2, + (unsigned long long *)result); } /* result might be undefined when input_num is zero */ -static inline int trailingzeroes(uint64_t input_num) { -#ifdef __BMI__// tzcnt is BMI1 - return _tzcnt_u64(input_num); +static inline int trailing_zeroes(uint64_t input_num) { +#ifdef __BMI__ // tzcnt is BMI1 + return _tzcnt_u64(input_num); #else - return __builtin_ctzll(input_num); + return __builtin_ctzll(input_num); #endif } /* result might be undefined when input_num is zero */ -static inline int leadingzeroes(uint64_t input_num) { +static inline int leading_zeroes(uint64_t input_num) { #ifdef __BMI2__ - return _lzcnt_u64(input_num); + return _lzcnt_u64(input_num); #else - return __builtin_clzll(input_num); + return __builtin_clzll(input_num); #endif } /* result might be undefined when input_num is zero */ static inline int hamming(uint64_t input_num) { #ifdef __POPCOUNT__ - return _popcnt64(input_num); + return _popcnt64(input_num); #else - return __builtin_popcountll(input_num); + return __builtin_popcountll(input_num); #endif } -} +} // namespace simdjson #endif // _MSC_VER - namespace simdjson { // portable version of posix_memalign static inline void *aligned_malloc(size_t alignment, size_t size) { - void *p; + void *p; #ifdef _MSC_VER - p = _aligned_malloc(size, alignment); + p = _aligned_malloc(size, alignment); #elif defined(__MINGW32__) || defined(__MINGW64__) - p = __mingw_aligned_malloc(size, alignment); + p = __mingw_aligned_malloc(size, alignment); #else - // somehow, if this is used before including "x86intrin.h", it creates an - // implicit defined warning. - if (posix_memalign(&p, alignment, size) != 0) { return nullptr; } + // somehow, if this is used before including "x86intrin.h", it creates an + // implicit defined warning. + if (posix_memalign(&p, alignment, size) != 0) { + return nullptr; + } #endif - return p; + return p; } static inline char *aligned_malloc_char(size_t alignment, size_t size) { - return (char*)aligned_malloc(alignment, size); + return (char *)aligned_malloc(alignment, size); } -static inline void aligned_free(void *memblock) { - if(memblock == nullptr) { return; } +static inline void aligned_free(void *mem_block) { + if (mem_block == nullptr) { + return; + } #ifdef _MSC_VER - _aligned_free(memblock); + _aligned_free(mem_block); #elif defined(__MINGW32__) || defined(__MINGW64__) - __mingw_aligned_free(memblock); + __mingw_aligned_free(mem_block); #else - free(memblock); + free(mem_block); #endif } - - -static inline void aligned_free_char(char *memblock) { - aligned_free((void*)memblock); -} +static inline void aligned_free_char(char *mem_block) { + aligned_free((void *)mem_block); } +} // namespace simdjson #endif // SIMDJSON_PORTABILITY_H diff --git a/include/simdjson/simdjson.h b/include/simdjson/simdjson.h index ff2aaa3b..ffb0c717 100644 --- a/include/simdjson/simdjson.h +++ b/include/simdjson/simdjson.h @@ -5,38 +5,40 @@ namespace simdjson { // Represents the minimal architecture that would support an implementation -enum class architecture { - westmere, - haswell, - arm64, - none, - // TODO remove 'native' in favor of runtime dispatch? - // the 'native' enum class value should point at a good default on the current machine +enum class Architecture { + WESTMERE, + HASWELL, + ARM64, + NONE, +// TODO remove 'native' in favor of runtime dispatch? +// the 'native' enum class value should point at a good default on the current +// machine #ifdef IS_X86_64 - native = westmere + NATIVE = WESTMERE #elif defined(IS_ARM64) - native = arm64 + NATIVE = ARM64 #endif }; -enum errorValues { +enum ErrorValues { SUCCESS = 0, - CAPACITY, // This ParsedJson can't support a document that big - MEMALLOC, // Error allocating memory, most likely out of memory - TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this is a generic error + CAPACITY, // This ParsedJson can't support a document that big + MEMALLOC, // Error allocating memory, most likely out of memory + TAPE_ERROR, // Something went wrong while writing to the tape (stage 2), this + // is a generic error DEPTH_ERROR, // Your document exceeds the user-specified depth limitation - STRING_ERROR, // Problem while parsing a string - T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't' - F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f' - N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n' - NUMBER_ERROR, // Problem while parsing a number - UTF8_ERROR, // the input is not valid UTF-8 - UNITIALIZED, // unknown error, or uninitialized document - EMPTY, // no structural document found + STRING_ERROR, // Problem while parsing a string + T_ATOM_ERROR, // Problem while parsing an atom starting with the letter 't' + F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f' + N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n' + NUMBER_ERROR, // Problem while parsing a number + UTF8_ERROR, // the input is not valid UTF-8 + UNITIALIZED, // unknown error, or uninitialized document + EMPTY, // no structural document found UNESCAPED_CHARS, // found unescaped characters in a string. UNCLOSED_STRING, // missing quote at the end UNEXPECTED_ERROR // indicative of a bug in simdjson }; -const std::string& errorMsg(const int); -} +const std::string &error_message(const int); +} // namespace simdjson #endif diff --git a/include/simdjson/simdjson_version.h b/include/simdjson/simdjson_version.h index fd072b30..dc55781e 100644 --- a/include/simdjson/simdjson_version.h +++ b/include/simdjson/simdjson_version.h @@ -1,12 +1,13 @@ -// /include/simdjson/simdjson_version.h automatically generated by release.py, do not change by hand -#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION -#define SIMDJSON_INCLUDE_SIMDJSON_VERSION -#define SIMDJSON_VERSION 0.1.2 +// /include/simdjson/simdjson_version.h automatically generated by release.py, +// do not change by hand +#ifndef SIMDJSON_INCLUDE_SIMDJSON_VERSION +#define SIMDJSON_INCLUDE_SIMDJSON_VERSION +#define SIMDJSON_VERSION 0.1.2 namespace simdjson { -enum { - SIMDJSON_VERSION_MAJOR = 0, - SIMDJSON_VERSION_MINOR = 1, - SIMDJSON_VERSION_REVISION = 2 -}; +enum { + SIMDJSON_VERSION_MAJOR = 0, + SIMDJSON_VERSION_MINOR = 1, + SIMDJSON_VERSION_REVISION = 2 +}; } -#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION +#endif // SIMDJSON_INCLUDE_SIMDJSON_VERSION diff --git a/include/simdjson/simdutf8check_arm64.h b/include/simdjson/simdutf8check_arm64.h index 198fc799..6360b012 100644 --- a/include/simdjson/simdutf8check_arm64.h +++ b/include/simdjson/simdutf8check_arm64.h @@ -4,14 +4,15 @@ #ifndef SIMDJSON_SIMDUTF8CHECK_ARM64_H #define SIMDJSON_SIMDUTF8CHECK_ARM64_H -#if defined(_ARM_NEON) || defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64)) +#if defined(_ARM_NEON) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(_M_ARM64)) -#include +#include +#include #include #include +#include #include -#include -#include /* * legal utf-8 byte sequence @@ -32,47 +33,49 @@ namespace simdjson { // all byte values must be no larger than 0xF4 -static inline void checkSmallerThan0xF4(int8x16_t current_bytes, - int8x16_t *has_error) { +static inline void check_smaller_than_0xF4(int8x16_t current_bytes, + int8x16_t *has_error) { // unsigned, saturates to 0 below max - *has_error = vorrq_s8(*has_error, - vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4)))); + *has_error = vorrq_s8( + *has_error, vreinterpretq_s8_u8(vqsubq_u8( + vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4)))); } static const int8_t _nibbles[] = { - 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) - 0, 0, 0, 0, // 10xx (continuation) - 2, 2, // 110x - 3, // 1110 - 4, // 1111, next should be 0 (not checked here) + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) }; -static inline int8x16_t continuationLengths(int8x16_t high_nibbles) { +static inline int8x16_t continuation_lengths(int8x16_t high_nibbles) { return vqtbl1q_s8(vld1q_s8(_nibbles), vreinterpretq_u8_s8(high_nibbles)); } -static inline int8x16_t carryContinuations(int8x16_t initial_lengths, - int8x16_t previous_carries) { +static inline int8x16_t carry_continuations(int8x16_t initial_lengths, + int8x16_t previous_carries) { - int8x16_t right1 = - vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)), - vdupq_n_u8(1))); + int8x16_t right1 = vreinterpretq_s8_u8(vqsubq_u8( + vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)), + vdupq_n_u8(1))); int8x16_t sum = vaddq_s8(initial_lengths, right1); - int8x16_t right2 = vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)), - vdupq_n_u8(2))); + int8x16_t right2 = vreinterpretq_s8_u8( + vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)), + vdupq_n_u8(2))); return vaddq_s8(sum, right2); } -static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carries, - int8x16_t *has_error) { +static inline void check_continuations(int8x16_t initial_lengths, + int8x16_t carries, + int8x16_t *has_error) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) // (carries > length) == (lengths > 0) - uint8x16_t overunder = - vceqq_u8(vcgtq_s8(carries, initial_lengths), - vcgtq_s8(initial_lengths, vdupq_n_s8(0))); + uint8x16_t overunder = vceqq_u8(vcgtq_s8(carries, initial_lengths), + vcgtq_s8(initial_lengths, vdupq_n_s8(0))); *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder)); } @@ -80,9 +83,9 @@ static inline void checkContinuations(int8x16_t initial_lengths, int8x16_t carri // when 0xED is found, next byte must be no larger than 0x9F // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok -static inline void checkFirstContinuationMax(int8x16_t current_bytes, - int8x16_t off1_current_bytes, - int8x16_t *has_error) { +static inline void check_first_continuation_max(int8x16_t current_bytes, + int8x16_t off1_current_bytes, + int8x16_t *has_error) { uint8x16_t maskED = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED)); uint8x16_t maskF4 = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4)); @@ -91,23 +94,24 @@ static inline void checkFirstContinuationMax(int8x16_t current_bytes, uint8x16_t badfollowF4 = vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4); - *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4))); + *has_error = vorrq_s8( + *has_error, vreinterpretq_s8_u8(vorrq_u8(badfollowED, badfollowF4))); } static const int8_t _initial_mins[] = { - -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, // 10xx => false - (int8_t) 0xC2, -128, // 110x - (int8_t) 0xE1, // 1110 - (int8_t) 0xF1, + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + (int8_t)0xC2, -128, // 110x + (int8_t)0xE1, // 1110 + (int8_t)0xF1, }; static const int8_t _second_mins[] = { - -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, // 10xx => false - 127, 127, // 110x => true - (int8_t) 0xA0, // 1110 - (int8_t) 0x90, + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + (int8_t)0xA0, // 1110 + (int8_t)0x90, }; // map off1_hibits => error condition @@ -116,58 +120,61 @@ static const int8_t _second_mins[] = { // E => < E1 && < A0 // F => < F1 && < 90 // else false && false -static inline void checkOverlong(int8x16_t current_bytes, - int8x16_t off1_current_bytes, int8x16_t hibits, - int8x16_t previous_hibits, int8x16_t *has_error) { +static inline void check_overlong(int8x16_t current_bytes, + int8x16_t off1_current_bytes, + int8x16_t hibits, int8x16_t previous_hibits, + int8x16_t *has_error) { int8x16_t off1_hibits = vextq_s8(previous_hibits, hibits, 16 - 1); - int8x16_t initial_mins = vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits)); + int8x16_t initial_mins = + vqtbl1q_s8(vld1q_s8(_initial_mins), vreinterpretq_u8_s8(off1_hibits)); uint8x16_t initial_under = vcgtq_s8(initial_mins, off1_current_bytes); - int8x16_t second_mins = vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits)); + int8x16_t second_mins = + vqtbl1q_s8(vld1q_s8(_second_mins), vreinterpretq_u8_s8(off1_hibits)); uint8x16_t second_under = vcgtq_s8(second_mins, current_bytes); - *has_error = - vorrq_s8(*has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under))); + *has_error = vorrq_s8( + *has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under))); } struct processed_utf_bytes { - int8x16_t rawbytes; + int8x16_t raw_bytes; int8x16_t high_nibbles; int8x16_t carried_continuations; }; static inline void count_nibbles(int8x16_t bytes, struct processed_utf_bytes *answer) { - answer->rawbytes = bytes; + answer->raw_bytes = bytes; answer->high_nibbles = - vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)); + vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)); } // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated static inline struct processed_utf_bytes -checkUTF8Bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous, - int8x16_t *has_error) { +check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous, + int8x16_t *has_error) { struct processed_utf_bytes pb; count_nibbles(current_bytes, &pb); - checkSmallerThan0xF4(current_bytes, has_error); + check_smaller_than_0xF4(current_bytes, has_error); - int8x16_t initial_lengths = continuationLengths(pb.high_nibbles); + int8x16_t initial_lengths = continuation_lengths(pb.high_nibbles); pb.carried_continuations = - carryContinuations(initial_lengths, previous->carried_continuations); + carry_continuations(initial_lengths, previous->carried_continuations); - checkContinuations(initial_lengths, pb.carried_continuations, has_error); + check_continuations(initial_lengths, pb.carried_continuations, has_error); int8x16_t off1_current_bytes = - vextq_s8(previous->rawbytes, pb.rawbytes, 16 - 1); - checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + vextq_s8(previous->raw_bytes, pb.raw_bytes, 16 - 1); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); - checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, - previous->high_nibbles, has_error); + check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); return pb; } -}// simdjson +} // namespace simdjson #endif #endif diff --git a/include/simdjson/simdutf8check_haswell.h b/include/simdjson/simdutf8check_haswell.h index f896a3dc..72dcf938 100644 --- a/include/simdjson/simdutf8check_haswell.h +++ b/include/simdjson/simdutf8check_haswell.h @@ -1,11 +1,10 @@ #ifndef SIMDJSON_SIMDUTF8CHECK_HASWELL_H #define SIMDJSON_SIMDUTF8CHECK_HASWELL_H - +#include "simdjson/portability.h" #include #include #include -#include "simdjson/portability.h" #ifdef IS_X86_64 /* @@ -38,14 +37,14 @@ static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) { } // all byte values must be no larger than 0xF4 -static inline void avxcheckSmallerThan0xF4(__m256i current_bytes, - __m256i *has_error) { +static inline void avx_check_smaller_than_0xF4(__m256i current_bytes, + __m256i *has_error) { // unsigned, saturates to 0 below max *has_error = _mm256_or_si256( *has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4))); } -static inline __m256i avxcontinuationLengths(__m256i high_nibbles) { +static inline __m256i avx_continuation_lengths(__m256i high_nibbles) { return _mm256_shuffle_epi8( _mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) 0, 0, 0, 0, // 10xx (continuation) @@ -61,8 +60,8 @@ static inline __m256i avxcontinuationLengths(__m256i high_nibbles) { high_nibbles); } -static inline __m256i avxcarryContinuations(__m256i initial_lengths, - __m256i previous_carries) { +static inline __m256i avx_carry_continuations(__m256i initial_lengths, + __m256i previous_carries) { __m256i right1 = _mm256_subs_epu8( push_last_byte_of_a_to_b(previous_carries, initial_lengths), @@ -74,8 +73,9 @@ static inline __m256i avxcarryContinuations(__m256i initial_lengths, return _mm256_add_epi8(sum, right2); } -static inline void avxcheckContinuations(__m256i initial_lengths, - __m256i carries, __m256i *has_error) { +static inline void avx_check_continuations(__m256i initial_lengths, + __m256i carries, + __m256i *has_error) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) @@ -90,9 +90,9 @@ static inline void avxcheckContinuations(__m256i initial_lengths, // when 0xED is found, next byte must be no larger than 0x9F // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok -static inline void avxcheckFirstContinuationMax(__m256i current_bytes, - __m256i off1_current_bytes, - __m256i *has_error) { +static inline void avx_check_first_continuation_max(__m256i current_bytes, + __m256i off1_current_bytes, + __m256i *has_error) { __m256i maskED = _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED)); __m256i maskF4 = @@ -113,37 +113,37 @@ static inline void avxcheckFirstContinuationMax(__m256i current_bytes, // E => < E1 && < A0 // F => < F1 && < 90 // else false && false -static inline void avxcheckOverlong(__m256i current_bytes, - __m256i off1_current_bytes, __m256i hibits, - __m256i previous_hibits, - __m256i *has_error) { +static inline void avx_check_overlong(__m256i current_bytes, + __m256i off1_current_bytes, + __m256i hibits, __m256i previous_hibits, + __m256i *has_error) { __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits); __m256i initial_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 0xC2, -128, // 110x - 0xE1, // 1110 - 0xF1, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 0xC2, -128, // 110x - 0xE1, // 1110 - 0xF1), // 1111 + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 0xC2, -128, // 110x + 0xE1, // 1110 + 0xF1, // 1111 + -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 0xC2, -128, // 110x + 0xE1, // 1110 + 0xF1), // 1111 off1_hibits); __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); __m256i second_mins = _mm256_shuffle_epi8( - _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0, // 1110 - 0x90, // 1111 - -128, -128, -128, -128, -128, -128, -128, -128, - -128, -128, -128, -128, // 10xx => false - 127, 127, // 110x => true - 0xA0, // 1110 - 0x90), // 1111 + _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90, // 1111 + -128, -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + 0xA0, // 1110 + 0x90), // 1111 off1_hibits); __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); *has_error = _mm256_or_si256(*has_error, @@ -151,14 +151,14 @@ static inline void avxcheckOverlong(__m256i current_bytes, } struct avx_processed_utf_bytes { - __m256i rawbytes; + __m256i raw_bytes; __m256i high_nibbles; __m256i carried_continuations; }; static inline void avx_count_nibbles(__m256i bytes, struct avx_processed_utf_bytes *answer) { - answer->rawbytes = bytes; + answer->raw_bytes = bytes; answer->high_nibbles = _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F)); } @@ -166,33 +166,33 @@ static inline void avx_count_nibbles(__m256i bytes, // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated static inline struct avx_processed_utf_bytes -avxcheckUTF8Bytes(__m256i current_bytes, - struct avx_processed_utf_bytes *previous, - __m256i *has_error) { - struct avx_processed_utf_bytes pb{}; +avx_check_utf8_bytes(__m256i current_bytes, + struct avx_processed_utf_bytes *previous, + __m256i *has_error) { + struct avx_processed_utf_bytes pb {}; avx_count_nibbles(current_bytes, &pb); - avxcheckSmallerThan0xF4(current_bytes, has_error); + avx_check_smaller_than_0xF4(current_bytes, has_error); - __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles); + __m256i initial_lengths = avx_continuation_lengths(pb.high_nibbles); pb.carried_continuations = - avxcarryContinuations(initial_lengths, previous->carried_continuations); + avx_carry_continuations(initial_lengths, previous->carried_continuations); - avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error); + avx_check_continuations(initial_lengths, pb.carried_continuations, has_error); __m256i off1_current_bytes = - push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes); - avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + push_last_byte_of_a_to_b(previous->raw_bytes, pb.raw_bytes); + avx_check_first_continuation_max(current_bytes, off1_current_bytes, + has_error); - avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, - previous->high_nibbles, has_error); + avx_check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); return pb; } -}// simdjson +} // namespace simdjson UNTARGET_REGION // haswell - #endif // IS_X86_64 #endif diff --git a/include/simdjson/simdutf8check_westmere.h b/include/simdjson/simdutf8check_westmere.h index 73beab9d..5cfdd59b 100644 --- a/include/simdjson/simdutf8check_westmere.h +++ b/include/simdjson/simdutf8check_westmere.h @@ -1,10 +1,10 @@ #ifndef SIMDJSON_SIMDUTF8CHECK_WESTMERE_H #define SIMDJSON_SIMDUTF8CHECK_WESTMERE_H +#include "simdjson/portability.h" #include #include #include -#include "simdjson/portability.h" #ifdef IS_X86_64 /* @@ -29,16 +29,16 @@ /********** sse code **********/ TARGET_WESTMERE -namespace simdjson{ +namespace simdjson { // all byte values must be no larger than 0xF4 -static inline void checkSmallerThan0xF4(__m128i current_bytes, - __m128i *has_error) { +static inline void check_smaller_than_0xF4(__m128i current_bytes, + __m128i *has_error) { // unsigned, saturates to 0 below max *has_error = _mm_or_si128(*has_error, _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4))); } -static inline __m128i continuationLengths(__m128i high_nibbles) { +static inline __m128i continuation_lengths(__m128i high_nibbles) { return _mm_shuffle_epi8( _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) 0, 0, 0, 0, // 10xx (continuation) @@ -48,8 +48,8 @@ static inline __m128i continuationLengths(__m128i high_nibbles) { high_nibbles); } -static inline __m128i carryContinuations(__m128i initial_lengths, - __m128i previous_carries) { +static inline __m128i carry_continuations(__m128i initial_lengths, + __m128i previous_carries) { __m128i right1 = _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1), @@ -61,8 +61,8 @@ static inline __m128i carryContinuations(__m128i initial_lengths, return _mm_add_epi8(sum, right2); } -static inline void checkContinuations(__m128i initial_lengths, __m128i carries, - __m128i *has_error) { +static inline void check_continuations(__m128i initial_lengths, __m128i carries, + __m128i *has_error) { // overlap || underlap // carry > length && length > 0 || !(carry > length) && !(length > 0) @@ -77,9 +77,9 @@ static inline void checkContinuations(__m128i initial_lengths, __m128i carries, // when 0xED is found, next byte must be no larger than 0x9F // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok -static inline void checkFirstContinuationMax(__m128i current_bytes, - __m128i off1_current_bytes, - __m128i *has_error) { +static inline void check_first_continuation_max(__m128i current_bytes, + __m128i off1_current_bytes, + __m128i *has_error) { __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED)); __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4)); @@ -97,9 +97,9 @@ static inline void checkFirstContinuationMax(__m128i current_bytes, // E => < E1 && < A0 // F => < F1 && < 90 // else false && false -static inline void checkOverlong(__m128i current_bytes, - __m128i off1_current_bytes, __m128i hibits, - __m128i previous_hibits, __m128i *has_error) { +static inline void check_overlong(__m128i current_bytes, + __m128i off1_current_bytes, __m128i hibits, + __m128i previous_hibits, __m128i *has_error) { __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1); __m128i initial_mins = _mm_shuffle_epi8( _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, @@ -124,14 +124,14 @@ static inline void checkOverlong(__m128i current_bytes, } struct processed_utf_bytes { - __m128i rawbytes; + __m128i raw_bytes; __m128i high_nibbles; __m128i carried_continuations; }; static inline void count_nibbles(__m128i bytes, struct processed_utf_bytes *answer) { - answer->rawbytes = bytes; + answer->raw_bytes = bytes; answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)); } @@ -139,32 +139,31 @@ static inline void count_nibbles(__m128i bytes, // check whether the current bytes are valid UTF-8 // at the end of the function, previous gets updated static struct processed_utf_bytes -checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous, - __m128i *has_error) { +check_utf8_bytes(__m128i current_bytes, struct processed_utf_bytes *previous, + __m128i *has_error) { struct processed_utf_bytes pb; count_nibbles(current_bytes, &pb); - checkSmallerThan0xF4(current_bytes, has_error); + check_smaller_than_0xF4(current_bytes, has_error); - __m128i initial_lengths = continuationLengths(pb.high_nibbles); + __m128i initial_lengths = continuation_lengths(pb.high_nibbles); pb.carried_continuations = - carryContinuations(initial_lengths, previous->carried_continuations); + carry_continuations(initial_lengths, previous->carried_continuations); - checkContinuations(initial_lengths, pb.carried_continuations, has_error); + check_continuations(initial_lengths, pb.carried_continuations, has_error); __m128i off1_current_bytes = - _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1); - checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); + _mm_alignr_epi8(pb.raw_bytes, previous->raw_bytes, 16 - 1); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); - checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, - previous->high_nibbles, has_error); + check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles, + previous->high_nibbles, has_error); return pb; } -}//simdjson +} // namespace simdjson UNTARGET_REGION // westmere - #endif // IS_X86_64 #endif diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h index 158f3729..0eefb878 100644 --- a/include/simdjson/stage1_find_marks.h +++ b/include/simdjson/stage1_find_marks.h @@ -1,67 +1,60 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_H #define SIMDJSON_STAGE1_FIND_MARKS_H -#include #include "simdjson/common_defs.h" -#include "simdjson/simdjson.h" #include "simdjson/parsedjson.h" #include "simdjson/portability.h" +#include "simdjson/simdjson.h" +#include namespace simdjson { -template -struct simd_input; +template struct simd_input; -template -uint64_t compute_quote_mask(uint64_t quote_bits); +template uint64_t compute_quote_mask(uint64_t quote_bits); namespace { - // for when clmul is unavailable - [[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) { - uint64_t quote_mask = quote_bits ^ (quote_bits << 1); - quote_mask = quote_mask ^ (quote_mask << 2); - quote_mask = quote_mask ^ (quote_mask << 4); - quote_mask = quote_mask ^ (quote_mask << 8); - quote_mask = quote_mask ^ (quote_mask << 16); - quote_mask = quote_mask ^ (quote_mask << 32); - return quote_mask; - } +// for when clmul is unavailable +[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) { + uint64_t quote_mask = quote_bits ^ (quote_bits << 1); + quote_mask = quote_mask ^ (quote_mask << 2); + quote_mask = quote_mask ^ (quote_mask << 4); + quote_mask = quote_mask ^ (quote_mask << 8); + quote_mask = quote_mask ^ (quote_mask << 16); + quote_mask = quote_mask ^ (quote_mask << 32); + return quote_mask; } +} // namespace // Holds the state required to perform check_utf8(). -template -struct utf8_checking_state; +template struct utf8_checking_state; - -template -void check_utf8(simd_input in, utf8_checking_state& state); +template +void check_utf8(simd_input in, utf8_checking_state &state); // Checks if the utf8 validation has found any error. -template -errorValues check_utf8_errors(utf8_checking_state& state); +template +ErrorValues check_utf8_errors(utf8_checking_state &state); -// a straightforward comparison of a mask against input. -template +// a straightforward comparison of a mask against input. +template uint64_t cmp_mask_against_input(simd_input in, uint8_t m); - -template -simd_input fill_input(const uint8_t * ptr); +template simd_input fill_input(const uint8_t *ptr); - -// find all values less than or equal than the content of maxval (using unsigned arithmetic) -template +// find all values less than or equal than the content of maxval (using unsigned +// arithmetic) +template uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m); +template +really_inline uint64_t find_odd_backslash_sequences( + simd_input in, uint64_t &prev_iter_ends_odd_backslash); -template really_inline -uint64_t find_odd_backslash_sequences(simd_input in, uint64_t &prev_iter_ends_odd_backslash); - - -template really_inline -uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask); - +template +really_inline uint64_t find_quote_mask_and_bits( + simd_input in, uint64_t odd_ends, uint64_t &prev_iter_inside_quote, + uint64_t "e_bits, uint64_t &error_mask); // do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c @@ -70,9 +63,8 @@ uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) -template -void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, +template +void find_whitespace_and_structurals(simd_input in, uint64_t &whitespace, uint64_t &structurals); // return a updated structural bit vector with quoted contents cleared out and @@ -86,7 +78,7 @@ really_inline uint64_t finalize_structurals( uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) { // mask off anything inside quotes structurals &= ~quote_mask; - // add the real quote bits back into our bitmask as well, so we can + // add the real quote bits back into our bit_mask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace @@ -114,12 +106,14 @@ really_inline uint64_t finalize_structurals( return structurals; } -template -int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj); +template +int find_structural_bits(const uint8_t *buf, size_t len, + simdjson::ParsedJson &pj); -template -int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) { - return find_structural_bits((const uint8_t*)buf, len, pj); +template +int find_structural_bits(const char *buf, size_t len, + simdjson::ParsedJson &pj) { + return find_structural_bits((const uint8_t *)buf, len, pj); } } // namespace simdjson diff --git a/include/simdjson/stage1_find_marks_arm64.h b/include/simdjson/stage1_find_marks_arm64.h index 69cb069c..5840ad30 100644 --- a/include/simdjson/stage1_find_marks_arm64.h +++ b/include/simdjson/stage1_find_marks_arm64.h @@ -1,23 +1,24 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H #define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H -#include "simdjson/stage1_find_marks.h" -#include "simdjson/stage1_find_marks_macros.h" -#include "simdjson/stage1_find_marks_flatten.h" #include "simdjson/simdutf8check_arm64.h" +#include "simdjson/stage1_find_marks.h" +#include "simdjson/stage1_find_marks_flatten.h" +#include "simdjson/stage1_find_marks_macros.h" #ifdef IS_ARM64 namespace simdjson { -template<> struct simd_input { +template <> struct simd_input { uint8x16_t i0; uint8x16_t i1; uint8x16_t i2; uint8x16_t i3; }; -template<> really_inline -simd_input fill_input(const uint8_t * ptr) { - struct simd_input in; +template <> +really_inline simd_input +fill_input(const uint8_t *ptr) { + struct simd_input in; in.i0 = vld1q_u8(ptr + 0); in.i1 = vld1q_u8(ptr + 16); in.i2 = vld1q_u8(ptr + 32); @@ -25,26 +26,24 @@ simd_input fill_input(const uint8_t * return in; } - -really_inline -uint16_t neonmovemask(uint8x16_t input) { - const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, +really_inline uint16_t neon_movemask(uint8x16_t input) { + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t minput = vandq_u8(input, bitmask); + uint8x16_t minput = vandq_u8(input, bit_mask); uint8x16_t tmp = vpaddq_u8(minput, minput); tmp = vpaddq_u8(tmp, tmp); tmp = vpaddq_u8(tmp, tmp); return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); } -really_inline -uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { - const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, +really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1, + uint8x16_t p2, uint8x16_t p3) { + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t t0 = vandq_u8(p0, bitmask); - uint8x16_t t1 = vandq_u8(p1, bitmask); - uint8x16_t t2 = vandq_u8(p2, bitmask); - uint8x16_t t3 = vandq_u8(p3, bitmask); + uint8x16_t t0 = vandq_u8(p0, bit_mask); + uint8x16_t t1 = vandq_u8(p1, bit_mask); + uint8x16_t t2 = vandq_u8(p2, bit_mask); + uint8x16_t t3 = vandq_u8(p3, bit_mask); uint8x16_t sum0 = vpaddq_u8(t0, t1); uint8x16_t sum1 = vpaddq_u8(t2, t3); sum0 = vpaddq_u8(sum0, sum1); @@ -52,108 +51,122 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); } -template<> really_inline -uint64_t compute_quote_mask(uint64_t quote_bits) { +template <> +really_inline uint64_t +compute_quote_mask(uint64_t quote_bits) { #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension - return vmull_p64( -1ULL, quote_bits); + return vmull_p64(-1ULL, quote_bits); #else return portable_compute_quote_mask(quote_bits); -#endif +#endif } -template<> -struct utf8_checking_state -{ - int8x16_t has_error {}; - processed_utf_bytes previous {}; +template <> struct utf8_checking_state { + int8x16_t has_error{}; + processed_utf_bytes previous{}; }; // Checks that all bytes are ascii -really_inline -bool check_ascii_neon(simd_input in) { +really_inline bool check_ascii_neon(simd_input in) { // checking if the most significant bit is always equal to 0. - uint8x16_t highbit = vdupq_n_u8(0x80); + uint8x16_t high_bit = vdupq_n_u8(0x80); uint8x16_t t0 = vorrq_u8(in.i0, in.i1); uint8x16_t t1 = vorrq_u8(in.i2, in.i3); uint8x16_t t3 = vorrq_u8(t0, t1); - uint8x16_t t4 = vandq_u8(t3, highbit); + uint8x16_t t4 = vandq_u8(t3, high_bit); uint64x2_t v64 = vreinterpretq_u64_u8(t4); uint32x2_t v32 = vqmovn_u64(v64); uint64x1_t result = vreinterpret_u64_u32(v32); return vget_lane_u64(result, 0) == 0; } -template<> really_inline -void check_utf8(simd_input in, - utf8_checking_state& state) { +template <> +really_inline void check_utf8( + simd_input in, + utf8_checking_state &state) { if (check_ascii_neon(in)) { - // All bytes are ascii. Therefore the byte that was just before must be ascii too. - // We only check the byte that was just before simd_input. Nines are arbitrary values. - const int8x16_t verror = (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1}; + // All bytes are ascii. Therefore the byte that was just before must be + // ascii too. We only check the byte that was just before simd_input. Nines + // are arbitrary values. + const int8x16_t verror = + (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1}; state.has_error = - vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(state.previous.carried_continuations, - verror)), - state.has_error); + vorrq_s8(vreinterpretq_s8_u8( + vcgtq_s8(state.previous.carried_continuations, verror)), + state.has_error); } else { // it is not ascii so we have to do heavy work - state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i0), &(state.previous), &(state.has_error)); - state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i1), &(state.previous), &(state.has_error)); - state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i2), &(state.previous), &(state.has_error)); - state.previous = checkUTF8Bytes(vreinterpretq_s8_u8(in.i3), &(state.previous), &(state.has_error)); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0), + &(state.previous), &(state.has_error)); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1), + &(state.previous), &(state.has_error)); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2), + &(state.previous), &(state.has_error)); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3), + &(state.previous), &(state.has_error)); } } -template<> really_inline -errorValues check_utf8_errors(utf8_checking_state& state) { +template <> +really_inline ErrorValues check_utf8_errors( + utf8_checking_state &state) { uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error); uint32x2_t v32 = vqmovn_u64(v64); uint64x1_t result = vreinterpret_u64_u32(v32); - return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; + return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR + : simdjson::SUCCESS; } -template<> really_inline -uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { - const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask); - uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask); - uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask); - uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask); - return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); +template <> +really_inline uint64_t cmp_mask_against_input( + simd_input in, uint8_t m) { + const uint8x16_t mask = vmovq_n_u8(m); + uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask); + uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask); + uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask); + uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask); + return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); } -template<> really_inline -uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { - const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask); - uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask); - uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask); - uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask); - return neonmovemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); +template <> +really_inline uint64_t unsigned_lteq_against_input( + simd_input in, uint8_t m) { + const uint8x16_t mask = vmovq_n_u8(m); + uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask); + uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask); + uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask); + uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask); + return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); } -template<> really_inline -uint64_t find_odd_backslash_sequences(simd_input in, uint64_t &prev_iter_ends_odd_backslash) { - FIND_ODD_BACKSLASH_SEQUENCES(architecture::arm64, in, prev_iter_ends_odd_backslash); +template <> +really_inline uint64_t find_odd_backslash_sequences( + simd_input in, + uint64_t &prev_iter_ends_odd_backslash) { + FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in, + prev_iter_ends_odd_backslash); } -template<> really_inline -uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) { - FIND_QUOTE_MASK_AND_BITS(architecture::arm64, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) +template <> +really_inline uint64_t find_quote_mask_and_bits( + simd_input in, uint64_t odd_ends, + uint64_t &prev_iter_inside_quote, uint64_t "e_bits, + uint64_t &error_mask) { + FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends, + prev_iter_inside_quote, quote_bits, error_mask) } -template<> really_inline -void find_whitespace_and_structurals( - simd_input in, - uint64_t &whitespace, - uint64_t &structurals) { - const uint8x16_t low_nibble_mask = (uint8x16_t){ - 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; - const uint8x16_t high_nibble_mask = (uint8x16_t){ - 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; - const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); - const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); +template <> +really_inline void find_whitespace_and_structurals( + simd_input in, uint64_t &whitespace, + uint64_t &structurals) { + const uint8x16_t low_nibble_mask = + (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; + const uint8x16_t high_nibble_mask = + (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; + const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); + const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); + const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask); uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4); @@ -183,15 +196,15 @@ void find_whitespace_and_structurals( uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask); uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask); uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask); - structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); + structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask); uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask); uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask); uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask); - whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); + whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); } -}// simdjson namespace +} // namespace simdjson #endif // IS_ARM64 #endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H \ No newline at end of file diff --git a/include/simdjson/stage1_find_marks_flatten.h b/include/simdjson/stage1_find_marks_flatten.h index cb76f7c0..521b6265 100644 --- a/include/simdjson/stage1_find_marks_flatten.h +++ b/include/simdjson/stage1_find_marks_flatten.h @@ -10,17 +10,17 @@ namespace simdjson { // again our optimized version. really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { - uint32_t * out_ptr = base_ptr + base; + uint32_t *out_ptr = base_ptr + base; idx -= 64; - while(bits != 0) { - out_ptr[0] = idx + trailingzeroes(bits); - bits = bits & (bits - 1); - out_ptr++; + while (bits != 0) { + out_ptr[0] = idx + trailing_zeroes(bits); + bits = bits & (bits - 1); + out_ptr++; } base = (out_ptr - base_ptr); } -#else +#else // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -28,65 +28,66 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, // needs to be large enough to handle this really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { - // In some instances, the next branch is expensive because it is mispredicted. + // In some instances, the next branch is expensive because it is mispredicted. // Unfortunately, in other cases, // it helps tremendously. - if(bits == 0) return; + if (bits == 0) + return; uint32_t cnt = hamming(bits); uint32_t next_base = base + cnt; idx -= 64; base_ptr += base; - { - base_ptr[0] = idx + trailingzeroes(bits); + { + base_ptr[0] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[1] = idx + trailingzeroes(bits); + base_ptr[1] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[2] = idx + trailingzeroes(bits); + base_ptr[2] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[3] = idx + trailingzeroes(bits); + base_ptr[3] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[4] = idx + trailingzeroes(bits); + base_ptr[4] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[5] = idx + trailingzeroes(bits); + base_ptr[5] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[6] = idx + trailingzeroes(bits); + base_ptr[6] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[7] = idx + trailingzeroes(bits); + base_ptr[7] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); base_ptr += 8; } // We hope that the next branch is easily predicted. if (cnt > 8) { - base_ptr[0] = idx + trailingzeroes(bits); + base_ptr[0] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[1] = idx + trailingzeroes(bits); + base_ptr[1] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[2] = idx + trailingzeroes(bits); + base_ptr[2] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[3] = idx + trailingzeroes(bits); + base_ptr[3] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[4] = idx + trailingzeroes(bits); + base_ptr[4] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[5] = idx + trailingzeroes(bits); + base_ptr[5] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[6] = idx + trailingzeroes(bits); + base_ptr[6] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); - base_ptr[7] = idx + trailingzeroes(bits); + base_ptr[7] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); base_ptr += 8; } if (cnt > 16) { // unluckly: we rarely get here - // since it means having one structural or pseudo-structral element + // since it means having one structural or pseudo-structral element // every 4 characters (possible with inputs like "","","",...). do { - base_ptr[0] = idx + trailingzeroes(bits); + base_ptr[0] = idx + trailing_zeroes(bits); bits = bits & (bits - 1); base_ptr++; - } while(bits != 0); + } while (bits != 0); } base = next_base; } #endif // SIMDJSON_NAIVE_FLATTEN -} +} // namespace simdjson #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H \ No newline at end of file diff --git a/include/simdjson/stage1_find_marks_flatten_haswell.h b/include/simdjson/stage1_find_marks_flatten_haswell.h index a54eb712..9eb9efa2 100644 --- a/include/simdjson/stage1_find_marks_flatten_haswell.h +++ b/include/simdjson/stage1_find_marks_flatten_haswell.h @@ -1,7 +1,7 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H -// This file provides the same function as +// This file provides the same function as // stage1_find_marks_flatten.h, but uses Intel intrinsics. // This should provide better performance on Visual Studio // and other compilers that do a conservative optimization. @@ -20,15 +20,16 @@ namespace haswell { // needs to be large enough to handle this really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) { - // In some instances, the next branch is expensive because it is mispredicted. + // In some instances, the next branch is expensive because it is mispredicted. // Unfortunately, in other cases, // it helps tremendously. - if(bits == 0) return; + if (bits == 0) + return; uint32_t cnt = _mm_popcnt_u64(bits); uint32_t next_base = base + cnt; idx -= 64; base_ptr += base; - { + { base_ptr[0] = idx + _tzcnt_u64(bits); bits = _blsr_u64(bits); base_ptr[1] = idx + _tzcnt_u64(bits); @@ -68,19 +69,18 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, base_ptr += 8; } if (cnt > 16) { // unluckly: we rarely get here - // since it means having one structural or pseudo-structral element + // since it means having one structural or pseudo-structral element // every 4 characters (possible with inputs like "","","",...). do { base_ptr[0] = idx + _tzcnt_u64(bits); bits = _blsr_u64(bits); base_ptr++; - } while(bits != 0); + } while (bits != 0); } base = next_base; } -} // haswell -} // simdjson +} // namespace haswell +} // namespace simdjson UNTARGET_REGION - #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H diff --git a/include/simdjson/stage1_find_marks_haswell.h b/include/simdjson/stage1_find_marks_haswell.h index 6cb97916..f8ed26e0 100644 --- a/include/simdjson/stage1_find_marks_haswell.h +++ b/include/simdjson/stage1_find_marks_haswell.h @@ -1,31 +1,32 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H #define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H -#include "simdjson/stage1_find_marks.h" -#include "simdjson/stage1_find_marks_macros.h" -#include "simdjson/stage1_find_marks_flatten_haswell.h" #include "simdjson/simdutf8check_haswell.h" +#include "simdjson/stage1_find_marks.h" +#include "simdjson/stage1_find_marks_flatten_haswell.h" +#include "simdjson/stage1_find_marks_macros.h" #ifdef IS_X86_64 TARGET_HASWELL namespace simdjson { -template<> -struct simd_input { +template <> struct simd_input { __m256i lo; __m256i hi; }; -template<> really_inline -simd_input fill_input(const uint8_t * ptr) { - struct simd_input in; +template <> +really_inline simd_input +fill_input(const uint8_t *ptr) { + struct simd_input in; in.lo = _mm256_loadu_si256(reinterpret_cast(ptr + 0)); in.hi = _mm256_loadu_si256(reinterpret_cast(ptr + 32)); return in; } -template<> really_inline -uint64_t compute_quote_mask(uint64_t quote_bits) { +template <> +really_inline uint64_t +compute_quote_mask(uint64_t quote_bits) { // There should be no such thing with a processing supporting avx2 // but not clmul. uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( @@ -33,45 +34,50 @@ uint64_t compute_quote_mask(uint64_t quote_bits) { return quote_mask; } -template<> -struct utf8_checking_state { +template <> struct utf8_checking_state { __m256i has_error; avx_processed_utf_bytes previous; utf8_checking_state() { has_error = _mm256_setzero_si256(); - previous.rawbytes = _mm256_setzero_si256(); + previous.raw_bytes = _mm256_setzero_si256(); previous.high_nibbles = _mm256_setzero_si256(); - previous.carried_continuations =_mm256_setzero_si256(); + previous.carried_continuations = _mm256_setzero_si256(); } }; - -template<> really_inline -void check_utf8(simd_input in, - utf8_checking_state& state) { - __m256i highbit = _mm256_set1_epi8(0x80); - if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) { +template <> +really_inline void check_utf8( + simd_input in, + utf8_checking_state &state) { + __m256i high_bit = _mm256_set1_epi8(0x80); + if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) { // it is ascii, we just check continuation state.has_error = _mm256_or_si256( - _mm256_cmpgt_epi8( - state.previous.carried_continuations, - _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), + _mm256_cmpgt_epi8(state.previous.carried_continuations, + _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 1)), state.has_error); } else { // it is not ascii so we have to do heavy work - state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error)); - state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error)); + state.previous = + avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error)); + state.previous = + avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error)); } } -template<> really_inline -errorValues check_utf8_errors(utf8_checking_state& state) { - return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +template <> +really_inline ErrorValues check_utf8_errors( + utf8_checking_state &state) { + return _mm256_testz_si256(state.has_error, state.has_error) == 0 + ? simdjson::UTF8_ERROR + : simdjson::SUCCESS; } -template<> really_inline -uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { +template <> +really_inline uint64_t cmp_mask_against_input( + simd_input in, uint8_t m) { const __m256i mask = _mm256_set1_epi8(m); __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask); uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); @@ -80,31 +86,38 @@ uint64_t cmp_mask_against_input(simd_input really_inline -uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { +template <> +really_inline uint64_t unsigned_lteq_against_input( + simd_input in, uint8_t m) { const __m256i maxval = _mm256_set1_epi8(m); - __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.lo),maxval); + __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval); uint64_t res_0 = static_cast(_mm256_movemask_epi8(cmp_res_0)); - __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval,in.hi),maxval); + __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval); uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1); return res_0 | (res_1 << 32); } -template<> really_inline -uint64_t find_odd_backslash_sequences(simd_input in, uint64_t &prev_iter_ends_odd_backslash) { - FIND_ODD_BACKSLASH_SEQUENCES(architecture::haswell, in, prev_iter_ends_odd_backslash); +template <> +really_inline uint64_t find_odd_backslash_sequences( + simd_input in, + uint64_t &prev_iter_ends_odd_backslash) { + FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in, + prev_iter_ends_odd_backslash); } -template<> really_inline -uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) { - FIND_QUOTE_MASK_AND_BITS(architecture::haswell, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) +template <> +really_inline uint64_t find_quote_mask_and_bits( + simd_input in, uint64_t odd_ends, + uint64_t &prev_iter_inside_quote, uint64_t "e_bits, + uint64_t &error_mask) { + FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends, + prev_iter_inside_quote, quote_bits, error_mask) } -template<> really_inline -void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, - uint64_t &structurals) { +template <> +really_inline void find_whitespace_and_structurals( + simd_input in, uint64_t &whitespace, + uint64_t &structurals) { #ifdef SIMDJSON_NAIVE_STRUCTURAL // You should never need this naive approach, but it can be useful // for research purposes @@ -112,21 +125,28 @@ void find_whitespace_and_structurals(simd_input(_mm256_movemask_epi8(struct_lo)); + struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma)); + struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma)); + uint64_t structural_res_0 = + static_cast(_mm256_movemask_epi8(struct_lo)); uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi); structurals = (structural_res_0 | (structural_res_1 << 32)); @@ -134,34 +154,34 @@ void find_whitespace_and_structurals(simd_input(_mm256_movemask_epi8(space_lo)); uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi); whitespace = (ws_res_0 | (ws_res_1 << 32)); // end of naive approach -#else // SIMDJSON_NAIVE_STRUCTURAL - const __m256i structural_table = _mm256_setr_epi8( - 44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123, - 44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); +#else // SIMDJSON_NAIVE_STRUCTURAL + const __m256i structural_table = + _mm256_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123, + 44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); const __m256i white_table = _mm256_setr_epi8( - 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100, - 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); + 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100, + 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); const __m256i struct_offset = _mm256_set1_epi8(0xd4); const __m256i struct_mask = _mm256_set1_epi8(32); - __m256i lo_white = _mm256_cmpeq_epi8(in.lo, - _mm256_shuffle_epi8(white_table, in.lo)); - __m256i hi_white = _mm256_cmpeq_epi8(in.hi, - _mm256_shuffle_epi8(white_table, in.hi)); + __m256i lo_white = + _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo)); + __m256i hi_white = + _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi)); uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(lo_white)); uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white); whitespace = (ws_res_0 | (ws_res_1 << 32)); @@ -173,7 +193,7 @@ void find_whitespace_and_structurals(simd_input(_mm256_movemask_epi8(lo_struct)); uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct); @@ -184,6 +204,5 @@ void find_whitespace_and_structurals(simd_input in, uint64_t &prev_iter_ends_odd_backslash) -#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) { \ - const uint64_t even_bits = 0x5555555555555555ULL; \ - const uint64_t odd_bits = ~even_bits; \ - uint64_t bs_bits = cmp_mask_against_input(in, '\\'); \ - uint64_t start_edges = bs_bits & ~(bs_bits << 1); \ - /* flip lowest if we have an odd-length run at the end of the prior */ \ - /* iteration */ \ - uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \ - uint64_t even_starts = start_edges & even_start_mask; \ - uint64_t odd_starts = start_edges & ~even_start_mask; \ - uint64_t even_carries = bs_bits + even_starts; \ - \ - uint64_t odd_carries; \ - /* must record the carry-out of our odd-carries out of bit 63; this */ \ - /* indicates whether the sense of any edge going to the next iteration */ \ - /* should be flipped */ \ - bool iter_ends_odd_backslash = \ - add_overflow(bs_bits, odd_starts, &odd_carries); \ - \ - odd_carries |= \ - prev_iter_ends_odd_backslash; /* push in bit zero as a potential end */ \ - /* if we had an odd-numbered run at the */ \ - /* end of the previous iteration */ \ - prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \ - uint64_t even_carry_ends = even_carries & ~bs_bits; \ - uint64_t odd_carry_ends = odd_carries & ~bs_bits; \ - uint64_t even_start_odd_end = even_carry_ends & odd_bits; \ - uint64_t odd_start_even_end = odd_carry_ends & even_bits; \ - uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \ - return odd_ends; \ -} - +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication. uint64_t +// FIND_ODD_BACKSLASH_SEQUENCES(Architecture T, simd_input in, uint64_t +// &prev_iter_ends_odd_backslash) +#define FIND_ODD_BACKSLASH_SEQUENCES(T, in, prev_iter_ends_odd_backslash) \ + { \ + const uint64_t even_bits = 0x5555555555555555ULL; \ + const uint64_t odd_bits = ~even_bits; \ + uint64_t bs_bits = cmp_mask_against_input(in, '\\'); \ + uint64_t start_edges = bs_bits & ~(bs_bits << 1); \ + /* flip lowest if we have an odd-length run at the end of the prior \ + * iteration */ \ + uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; \ + uint64_t even_starts = start_edges & even_start_mask; \ + uint64_t odd_starts = start_edges & ~even_start_mask; \ + uint64_t even_carries = bs_bits + even_starts; \ + \ + uint64_t odd_carries; \ + /* must record the carry-out of our odd-carries out of bit 63; this \ + * indicates whether the sense of any edge going to the next iteration \ + * should be flipped */ \ + bool iter_ends_odd_backslash = \ + add_overflow(bs_bits, odd_starts, &odd_carries); \ + \ + odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a \ + * potential end if we had an \ + * odd-numbered run at the \ + * end of the previous \ + * iteration */ \ + prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; \ + uint64_t even_carry_ends = even_carries & ~bs_bits; \ + uint64_t odd_carry_ends = odd_carries & ~bs_bits; \ + uint64_t even_start_odd_end = even_carry_ends & odd_bits; \ + uint64_t odd_start_even_end = odd_carry_ends & even_bits; \ + uint64_t odd_ends = even_start_odd_end | odd_start_even_end; \ + return odd_ends; \ + } // return both the quote mask (which is a half-open mask that covers the first // quote @@ -58,34 +61,39 @@ // Note that we don't do any error checking to see if we have backslash // sequences outside quotes; these // backslash sequences (of any length) will be detected elsewhere. -// We need to compile that code for multiple architectures. However, target attributes can be used -// only once by function definition. Huge macro seemed better than huge code duplication. -// uint64_t FIND_QUOTE_MASK_AND_BITS(architecture T, simd_input in, uint64_t odd_ends, -// uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) -#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) { \ - quote_bits = cmp_mask_against_input(in, '"'); \ - quote_bits = quote_bits & ~odd_ends; \ - uint64_t quote_mask = compute_quote_mask(quote_bits); \ - quote_mask ^= prev_iter_inside_quote; \ - /* All Unicode characters may be placed within the */ \ - /* quotation marks, except for the characters that MUST be escaped: */ \ - /* quotation mark, reverse solidus, and the control characters (U+0000 */ \ - /*through U+001F). */ \ - /* https://tools.ietf.org/html/rfc8259 */ \ - uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F); \ - error_mask |= quote_mask & unescaped; \ - /* right shift of a signed value expected to be well-defined and standard */ \ - /* compliant as of C++20, */ \ - /* John Regher from Utah U. says this is fine code */ \ - prev_iter_inside_quote = \ - static_cast(static_cast(quote_mask) >> 63); \ - return quote_mask; \ -} \ +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication. uint64_t +// FIND_QUOTE_MASK_AND_BITS(Architecture T, simd_input in, uint64_t odd_ends, +// uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t +// &error_mask) +#define FIND_QUOTE_MASK_AND_BITS(T, in, odd_ends, prev_iter_inside_quote, \ + quote_bits, error_mask) \ + { \ + quote_bits = cmp_mask_against_input(in, '"'); \ + quote_bits = quote_bits & ~odd_ends; \ + uint64_t quote_mask = compute_quote_mask(quote_bits); \ + quote_mask ^= prev_iter_inside_quote; \ + /* All Unicode characters may be placed within the \ + * quotation marks, except for the characters that MUST be escaped: \ + * quotation mark, reverse solidus, and the control characters (U+0000 \ + * through U+001F). \ + * https://tools.ietf.org/html/rfc8259 */ \ + uint64_t unescaped = unsigned_lteq_against_input(in, 0x1F); \ + error_mask |= quote_mask & unescaped; \ + /* right shift of a signed value expected to be well-defined and standard \ + * compliant as of C++20, \ + * John Regher from Utah U. says this is fine code */ \ + prev_iter_inside_quote = \ + static_cast(static_cast(quote_mask) >> 63); \ + return quote_mask; \ + } // Find structural bits in a 64-byte chunk. -// We need to compile that code for multiple architectures. However, target attributes can be used -// only once by function definition. Huge macro seemed better than huge code duplication. -// void FIND_STRUCTURAL_BITS_64(architecture T, +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication. void FIND_STRUCTURAL_BITS_64( +// Architecture T, // const uint8_t *buf, // size_t idx, // uint32_t *base_ptr, @@ -95,131 +103,137 @@ // uint64_t &prev_iter_ends_pseudo_pred, // uint64_t &structurals, // uint64_t &error_mask, -// utf8_checking_state &utf8_state, flatten function) -#define FIND_STRUCTURAL_BITS_64(T, \ - buf, \ - idx, \ - base_ptr, \ - base, \ - prev_iter_ends_odd_backslash, \ - prev_iter_inside_quote, \ - prev_iter_ends_pseudo_pred, \ - structurals, \ - error_mask, \ - utf8_state, \ - flat \ -) { \ - simd_input in = fill_input(buf); \ - check_utf8(in, utf8_state); \ - /* detect odd sequences of backslashes */ \ - uint64_t odd_ends = find_odd_backslash_sequences(in, prev_iter_ends_odd_backslash); \ - \ - /* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \ - /* themselves */ \ - uint64_t quote_bits; \ - uint64_t quote_mask = find_quote_mask_and_bits( \ - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \ - \ - /* take the previous iterations structural bits, not our current iteration, */ \ - /* and flatten */ \ - flat(base_ptr, base, idx, structurals); \ - \ - uint64_t whitespace; \ - find_whitespace_and_structurals(in, whitespace, structurals); \ - \ - /* fixup structurals to reflect quotes and add pseudo-structural characters */ \ - structurals = finalize_structurals(structurals, whitespace, quote_mask, \ - quote_bits, prev_iter_ends_pseudo_pred); \ -} \ - - -// We need to compile that code for multiple architectures. However, target attributes can be used -// only once by function definition. Huge macro seemed better than huge code duplication. -// errorValues FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj, flatten functio ) -#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) { \ - if (len > pj.bytecapacity) { \ - std::cerr << "Your ParsedJson object only supports documents up to " \ - << pj.bytecapacity << " bytes but you are trying to process " << len \ - << " bytes" << std::endl; \ - return simdjson::CAPACITY; \ - } \ - uint32_t *base_ptr = pj.structural_indexes; \ - uint32_t base = 0; \ - utf8_checking_state utf8_state; \ - \ - /* we have padded the input out to 64 byte multiple with the remainder being */ \ - /* zeros */ \ - \ - /* persistent state across loop */ \ - /* does the last iteration end with an odd-length sequence of backslashes? */ \ - /* either 0 or 1, but a 64-bit value */ \ - uint64_t prev_iter_ends_odd_backslash = 0ULL; \ - /* does the previous iteration end inside a double-quote pair? */ \ - uint64_t prev_iter_inside_quote = 0ULL; /* either all zeros or all ones */ \ - /* does the previous iteration end on something that is a predecessor of a */ \ - /* pseudo-structural character - i.e. whitespace or a structural character */ \ - /* effectively the very first char is considered to follow "whitespace" for */ \ - /* the */ \ - /* purposes of pseudo-structural character detection so we initialize to 1 */ \ - uint64_t prev_iter_ends_pseudo_pred = 1ULL; \ - \ - /* structurals are persistent state across loop as we flatten them on the */ \ - /* subsequent iteration into our array pointed to be base_ptr. */ \ - /* This is harmless on the first iteration as structurals==0 */ \ - /* and is done for performance reasons; we can hide some of the latency of the */ \ - /* expensive carryless multiply in the previous step with this work */ \ - uint64_t structurals = 0; \ - \ - size_t lenminus64 = len < 64 ? 0 : len - 64; \ - size_t idx = 0; \ - uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII code points < 0x20) */ \ - \ - for (; idx < lenminus64; idx += 64) { \ - FIND_STRUCTURAL_BITS_64(T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ - prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ - error_mask, utf8_state, flat); \ - } \ - /* If we have a final chunk of less than 64 bytes, pad it to 64 with spaces */ \ - /* before processing it (otherwise, we risk invalidating the UTF-8 checks). */ \ - if (idx < len) { \ - uint8_t tmpbuf[64]; \ - memset(tmpbuf, 0x20, 64); \ - memcpy(tmpbuf, buf + idx, len - idx); \ - FIND_STRUCTURAL_BITS_64(T, &tmpbuf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ - prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ - error_mask, utf8_state, flat); \ - idx += 64; \ - } \ - \ - /* is last string quote closed? */ \ - if (prev_iter_inside_quote) { \ - return simdjson::UNCLOSED_STRING; \ - } \ - \ - /* finally, flatten out the remaining structurals from the last iteration */ \ - flat(base_ptr, base, idx, structurals); \ - \ - pj.n_structural_indexes = base; \ - /* a valid JSON file cannot have zero structural indexes - we should have */ \ - /* found something */ \ - if (pj.n_structural_indexes == 0u) { \ - return simdjson::EMPTY; \ - } \ - if (base_ptr[pj.n_structural_indexes - 1] > len) { \ - return simdjson::UNEXPECTED_ERROR; \ - } \ - if (len != base_ptr[pj.n_structural_indexes - 1]) { \ - /* the string might not be NULL terminated, but we add a virtual NULL ending */ \ - /* character. */ \ - base_ptr[pj.n_structural_indexes++] = len; \ - } \ - /* make it safe to dereference one beyond this array */ \ - base_ptr[pj.n_structural_indexes] = 0; \ - if (error_mask) { \ - return simdjson::UNESCAPED_CHARS; \ - } \ - return check_utf8_errors(utf8_state); \ -} +// utf8_checking_state &utf8_state, flatten +// function) +#define FIND_STRUCTURAL_BITS_64( \ + T, buf, idx, base_ptr, base, prev_iter_ends_odd_backslash, \ + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ + error_mask, utf8_state, flat) \ + { \ + simd_input in = fill_input(buf); \ + check_utf8(in, utf8_state); \ + /* detect odd sequences of backslashes */ \ + uint64_t odd_ends = \ + find_odd_backslash_sequences(in, prev_iter_ends_odd_backslash); \ + \ + /* detect insides of quote pairs ("quote_mask") and also our quote_bits \ + * themselves */ \ + uint64_t quote_bits; \ + uint64_t quote_mask = find_quote_mask_and_bits( \ + in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \ + \ + /* take the previous iterations structural bits, not our current \ + * iteration, \ + * and flatten */ \ + flat(base_ptr, base, idx, structurals); \ + \ + uint64_t whitespace; \ + find_whitespace_and_structurals(in, whitespace, structurals); \ + \ + /* fixup structurals to reflect quotes and add pseudo-structural \ + * characters */ \ + structurals = \ + finalize_structurals(structurals, whitespace, quote_mask, quote_bits, \ + prev_iter_ends_pseudo_pred); \ + } +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication. ErrorValues +// FIND_STRUCTURAL_BITS(Architecture T, const uint8_t *buf, size_t len, +// ParsedJson &pj, flatten function) +#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) \ + { \ + if (len > pj.byte_capacity) { \ + std::cerr << "Your ParsedJson object only supports documents up to " \ + << pj.byte_capacity << " bytes but you are trying to process " \ + << len << " bytes" << std::endl; \ + return simdjson::CAPACITY; \ + } \ + uint32_t *base_ptr = pj.structural_indexes; \ + uint32_t base = 0; \ + utf8_checking_state utf8_state; \ + \ + /* we have padded the input out to 64 byte multiple with the remainder \ + * being zeros persistent state across loop does the last iteration end \ + * with an odd-length sequence of backslashes? */ \ + \ + /* either 0 or 1, but a 64-bit value */ \ + uint64_t prev_iter_ends_odd_backslash = 0ULL; \ + /* does the previous iteration end inside a double-quote pair? */ \ + uint64_t prev_iter_inside_quote = \ + 0ULL; /* either all zeros or all ones \ + * does the previous iteration end on something that is a \ + * predecessor of a pseudo-structural character - i.e. \ + * whitespace or a structural character effectively the very \ + * first char is considered to follow "whitespace" for the \ + * purposes of pseudo-structural character detection so we \ + * initialize to 1 */ \ + uint64_t prev_iter_ends_pseudo_pred = 1ULL; \ + \ + /* structurals are persistent state across loop as we flatten them on the \ + * subsequent iteration into our array pointed to be base_ptr. \ + * This is harmless on the first iteration as structurals==0 \ + * and is done for performance reasons; we can hide some of the latency of \ + * the \ + * expensive carryless multiply in the previous step with this work */ \ + uint64_t structurals = 0; \ + \ + size_t lenminus64 = len < 64 ? 0 : len - 64; \ + size_t idx = 0; \ + uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII \ + code points < 0x20) */ \ + \ + for (; idx < lenminus64; idx += 64) { \ + FIND_STRUCTURAL_BITS_64( \ + T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ + error_mask, utf8_state, flat); \ + } \ + /* If we have a final chunk of less than 64 bytes, pad it to 64 with \ + * spaces before processing it (otherwise, we risk invalidating the UTF-8 \ + * checks). */ \ + if (idx < len) { \ + uint8_t tmp_buf[64]; \ + memset(tmp_buf, 0x20, 64); \ + memcpy(tmp_buf, buf + idx, len - idx); \ + FIND_STRUCTURAL_BITS_64( \ + T, &tmp_buf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ + error_mask, utf8_state, flat); \ + idx += 64; \ + } \ + \ + /* is last string quote closed? */ \ + if (prev_iter_inside_quote) { \ + return simdjson::UNCLOSED_STRING; \ + } \ + \ + /* finally, flatten out the remaining structurals from the last iteration \ + */ \ + flat(base_ptr, base, idx, structurals); \ + \ + pj.n_structural_indexes = base; \ + /* a valid JSON file cannot have zero structural indexes - we should have \ + * found something */ \ + if (pj.n_structural_indexes == 0u) { \ + return simdjson::EMPTY; \ + } \ + if (base_ptr[pj.n_structural_indexes - 1] > len) { \ + return simdjson::UNEXPECTED_ERROR; \ + } \ + if (len != base_ptr[pj.n_structural_indexes - 1]) { \ + /* the string might not be NULL terminated, but we add a virtual NULL \ + * ending \ + * character. */ \ + base_ptr[pj.n_structural_indexes++] = len; \ + } \ + /* make it safe to dereference one beyond this array */ \ + base_ptr[pj.n_structural_indexes] = 0; \ + if (error_mask) { \ + return simdjson::UNESCAPED_CHARS; \ + } \ + return check_utf8_errors(utf8_state); \ + } #endif // SIMDJSON_STAGE1_FIND_MARKS_MACROS_H \ No newline at end of file diff --git a/include/simdjson/stage1_find_marks_westmere.h b/include/simdjson/stage1_find_marks_westmere.h index 376bce36..cb85d69c 100644 --- a/include/simdjson/stage1_find_marks_westmere.h +++ b/include/simdjson/stage1_find_marks_westmere.h @@ -1,26 +1,26 @@ #ifndef SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H #define SIMDJSON_STAGE1_FIND_MARKS_WESTMERE_H -#include "simdjson/stage1_find_marks.h" -#include "simdjson/stage1_find_marks_macros.h" -#include "simdjson/stage1_find_marks_flatten.h" #include "simdjson/simdutf8check_westmere.h" +#include "simdjson/stage1_find_marks.h" +#include "simdjson/stage1_find_marks_flatten.h" +#include "simdjson/stage1_find_marks_macros.h" #ifdef IS_X86_64 TARGET_WESTMERE namespace simdjson { -template<> -struct simd_input { +template <> struct simd_input { __m128i v0; __m128i v1; __m128i v2; __m128i v3; }; -template<> really_inline -simd_input fill_input(const uint8_t * ptr) { - struct simd_input in; +template <> +really_inline simd_input +fill_input(const uint8_t *ptr) { + struct simd_input in; in.v0 = _mm_loadu_si128(reinterpret_cast(ptr + 0)); in.v1 = _mm_loadu_si128(reinterpret_cast(ptr + 16)); in.v2 = _mm_loadu_si128(reinterpret_cast(ptr + 32)); @@ -28,61 +28,69 @@ simd_input fill_input(const uint return in; } -template<> really_inline -uint64_t compute_quote_mask(uint64_t quote_bits) { +template <> +really_inline uint64_t +compute_quote_mask(uint64_t quote_bits) { return _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); } -template<> -struct utf8_checking_state -{ +template <> struct utf8_checking_state { __m128i has_error = _mm_setzero_si128(); - processed_utf_bytes previous { - _mm_setzero_si128(), // rawbytes - _mm_setzero_si128(), // high_nibbles - _mm_setzero_si128() // carried_continuations + processed_utf_bytes previous{ + _mm_setzero_si128(), // raw_bytes + _mm_setzero_si128(), // high_nibbles + _mm_setzero_si128() // carried_continuations }; }; -template<> really_inline -void check_utf8(simd_input in, - utf8_checking_state& state) { - __m128i highbit = _mm_set1_epi8(0x80); - if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) { +template <> +really_inline void check_utf8( + simd_input in, + utf8_checking_state &state) { + __m128i high_bit = _mm_set1_epi8(0x80); + if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), high_bit)) == 1) { // it is ascii, we just check continuation - state.has_error = _mm_or_si128( - _mm_cmpgt_epi8( - state.previous.carried_continuations, - _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - state.has_error); + state.has_error = + _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations, + _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 1)), + state.has_error); } else { // it is not ascii so we have to do heavy work - state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error)); - state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error)); + state.previous = + check_utf8_bytes(in.v0, &(state.previous), &(state.has_error)); + state.previous = + check_utf8_bytes(in.v1, &(state.previous), &(state.has_error)); } - if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) { + if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), high_bit)) == 1) { // it is ascii, we just check continuation - state.has_error = _mm_or_si128( - _mm_cmpgt_epi8( - state.previous.carried_continuations, - _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), - state.has_error); + state.has_error = + _mm_or_si128(_mm_cmpgt_epi8(state.previous.carried_continuations, + _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 1)), + state.has_error); } else { // it is not ascii so we have to do heavy work - state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error)); - state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error)); + state.previous = + check_utf8_bytes(in.v2, &(state.previous), &(state.has_error)); + state.previous = + check_utf8_bytes(in.v3, &(state.previous), &(state.has_error)); } } -template<> really_inline -errorValues check_utf8_errors(utf8_checking_state& state) { - return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS; +template <> +really_inline ErrorValues check_utf8_errors( + utf8_checking_state &state) { + return _mm_testz_si128(state.has_error, state.has_error) == 0 + ? simdjson::UTF8_ERROR + : simdjson::SUCCESS; } -template<> really_inline -uint64_t cmp_mask_against_input(simd_input in, uint8_t m) { +template <> +really_inline uint64_t cmp_mask_against_input( + simd_input in, uint8_t m) { const __m128i mask = _mm_set1_epi8(m); __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask); uint64_t res_0 = _mm_movemask_epi8(cmp_res_0); @@ -95,54 +103,60 @@ uint64_t cmp_mask_against_input(simd_input really_inline -uint64_t unsigned_lteq_against_input(simd_input in, uint8_t m) { +template <> +really_inline uint64_t unsigned_lteq_against_input( + simd_input in, uint8_t m) { const __m128i maxval = _mm_set1_epi8(m); - __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval); + __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval); uint64_t res_0 = _mm_movemask_epi8(cmp_res_0); - __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval); + __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval); uint64_t res_1 = _mm_movemask_epi8(cmp_res_1); - __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval); + __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval); uint64_t res_2 = _mm_movemask_epi8(cmp_res_2); - __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval); + __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval); uint64_t res_3 = _mm_movemask_epi8(cmp_res_3); return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48); } -template<> really_inline -uint64_t find_odd_backslash_sequences(simd_input in, uint64_t &prev_iter_ends_odd_backslash) { - FIND_ODD_BACKSLASH_SEQUENCES(architecture::westmere, in, prev_iter_ends_odd_backslash); +template <> +really_inline uint64_t find_odd_backslash_sequences( + simd_input in, + uint64_t &prev_iter_ends_odd_backslash) { + FIND_ODD_BACKSLASH_SEQUENCES(Architecture::WESTMERE, in, + prev_iter_ends_odd_backslash); } -template<> really_inline -uint64_t find_quote_mask_and_bits(simd_input in, uint64_t odd_ends, - uint64_t &prev_iter_inside_quote, uint64_t "e_bits, uint64_t &error_mask) { - FIND_QUOTE_MASK_AND_BITS(architecture::westmere, in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask) +template <> +really_inline uint64_t find_quote_mask_and_bits( + simd_input in, uint64_t odd_ends, + uint64_t &prev_iter_inside_quote, uint64_t "e_bits, + uint64_t &error_mask) { + FIND_QUOTE_MASK_AND_BITS(Architecture::WESTMERE, in, odd_ends, + prev_iter_inside_quote, quote_bits, error_mask) } -template<> really_inline -void find_whitespace_and_structurals(simd_input in, - uint64_t &whitespace, uint64_t &structurals) { - const __m128i structural_table = _mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); - const __m128i white_table = _mm_setr_epi8( - 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100); +template <> +really_inline void find_whitespace_and_structurals( + simd_input in, uint64_t &whitespace, + uint64_t &structurals) { + const __m128i structural_table = + _mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123); + const __m128i white_table = _mm_setr_epi8(32, 100, 100, 100, 17, 100, 113, 2, + 100, 9, 10, 112, 100, 13, 100, 100); const __m128i struct_offset = _mm_set1_epi8(0xd4); const __m128i struct_mask = _mm_set1_epi8(32); - __m128i white0 = _mm_cmpeq_epi8(in.v0, - _mm_shuffle_epi8(white_table, in.v0)); - __m128i white1 = _mm_cmpeq_epi8(in.v1, - _mm_shuffle_epi8(white_table, in.v1)); - __m128i white2 = _mm_cmpeq_epi8(in.v2, - _mm_shuffle_epi8(white_table, in.v2)); - __m128i white3 = _mm_cmpeq_epi8(in.v3, - _mm_shuffle_epi8(white_table, in.v3)); + __m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0)); + __m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1)); + __m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2)); + __m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3)); uint64_t ws_res_0 = _mm_movemask_epi8(white0); uint64_t ws_res_1 = _mm_movemask_epi8(white1); uint64_t ws_res_2 = _mm_movemask_epi8(white2); uint64_t ws_res_3 = _mm_movemask_epi8(white3); - whitespace = (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48)); + whitespace = + (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48)); __m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0); __m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1); @@ -169,13 +183,12 @@ void find_whitespace_and_structurals(simd_input("true "); uint64_t mask4 = 0x00000000ffffffff; uint32_t error = 0; - uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + uint64_t + locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) // this can read up to 7 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING); @@ -43,8 +44,9 @@ really_inline bool is_valid_false_atom(const uint8_t *loc) { // the last character of false (it being 5 byte long!) would be // ignored uint64_t error = 0; - uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) - // this can read up to 7 bytes beyond the buffer size, but we require + uint64_t + locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + // this can read up to 7 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING); std::memcpy(&locval, loc, sizeof(uint64_t)); @@ -58,8 +60,9 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) { uint64_t nv = *reinterpret_cast("null "); uint64_t mask4 = 0x00000000ffffffff; uint32_t error = 0; - uint64_t locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) - // this can read up to 7 bytes beyond the buffer size, but we require + uint64_t + locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++) + // this can read up to 7 bytes beyond the buffer size, but we require // SIMDJSON_PADDING of padding static_assert(sizeof(uint64_t) - 1 <= SIMDJSON_PADDING); std::memcpy(&locval, loc, sizeof(uint64_t)); @@ -68,15 +71,15 @@ really_inline bool is_valid_null_atom(const uint8_t *loc) { return error == 0; } -template -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER -int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj); +template +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int +unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj); -template +template int unified_machine(const char *buf, size_t len, ParsedJson &pj) { - return unified_machine(reinterpret_cast(buf), len, pj); + return unified_machine(reinterpret_cast(buf), len, pj); } -} +} // namespace simdjson #endif diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h index ca097eb1..d1054265 100644 --- a/include/simdjson/stringparsing.h +++ b/include/simdjson/stringparsing.h @@ -6,8 +6,9 @@ #include "simdjson/parsedjson.h" #ifdef JSON_TEST_STRINGS -void foundString(const uint8_t *buf, const uint8_t *parsed_begin, const uint8_t *parsed_end); -void foundBadString(const uint8_t *buf); +void found_string(const uint8_t *buf, const uint8_t *parsed_begin, + const uint8_t *parsed_end); +void found_bad_string(const uint8_t *buf); #endif namespace simdjson { @@ -37,7 +38,6 @@ static const uint8_t escape_map[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - // handle a unicode codepoint // write appropriate values into dest // src will advance 6 bytes or 12 bytes @@ -45,9 +45,10 @@ static const uint8_t escape_map[256] = { // return true if the unicode codepoint was valid // We work in little-endian then swap at write time WARN_UNUSED -really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **dst_ptr) { +really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, + uint8_t **dst_ptr) { // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the - // conversion isn't valid; we defer the check for this to inside the + // conversion isn't valid; we defer the check for this to inside the // multilingual plane check uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2); *src_ptr += 6; @@ -58,14 +59,14 @@ really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, uint8_t **d return false; } uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2); - + // if the first code point is invalid we will get here, as we will go past // the check for being outside the Basic Multilingual plane. If we don't - // find a \u immediately afterwards we fail out anyhow, but if we do, + // find a \u immediately afterwards we fail out anyhow, but if we do, // this check catches both the case of the first code point being invalid // or the second code point being invalid. if ((code_point | code_point_2) >> 16) { - return false; + return false; } code_point = @@ -84,18 +85,17 @@ struct parse_string_helper { }; // Finds where the backslashes and quotes are located. -template -parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, uint8_t *dst); +template +parse_string_helper find_bs_bits_and_quote_bits(const uint8_t *src, + uint8_t *dst); +template +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER + really_inline bool + parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, ParsedJson &pj, + UNUSED const uint32_t depth, UNUSED uint32_t offset); - -template -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline -bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, - ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset); - - -} +} // namespace simdjson /// Now include the specializations: #include "simdjson/stringparsing_arm64.h" diff --git a/include/simdjson/stringparsing_arm64.h b/include/simdjson/stringparsing_arm64.h index 11eb6b88..076ea71a 100644 --- a/include/simdjson/stringparsing_arm64.h +++ b/include/simdjson/stringparsing_arm64.h @@ -6,46 +6,51 @@ #ifdef IS_ARM64 namespace simdjson { -template<> really_inline -parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { - // this can read up to 31 bytes beyond the buffer size, but we require - // SIMDJSON_PADDING of padding - static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING); - uint8x16_t v0 = vld1q_u8(src); - uint8x16_t v1 = vld1q_u8(src+16); - vst1q_u8(dst, v0); - vst1q_u8(dst+16, v1); - - uint8x16_t bs_mask = vmovq_n_u8('\\'); - uint8x16_t qt_mask = vmovq_n_u8('"'); - const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask); - uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask); - uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask); - uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask); - - cmp_bs_0 = vandq_u8(cmp_bs_0, bitmask); - cmp_bs_1 = vandq_u8(cmp_bs_1, bitmask); - cmp_qt_0 = vandq_u8(cmp_qt_0, bitmask); - cmp_qt_1 = vandq_u8(cmp_qt_1, bitmask); +template <> +really_inline parse_string_helper +find_bs_bits_and_quote_bits(const uint8_t *src, + uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(2 * sizeof(uint8x16_t) - 1 <= SIMDJSON_PADDING); + uint8x16_t v0 = vld1q_u8(src); + uint8x16_t v1 = vld1q_u8(src + 16); + vst1q_u8(dst, v0); + vst1q_u8(dst + 16, v1); - uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1); - uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1); - sum0 = vpaddq_u8(sum0, sum1); - sum0 = vpaddq_u8(sum0, sum0); - return { + uint8x16_t bs_mask = vmovq_n_u8('\\'); + uint8x16_t qt_mask = vmovq_n_u8('"'); + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; + uint8x16_t cmp_bs_0 = vceqq_u8(v0, bs_mask); + uint8x16_t cmp_bs_1 = vceqq_u8(v1, bs_mask); + uint8x16_t cmp_qt_0 = vceqq_u8(v0, qt_mask); + uint8x16_t cmp_qt_1 = vceqq_u8(v1, qt_mask); + + cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask); + cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask); + cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask); + cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask); + + uint8x16_t sum0 = vpaddq_u8(cmp_bs_0, cmp_bs_1); + uint8x16_t sum1 = vpaddq_u8(cmp_qt_0, cmp_qt_1); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits - vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits - }; + vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) // quote_bits + }; } -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline -bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, - ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) { - PARSE_STRING(architecture::arm64, buf, len, pj, depth, offset); -} +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER + really_inline bool + parse_string(UNUSED const uint8_t *buf, + UNUSED size_t len, ParsedJson &pj, + UNUSED const uint32_t depth, + UNUSED uint32_t offset) { + PARSE_STRING(Architecture::ARM64, buf, len, pj, depth, offset); } +} // namespace simdjson #endif #endif diff --git a/include/simdjson/stringparsing_haswell.h b/include/simdjson/stringparsing_haswell.h index d49daacf..4aa806f9 100644 --- a/include/simdjson/stringparsing_haswell.h +++ b/include/simdjson/stringparsing_haswell.h @@ -4,34 +4,39 @@ #include "simdjson/stringparsing.h" #include "simdjson/stringparsing_macros.h" - #ifdef IS_X86_64 TARGET_HASWELL namespace simdjson { -template<> really_inline -parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { - // this can read up to 31 bytes beyond the buffer size, but we require - // SIMDJSON_PADDING of padding - static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING); - __m256i v = _mm256_loadu_si256(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v); - auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')); - return { - static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits +template <> +really_inline parse_string_helper +find_bs_bits_and_quote_bits(const uint8_t *src, + uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(sizeof(__m256i) - 1 <= SIMDJSON_PADDING); + __m256i v = _mm256_loadu_si256(reinterpret_cast(src)); + // store to dest unconditionally - we can overwrite the bits we don't like + // later + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), v); + auto quote_mask = _mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')); + return { + static_cast(_mm256_movemask_epi8( + _mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')))), // bs_bits static_cast(_mm256_movemask_epi8(quote_mask)) // quote_bits - }; + }; } -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline -bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, - ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) { - PARSE_STRING(architecture::haswell, buf, len, pj, depth, offset); +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER + really_inline bool + parse_string(UNUSED const uint8_t *buf, + UNUSED size_t len, ParsedJson &pj, + UNUSED const uint32_t depth, + UNUSED uint32_t offset) { + PARSE_STRING(Architecture::HASWELL, buf, len, pj, depth, offset); } -} +} // namespace simdjson UNTARGET_REGION #endif diff --git a/include/simdjson/stringparsing_macros.h b/include/simdjson/stringparsing_macros.h index ae6c2e3e..ec699c81 100644 --- a/include/simdjson/stringparsing_macros.h +++ b/include/simdjson/stringparsing_macros.h @@ -1,80 +1,88 @@ #ifndef SIMDJSON_STRINGPARSING_MACROS_H #define SIMDJSON_STRINGPARSING_MACROS_H -// We need to compile that code for multiple architectures. However, target attributes can be used -// only once by function definition. Huge macro seemed better than huge code duplication. -// bool PARSE_STRING(architecture T, UNUSED const uint8_t *buf, UNUSED size_t len, -// ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) -#define PARSE_STRING(T, buf, len, pj, depth, offset) { \ - pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \ - const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */ \ - uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \ - const uint8_t *const start_of_string = dst; \ - while (1) { \ - parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); \ - if(((helper.bs_bits - 1) & helper.quote_bits) != 0 ) { \ - /* we encountered quotes first. Move dst to point to quotes and exit */ \ - \ - /* find out where the quote is... */ \ - uint32_t quote_dist = trailingzeroes(helper.quote_bits); \ - \ - /* NULL termination is still handy if you expect all your strings to be NULL terminated? */ \ - /* It comes at a small cost */ \ - dst[quote_dist] = 0; \ - \ - uint32_t str_length = (dst - start_of_string) + quote_dist; \ - memcpy(pj.current_string_buf_loc,&str_length, sizeof(uint32_t)); \ - /*///////////////////// */ \ - /* Above, check for overflow in case someone has a crazy string (>=4GB?) */ \ - /* But only add the overflow check when the document itself exceeds 4GB */ \ - /* Currently unneeded because we refuse to parse docs larger or equal to 4GB. */ \ - /*////////////////////// */ \ - \ - \ - /* we advance the point, accounting for the fact that we have a NULL termination */ \ - pj.current_string_buf_loc = dst + quote_dist + 1; \ - return true; \ - } \ - if(((helper.quote_bits - 1) & helper.bs_bits ) != 0 ) { \ - /* find out where the backspace is */ \ - uint32_t bs_dist = trailingzeroes(helper.bs_bits); \ - uint8_t escape_char = src[bs_dist + 1]; \ - /* we encountered backslash first. Handle backslash */ \ - if (escape_char == 'u') { \ - /* move src/dst up to the start; they will be further adjusted */ \ - /* within the unicode codepoint handling code. */ \ - src += bs_dist; \ - dst += bs_dist; \ - if (!handle_unicode_codepoint(&src, &dst)) { \ - return false; \ - } \ - } else { \ - /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and */ \ - /* write bs_dist+1 characters to output */ \ - /* note this may reach beyond the part of the buffer we've actually */ \ - /* seen. I think this is ok */ \ - uint8_t escape_result = escape_map[escape_char]; \ - if (escape_result == 0u) { \ - return false; /* bogus escape value is an error */ \ - } \ - dst[bs_dist] = escape_result; \ - src += bs_dist + 2; \ - dst += bs_dist + 1; \ - } \ - } else { \ - /* they are the same. Since they can't co-occur, it means we encountered */ \ - /* neither. */ \ - if constexpr(T == architecture::westmere) { \ - src += 16; \ - dst += 16; \ - } else { \ - src += 32; \ - dst += 32; \ - } \ - } \ - } \ - /* can't be reached */ \ - return true; \ -} +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication.ç +// bool PARSE_STRING(Architecture T, const uint8_t *buf, size_t len, ParsedJson +// &pj,const uint32_t depth, uint32_t offset) +#define PARSE_STRING(T, buf, len, pj, depth, offset) \ + { \ + pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); \ + const uint8_t *src = \ + &buf[offset + 1]; /* we know that buf at offset is a " */ \ + uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); \ + const uint8_t *const start_of_string = dst; \ + while (1) { \ + parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); \ + if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { \ + /* we encountered quotes first. Move dst to point to quotes and exit \ + */ \ + \ + /* find out where the quote is... */ \ + uint32_t quote_dist = trailing_zeroes(helper.quote_bits); \ + \ + /* NULL termination is still handy if you expect all your strings to \ + * be NULL terminated? */ \ + /* It comes at a small cost */ \ + dst[quote_dist] = 0; \ + \ + uint32_t str_length = (dst - start_of_string) + quote_dist; \ + memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); \ + /***************************** \ + * Above, check for overflow in case someone has a crazy string \ + * (>=4GB?) _ \ + * But only add the overflow check when the document itself exceeds \ + * 4GB \ + * Currently unneeded because we refuse to parse docs larger or equal \ + * to 4GB. \ + ****************************/ \ + \ + /* we advance the point, accounting for the fact that we have a NULL \ + * termination */ \ + pj.current_string_buf_loc = dst + quote_dist + 1; \ + return true; \ + } \ + if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { \ + /* find out where the backspace is */ \ + uint32_t bs_dist = trailing_zeroes(helper.bs_bits); \ + uint8_t escape_char = src[bs_dist + 1]; \ + /* we encountered backslash first. Handle backslash */ \ + if (escape_char == 'u') { \ + /* move src/dst up to the start; they will be further adjusted \ + within the unicode codepoint handling code. */ \ + src += bs_dist; \ + dst += bs_dist; \ + if (!handle_unicode_codepoint(&src, &dst)) { \ + return false; \ + } \ + } else { \ + /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and \ + * write bs_dist+1 characters to output \ + * note this may reach beyond the part of the buffer we've actually \ + * seen. I think this is ok */ \ + uint8_t escape_result = escape_map[escape_char]; \ + if (escape_result == 0u) { \ + return false; /* bogus escape value is an error */ \ + } \ + dst[bs_dist] = escape_result; \ + src += bs_dist + 2; \ + dst += bs_dist + 1; \ + } \ + } else { \ + /* they are the same. Since they can't co-occur, it means we \ + * encountered neither. */ \ + if constexpr (T == Architecture::WESTMERE) { \ + src += 16; \ + dst += 16; \ + } else { \ + src += 32; \ + dst += 32; \ + } \ + } \ + } \ + /* can't be reached */ \ + return true; \ + } #endif \ No newline at end of file diff --git a/include/simdjson/stringparsing_westmere.h b/include/simdjson/stringparsing_westmere.h index b7bb69ad..0e70a6eb 100644 --- a/include/simdjson/stringparsing_westmere.h +++ b/include/simdjson/stringparsing_westmere.h @@ -4,32 +4,37 @@ #include "simdjson/stringparsing.h" #include "simdjson/stringparsing_macros.h" - #ifdef IS_X86_64 TARGET_WESTMERE namespace simdjson { -template<> really_inline -parse_string_helper find_bs_bits_and_quote_bits (const uint8_t *src, uint8_t *dst) { - // this can read up to 31 bytes beyond the buffer size, but we require - // SIMDJSON_PADDING of padding - __m128i v = _mm_loadu_si128(reinterpret_cast(src)); - // store to dest unconditionally - we can overwrite the bits we don't like - // later - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v); - auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"')); - return { - static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits +template <> +really_inline parse_string_helper +find_bs_bits_and_quote_bits(const uint8_t *src, + uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + __m128i v = _mm_loadu_si128(reinterpret_cast(src)); + // store to dest unconditionally - we can overwrite the bits we don't like + // later + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v); + auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"')); + return { + static_cast( + _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits static_cast(_mm_movemask_epi8(quote_mask)) // quote_bits - }; + }; } -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER really_inline -bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len, - ParsedJson &pj, UNUSED const uint32_t depth, UNUSED uint32_t offset) { - PARSE_STRING(architecture::westmere, buf, len, pj, depth, offset); -} +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER + really_inline bool + parse_string(UNUSED const uint8_t *buf, + UNUSED size_t len, ParsedJson &pj, + UNUSED const uint32_t depth, + UNUSED uint32_t offset) { + PARSE_STRING(Architecture::WESTMERE, buf, len, pj, depth, offset); } +} // namespace simdjson UNTARGET_REGION #endif diff --git a/src/jsonioutil.cpp b/src/jsonioutil.cpp index 2e99e6e1..1ae2fa07 100644 --- a/src/jsonioutil.cpp +++ b/src/jsonioutil.cpp @@ -1,35 +1,35 @@ #include "simdjson/jsonioutil.h" -#include #include +#include namespace simdjson { -char * allocate_padded_buffer(size_t length) { - // we could do a simple malloc - //return (char *) malloc(length + SIMDJSON_PADDING); - // However, we might as well align to cache lines... - size_t totalpaddedlength = length + SIMDJSON_PADDING; - char *padded_buffer = aligned_malloc_char(64, totalpaddedlength); - return padded_buffer; +char *allocate_padded_buffer(size_t length) { + // we could do a simple malloc + // return (char *) malloc(length + SIMDJSON_PADDING); + // However, we might as well align to cache lines... + size_t totalpaddedlength = length + SIMDJSON_PADDING; + char *padded_buffer = aligned_malloc_char(64, totalpaddedlength); + return padded_buffer; } -padded_string get_corpus(const std::string& filename) { +padded_string get_corpus(const std::string &filename) { std::FILE *fp = std::fopen(filename.c_str(), "rb"); if (fp != nullptr) { std::fseek(fp, 0, SEEK_END); size_t len = std::ftell(fp); padded_string s(len); - if(s.data() == nullptr) { + if (s.data() == nullptr) { std::fclose(fp); - throw std::runtime_error("could not allocate memory"); + throw std::runtime_error("could not allocate memory"); } std::rewind(fp); size_t readb = std::fread(s.data(), 1, len, fp); std::fclose(fp); - if(readb != len) { - throw std::runtime_error("could not read the data"); + if (readb != len) { + throw std::runtime_error("could not read the data"); } return s; } - throw std::runtime_error("could not load corpus"); -} + throw std::runtime_error("could not load corpus"); } +} // namespace simdjson diff --git a/src/jsonminifier.cpp b/src/jsonminifier.cpp index f8baf5f2..f74c7a80 100644 --- a/src/jsonminifier.cpp +++ b/src/jsonminifier.cpp @@ -38,13 +38,13 @@ static uint8_t jump_table[256 * 3] = { 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, }; -size_t jsonminify(const unsigned char *bytes, size_t howmany, - unsigned char *out) { +size_t json_minify(const unsigned char *bytes, size_t how_many, + unsigned char *out) { size_t i = 0, pos = 0; uint8_t quote = 0; uint8_t nonescape = 1; - while (i < howmany) { + while (i < how_many) { unsigned char c = bytes[i]; uint8_t *meta = jump_table + 3 * c; @@ -64,7 +64,6 @@ size_t jsonminify(const unsigned char *bytes, size_t howmany, namespace simdjson { - // some intrinsics are missing under GCC? #ifndef __clang__ #ifndef _MSC_VER @@ -85,8 +84,6 @@ static inline void _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, #endif #endif - - // a straightforward comparison of a mask against input. static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, __m256i mask) { @@ -98,8 +95,9 @@ static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi, } // take input from buf and remove useless whitespace, input and output can be -// the same, result is null terminated, return the string length (minus the null termination) -size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { +// the same, result is null terminated, return the string length (minus the null +// termination) +size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) { // Useful constant masks const uint64_t even_bits = 0x5555555555555555ULL; const uint64_t odd_bits = ~even_bits; @@ -109,11 +107,13 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones size_t idx = 0; if (len >= 64) { - size_t avxlen = len - 63; + size_t avx_len = len - 63; - for (; idx < avxlen; idx += 64) { - __m256i input_lo = _mm256_loadu_si256(reinterpret_cast(buf + idx + 0)); - __m256i input_hi = _mm256_loadu_si256(reinterpret_cast(buf + idx + 32)); + for (; idx < avx_len; idx += 64) { + __m256i input_lo = + _mm256_loadu_si256(reinterpret_cast(buf + idx + 0)); + __m256i input_hi = + _mm256_loadu_si256(reinterpret_cast(buf + idx + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); @@ -122,8 +122,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; - bool iter_ends_odd_backslash = add_overflow( - bs_bits, odd_starts, &odd_carries); + bool iter_ends_odd_backslash = + add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; @@ -137,7 +137,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; - prev_iter_inside_quote = static_cast(static_cast(quote_mask) >> 63);// might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University + prev_iter_inside_quote = static_cast( + static_cast(quote_mask) >> + 63); // might be undefined behavior, should be fully defined in C++20, + // ok according to John Regher from Utah University const __m256i low_nibble_mask = _mm256_setr_epi8( // 0 9 a b c d 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, @@ -163,7 +166,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { __m256i tmp_ws_hi = _mm256_cmpeq_epi8( _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0)); - uint64_t ws_res_0 = static_cast(_mm256_movemask_epi8(tmp_ws_lo)); + uint64_t ws_res_0 = + static_cast(_mm256_movemask_epi8(tmp_ws_lo)); uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; @@ -175,17 +179,18 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); - __m256i vmask1 = - _mm256_loadu2_m128i(reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), - reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); - __m256i vmask2 = - _mm256_loadu2_m128i(reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), - reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); + __m256i vmask1 = _mm256_loadu2_m128i( + reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), + reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); + __m256i vmask2 = _mm256_loadu2_m128i( + reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), + reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), reinterpret_cast<__m128i *>(out), result1); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), reinterpret_cast<__m128i *>(out + pop2), - result2); + _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1), + reinterpret_cast<__m128i *>(out), result1); + _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3), + reinterpret_cast<__m128i *>(out + pop2), result2); out += pop4; } } @@ -195,8 +200,10 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint8_t buffer[64]; memset(buffer, 0, 64); memcpy(buffer, buf + idx, len - idx); - __m256i input_lo = _mm256_loadu_si256(reinterpret_cast(buffer)); - __m256i input_hi = _mm256_loadu_si256(reinterpret_cast(buffer + 32)); + __m256i input_lo = + _mm256_loadu_si256(reinterpret_cast(buffer)); + __m256i input_hi = + _mm256_loadu_si256(reinterpret_cast(buffer + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); @@ -205,10 +212,11 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; - //bool iter_ends_odd_backslash = - add_overflow( bs_bits, odd_starts, &odd_carries); + // bool iter_ends_odd_backslash = + add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; - //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it + // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; + // // we never use it uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; @@ -220,7 +228,8 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); quote_mask ^= prev_iter_inside_quote; - // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore + // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we + // don't need this anymore __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32 __m256i mask_70 = @@ -254,23 +263,23 @@ size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out) { int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); - __m256i vmask1 = - _mm256_loadu2_m128i(reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), - reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); - __m256i vmask2 = - _mm256_loadu2_m128i(reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), - reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); + __m256i vmask1 = _mm256_loadu2_m128i( + reinterpret_cast(mask128_epi8) + (mask2 & 0x7FFF), + reinterpret_cast(mask128_epi8) + (mask1 & 0x7FFF)); + __m256i vmask2 = _mm256_loadu2_m128i( + reinterpret_cast(mask128_epi8) + (mask4 & 0x7FFF), + reinterpret_cast(mask128_epi8) + (mask3 & 0x7FFF)); __m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1); __m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), reinterpret_cast<__m128i *>(buffer), - result1); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), reinterpret_cast<__m128i *>(buffer + pop2), - result2); + _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1), + reinterpret_cast<__m128i *>(buffer), result1); + _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3), + reinterpret_cast<__m128i *>(buffer + pop2), result2); memcpy(out, buffer, pop4); out += pop4; } - *out = '\0';// NULL termination + *out = '\0'; // NULL termination return out - initout; } -} +} // namespace simdjson #endif diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp index d3b0b60d..4e1bd071 100644 --- a/src/jsonparser.cpp +++ b/src/jsonparser.cpp @@ -1,67 +1,68 @@ -#include "simdjson/jsonparser.h" -#ifdef _MSC_VER -#include -#include -#else -#include -#endif -#include "simdjson/simdjson.h" #include "simdjson/isadetection.h" +#include "simdjson/jsonparser.h" #include "simdjson/portability.h" +#include "simdjson/simdjson.h" namespace simdjson { -architecture find_best_supported_implementation() { - constexpr uint32_t haswell_flags = SIMDExtensions::AVX2 | SIMDExtensions::PCLMULQDQ - | SIMDExtensions::BMI1 | SIMDExtensions::BMI2; - constexpr uint32_t westmere_flags = SIMDExtensions::SSE42 | SIMDExtensions::PCLMULQDQ; +Architecture find_best_supported_implementation() { + constexpr uint32_t haswell_flags = + instruction_set::AVX2 | instruction_set::PCLMULQDQ | + instruction_set::BMI1 | instruction_set::BMI2; + constexpr uint32_t westmere_flags = + instruction_set::SSE42 | instruction_set::PCLMULQDQ; uint32_t supports = detect_supported_architectures(); // Order from best to worst (within architecture) - if ((haswell_flags & supports) == haswell_flags) return architecture::haswell; - if ((westmere_flags & supports) == westmere_flags) return architecture::westmere; - if (SIMDExtensions::NEON) return architecture::arm64; + if ((haswell_flags & supports) == haswell_flags) + return Architecture::HASWELL; + if ((westmere_flags & supports) == westmere_flags) + return Architecture::WESTMERE; + if (instruction_set::NEON) + return Architecture::ARM64; - return architecture::none; + return Architecture::NONE; } // Responsible to select the best json_parse implementation -int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded) { - architecture best_implementation = find_best_supported_implementation(); +int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, + bool realloc_if_needed) { + Architecture best_implementation = find_best_supported_implementation(); // Selecting the best implementation switch (best_implementation) { #ifdef IS_X86_64 - case architecture::haswell: - json_parse_ptr = &json_parse_implementation; + case Architecture::HASWELL: + json_parse_ptr = &json_parse_implementation; break; - case architecture::westmere: - json_parse_ptr = &json_parse_implementation; + case Architecture::WESTMERE: + json_parse_ptr = &json_parse_implementation; break; #endif #ifdef IS_ARM64 - case architecture::arm64: - json_parse_ptr = &json_parse_implementation; + case Architecture::ARM64: + json_parse_ptr = &json_parse_implementation; break; #endif - default : + default: std::cerr << "The processor is not supported by simdjson." << std::endl; return simdjson::UNEXPECTED_ERROR; } - return json_parse_ptr(buf, len, pj, reallocifneeded); + return json_parse_ptr(buf, len, pj, realloc_if_needed); } json_parse_functype *json_parse_ptr = &json_parse_dispatch; WARN_UNUSED -ParsedJson build_parsed_json(const uint8_t *buf, size_t len, bool reallocifneeded) { +ParsedJson build_parsed_json(const uint8_t *buf, size_t len, + bool realloc_if_needed) { ParsedJson pj; - bool ok = pj.allocateCapacity(len); - if(ok) { - json_parse(buf, len, pj, reallocifneeded); + bool ok = pj.allocate_capacity(len); + if (ok) { + json_parse(buf, len, pj, realloc_if_needed); } else { std::cerr << "failure during memory allocation " << std::endl; } return pj; } -} +} // namespace simdjson diff --git a/src/parsedjson.cpp b/src/parsedjson.cpp index efbde503..e431f1c7 100644 --- a/src/parsedjson.cpp +++ b/src/parsedjson.cpp @@ -1,324 +1,323 @@ #include "simdjson/parsedjson.h" namespace simdjson { -ParsedJson::ParsedJson() : - structural_indexes(nullptr), tape(nullptr), containing_scope_offset(nullptr), - ret_address(nullptr), string_buf(nullptr), current_string_buf_loc(nullptr) {} +ParsedJson::ParsedJson() + : structural_indexes(nullptr), tape(nullptr), + containing_scope_offset(nullptr), ret_address(nullptr), + string_buf(nullptr), current_string_buf_loc(nullptr) {} -ParsedJson::~ParsedJson() { - deallocate(); +ParsedJson::~ParsedJson() { deallocate(); } + +ParsedJson::ParsedJson(ParsedJson &&p) + : byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity), + tape_capacity(p.tape_capacity), string_capacity(p.string_capacity), + current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes), + structural_indexes(p.structural_indexes), tape(p.tape), + containing_scope_offset(p.containing_scope_offset), + ret_address(p.ret_address), string_buf(p.string_buf), + current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) { + p.structural_indexes = nullptr; + p.tape = nullptr; + p.containing_scope_offset = nullptr; + p.ret_address = nullptr; + p.string_buf = nullptr; + p.current_string_buf_loc = nullptr; } -ParsedJson::ParsedJson(ParsedJson && p) - : bytecapacity(p.bytecapacity), - depthcapacity(p.depthcapacity), - tapecapacity(p.tapecapacity), - stringcapacity(p.stringcapacity), - current_loc(p.current_loc), - n_structural_indexes(p.n_structural_indexes), - structural_indexes(p.structural_indexes), - tape(p.tape), - containing_scope_offset(p.containing_scope_offset), - ret_address(p.ret_address), - string_buf(p.string_buf), - current_string_buf_loc(p.current_string_buf_loc), - isvalid(p.isvalid) { - p.structural_indexes=nullptr; - p.tape=nullptr; - p.containing_scope_offset=nullptr; - p.ret_address=nullptr; - p.string_buf=nullptr; - p.current_string_buf_loc=nullptr; - } - - - WARN_UNUSED -bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) { - if (maxdepth <= 0) { - maxdepth = 1; // don't let the user allocate nothing - } - if (len <= 0) { - len = 64; // allocating 0 bytes is wasteful. - } - if(len > SIMDJSON_MAXSIZE_BYTES) { - return false; - } - if ((len <= bytecapacity) && (depthcapacity < maxdepth)) { - return true; - } - deallocate(); - isvalid = false; - bytecapacity = 0; // will only set it to len after allocations are a success - n_structural_indexes = 0; - uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; - structural_indexes = new (std::nothrow) uint32_t[max_structures]; - // a pathological input like "[[[[..." would generate len tape elements, so need a capacity of len + 1 - size_t localtapecapacity = ROUNDUP_N(len + 1, 64); - // a document with only zero-length strings... could have len/3 string - // and we would need len/3 * 5 bytes on the string buffer - size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64); - string_buf = new (std::nothrow) uint8_t[localstringcapacity]; - tape = new (std::nothrow) uint64_t[localtapecapacity]; - containing_scope_offset = new (std::nothrow) uint32_t[maxdepth]; -#ifdef SIMDJSON_USE_COMPUTED_GOTO - ret_address = new (std::nothrow) void *[maxdepth]; -#else - ret_address = new (std::nothrow) char[maxdepth]; -#endif - if ((string_buf == nullptr) || (tape == nullptr) || - (containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) { - std::cerr << "Could not allocate memory" << std::endl; - delete[] ret_address; - delete[] containing_scope_offset; - delete[] tape; - delete[] string_buf; - delete[] structural_indexes; - - return false; - } - /* - // We do not need to initialize this content for parsing, though we could - // need to initialize it for safety. - memset(string_buf, 0 , localstringcapacity); - memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); - memset(tape, 0, localtapecapacity * sizeof(uint64_t)); - */ - bytecapacity = len; - depthcapacity = maxdepth; - tapecapacity = localtapecapacity; - stringcapacity = localstringcapacity; +bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) { + if (max_depth <= 0) { + max_depth = 1; // don't let the user allocate nothing + } + if (len <= 0) { + len = 64; // allocating 0 bytes is wasteful. + } + if (len > SIMDJSON_MAXSIZE_BYTES) { + return false; + } + if ((len <= byte_capacity) && (depth_capacity < max_depth)) { return true; -} - -bool ParsedJson::isValid() const { - return isvalid; -} - -int ParsedJson::getErrorCode() const { - return errorcode; -} - -std::string ParsedJson::getErrorMsg() const { - return errorMsg(errorcode); -} - -void ParsedJson::deallocate() { - bytecapacity = 0; - depthcapacity = 0; - tapecapacity = 0; - stringcapacity = 0; + } + deallocate(); + valid = false; + byte_capacity = 0; // will only set it to len after allocations are a success + n_structural_indexes = 0; + uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; + structural_indexes = new (std::nothrow) uint32_t[max_structures]; + // a pathological input like "[[[[..." would generate len tape elements, so + // need a capacity of len + 1 + size_t local_tape_capacity = ROUNDUP_N(len + 1, 64); + // a document with only zero-length strings... could have len/3 string + // and we would need len/3 * 5 bytes on the string buffer + size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64); + string_buf = new (std::nothrow) uint8_t[local_string_capacity]; + tape = new (std::nothrow) uint64_t[local_tape_capacity]; + containing_scope_offset = new (std::nothrow) uint32_t[max_depth]; +#ifdef SIMDJSON_USE_COMPUTED_GOTO + ret_address = new (std::nothrow) void *[max_depth]; +#else + ret_address = new (std::nothrow) char[max_depth]; +#endif + if ((string_buf == nullptr) || (tape == nullptr) || + (containing_scope_offset == nullptr) || (ret_address == nullptr) || + (structural_indexes == nullptr)) { + std::cerr << "Could not allocate memory" << std::endl; delete[] ret_address; delete[] containing_scope_offset; delete[] tape; delete[] string_buf; delete[] structural_indexes; - isvalid = false; + + return false; + } + /* + // We do not need to initialize this content for parsing, though we could + // need to initialize it for safety. + memset(string_buf, 0 , local_string_capacity); + memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); + memset(tape, 0, local_tape_capacity * sizeof(uint64_t)); + */ + byte_capacity = len; + depth_capacity = max_depth; + tape_capacity = local_tape_capacity; + string_capacity = local_string_capacity; + return true; +} + +bool ParsedJson::is_valid() const { return valid; } + +int ParsedJson::get_error_code() const { return error_code; } + +std::string ParsedJson::get_error_message() const { + return error_message(error_code); +} + +void ParsedJson::deallocate() { + byte_capacity = 0; + depth_capacity = 0; + tape_capacity = 0; + string_capacity = 0; + delete[] ret_address; + delete[] containing_scope_offset; + delete[] tape; + delete[] string_buf; + delete[] structural_indexes; + valid = false; } void ParsedJson::init() { - current_string_buf_loc = string_buf; - current_loc = 0; - isvalid = false; + current_string_buf_loc = string_buf; + current_loc = 0; + valid = false; } WARN_UNUSED -bool ParsedJson::printjson(std::ostream &os) { - if(!isvalid) { - return false; - } - uint32_t string_length; - size_t tapeidx = 0; - uint64_t tape_val = tape[tapeidx]; - uint8_t type = (tape_val >> 56); - size_t howmany = 0; - if (type == 'r') { - howmany = tape_val & JSONVALUEMASK; - } else { - fprintf(stderr, "Error: no starting root node?"); - return false; - } - if (howmany > tapecapacity) { - fprintf(stderr, - "We may be exceeding the tape capacity. Is this a valid document?\n"); - return false; - } - tapeidx++; - bool *inobject = new bool[depthcapacity]; - auto *inobjectidx = new size_t[depthcapacity]; - int depth = 1; // only root at level 0 - inobjectidx[depth] = 0; - inobject[depth] = false; - for (; tapeidx < howmany; tapeidx++) { - tape_val = tape[tapeidx]; - uint64_t payload = tape_val & JSONVALUEMASK; - type = (tape_val >> 56); - if (!inobject[depth]) { - if ((inobjectidx[depth] > 0) && (type != ']')) { - os << ","; - } - inobjectidx[depth]++; - } else { // if (inobject) { - if ((inobjectidx[depth] > 0) && ((inobjectidx[depth] & 1) == 0) && - (type != '}')) { - os << ","; - } - if (((inobjectidx[depth] & 1) == 1)) { - os << ":"; - } - inobjectidx[depth]++; +bool ParsedJson::print_json(std::ostream &os) { + if (!valid) { + return false; + } + uint32_t string_length; + size_t tape_idx = 0; + uint64_t tape_val = tape[tape_idx]; + uint8_t type = (tape_val >> 56); + size_t how_many = 0; + if (type == 'r') { + how_many = tape_val & JSON_VALUE_MASK; + } else { + fprintf(stderr, "Error: no starting root node?"); + return false; + } + if (how_many > tape_capacity) { + fprintf( + stderr, + "We may be exceeding the tape capacity. Is this a valid document?\n"); + return false; + } + tape_idx++; + bool *in_object = new bool[depth_capacity]; + auto *in_object_idx = new size_t[depth_capacity]; + int depth = 1; // only root at level 0 + in_object_idx[depth] = 0; + in_object[depth] = false; + for (; tape_idx < how_many; tape_idx++) { + tape_val = tape[tape_idx]; + uint64_t payload = tape_val & JSON_VALUE_MASK; + type = (tape_val >> 56); + if (!in_object[depth]) { + if ((in_object_idx[depth] > 0) && (type != ']')) { + os << ","; } - switch (type) { - case '"': // we have a string - os << '"'; - memcpy(&string_length,string_buf + payload, sizeof(uint32_t)); - print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length); - os << '"'; - break; - case 'l': // we have a long int - if (tapeidx + 1 >= howmany) { - delete[] inobject; - delete[] inobjectidx; - return false; - } - os << static_cast(tape[++tapeidx]); - break; - case 'd': // we have a double - if (tapeidx + 1 >= howmany){ - delete[] inobject; - delete[] inobjectidx; - return false; - } - double answer; - memcpy(&answer, &tape[++tapeidx], sizeof(answer)); - os << answer; - break; - case 'n': // we have a null - os << "null"; - break; - case 't': // we have a true - os << "true"; - break; - case 'f': // we have a false - os << "false"; - break; - case '{': // we have an object - os << '{'; - depth++; - inobject[depth] = true; - inobjectidx[depth] = 0; - break; - case '}': // we end an object - depth--; - os << '}'; - break; - case '[': // we start an array - os << '['; - depth++; - inobject[depth] = false; - inobjectidx[depth] = 0; - break; - case ']': // we end an array - depth--; - os << ']'; - break; - case 'r': // we start and end with the root node - fprintf(stderr, "should we be hitting the root node?\n"); - delete[] inobject; - delete[] inobjectidx; - return false; - default: - fprintf(stderr, "bug %c\n", type); - delete[] inobject; - delete[] inobjectidx; + in_object_idx[depth]++; + } else { // if (in_object) { + if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) && + (type != '}')) { + os << ","; + } + if (((in_object_idx[depth] & 1) == 1)) { + os << ":"; + } + in_object_idx[depth]++; + } + switch (type) { + case '"': // we have a string + os << '"'; + memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); + print_with_escapes( + (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), + string_length); + os << '"'; + break; + case 'l': // we have a long int + if (tape_idx + 1 >= how_many) { + delete[] in_object; + delete[] in_object_idx; return false; } + os << static_cast(tape[++tape_idx]); + break; + case 'd': // we have a double + if (tape_idx + 1 >= how_many) { + delete[] in_object; + delete[] in_object_idx; + return false; + } + double answer; + memcpy(&answer, &tape[++tape_idx], sizeof(answer)); + os << answer; + break; + case 'n': // we have a null + os << "null"; + break; + case 't': // we have a true + os << "true"; + break; + case 'f': // we have a false + os << "false"; + break; + case '{': // we have an object + os << '{'; + depth++; + in_object[depth] = true; + in_object_idx[depth] = 0; + break; + case '}': // we end an object + depth--; + os << '}'; + break; + case '[': // we start an array + os << '['; + depth++; + in_object[depth] = false; + in_object_idx[depth] = 0; + break; + case ']': // we end an array + depth--; + os << ']'; + break; + case 'r': // we start and end with the root node + fprintf(stderr, "should we be hitting the root node?\n"); + delete[] in_object; + delete[] in_object_idx; + return false; + default: + fprintf(stderr, "bug %c\n", type); + delete[] in_object; + delete[] in_object_idx; + return false; } - delete[] inobject; - delete[] inobjectidx; - return true; + } + delete[] in_object; + delete[] in_object_idx; + return true; } WARN_UNUSED bool ParsedJson::dump_raw_tape(std::ostream &os) { - if(!isvalid) { - return false; - } - uint32_t string_length; - size_t tapeidx = 0; - uint64_t tape_val = tape[tapeidx]; - uint8_t type = (tape_val >> 56); - os << tapeidx << " : " << type; - tapeidx++; - size_t howmany = 0; - if (type == 'r') { - howmany = tape_val & JSONVALUEMASK; - } else { - fprintf(stderr, "Error: no starting root node?"); - return false; - } - os << "\t// pointing to " << howmany <<" (right after last node)\n"; - uint64_t payload; - for (; tapeidx < howmany; tapeidx++) { - os << tapeidx << " : "; - tape_val = tape[tapeidx]; - payload = tape_val & JSONVALUEMASK; - type = (tape_val >> 56); - switch (type) { - case '"': // we have a string - os << "string \""; - memcpy(&string_length,string_buf + payload, sizeof(uint32_t)); - print_with_escapes((const unsigned char *)(string_buf + payload + sizeof(uint32_t)), string_length); - os << '"'; - os << '\n'; - break; - case 'l': // we have a long int - if (tapeidx + 1 >= howmany) { - return false; - } - os << "integer " << static_cast(tape[++tapeidx]) << "\n"; - break; - case 'd': // we have a double - os << "float "; - if (tapeidx + 1 >= howmany) { - return false; - } - double answer; - memcpy(&answer, &tape[++tapeidx], sizeof(answer)); - os << answer << '\n'; - break; - case 'n': // we have a null - os << "null\n"; - break; - case 't': // we have a true - os << "true\n"; - break; - case 'f': // we have a false - os << "false\n"; - break; - case '{': // we have an object - os << "{\t// pointing to next tape location " << payload << " (first node after the scope) \n"; - break; - case '}': // we end an object - os << "}\t// pointing to previous tape location " << payload << " (start of the scope) \n"; - break; - case '[': // we start an array - os << "[\t// pointing to next tape location " << payload << " (first node after the scope) \n"; - break; - case ']': // we end an array - os << "]\t// pointing to previous tape location " << payload << " (start of the scope) \n"; - break; - case 'r': // we start and end with the root node - printf("end of root\n"); - return false; - default: + if (!valid) { + return false; + } + uint32_t string_length; + size_t tape_idx = 0; + uint64_t tape_val = tape[tape_idx]; + uint8_t type = (tape_val >> 56); + os << tape_idx << " : " << type; + tape_idx++; + size_t how_many = 0; + if (type == 'r') { + how_many = tape_val & JSON_VALUE_MASK; + } else { + fprintf(stderr, "Error: no starting root node?"); + return false; + } + os << "\t// pointing to " << how_many << " (right after last node)\n"; + uint64_t payload; + for (; tape_idx < how_many; tape_idx++) { + os << tape_idx << " : "; + tape_val = tape[tape_idx]; + payload = tape_val & JSON_VALUE_MASK; + type = (tape_val >> 56); + switch (type) { + case '"': // we have a string + os << "string \""; + memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); + print_with_escapes( + (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), + string_length); + os << '"'; + os << '\n'; + break; + case 'l': // we have a long int + if (tape_idx + 1 >= how_many) { return false; } + os << "integer " << static_cast(tape[++tape_idx]) << "\n"; + break; + case 'd': // we have a double + os << "float "; + if (tape_idx + 1 >= how_many) { + return false; + } + double answer; + memcpy(&answer, &tape[++tape_idx], sizeof(answer)); + os << answer << '\n'; + break; + case 'n': // we have a null + os << "null\n"; + break; + case 't': // we have a true + os << "true\n"; + break; + case 'f': // we have a false + os << "false\n"; + break; + case '{': // we have an object + os << "{\t// pointing to next tape location " << payload + << " (first node after the scope) \n"; + break; + case '}': // we end an object + os << "}\t// pointing to previous tape location " << payload + << " (start of the scope) \n"; + break; + case '[': // we start an array + os << "[\t// pointing to next tape location " << payload + << " (first node after the scope) \n"; + break; + case ']': // we end an array + os << "]\t// pointing to previous tape location " << payload + << " (start of the scope) \n"; + break; + case 'r': // we start and end with the root node + printf("end of root\n"); + return false; + default: + return false; } - tape_val = tape[tapeidx]; - payload = tape_val & JSONVALUEMASK; - type = (tape_val >> 56); - os << tapeidx << " : "<< type <<"\t// pointing to " << payload <<" (start root)\n"; - return true; -} + } + tape_val = tape[tape_idx]; + payload = tape_val & JSON_VALUE_MASK; + type = (tape_val >> 56); + os << tape_idx << " : " << type << "\t// pointing to " << payload + << " (start root)\n"; + return true; } +} // namespace simdjson diff --git a/src/parsedjsoniterator.cpp b/src/parsedjsoniterator.cpp index ad808ab3..96dc7a17 100644 --- a/src/parsedjsoniterator.cpp +++ b/src/parsedjsoniterator.cpp @@ -1,264 +1,269 @@ -#include "simdjson/parsedjson.h" #include "simdjson/common_defs.h" +#include "simdjson/parsedjson.h" #include namespace simdjson { -ParsedJson::iterator::iterator(ParsedJson &pj_) : pj(pj_), depth(0), location(0), tape_length(0), depthindex(nullptr) { - if(!pj.isValid()) { - throw InvalidJSON(); - } - depthindex = new scopeindex_t[pj.depthcapacity]; - // memory allocation would throw - //if(depthindex == nullptr) { - // return; - //} - depthindex[0].start_of_scope = location; - current_val = pj.tape[location++]; - current_type = (current_val >> 56); - depthindex[0].scope_type = current_type; - if (current_type == 'r') { - tape_length = current_val & JSONVALUEMASK; - if(location < tape_length) { - current_val = pj.tape[location]; - current_type = (current_val >> 56); - depth++; - depthindex[depth].start_of_scope = location; - depthindex[depth].scope_type = current_type; - } - } else { - // should never happen - throw InvalidJSON(); - } -} - -ParsedJson::iterator::~iterator() { - delete[] depthindex; -} - -ParsedJson::iterator::iterator(const iterator &o): - pj(o.pj), depth(o.depth), location(o.location), - tape_length(0), current_type(o.current_type), - current_val(o.current_val), depthindex(nullptr) { - depthindex = new scopeindex_t[pj.depthcapacity]; - // allocation might throw - memcpy(depthindex, o.depthindex, pj.depthcapacity * sizeof(depthindex[0])); - tape_length = o.tape_length; -} - -ParsedJson::iterator::iterator(iterator &&o): - pj(o.pj), depth(o.depth), location(o.location), - tape_length(o.tape_length), current_type(o.current_type), - current_val(o.current_val), depthindex(o.depthindex) { - o.depthindex = nullptr;// we take ownership -} - -bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const { - if(!isOk()) { - return false; +ParsedJson::Iterator::Iterator(ParsedJson &pj_) + : pj(pj_), depth(0), location(0), tape_length(0), depth_index(nullptr) { + if (!pj.is_valid()) { + throw InvalidJSON(); + } + depth_index = new scopeindex_t[pj.depth_capacity]; + // memory allocation would throw + // if(depth_index == nullptr) { + // return; + //} + depth_index[0].start_of_scope = location; + current_val = pj.tape[location++]; + current_type = (current_val >> 56); + depth_index[0].scope_type = current_type; + if (current_type == 'r') { + tape_length = current_val & JSON_VALUE_MASK; + if (location < tape_length) { + current_val = pj.tape[location]; + current_type = (current_val >> 56); + depth++; + depth_index[depth].start_of_scope = location; + depth_index[depth].scope_type = current_type; } - switch (current_type) { - case '"': // we have a string + } else { + // should never happen + throw InvalidJSON(); + } +} + +ParsedJson::Iterator::~Iterator() { delete[] depth_index; } + +ParsedJson::Iterator::Iterator(const Iterator &o) + : pj(o.pj), depth(o.depth), location(o.location), tape_length(0), + current_type(o.current_type), current_val(o.current_val), + depth_index(nullptr) { + depth_index = new scopeindex_t[pj.depth_capacity]; + // allocation might throw + memcpy(depth_index, o.depth_index, + pj.depth_capacity * sizeof(depth_index[0])); + tape_length = o.tape_length; +} + +ParsedJson::Iterator::Iterator(Iterator &&o) + : pj(o.pj), depth(o.depth), location(o.location), + tape_length(o.tape_length), current_type(o.current_type), + current_val(o.current_val), depth_index(o.depth_index) { + o.depth_index = nullptr; // we take ownership +} + +bool ParsedJson::Iterator::print(std::ostream &os, bool escape_strings) const { + if (!is_ok()) { + return false; + } + switch (current_type) { + case '"': // we have a string os << '"'; - if(escape_strings) { - print_with_escapes(get_string(), os, get_string_length()); + if (escape_strings) { + print_with_escapes(get_string(), os, get_string_length()); } else { - // was: os << get_string();, but given that we can include null chars, we have to do something crazier: - std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator(os)); + // was: os << get_string();, but given that we can include null chars, we + // have to do something crazier: + std::copy(get_string(), get_string() + get_string_length(), + std::ostream_iterator(os)); } os << '"'; break; - case 'l': // we have a long int + case 'l': // we have a long int os << get_integer(); break; - case 'd': + case 'd': os << get_double(); break; - case 'n': // we have a null + case 'n': // we have a null os << "null"; break; - case 't': // we have a true + case 't': // we have a true os << "true"; break; - case 'f': // we have a false + case 'f': // we have a false os << "false"; break; - case '{': // we have an object - case '}': // we end an object - case '[': // we start an array - case ']': // we end an array + case '{': // we have an object + case '}': // we end an object + case '[': // we start an array + case ']': // we end an array os << static_cast(current_type); break; - default: + default: return false; - } - return true; + } + return true; } -bool ParsedJson::iterator::move_to(const char * pointer, uint32_t length) { - char* new_pointer = nullptr; - if (pointer[0] == '#') { - // Converting fragment representation to string representation - new_pointer = new char[length]; - uint32_t new_length = 0; - for (uint32_t i = 1; i < length; i++) { - if (pointer[i] == '%' && pointer[i+1] == 'x') { - try { - int fragment = std::stoi(std::string(&pointer[i+2], 2), nullptr, 16); - if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) { - // escaping the character - new_pointer[new_length] = '\\'; - new_length++; - } - new_pointer[new_length] = fragment; - i += 3; - } - catch(std::invalid_argument& e) { - delete[] new_pointer; - return false; // the fragment is invalid +bool ParsedJson::Iterator::move_to(const char *pointer, uint32_t length) { + char *new_pointer = nullptr; + if (pointer[0] == '#') { + // Converting fragment representation to string representation + new_pointer = new char[length]; + uint32_t new_length = 0; + for (uint32_t i = 1; i < length; i++) { + if (pointer[i] == '%' && pointer[i + 1] == 'x') { + try { + int fragment = + std::stoi(std::string(&pointer[i + 2], 2), nullptr, 16); + if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) { + // escaping the character + new_pointer[new_length] = '\\'; + new_length++; } + new_pointer[new_length] = fragment; + i += 3; + } catch (std::invalid_argument &e) { + delete[] new_pointer; + return false; // the fragment is invalid } - else { - new_pointer[new_length] = pointer[i]; - } - new_length++; + } else { + new_pointer[new_length] = pointer[i]; } - length = new_length; - pointer = new_pointer; + new_length++; } - - // saving the current state - size_t depth_s = depth; - size_t location_s = location; - uint8_t current_type_s = current_type; - uint64_t current_val_s = current_val; - scopeindex_t *depthindex_s = depthindex; - - rewind(); // The json pointer is used from the root of the document. + length = new_length; + pointer = new_pointer; + } - bool found = relative_move_to(pointer, length); - delete[] new_pointer; + // saving the current state + size_t depth_s = depth; + size_t location_s = location; + uint8_t current_type_s = current_type; + uint64_t current_val_s = current_val; + scopeindex_t *depth_index_s = depth_index; - if (!found) { - // since the pointer has found nothing, we get back to the original position. - depth = depth_s; - location = location_s; - current_type = current_type_s; - current_val = current_val_s; - depthindex = depthindex_s; - } + rewind(); // The json pointer is used from the root of the document. - return found; + bool found = relative_move_to(pointer, length); + delete[] new_pointer; + + if (!found) { + // since the pointer has found nothing, we get back to the original + // position. + depth = depth_s; + location = location_s; + current_type = current_type_s; + current_val = current_val_s; + depth_index = depth_index_s; + } + + return found; } -bool ParsedJson::iterator::relative_move_to(const char * pointer, uint32_t length) { - if (length == 0) { - // returns the whole document - return true; - } +bool ParsedJson::Iterator::relative_move_to(const char *pointer, + uint32_t length) { + if (length == 0) { + // returns the whole document + return true; + } - if (pointer[0] != '/') { - // '/' must be the first character + if (pointer[0] != '/') { + // '/' must be the first character + return false; + } + + // finding the key in an object or the index in an array + std::string key_or_index; + uint32_t offset = 1; + + // checking for the "-" case + if (is_array() && pointer[1] == '-') { + if (length != 2) { + // the pointer must be exactly "/-" + // there can't be anything more after '-' as an index return false; } + key_or_index = '-'; + offset = length; // will skip the loop coming right after + } - // finding the key in an object or the index in an array - std::string key_or_index; - uint32_t offset = 1; + // We either transform the first reference token to a valid json key + // or we make sure it is a valid index in an array. + for (; offset < length; offset++) { + if (pointer[offset] == '/') { + // beginning of the next key or index + break; + } + if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) { + // the index of an array must be an integer + // we also make sure std::stoi won't discard whitespaces later + return false; + } + if (pointer[offset] == '~') { + // "~1" represents "/" + if (pointer[offset + 1] == '1') { + key_or_index += '/'; + offset++; + continue; + } + // "~0" represents "~" + if (pointer[offset + 1] == '0') { + key_or_index += '~'; + offset++; + continue; + } + } + if (pointer[offset] == '\\') { + if (pointer[offset + 1] == '\\' || pointer[offset + 1] == '"' || + (pointer[offset + 1] <= 0x1F)) { + key_or_index += pointer[offset + 1]; + offset++; + continue; + } + return false; // invalid escaped character + } + if (pointer[offset] == '\"') { + // unescaped quote character. this is an invalid case. + // lets do nothing and assume most pointers will be valid. + // it won't find any corresponding json key anyway. + // return false; + } + key_or_index += pointer[offset]; + } - // checking for the "-" case - if (is_array() && pointer[1] == '-') { - if (length != 2) { - // the pointer must be exactly "/-" - // there can't be anything more after '-' as an index + bool found = false; + if (is_object()) { + if (move_to_key(key_or_index.c_str(), key_or_index.length())) { + found = relative_move_to(pointer + offset, length - offset); + } + } else if (is_array()) { + if (key_or_index == "-") { // handling "-" case first + if (down()) { + while (next()) + ; // moving to the end of the array + // moving to the nonexistent value right after... + size_t npos; + if ((current_type == '[') || (current_type == '{')) { + // we need to jump + npos = (current_val & JSON_VALUE_MASK); + } else { + npos = + location + ((current_type == 'd' || current_type == 'l') ? 2 : 1); + } + location = npos; + current_val = pj.tape[npos]; + current_type = (current_val >> 56); + return true; // how could it fail ? + } + } else { // regular numeric index + // The index can't have a leading '0' + if (key_or_index[0] == '0' && key_or_index.length() > 1) { return false; } - key_or_index = '-'; - offset = length; // will skip the loop coming right after - } - - // We either transform the first reference token to a valid json key - // or we make sure it is a valid index in an array. - for (; offset < length ; offset++) { - if (pointer[offset] == '/') { - // beginning of the next key or index - break; - } - if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) { - // the index of an array must be an integer - // we also make sure std::stoi won't discard whitespaces later + // it cannot be empty + if (key_or_index.length() == 0) { return false; } - if (pointer[offset] == '~') { - // "~1" represents "/" - if (pointer[offset+1] == '1') { - key_or_index += '/'; - offset++; - continue; - } - // "~0" represents "~" - if (pointer[offset+1] == '0') { - key_or_index += '~'; - offset++; - continue; - } + // we already checked the index contains only valid digits + uint32_t index = std::stoi(key_or_index); + if (move_to_index(index)) { + found = relative_move_to(pointer + offset, length - offset); } - if (pointer[offset] == '\\') { - if (pointer[offset+1] == '\\' || pointer[offset+1] == '"' || (pointer[offset+1] <= 0x1F)) { - key_or_index += pointer[offset+1]; - offset++; - continue; - } - return false; // invalid escaped character - } - if (pointer[offset] == '\"') { - // unescaped quote character. this is an invalid case. - // lets do nothing and assume most pointers will be valid. - // it won't find any corresponding json key anyway. - // return false; - } - key_or_index += pointer[offset]; } + } - bool found = false; - if (is_object()) { - if (move_to_key(key_or_index.c_str(), key_or_index.length())) { - found = relative_move_to(pointer+offset, length-offset); - } - } - else if(is_array()) { - if (key_or_index == "-") { // handling "-" case first - if (down()) { - while(next()); // moving to the end of the array - // moving to the nonexistent value right after... - size_t npos; - if ((current_type == '[') || (current_type == '{')) { - // we need to jump - npos = ( current_val & JSONVALUEMASK); - } else { - npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1); - } - location = npos; - current_val = pj.tape[npos]; - current_type = (current_val >> 56); - return true; // how could it fail ? - } - } else { // regular numeric index - // The index can't have a leading '0' - if (key_or_index[0] == '0' && key_or_index.length() > 1) { - return false; - } - // it cannot be empty - if (key_or_index.length() == 0) { - return false; - } - // we already checked the index contains only valid digits - uint32_t index = std::stoi(key_or_index); - if (move_to_index(index)) { - found = relative_move_to(pointer+offset, length-offset); - } - } - } - - return found; -} + return found; } +} // namespace simdjson diff --git a/src/simdjson.cpp b/src/simdjson.cpp index d77c3a5b..fd7fbdb4 100644 --- a/src/simdjson.cpp +++ b/src/simdjson.cpp @@ -1,25 +1,30 @@ -#include #include "simdjson/simdjson.h" +#include namespace simdjson { -const std::map errorStrings = { +const std::map error_strings = { {SUCCESS, "No errors"}, {CAPACITY, "This ParsedJson can't support a document that big"}, {MEMALLOC, "Error allocating memory, we're most likely out of memory"}, {TAPE_ERROR, "Something went wrong while writing to the tape"}, {STRING_ERROR, "Problem while parsing a string"}, - {T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'"}, - {F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'"}, - {N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'"}, + {T_ATOM_ERROR, + "Problem while parsing an atom starting with the letter 't'"}, + {F_ATOM_ERROR, + "Problem while parsing an atom starting with the letter 'f'"}, + {N_ATOM_ERROR, + "Problem while parsing an atom starting with the letter 'n'"}, {NUMBER_ERROR, "Problem while parsing a number"}, {UTF8_ERROR, "The input is not valid UTF-8"}, {UNITIALIZED, "Unitialized"}, {EMPTY, "Empty"}, - {UNESCAPED_CHARS, "Within strings, some characters must be escapted, we found unescapted characters"}, - {UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson"}, + {UNESCAPED_CHARS, "Within strings, some characters must be escapted, we " + "found unescapted characters"}, + {UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as " + "you may have found a bug in simdjson"}, }; -const std::string& errorMsg(const int errorCode) { - return errorStrings.at(errorCode); -} +const std::string &error_message(const int error_code) { + return error_strings.at(error_code); } +} // namespace simdjson diff --git a/src/stage1_find_marks.cpp b/src/stage1_find_marks.cpp index 605dbc97..fbf561c4 100644 --- a/src/stage1_find_marks.cpp +++ b/src/stage1_find_marks.cpp @@ -1,37 +1,41 @@ #include "simdjson/portability.h" - #ifdef IS_X86_64 #include "simdjson/stage1_find_marks_haswell.h" #include "simdjson/stage1_find_marks_westmere.h" TARGET_HASWELL namespace simdjson { -template<> -int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { - FIND_STRUCTURAL_BITS(architecture::haswell, buf, len, pj, simdjson::haswell::flatten_bits); +template <> +int find_structural_bits(const uint8_t *buf, size_t len, + ParsedJson &pj) { + FIND_STRUCTURAL_BITS(Architecture::HASWELL, buf, len, pj, + simdjson::haswell::flatten_bits); } -} // simdjson +} // namespace simdjson UNTARGET_REGION TARGET_WESTMERE namespace simdjson { -template<> -int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { - FIND_STRUCTURAL_BITS(architecture::westmere, buf, len, pj, simdjson::flatten_bits); +template <> +int find_structural_bits(const uint8_t *buf, size_t len, + ParsedJson &pj) { + FIND_STRUCTURAL_BITS(Architecture::WESTMERE, buf, len, pj, + simdjson::flatten_bits); } -} // simdjson +} // namespace simdjson UNTARGET_REGION #endif - #ifdef IS_ARM64 #include "simdjson/stage1_find_marks_arm64.h" namespace simdjson { -template<> -int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj) { - FIND_STRUCTURAL_BITS(architecture::arm64, buf, len, pj, simdjson::flatten_bits); -} +template <> +int find_structural_bits(const uint8_t *buf, size_t len, + ParsedJson &pj) { + FIND_STRUCTURAL_BITS(Architecture::ARM64, buf, len, pj, + simdjson::flatten_bits); } +} // namespace simdjson #endif diff --git a/src/stage2_build_tape.cpp b/src/stage2_build_tape.cpp index 2c333d70..423147d5 100644 --- a/src/stage2_build_tape.cpp +++ b/src/stage2_build_tape.cpp @@ -4,13 +4,13 @@ namespace simdjson { // this macro reads the next structural character, updating idx, i and c. #define UPDATE_CHAR() \ -{ \ - idx = pj.structural_indexes[i++]; \ - c = buf[idx]; \ -} + { \ + idx = pj.structural_indexes[i++]; \ + c = buf[idx]; \ + } #ifdef SIMDJSON_USE_COMPUTED_GOTO -#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue; +#define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = &&array_continue; #define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = &&object_continue; #define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = &&start_continue; #define GOTO_CONTINUE() goto *pj.ret_address[depth]; @@ -18,517 +18,550 @@ namespace simdjson { #define SET_GOTO_ARRAY_CONTINUE() pj.ret_address[depth] = 'a'; #define SET_GOTO_OBJECT_CONTINUE() pj.ret_address[depth] = 'o'; #define SET_GOTO_START_CONTINUE() pj.ret_address[depth] = 's'; -#define GOTO_CONTINUE() { \ - if(pj.ret_address[depth] == 'a') { \ - goto array_continue; \ - } else if (pj.ret_address[depth] == 'o') { \ - goto object_continue; \ - } else { \ - goto start_continue; \ - } \ -} -#endif +#define GOTO_CONTINUE() \ + { \ + if (pj.ret_address[depth] == 'a') { \ + goto array_continue; \ + } else if (pj.ret_address[depth] == 'o') { \ + goto object_continue; \ + } else { \ + goto start_continue; \ + } \ + } +#endif /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -// We need to compile that code for multiple architectures. However, target attributes can be used -// only once by function definition. Huge macro seemed better than huge code duplication. -// int UNIFIED_MACHINE(const uint8_t *buf, size_t len, ParsedJson &pj) -#define UNIFIED_MACHINE(T, buf, len, pj) { \ - if (ALLOW_SAME_PAGE_BUFFER_OVERRUN) { \ - memset((uint8_t*)buf + len, 0, SIMDJSON_PADDING); /* to please valgrind */ \ - } \ - uint32_t i = 0; /* index of the structural character (0,1,2,3...) */ \ - uint32_t idx; /* location of the structural character in the input (buf) */ \ - uint8_t c; /* used to track the (structural) character we are looking at, updated */ \ - /* by UPDATE_CHAR macro */ \ - uint32_t depth = 0; /* could have an arbitrary starting depth */ \ - pj.init(); /* sets isvalid to false */ \ - if(pj.bytecapacity < len) { \ - pj.errorcode = simdjson::CAPACITY; \ - return pj.errorcode; \ - } \ - \ - /*//////////////////////////// START STATE ///////////////////////////// */ \ - SET_GOTO_START_CONTINUE() \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */ \ - /* the root is used, if nothing else, to capture the size of the tape */ \ - depth++; /* everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. */ \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - \ - UPDATE_CHAR(); \ - switch (c) { \ - case '{': \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - SET_GOTO_START_CONTINUE(); \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - pj.write_tape(0, c); /* strangely, moving this to object_begin slows things down */ \ - goto object_begin; \ - case '[': \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - SET_GOTO_START_CONTINUE(); \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - goto array_begin; \ -/* #define SIMDJSON_ALLOWANYTHINGINROOT */ \ - /* A JSON text is a serialized value. Note that certain previous */ \ - /* specifications of JSON constrained a JSON text to be an object or an */ \ - /* array. Implementations that generate only objects or arrays where a */ \ - /* JSON text is called for will be interoperable in the sense that all */ \ - /* implementations will accept these as conforming JSON texts. */ \ - /* https://tools.ietf.org/html/rfc8259 */ \ -/* #ifdef SIMDJSON_ALLOWANYTHINGINROOT */ \ - case '"': { \ - if (!parse_string(buf, len, pj, depth, idx)) { \ - goto fail; \ - } \ - break; \ - } \ - case 't': { \ - /* we need to make a copy to make sure that the string is space terminated. */ \ - /* this only applies to the JSON document made solely of the true value. */ \ - /* this will almost never be called in practice */ \ - char * copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ - if(copy == nullptr) { \ - goto fail; \ - } \ - memcpy(copy, buf, len); \ - copy[len] = ' '; \ - if (!is_valid_true_atom(reinterpret_cast(copy) + idx)) { \ - free(copy); \ - goto fail; \ - } \ - free(copy); \ - pj.write_tape(0, c); \ - break; \ - } \ - case 'f': { \ - /* we need to make a copy to make sure that the string is space terminated. */ \ - /* this only applies to the JSON document made solely of the false value. */ \ - /* this will almost never be called in practice */ \ - char * copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ - if(copy == nullptr) { \ - goto fail; \ - } \ - memcpy(copy, buf, len); \ - copy[len] = ' '; \ - if (!is_valid_false_atom(reinterpret_cast(copy) + idx)) { \ - free(copy); \ - goto fail; \ - } \ - free(copy); \ - pj.write_tape(0, c); \ - break; \ - } \ - case 'n': { \ - /* we need to make a copy to make sure that the string is space terminated. */ \ - /* this only applies to the JSON document made solely of the null value. */ \ - /* this will almost never be called in practice */ \ - char * copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ - if(copy == nullptr) { \ - goto fail; \ - } \ - memcpy(copy, buf, len); \ - copy[len] = ' '; \ - if (!is_valid_null_atom(reinterpret_cast(copy) + idx)) { \ - free(copy); \ - goto fail; \ - } \ - free(copy); \ - pj.write_tape(0, c); \ - break; \ - } \ - case '0': \ - case '1': \ - case '2': \ - case '3': \ - case '4': \ - case '5': \ - case '6': \ - case '7': \ - case '8': \ - case '9': { \ - /* we need to make a copy to make sure that the string is space terminated. */ \ - /* this is done only for JSON documents made of a sole number */ \ - /* this will almost never be called in practice. We terminate with a space */ \ - /* because we do not want to allow NULLs in the middle of a number (whereas a */ \ - /* space in the middle of a number would be identified in stage 1). */ \ - char * copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ - if(copy == nullptr) { \ - goto fail; \ - } \ - memcpy(copy, buf, len); \ - copy[len] = ' '; \ - if (!parse_number(reinterpret_cast(copy), pj, idx, false)) { \ - free(copy); \ - goto fail; \ - } \ - free(copy); \ - break; \ - } \ - case '-': { \ - /* we need to make a copy to make sure that the string is NULL terminated. */ \ - /* this is done only for JSON documents made of a sole number */ \ - /* this will almost never be called in practice */ \ - char * copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ - if(copy == nullptr) { \ - goto fail; \ - } \ - memcpy(copy, buf, len); \ - copy[len] = ' '; \ - if (!parse_number(reinterpret_cast(copy), pj, idx, true)) { \ - free(copy); \ - goto fail; \ - } \ - free(copy); \ - break; \ - } \ -/* #endif // ALLOWANYTHINGINROOT */ \ - default: \ - goto fail; \ - } \ -start_continue: \ - /* the string might not be NULL terminated. */ \ - if(i + 1 == pj.n_structural_indexes) { \ - goto succeed; \ - } else { \ - goto fail; \ - } \ - /*//////////////////////////// OBJECT STATES ///////////////////////////// */ \ - \ -object_begin: \ - UPDATE_CHAR(); \ - switch (c) { \ - case '"': { \ - if (!parse_string(buf, len, pj, depth, idx)) { \ - goto fail; \ - } \ - goto object_key_state; \ - } \ - case '}': \ - goto scope_end; /* could also go to object_continue */ \ - default: \ - goto fail; \ - } \ - \ -object_key_state: \ - UPDATE_CHAR(); \ - if (c != ':') { \ - goto fail; \ - } \ - UPDATE_CHAR(); \ - switch (c) { \ - case '"': { \ - if (!parse_string(buf, len, pj, depth, idx)) { \ - goto fail; \ - } \ - break; \ - } \ - case 't': \ - if (!is_valid_true_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; \ - case 'f': \ - if (!is_valid_false_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; \ - case 'n': \ - if (!is_valid_null_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; \ - case '0': \ - case '1': \ - case '2': \ - case '3': \ - case '4': \ - case '5': \ - case '6': \ - case '7': \ - case '8': \ - case '9': { \ - if (!parse_number(buf, pj, idx, false)) { \ - goto fail; \ - } \ - break; \ - } \ - case '-': { \ - if (!parse_number(buf, pj, idx, true)) { \ - goto fail; \ - } \ - break; \ - } \ - case '{': { \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - pj.write_tape(0, c); /* here the compilers knows what c is so this gets optimized */ \ - /* we have not yet encountered } so we need to come back for it */ \ - SET_GOTO_OBJECT_CONTINUE() \ - /* we found an object inside an object, so we need to increment the depth */ \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - \ - goto object_begin; \ - } \ - case '[': { \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - pj.write_tape(0, c); /* here the compilers knows what c is so this gets optimized */ \ - /* we have not yet encountered } so we need to come back for it */ \ - SET_GOTO_OBJECT_CONTINUE() \ - /* we found an array inside an object, so we need to increment the depth */ \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - goto array_begin; \ - } \ - default: \ - goto fail; \ - } \ - \ -object_continue: \ - UPDATE_CHAR(); \ - switch (c) { \ - case ',': \ - UPDATE_CHAR(); \ - if (c != '"') { \ - goto fail; \ - } else { \ - if (!parse_string(buf, len, pj, depth, idx)) { \ - goto fail; \ - } \ - goto object_key_state; \ - } \ - case '}': \ - goto scope_end; \ - default: \ - goto fail; \ - } \ - \ - /*//////////////////////////// COMMON STATE ///////////////////////////// */ \ - \ -scope_end: \ - /* write our tape location to the header scope */ \ - depth--; \ - pj.write_tape(pj.containing_scope_offset[depth], c); \ - pj.annotate_previousloc(pj.containing_scope_offset[depth], \ - pj.get_current_loc()); \ - /* goto saved_state */ \ - GOTO_CONTINUE() \ - \ - /*//////////////////////////// ARRAY STATES ///////////////////////////// */ \ -array_begin: \ - UPDATE_CHAR(); \ - if (c == ']') { \ - goto scope_end; /* could also go to array_continue */ \ - } \ - \ -main_array_switch: \ - /* we call update char on all paths in, so we can peek at c on the */ \ - /* on paths that can accept a close square brace (post-, and at start) */ \ - switch (c) { \ - case '"': { \ - if (!parse_string(buf, len, pj, depth, idx)) { \ - goto fail; \ - } \ - break; \ - } \ - case 't': \ - if (!is_valid_true_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; \ - case 'f': \ - if (!is_valid_false_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; \ - case 'n': \ - if (!is_valid_null_atom(buf + idx)) { \ - goto fail; \ - } \ - pj.write_tape(0, c); \ - break; /* goto array_continue; */ \ - \ - case '0': \ - case '1': \ - case '2': \ - case '3': \ - case '4': \ - case '5': \ - case '6': \ - case '7': \ - case '8': \ - case '9': { \ - if (!parse_number(buf, pj, idx, false)) { \ - goto fail; \ - } \ - break; /* goto array_continue; */ \ - } \ - case '-': { \ - if (!parse_number(buf, pj, idx, true)) { \ - goto fail; \ - } \ - break; /* goto array_continue; */ \ - } \ - case '{': { \ - /* we have not yet encountered ] so we need to come back for it */ \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - pj.write_tape(0, c); /* here the compilers knows what c is so this gets optimized */ \ - SET_GOTO_ARRAY_CONTINUE() \ - /* we found an object inside an array, so we need to increment the depth */ \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - \ - goto object_begin; \ - } \ - case '[': { \ - /* we have not yet encountered ] so we need to come back for it */ \ - pj.containing_scope_offset[depth] = pj.get_current_loc(); \ - pj.write_tape(0, c); /* here the compilers knows what c is so this gets optimized */ \ - SET_GOTO_ARRAY_CONTINUE() \ - /* we found an array inside an array, so we need to increment the depth */ \ - depth++; \ - if (depth >= pj.depthcapacity) { \ - goto fail; \ - } \ - goto array_begin; \ - } \ - default: \ - goto fail; \ - } \ - \ -array_continue: \ - UPDATE_CHAR(); \ - switch (c) { \ - case ',': \ - UPDATE_CHAR(); \ - goto main_array_switch; \ - case ']': \ - goto scope_end; \ - default: \ - goto fail; \ - } \ - \ - /*//////////////////////////// FINAL STATES ///////////////////////////// */ \ - \ -succeed: \ - depth --; \ - if(depth != 0) { \ - fprintf(stderr, "internal bug\n"); \ - abort(); \ - } \ - if(pj.containing_scope_offset[depth] != 0) { \ - fprintf(stderr, "internal bug\n"); \ - abort(); \ - } \ - pj.annotate_previousloc(pj.containing_scope_offset[depth], \ - pj.get_current_loc()); \ - pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ \ - \ - pj.isvalid = true; \ - pj.errorcode = simdjson::SUCCESS; \ - return pj.errorcode; \ -fail: \ - /* we do not need the next line because this is done by pj.init(), pessimistically. */ \ - /* pj.isvalid = false; */ \ - /* At this point in the code, we have all the time in the world. */ \ - /* Note that we know exactly where we are in the document so we could, */ \ - /* without any overhead on the processing code, report a specific location. */ \ - /* We could even trigger special code paths to assess what happened carefully, */ \ - /* all without any added cost. */ \ - if (depth >= pj.depthcapacity) { \ - pj.errorcode = simdjson::DEPTH_ERROR; \ - return pj.errorcode; \ - } \ - switch(c) { \ - case '"': \ - pj.errorcode = simdjson::STRING_ERROR; \ - return pj.errorcode; \ - case '0': \ - case '1': \ - case '2': \ - case '3': \ - case '4': \ - case '5': \ - case '6': \ - case '7': \ - case '8': \ - case '9': \ - case '-': \ - pj.errorcode = simdjson::NUMBER_ERROR; \ - return pj.errorcode; \ - case 't': \ - pj.errorcode = simdjson::T_ATOM_ERROR; \ - return pj.errorcode; \ - case 'n': \ - pj.errorcode = simdjson::N_ATOM_ERROR; \ - return pj.errorcode; \ - case 'f': \ - pj.errorcode = simdjson::F_ATOM_ERROR; \ - return pj.errorcode; \ - default: \ - break; \ - } \ - pj.errorcode = simdjson::TAPE_ERROR; \ - return pj.errorcode; \ -} \ - - -} +// We need to compile that code for multiple architectures. However, target +// attributes can be used only once by function definition. Huge macro seemed +// better than huge code duplication. int UNIFIED_MACHINE(const uint8_t *buf, +// size_t len, ParsedJson &pj) +#define UNIFIED_MACHINE(T, buf, len, pj) \ + { \ + if (ALLOW_SAME_PAGE_BUFFER_OVERRUN) { \ + memset((uint8_t *)buf + len, 0, \ + SIMDJSON_PADDING); /* to please valgrind */ \ + } \ + uint32_t i = 0; /* index of the structural character (0,1,2,3...) */ \ + uint32_t \ + idx; /* location of the structural character in the input (buf) */ \ + uint8_t c; /* used to track the (structural) character we are looking at, \ + updated */ \ + /* by UPDATE_CHAR macro */ \ + uint32_t depth = 0; /* could have an arbitrary starting depth */ \ + pj.init(); /* sets is_valid to false */ \ + if (pj.byte_capacity < len) { \ + pj.error_code = simdjson::CAPACITY; \ + return pj.error_code; \ + } \ + \ + /*//////////////////////////// START STATE ///////////////////////////// \ + */ \ + SET_GOTO_START_CONTINUE() \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + pj.write_tape(0, 'r'); /* r for root, 0 is going to get overwritten */ \ + /* the root is used, if nothing else, to capture the size of the tape */ \ + depth++; /* everything starts at depth = 1, depth = 0 is just for the \ + root, the root may contain an object, an array or something \ + else. */ \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + \ + UPDATE_CHAR(); \ + switch (c) { \ + case '{': \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + SET_GOTO_START_CONTINUE(); \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + pj.write_tape( \ + 0, \ + c); /* strangely, moving this to object_begin slows things down */ \ + goto object_begin; \ + case '[': \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + SET_GOTO_START_CONTINUE(); \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + goto array_begin; \ + /* #define SIMDJSON_ALLOWANYTHINGINROOT \ + * A JSON text is a serialized value. Note that certain previous \ + * specifications of JSON constrained a JSON text to be an object or an \ + * array. Implementations that generate only objects or arrays where a \ + * JSON text is called for will be interoperable in the sense that all \ + * implementations will accept these as conforming JSON texts. \ + * https://tools.ietf.org/html/rfc8259 \ + * #ifdef SIMDJSON_ALLOWANYTHINGINROOT */ \ + case '"': { \ + if (!parse_string(buf, len, pj, depth, idx)) { \ + goto fail; \ + } \ + break; \ + } \ + case 't': { \ + /* we need to make a copy to make sure that the string is space \ + * terminated. \ + * this only applies to the JSON document made solely of the true value. \ + * this will almost never be called in practice */ \ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ + if (copy == nullptr) { \ + goto fail; \ + } \ + memcpy(copy, buf, len); \ + copy[len] = ' '; \ + if (!is_valid_true_atom(reinterpret_cast(copy) + \ + idx)) { \ + free(copy); \ + goto fail; \ + } \ + free(copy); \ + pj.write_tape(0, c); \ + break; \ + } \ + case 'f': { \ + /* we need to make a copy to make sure that the string is space \ + * terminated. \ + * this only applies to the JSON document made solely of the false \ + * value. \ + * this will almost never be called in practice */ \ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ + if (copy == nullptr) { \ + goto fail; \ + } \ + memcpy(copy, buf, len); \ + copy[len] = ' '; \ + if (!is_valid_false_atom(reinterpret_cast(copy) + \ + idx)) { \ + free(copy); \ + goto fail; \ + } \ + free(copy); \ + pj.write_tape(0, c); \ + break; \ + } \ + case 'n': { \ + /* we need to make a copy to make sure that the string is space \ + * terminated. \ + * this only applies to the JSON document made solely of the null value. \ + * this will almost never be called in practice */ \ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ + if (copy == nullptr) { \ + goto fail; \ + } \ + memcpy(copy, buf, len); \ + copy[len] = ' '; \ + if (!is_valid_null_atom(reinterpret_cast(copy) + \ + idx)) { \ + free(copy); \ + goto fail; \ + } \ + free(copy); \ + pj.write_tape(0, c); \ + break; \ + } \ + case '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9': { \ + /* we need to make a copy to make sure that the string is space \ + * terminated. \ + * this is done only for JSON documents made of a sole number \ + * this will almost never be called in practice. We terminate with a \ + * space \ + * because we do not want to allow NULLs in the middle of a number \ + * (whereas a \ + * space in the middle of a number would be identified in stage 1). */ \ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ + if (copy == nullptr) { \ + goto fail; \ + } \ + memcpy(copy, buf, len); \ + copy[len] = ' '; \ + if (!parse_number(reinterpret_cast(copy), pj, idx, \ + false)) { \ + free(copy); \ + goto fail; \ + } \ + free(copy); \ + break; \ + } \ + case '-': { \ + /* we need to make a copy to make sure that the string is NULL \ + * terminated. \ + * this is done only for JSON documents made of a sole number \ + * this will almost never be called in practice */ \ + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); \ + if (copy == nullptr) { \ + goto fail; \ + } \ + memcpy(copy, buf, len); \ + copy[len] = ' '; \ + if (!parse_number(reinterpret_cast(copy), pj, idx, \ + true)) { \ + free(copy); \ + goto fail; \ + } \ + free(copy); \ + break; \ + } \ + default: \ + goto fail; \ + } \ + start_continue: \ + /* the string might not be NULL terminated. */ \ + if (i + 1 == pj.n_structural_indexes) { \ + goto succeed; \ + } else { \ + goto fail; \ + } \ + /*//////////////////////////// OBJECT STATES ///////////////////////////*/ \ + \ + object_begin: \ + UPDATE_CHAR(); \ + switch (c) { \ + case '"': { \ + if (!parse_string(buf, len, pj, depth, idx)) { \ + goto fail; \ + } \ + goto object_key_state; \ + } \ + case '}': \ + goto scope_end; /* could also go to object_continue */ \ + default: \ + goto fail; \ + } \ + \ + object_key_state: \ + UPDATE_CHAR(); \ + if (c != ':') { \ + goto fail; \ + } \ + UPDATE_CHAR(); \ + switch (c) { \ + case '"': { \ + if (!parse_string(buf, len, pj, depth, idx)) { \ + goto fail; \ + } \ + break; \ + } \ + case 't': \ + if (!is_valid_true_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; \ + case 'f': \ + if (!is_valid_false_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; \ + case 'n': \ + if (!is_valid_null_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; \ + case '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9': { \ + if (!parse_number(buf, pj, idx, false)) { \ + goto fail; \ + } \ + break; \ + } \ + case '-': { \ + if (!parse_number(buf, pj, idx, true)) { \ + goto fail; \ + } \ + break; \ + } \ + case '{': { \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ + optimized */ \ + /* we have not yet encountered } so we need to come back for it */ \ + SET_GOTO_OBJECT_CONTINUE() \ + /* we found an object inside an object, so we need to increment the \ + * depth */ \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + \ + goto object_begin; \ + } \ + case '[': { \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ + optimized */ \ + /* we have not yet encountered } so we need to come back for it */ \ + SET_GOTO_OBJECT_CONTINUE() \ + /* we found an array inside an object, so we need to increment the depth \ + */ \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + goto array_begin; \ + } \ + default: \ + goto fail; \ + } \ + \ + object_continue: \ + UPDATE_CHAR(); \ + switch (c) { \ + case ',': \ + UPDATE_CHAR(); \ + if (c != '"') { \ + goto fail; \ + } else { \ + if (!parse_string(buf, len, pj, depth, idx)) { \ + goto fail; \ + } \ + goto object_key_state; \ + } \ + case '}': \ + goto scope_end; \ + default: \ + goto fail; \ + } \ + \ + /*//////////////////////////// COMMON STATE ///////////////////////////*/ \ + \ + scope_end: \ + /* write our tape location to the header scope */ \ + depth--; \ + pj.write_tape(pj.containing_scope_offset[depth], c); \ + pj.annotate_previous_loc(pj.containing_scope_offset[depth], \ + pj.get_current_loc()); \ + /* goto saved_state */ \ + GOTO_CONTINUE() \ + \ + /*//////////////////////////// ARRAY STATES ///////////////////////////*/ \ + array_begin: \ + UPDATE_CHAR(); \ + if (c == ']') { \ + goto scope_end; /* could also go to array_continue */ \ + } \ + \ + main_array_switch: \ + /* we call update char on all paths in, so we can peek at c on the \ + * on paths that can accept a close square brace (post-, and at start) */ \ + switch (c) { \ + case '"': { \ + if (!parse_string(buf, len, pj, depth, idx)) { \ + goto fail; \ + } \ + break; \ + } \ + case 't': \ + if (!is_valid_true_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; \ + case 'f': \ + if (!is_valid_false_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; \ + case 'n': \ + if (!is_valid_null_atom(buf + idx)) { \ + goto fail; \ + } \ + pj.write_tape(0, c); \ + break; /* goto array_continue; */ \ + \ + case '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9': { \ + if (!parse_number(buf, pj, idx, false)) { \ + goto fail; \ + } \ + break; /* goto array_continue; */ \ + } \ + case '-': { \ + if (!parse_number(buf, pj, idx, true)) { \ + goto fail; \ + } \ + break; /* goto array_continue; */ \ + } \ + case '{': { \ + /* we have not yet encountered ] so we need to come back for it */ \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ + optimized */ \ + SET_GOTO_ARRAY_CONTINUE() \ + /* we found an object inside an array, so we need to increment the depth \ + */ \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + \ + goto object_begin; \ + } \ + case '[': { \ + /* we have not yet encountered ] so we need to come back for it */ \ + pj.containing_scope_offset[depth] = pj.get_current_loc(); \ + pj.write_tape(0, c); /* here the compilers knows what c is so this gets \ + optimized */ \ + SET_GOTO_ARRAY_CONTINUE() \ + /* we found an array inside an array, so we need to increment the depth \ + */ \ + depth++; \ + if (depth >= pj.depth_capacity) { \ + goto fail; \ + } \ + goto array_begin; \ + } \ + default: \ + goto fail; \ + } \ + \ + array_continue: \ + UPDATE_CHAR(); \ + switch (c) { \ + case ',': \ + UPDATE_CHAR(); \ + goto main_array_switch; \ + case ']': \ + goto scope_end; \ + default: \ + goto fail; \ + } \ + \ + /*//////////////////////////// FINAL STATES ///////////////////////////*/ \ + \ + succeed: \ + depth--; \ + if (depth != 0) { \ + fprintf(stderr, "internal bug\n"); \ + abort(); \ + } \ + if (pj.containing_scope_offset[depth] != 0) { \ + fprintf(stderr, "internal bug\n"); \ + abort(); \ + } \ + pj.annotate_previous_loc(pj.containing_scope_offset[depth], \ + pj.get_current_loc()); \ + pj.write_tape(pj.containing_scope_offset[depth], 'r'); /* r is root */ \ + \ + pj.valid = true; \ + pj.error_code = simdjson::SUCCESS; \ + return pj.error_code; \ + fail: \ + /* we do not need the next line because this is done by pj.init(), \ + * pessimistically. \ + * pj.is_valid = false; \ + * At this point in the code, we have all the time in the world. \ + * Note that we know exactly where we are in the document so we could, \ + * without any overhead on the processing code, report a specific \ + * location. \ + * We could even trigger special code paths to assess what happened \ + * carefully, \ + * all without any added cost. */ \ + if (depth >= pj.depth_capacity) { \ + pj.error_code = simdjson::DEPTH_ERROR; \ + return pj.error_code; \ + } \ + switch (c) { \ + case '"': \ + pj.error_code = simdjson::STRING_ERROR; \ + return pj.error_code; \ + case '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9': \ + case '-': \ + pj.error_code = simdjson::NUMBER_ERROR; \ + return pj.error_code; \ + case 't': \ + pj.error_code = simdjson::T_ATOM_ERROR; \ + return pj.error_code; \ + case 'n': \ + pj.error_code = simdjson::N_ATOM_ERROR; \ + return pj.error_code; \ + case 'f': \ + pj.error_code = simdjson::F_ATOM_ERROR; \ + return pj.error_code; \ + default: \ + break; \ + } \ + pj.error_code = simdjson::TAPE_ERROR; \ + return pj.error_code; \ + } +} // namespace simdjson #ifdef IS_X86_64 TARGET_HASWELL namespace simdjson { -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER -int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - UNIFIED_MACHINE(architecture::haswell, buf, len, pj); -} +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int +unified_machine(const uint8_t *buf, size_t len, + ParsedJson &pj) { + UNIFIED_MACHINE(Architecture::HASWELL, buf, len, pj); } +} // namespace simdjson UNTARGET_REGION TARGET_WESTMERE namespace simdjson { -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER -int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - UNIFIED_MACHINE(architecture::westmere, buf, len, pj); -} +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int +unified_machine(const uint8_t *buf, size_t len, + ParsedJson &pj) { + UNIFIED_MACHINE(Architecture::WESTMERE, buf, len, pj); } +} // namespace simdjson UNTARGET_REGION #endif // IS_X86_64 #ifdef IS_ARM64 namespace simdjson { -template<> -WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER -int unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj) { - UNIFIED_MACHINE(architecture::arm64, buf, len, pj); -} +template <> +WARN_UNUSED ALLOW_SAME_PAGE_BUFFER_OVERRUN_QUALIFIER LENIENT_MEM_SANITIZER int +unified_machine(const uint8_t *buf, size_t len, + ParsedJson &pj) { + UNIFIED_MACHINE(Architecture::ARM64, buf, len, pj); } +} // namespace simdjson #endif - - diff --git a/tests/allparserscheckfile.cpp b/tests/allparserscheckfile.cpp index d67cb96f..4da58679 100644 --- a/tests/allparserscheckfile.cpp +++ b/tests/allparserscheckfile.cpp @@ -40,7 +40,7 @@ using namespace rapidjson; int main(int argc, char *argv[]) { bool verbose = false; - bool justfavorites = false; + bool just_favorites = false; int c; while ((c = getopt(argc, argv, "vm")) != -1) switch (c) { @@ -48,7 +48,7 @@ int main(int argc, char *argv[]) { verbose = true; break; case 'm': - justfavorites = true; + just_favorites = true; break; default: abort(); @@ -77,8 +77,8 @@ int main(int argc, char *argv[]) { std::cout << std::endl; } simdjson::ParsedJson pj; - size_t maxdepth = 1024 * 4; - bool allocok = pj.allocateCapacity(p.size(), maxdepth); + size_t max_depth = 1024 * 4; + bool allocok = pj.allocate_capacity(p.size(), max_depth); if (!allocok) { std::cerr << "can't allocate memory" << std::endl; return EXIT_FAILURE; @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) { sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)) .is_valid(); - if (justfavorites) { + if (just_favorites) { printf("our parser : %s \n", ours_correct ? "correct" : "invalid"); printf("rapid (check encoding) : %s \n", @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { if (oursreturn == simdjson::DEPTH_ERROR) { printf("simdjson encountered a DEPTH_ERROR, it was parametrized to " "reject documents with depth exceeding %zu.\n", - maxdepth); + max_depth); } if ((ours_correct != rapid_correct_checkencoding) || (rapid_correct_checkencoding != sajson_correct) || @@ -157,12 +157,12 @@ int main(int argc, char *argv[]) { } Json::CharReaderBuilder b; - Json::CharReader *jsoncppreader = b.newCharReader(); + Json::CharReader *json_cpp_reader = b.newCharReader(); Json::Value root; Json::String errs; - bool isjsoncppok = - jsoncppreader->parse(buffer, buffer + p.size(), &root, &errs); - delete jsoncppreader; + bool is_json_cpp_ok = + json_cpp_reader->parse(buffer, buffer + p.size(), &root, &errs); + delete json_cpp_reader; printf("our parser : %s \n", ours_correct ? "correct" : "invalid"); @@ -185,7 +185,7 @@ int main(int argc, char *argv[]) { printf("cjson : %s \n", cjson_correct ? "correct" : "invalid"); printf("jsoncpp : %s \n", - isjsoncppok ? "correct" : "invalid"); + is_json_cpp_ok ? "correct" : "invalid"); free(buffer); return EXIT_SUCCESS; diff --git a/tests/basictests.cpp b/tests/basictests.cpp index 10bd0732..06ddc214 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -15,10 +15,10 @@ bool skyprophet_test() { std::vector data; char buf[1024]; for (size_t i = 0; i < n_records; ++i) { - auto n = - sprintf(buf, "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", " + auto n = sprintf(buf, + "{\"id\": %zu, \"name\": \"name%zu\", \"gender\": \"%s\", " "\"school\": {\"id\": %zu, \"name\": \"school%zu\"}}", - i, i, (i % 2) ? "male" : "female", i % 10, i % 10); + i, i, (i % 2) ? "male" : "female", i % 10, i % 10); data.emplace_back(std::string(buf, n)); } for (size_t i = 0; i < n_records; ++i) { @@ -40,7 +40,7 @@ bool skyprophet_test() { maxsize = s.size(); } simdjson::ParsedJson pj; - if (!pj.allocateCapacity(maxsize)) { + if (!pj.allocate_capacity(maxsize)) { printf("allocation failure in skyprophet_test\n"); return false; } @@ -52,12 +52,12 @@ bool skyprophet_test() { } counter++; auto ok1 = json_parse(rec.c_str(), rec.length(), pj); - if (ok1 != 0 || !pj.isValid()) { + if (ok1 != 0 || !pj.is_valid()) { printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str()); return false; } auto ok2 = json_parse(rec, pj); - if (ok2 != 0 || !pj.isValid()) { + if (ok2 != 0 || !pj.is_valid()) { printf("Something is wrong in skyprophet_test: %s.\n", rec.c_str()); return false; } diff --git a/tests/jsoncheck.cpp b/tests/jsoncheck.cpp index 8646a5fc..4c861bf0 100644 --- a/tests/jsoncheck.cpp +++ b/tests/jsoncheck.cpp @@ -17,14 +17,14 @@ /** * Does the file filename ends with the given extension. */ -static bool hasExtension(const char *filename, const char *extension) { +static bool has_extension(const char *filename, const char *extension) { const char *ext = strrchr(filename, '.'); return ((ext != nullptr) && (strcmp(ext, extension) == 0)); } -bool startsWith(const char *pre, const char *str) { - size_t lenpre = strlen(pre), lenstr = strlen(str); - return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0; +bool starts_with(const char *pre, const char *str) { + size_t len_pre = strlen(pre), len_str = strlen(str); + return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0; } bool contains(const char *pre, const char *str) { @@ -32,7 +32,7 @@ bool contains(const char *pre, const char *str) { } bool validate(const char *dirname) { - bool everythingfine = true; + bool everything_fine = true; const char *extension = ".json"; size_t dirlen = strlen(dirname); struct dirent **entry_list; @@ -45,15 +45,15 @@ bool validate(const char *dirname) { printf("nothing in dir %s \n", dirname); return false; } - bool *isfileasexpected = new bool[c]; + bool *is_file_as_expected = new bool[c]; for (int i = 0; i < c; i++) { - isfileasexpected[i] = true; + is_file_as_expected[i] = true; } - size_t howmany = 0; + size_t how_many = 0; bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/'); for (int i = 0; i < c; i++) { const char *name = entry_list[i]->d_name; - if (hasExtension(name, extension)) { + if (has_extension(name, extension)) { printf("validating: file %s ", name); fflush(nullptr); size_t filelen = strlen(name); @@ -73,38 +73,38 @@ bool validate(const char *dirname) { return EXIT_FAILURE; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "can't allocate memory" << std::endl; return false; } - ++howmany; - const int parseRes = json_parse(p, pj); - printf("%s\n", parseRes == 0 ? "ok" : "invalid"); + ++how_many; + const int parse_res = json_parse(p, pj); + printf("%s\n", parse_res == 0 ? "ok" : "invalid"); if (contains("EXCLUDE", name)) { // skipping - howmany--; - } else if (startsWith("pass", name) && parseRes != 0) { - isfileasexpected[i] = false; + how_many--; + } else if (starts_with("pass", name) && parse_res != 0) { + is_file_as_expected[i] = false; printf("warning: file %s should pass but it fails. Error is: %s\n", - name, simdjson::errorMsg(parseRes).data()); - everythingfine = false; - } else if (startsWith("fail", name) && parseRes == 0) { - isfileasexpected[i] = false; + name, simdjson::error_message(parse_res).data()); + everything_fine = false; + } else if (starts_with("fail", name) && parse_res == 0) { + is_file_as_expected[i] = false; printf("warning: file %s should fail but it passes.\n", name); - everythingfine = false; + everything_fine = false; } free(fullpath); } } - printf("%zu files checked.\n", howmany); - if (everythingfine) { + printf("%zu files checked.\n", how_many); + if (everything_fine) { printf("All ok!\n"); } else { fprintf(stderr, "There were problems! Consider reviewing the following files:\n"); for (int i = 0; i < c; i++) { - if (!isfileasexpected[i]) { + if (!is_file_as_expected[i]) { fprintf(stderr, "%s \n", entry_list[i]->d_name); } } @@ -113,8 +113,8 @@ bool validate(const char *dirname) { free(entry_list[i]); } free(entry_list); - delete[] isfileasexpected; - return everythingfine; + delete[] is_file_as_expected; + return everything_fine; } int main(int argc, char *argv[]) { diff --git a/tests/numberparsingcheck.cpp b/tests/numberparsingcheck.cpp index 94089daf..14f92413 100644 --- a/tests/numberparsingcheck.cpp +++ b/tests/numberparsingcheck.cpp @@ -13,31 +13,30 @@ #include "simdjson/common_defs.h" - -// ulp distance +// ulp distance // Marc B. Reynolds, 2016-2019 // Public Domain under http://unlicense.org, see link for details. // adapted by D. Lemire inline uint32_t f32_ulp_dist(float a, float b) { uint32_t ua, ub; - memcpy(&ua, &a, sizeof(ua)); - memcpy(&ub, &b, sizeof(ub)); - if ((int32_t)(ub^ua) >= 0) - return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua); - return ua+ub+0x80000000; + memcpy(&ua, &a, sizeof(ua)); + memcpy(&ub, &b, sizeof(ub)); + if ((int32_t)(ub ^ ua) >= 0) + return (int32_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua); + return ua + ub + 0x80000000; } -// ulp distance +// ulp distance // Marc B. Reynolds, 2016-2019 // Public Domain under http://unlicense.org, see link for details. // adapted by D. Lemire inline uint64_t f64_ulp_dist(double a, double b) { uint64_t ua, ub; - memcpy(&ua, &a, sizeof(ua)); - memcpy(&ub, &b, sizeof(ub)); - if ((int64_t)(ub^ua) >= 0) - return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua); - return ua+ub+0x80000000; + memcpy(&ua, &a, sizeof(ua)); + memcpy(&ub, &b, sizeof(ub)); + if ((int64_t)(ub ^ ua) >= 0) + return (int64_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua); + return ua + ub + 0x80000000; } int parse_error; @@ -51,7 +50,7 @@ size_t invalid_count; // strings that start with these should not be parsed as numbers const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"}; -bool startsWith(const char *pre, const char *str) { +bool starts_with(const char *pre, const char *str) { size_t lenpre = strlen(pre); return strncmp(pre, str, lenpre) == 0; } @@ -60,27 +59,27 @@ bool is_in_bad_list(const char *buf) { if (buf[0] != '0') return false; for (size_t i = 0; i < sizeof(really_bad) / sizeof(really_bad[0]); i++) - if (startsWith(really_bad[i], buf)) + if (starts_with(really_bad[i], buf)) return true; return false; } -void foundInvalidNumber(const uint8_t *buf) { +void found_invalid_number(const uint8_t *buf) { invalid_count++; char *endptr; double expected = strtod((const char *)buf, &endptr); if (endptr != (const char *)buf) { if (!is_in_bad_list((const char *)buf)) { - printf( - "Warning: foundInvalidNumber %.32s whereas strtod parses it to %f, ", - buf, expected); + printf("Warning: found_invalid_number %.32s whereas strtod parses it to " + "%f, ", + buf, expected); printf(" while parsing %s \n", fullpath); parse_error |= PARSE_WARNING; } } } -void foundInteger(int64_t result, const uint8_t *buf) { +void found_integer(int64_t result, const uint8_t *buf) { int_count++; char *endptr; long long expected = strtoll((const char *)buf, &endptr, 10); @@ -91,7 +90,7 @@ void foundInteger(int64_t result, const uint8_t *buf) { } } -void foundFloat(double result, const uint8_t *buf) { +void found_float(double result, const uint8_t *buf) { char *endptr; float_count++; double expected = strtod((const char *)buf, &endptr); @@ -111,8 +110,8 @@ void foundFloat(double result, const uint8_t *buf) { return; } // we want to get some reasonable relative accuracy - uint64_t ULP = f64_ulp_dist(expected,result); - if (f64_ulp_dist(expected,result) > 1) { + uint64_t ULP = f64_ulp_dist(expected, result); + if (f64_ulp_dist(expected, result) > 1) { fprintf(stderr, "parsed %.128e from \n", result); fprintf(stderr, " %.32s whereas strtod gives\n", buf); fprintf(stderr, " %.128e,", expected); @@ -128,7 +127,7 @@ void foundFloat(double result, const uint8_t *buf) { /** * Does the file filename ends with the given extension. */ -static bool hasExtension(const char *filename, const char *extension) { +static bool has_extension(const char *filename, const char *extension) { const char *ext = strrchr(filename, '.'); return (ext && !strcmp(ext, extension)); } @@ -151,7 +150,7 @@ bool validate(const char *dirname) { bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/'); for (int i = 0; i < c; i++) { const char *name = entry_list[i]->d_name; - if (hasExtension(name, extension)) { + if (has_extension(name, extension)) { size_t filelen = strlen(name); fullpath = (char *)malloc(dirlen + filelen + 1 + 1); strcpy(fullpath, dirname); @@ -170,7 +169,7 @@ bool validate(const char *dirname) { } // terrible hack but just to get it working simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "can't allocate memory" << std::endl; return false; diff --git a/tests/pointercheck.cpp b/tests/pointercheck.cpp index 28b7e6c5..764fecd9 100644 --- a/tests/pointercheck.cpp +++ b/tests/pointercheck.cpp @@ -4,34 +4,35 @@ #include "simdjson/parsedjson.h" int main() { - // {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}" - std::string json = "{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}"; - simdjson::ParsedJson pj; - assert(pj.allocateCapacity(json.length())); - simdjson::json_parse(json.c_str(), json.length(), pj); - assert(pj.isValid()); - simdjson::ParsedJson::iterator it(pj); + // {"/~01abc": [0, {"\\\" 0": ["value0", "value1"]}]}" + std::string json = + "{\"/~01abc\": [0, {\"\\\\\\\" 0\": [\"value0\", \"value1\"]}]}"; + simdjson::ParsedJson pj; + assert(pj.allocate_capacity(json.length())); + simdjson::json_parse(json.c_str(), json.length(), pj); + assert(pj.is_valid()); + simdjson::ParsedJson::Iterator it(pj); - // valid JSON String Representation pointer - std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0"); - assert(it.move_to(pointer1.c_str(), pointer1.length())); - assert(it.is_string()); - assert(it.get_string() == std::string("value0")); + // valid JSON String Representation pointer + std::string pointer1("/~1~001abc/1/\\\\\\\" 0/0"); + assert(it.move_to(pointer1.c_str(), pointer1.length())); + assert(it.is_string()); + assert(it.get_string() == std::string("value0")); - // valid URI Fragment Identifier Representation pointer - std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1"); - assert(it.move_to(pointer2.c_str(), pointer2.length())); - assert(it.is_string()); - assert(it.get_string() == std::string("value1")); + // valid URI Fragment Identifier Representation pointer + std::string pointer2("#/~1~001abc/1/%x5C%x22%x200/1"); + assert(it.move_to(pointer2.c_str(), pointer2.length())); + assert(it.is_string()); + assert(it.get_string() == std::string("value1")); - // invalid pointer with leading 0 in index - std::string pointer3("#/~1~001abc/01"); - assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed - assert(it.is_string()); // has probably not moved - assert(it.get_string() == std::string("value1")); // has not move + // invalid pointer with leading 0 in index + std::string pointer3("#/~1~001abc/01"); + assert(!it.move_to(pointer3.c_str(), pointer3.length())); // failed + assert(it.is_string()); // has probably not moved + assert(it.get_string() == std::string("value1")); // has not move - // "the (nonexistent) member after the last array element" - std::string pointer4("/~1~001abc/-"); - assert(it.move_to(pointer4.c_str(), pointer4.length())); - assert(it.get_type() == ']'); + // "the (nonexistent) member after the last array element" + std::string pointer4("/~1~001abc/-"); + assert(it.move_to(pointer4.c_str(), pointer4.length())); + assert(it.get_type() == ']'); } diff --git a/tests/singleheadertest.cpp b/tests/singleheadertest.cpp index b75ec8ae..8a3eb837 100644 --- a/tests/singleheadertest.cpp +++ b/tests/singleheadertest.cpp @@ -7,15 +7,15 @@ int main() { const char *filename = JSON_TEST_PATH; padded_string p = get_corpus(filename); ParsedJson pj = build_parsed_json(p); // do the parsing - if (!pj.isValid()) { + if (!pj.is_valid()) { return EXIT_FAILURE; } - if (!pj.allocateCapacity(p.size())) { + if (!pj.allocate_capacity(p.size())) { return EXIT_FAILURE; } const int res = json_parse(p, pj); if (res) { - std::cerr << errorMsg(res) << std::endl; + std::cerr << error_message(res) << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; diff --git a/tests/stringparsingcheck.cpp b/tests/stringparsingcheck.cpp index f78aea4c..9424c4c2 100644 --- a/tests/stringparsingcheck.cpp +++ b/tests/stringparsingcheck.cpp @@ -1,8 +1,8 @@ #include +#include #include #include #include -#include #include #include #include @@ -72,7 +72,7 @@ static bool parse_string(const char *p, char *output, char **end) { for (;;) { #if (CHAR_MIN < 0) || (!defined(CHAR_MIN)) // the '!defined' is just paranoia - // in this path, char is *signed* + // in this path, char is *signed* if ((*p >= 0 && *p < 0x20)) { return false; // unescaped } @@ -209,12 +209,12 @@ static bool parse_string(const char *p, char *output, char **end) { } } // end of borrowed code -char *bigbuffer; // global variable +char *big_buffer; // global variable -void foundBadString(const uint8_t *buf) { +void found_bad_string(const uint8_t *buf) { bad_string++; char *end; - if (parse_string((const char *)buf, bigbuffer, &end)) { + if (parse_string((const char *)buf, big_buffer, &end)) { printf("WARNING: Sajson-like parser seems to think that the string is " "valid %32s \n", buf); @@ -234,18 +234,18 @@ void print_cmp_hex(const char *s1, const char *s2, size_t len) { } } -void foundString(const uint8_t *buf, const uint8_t *parsed_begin, - const uint8_t *parsed_end) { - size_t thislen = parsed_end - parsed_begin; - total_string_length += thislen; +void found_string(const uint8_t *buf, const uint8_t *parsed_begin, + const uint8_t *parsed_end) { + size_t this_len = parsed_end - parsed_begin; + total_string_length += this_len; good_string++; char *end = NULL; - if (!parse_string((const char *)buf, bigbuffer, &end)) { + if (!parse_string((const char *)buf, big_buffer, &end)) { printf("WARNING: reference parser seems to think that the string is NOT " "valid %32s \n", buf); } - if (end == bigbuffer) { + if (end == big_buffer) { // we have a zero-length string if (parsed_begin != parsed_end) { printf("WARNING: We have a zero-length but gap is %zu \n", @@ -255,35 +255,35 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin, empty_string++; return; } - size_t len = end - bigbuffer; - if (len != thislen) { - printf("WARNING: lengths on parsed strings disagree %zu %zu \n", thislen, + size_t len = end - big_buffer; + if (len != this_len) { + printf("WARNING: lengths on parsed strings disagree %zu %zu \n", this_len, len); - printf("\nour parsed string : '%*s'\n\n", (int)thislen, + printf("\nour parsed string : '%*s'\n\n", (int)this_len, (const char *)parsed_begin); - print_hex((const char *)parsed_begin, thislen); + print_hex((const char *)parsed_begin, this_len); printf("\n"); - printf("reference parsing :'%*s'\n\n", (int)len, bigbuffer); - print_hex((const char *)bigbuffer, len); + printf("reference parsing :'%*s'\n\n", (int)len, big_buffer); + print_hex((const char *)big_buffer, len); printf("\n"); probable_bug = true; } - if (memcmp(bigbuffer, parsed_begin, thislen) != 0) { + if (memcmp(big_buffer, parsed_begin, this_len) != 0) { printf("WARNING: parsed strings disagree \n"); - printf("Lengths %zu %zu \n", thislen, len); + printf("Lengths %zu %zu \n", this_len, len); - printf("\nour parsed string : '%*s'\n", (int)thislen, + printf("\nour parsed string : '%*s'\n", (int)this_len, (const char *)parsed_begin); - print_hex((const char *)parsed_begin, thislen); + print_hex((const char *)parsed_begin, this_len); printf("\n"); - printf("reference parsing :'%*s'\n", (int)len, bigbuffer); - print_hex((const char *)bigbuffer, len); + printf("reference parsing :'%*s'\n", (int)len, big_buffer); + print_hex((const char *)big_buffer, len); printf("\n"); - print_cmp_hex((const char *)parsed_begin, bigbuffer, thislen); + print_cmp_hex((const char *)parsed_begin, big_buffer, this_len); probable_bug = true; } @@ -295,12 +295,12 @@ void foundString(const uint8_t *buf, const uint8_t *parsed_begin, /** * Does the file filename ends with the given extension. */ -static bool hasExtension(const char *filename, const char *extension) { +static bool has_extension(const char *filename, const char *extension) { const char *ext = strrchr(filename, '.'); return (ext && !strcmp(ext, extension)); } -bool startsWith(const char *pre, const char *str) { +bool starts_with(const char *pre, const char *str) { size_t lenpre = strlen(pre), lenstr = strlen(str); return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0; } @@ -323,7 +323,7 @@ bool validate(const char *dirname) { bool needsep = (strlen(dirname) > 1) && (dirname[strlen(dirname) - 1] != '/'); for (int i = 0; i < c; i++) { const char *name = entry_list[i]->d_name; - if (hasExtension(name, extension)) { + if (has_extension(name, extension)) { size_t filelen = strlen(name); fullpath = (char *)malloc(dirlen + filelen + 1 + 1); strcpy(fullpath, dirname); @@ -341,13 +341,13 @@ bool validate(const char *dirname) { return EXIT_FAILURE; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "can't allocate memory" << std::endl; return false; } - bigbuffer = (char *)malloc(p.size()); - if (bigbuffer == NULL) { + big_buffer = (char *)malloc(p.size()); + if (big_buffer == NULL) { std::cerr << "can't allocate memory" << std::endl; return false; } @@ -356,7 +356,7 @@ bool validate(const char *dirname) { total_string_length = 0; empty_string = 0; bool isok = json_parse(p, pj); - free(bigbuffer); + free(big_buffer); if (good_string > 0) { printf("File %40s %s --- bad strings: %10zu \tgood strings: %10zu\t " "empty strings: %10zu " diff --git a/tools/json2json.cpp b/tools/json2json.cpp index 88de7a73..f9910335 100644 --- a/tools/json2json.cpp +++ b/tools/json2json.cpp @@ -5,7 +5,7 @@ #include "simdjson/jsonioutil.h" #include "simdjson/jsonparser.h" -void compute_dump(simdjson::ParsedJson::iterator &pjh) { +void compute_dump(simdjson::ParsedJson::Iterator &pjh) { if (pjh.is_object()) { std::cout << "{"; if (pjh.down()) { @@ -40,8 +40,8 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) { } int main(int argc, char *argv[]) { - bool rawdump = false; - bool apidump = false; + bool rawdump = false; + bool apidump = false; #ifndef _MSC_VER int c; @@ -57,7 +57,7 @@ int main(int argc, char *argv[]) { default: abort(); } -} + } #else int optind = 1; #endif @@ -70,7 +70,8 @@ int main(int argc, char *argv[]) { } const char *filename = argv[optind]; if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl; + std::cerr << "warning: ignoring everything after " << argv[optind + 1] + << std::endl; } simdjson::padded_string p; try { @@ -80,25 +81,28 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "failed to allocate memory" << std::endl; return EXIT_FAILURE; } - int res = simdjson::json_parse(p, pj); // do the parsing, return false on error + int res = + simdjson::json_parse(p, pj); // do the parsing, return false on error if (res != simdjson::SUCCESS) { - std::cerr << " Parsing failed. Error is '" << simdjson::errorMsg(res) << "'." << std::endl; + std::cerr << " Parsing failed. Error is '" << simdjson::error_message(res) + << "'." << std::endl; return EXIT_FAILURE; } if (apidump) { - simdjson::ParsedJson::iterator pjh(pj); - if (!pjh.isOk()) { + simdjson::ParsedJson::Iterator pjh(pj); + if (!pjh.is_ok()) { std::cerr << " Could not iterate parsed result. " << std::endl; return EXIT_FAILURE; } compute_dump(pjh); } else { - const bool is_ok = rawdump ? pj.dump_raw_tape(std::cout) : pj.printjson(std::cout); + const bool is_ok = + rawdump ? pj.dump_raw_tape(std::cout) : pj.print_json(std::cout); if (!is_ok) { std::cerr << " Could not print out parsed result. " << std::endl; return EXIT_FAILURE; diff --git a/tools/jsonpointer.cpp b/tools/jsonpointer.cpp index 954d840c..e3a1eec0 100644 --- a/tools/jsonpointer.cpp +++ b/tools/jsonpointer.cpp @@ -1,9 +1,8 @@ -#include #include "simdjson/jsonioutil.h" #include "simdjson/jsonparser.h" +#include - -void compute_dump(simdjson::ParsedJson::iterator &pjh) { +void compute_dump(simdjson::ParsedJson::Iterator &pjh) { if (pjh.is_object()) { std::cout << "{"; if (pjh.down()) { @@ -40,9 +39,16 @@ void compute_dump(simdjson::ParsedJson::iterator &pjh) { int main(int argc, char *argv[]) { if (argc < 3) { std::cerr << "Usage: " << argv[0] << " " << std::endl; - std::cerr << "Follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901" << std::endl; - std::cerr << " Example: " << argv[0] << " jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2 " << std::endl; - std::cerr << "Multiple can be issued in the same command, but at least one is needed." << std::endl; + std::cerr << "Follows the rfc6901 standard's syntax: " + "https://tools.ietf.org/html/rfc6901" + << std::endl; + std::cerr << " Example: " << argv[0] + << " jsonexamples/small/demo.json /Image/Width /Image/Height " + "/Image/IDs/2 " + << std::endl; + std::cerr << "Multiple can be issued in the same command, but " + "at least one is needed." + << std::endl; exit(1); } const char *filename = argv[1]; @@ -54,31 +60,33 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } simdjson::ParsedJson pj; - bool allocok = pj.allocateCapacity(p.size(), 1024); + bool allocok = pj.allocate_capacity(p.size(), 1024); if (!allocok) { std::cerr << "failed to allocate memory" << std::endl; return EXIT_FAILURE; } - int res = simdjson::json_parse(p, pj); // do the parsing, return false on error + int res = + simdjson::json_parse(p, pj); // do the parsing, return false on error if (res) { - std::cerr << " Parsing failed with error " << simdjson::errorMsg(res) << std::endl; + std::cerr << " Parsing failed with error " << simdjson::error_message(res) + << std::endl; return EXIT_FAILURE; } std::cout << "[" << std::endl; - for(int idx = 2; idx < argc; idx++) { - const char * jsonpath = argv[idx]; - simdjson::ParsedJson::iterator it(pj); - if(it.move_to(std::string(jsonpath))) { - std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl; - std::cout << "\"value\":"; - compute_dump(it); - std::cout << "}" << std::endl; - } else { - std::cout << "null" << std::endl; - } - if(idx + 1 < argc) { - std::cout << "," << std::endl; - } + for (int idx = 2; idx < argc; idx++) { + const char *jsonpath = argv[idx]; + simdjson::ParsedJson::Iterator it(pj); + if (it.move_to(std::string(jsonpath))) { + std::cout << "{\"jsonpath\": \"" << jsonpath << "\"," << std::endl; + std::cout << "\"value\":"; + compute_dump(it); + std::cout << "}" << std::endl; + } else { + std::cout << "null" << std::endl; + } + if (idx + 1 < argc) { + std::cout << "," << std::endl; + } } std::cout << "]" << std::endl; return EXIT_SUCCESS; diff --git a/tools/jsonstats.cpp b/tools/jsonstats.cpp index 973c30ce..9fbb0e67 100644 --- a/tools/jsonstats.cpp +++ b/tools/jsonstats.cpp @@ -3,30 +3,28 @@ #include "simdjson/jsonioutil.h" #include "simdjson/jsonparser.h" -size_t count_nonasciibytes(const uint8_t* input, size_t length) { +size_t count_nonasciibytes(const uint8_t *input, size_t length) { size_t count = 0; - for(size_t i = 0; i < length; i++) { + for (size_t i = 0; i < length; i++) { count += input[i] >> 7; } return count; -} - - -size_t count_backslash(const uint8_t* input, size_t length) { - size_t count = 0; - for(size_t i = 0; i < length; i++) { - count += (input[i] == '\\') ? 1 : 0; - } - return count; } +size_t count_backslash(const uint8_t *input, size_t length) { + size_t count = 0; + for (size_t i = 0; i < length; i++) { + count += (input[i] == '\\') ? 1 : 0; + } + return count; +} struct stat_s { size_t integer_count; size_t float_count; size_t string_count; size_t backslash_count; - size_t nonasciibyte_count; + size_t non_ascii_byte_count; size_t object_count; size_t array_count; size_t null_count; @@ -39,18 +37,18 @@ struct stat_s { using stat_t = struct stat_s; - - -stat_t simdjson_computestats(const simdjson::padded_string &p) { +stat_t simdjson_compute_stats(const simdjson::padded_string &p) { stat_t answer; simdjson::ParsedJson pj = simdjson::build_parsed_json(p); - answer.valid = pj.isValid(); + answer.valid = pj.is_valid(); if (!answer.valid) { - std::cerr << pj.getErrorMsg() << std::endl; + std::cerr << pj.get_error_message() << std::endl; return answer; } - answer.backslash_count = count_backslash(reinterpret_cast(p.data()), p.size()); - answer.nonasciibyte_count = count_nonasciibytes(reinterpret_cast(p.data()), p.size()); + answer.backslash_count = + count_backslash(reinterpret_cast(p.data()), p.size()); + answer.non_ascii_byte_count = count_nonasciibytes( + reinterpret_cast(p.data()), p.size()); answer.byte_count = p.size(); answer.integer_count = 0; answer.float_count = 0; @@ -61,24 +59,24 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) { answer.false_count = 0; answer.string_count = 0; answer.structural_indexes_count = pj.n_structural_indexes; - size_t tapeidx = 0; - uint64_t tape_val = pj.tape[tapeidx++]; + size_t tape_idx = 0; + uint64_t tape_val = pj.tape[tape_idx++]; uint8_t type = (tape_val >> 56); - size_t howmany = 0; + size_t how_many = 0; assert(type == 'r'); - howmany = tape_val & JSONVALUEMASK; - for (; tapeidx < howmany; tapeidx++) { - tape_val = pj.tape[tapeidx]; - // uint64_t payload = tape_val & JSONVALUEMASK; + how_many = tape_val & JSON_VALUE_MASK; + for (; tape_idx < how_many; tape_idx++) { + tape_val = pj.tape[tape_idx]; + // uint64_t payload = tape_val & JSON_VALUE_MASK; type = (tape_val >> 56); switch (type) { case 'l': // we have a long int answer.integer_count++; - tapeidx++; // skipping the integer + tape_idx++; // skipping the integer break; case 'd': // we have a double answer.float_count++; - tapeidx++; // skipping the double + tape_idx++; // skipping the double break; case 'n': // we have a null answer.null_count++; @@ -109,12 +107,6 @@ stat_t simdjson_computestats(const simdjson::padded_string &p) { return answer; } - - - - - - int main(int argc, char *argv[]) { int myoptind = 1; if (myoptind >= argc) { @@ -124,7 +116,8 @@ int main(int argc, char *argv[]) { } const char *filename = argv[myoptind]; if (myoptind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] << std::endl; + std::cerr << "warning: ignoring everything after " << argv[myoptind + 1] + << std::endl; } simdjson::padded_string p; try { @@ -133,16 +126,18 @@ int main(int argc, char *argv[]) { std::cerr << "Could not load the file " << filename << std::endl; return EXIT_FAILURE; } - stat_t s = simdjson_computestats(p); - if(!s.valid) { + stat_t s = simdjson_compute_stats(p); + if (!s.valid) { std::cerr << "not a valid JSON" << std::endl; return EXIT_FAILURE; } - - printf("# integer_count float_count string_count backslash_count nonasciibyte_count object_count array_count null_count true_count false_count byte_count structural_indexes_count\n"); - printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, s.float_count, - s.string_count, s.backslash_count, s.nonasciibyte_count, s.object_count, s.array_count, - s.null_count, s.true_count, s.false_count, s.byte_count, s.structural_indexes_count); + printf("# integer_count float_count string_count backslash_count " + "non_ascii_byte_count object_count array_count null_count true_count " + "false_count byte_count structural_indexes_count\n"); + printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, + s.float_count, s.string_count, s.backslash_count, + s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count, + s.true_count, s.false_count, s.byte_count, s.structural_indexes_count); return EXIT_SUCCESS; } diff --git a/tools/minify.cpp b/tools/minify.cpp index b5940482..4d7fa248 100644 --- a/tools/minify.cpp +++ b/tools/minify.cpp @@ -10,12 +10,12 @@ int main(int argc, char *argv[]) { } simdjson::padded_string p; std::string filename = argv[argc - 1]; - try{ + try { simdjson::get_corpus(filename).swap(p); - } catch (const std::exception& e) { - std::cout << "Could not load the file " << filename << std::endl; - return EXIT_FAILURE; + } catch (const std::exception &e) { + std::cout << "Could not load the file " << filename << std::endl; + return EXIT_FAILURE; } - simdjson::jsonminify(p, p.data()); - printf("%s",p.data()); + simdjson::json_minify(p, p.data()); + printf("%s", p.data()); }