Improve documentation on padding

- Improves and clarifies the documentation on padding.
 - Use std:: prefix for memcpy, strlen etc.

Related to issues #1175 and #1178
This commit is contained in:
Daniel Lemire 2020-09-23 03:07:14 -04:00 committed by GitHub
parent 19cb5d57db
commit f410213003
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 129 additions and 58 deletions

View File

@ -8,7 +8,7 @@
#include <unistd.h> // for syscall #include <unistd.h> // for syscall
#include <cerrno> // for errno #include <cerrno> // for errno
#include <cstring> // for memset #include <cstring> // for std::memset
#include <stdexcept> #include <stdexcept>
#include <iostream> #include <iostream>
@ -24,7 +24,7 @@ template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
public: public:
explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) { explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
memset(&attribs, 0, sizeof(attribs)); std::memset(&attribs, 0, sizeof(attribs));
attribs.type = TYPE; attribs.type = TYPE;
attribs.size = sizeof(attribs); attribs.size = sizeof(attribs);
attribs.disabled = 1; attribs.disabled = 1;

View File

@ -296,8 +296,8 @@ simdjson_really_inline void sax_tweet_reader_visitor::field_lookup::neg(const ch
} }
sax_tweet_reader_visitor::field_lookup::field_lookup() { sax_tweet_reader_visitor::field_lookup::field_lookup() {
add("\"statuses\"", strlen("\"statuses\""), containers::top_object, field_type::array, 0); // { "statuses": [...] add("\"statuses\"", std::strlen("\"statuses\""), containers::top_object, field_type::array, 0); // { "statuses": [...]
#define TWEET_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::tweet, TYPE, offsetof(tweet, KEY)); #define TWEET_FIELD(KEY, TYPE) add("\"" #KEY "\"", std::strlen("\"" #KEY "\""), containers::tweet, TYPE, offsetof(tweet, KEY));
TWEET_FIELD(id, field_type::unsigned_integer); TWEET_FIELD(id, field_type::unsigned_integer);
TWEET_FIELD(in_reply_to_status_id, field_type::nullable_unsigned_integer); TWEET_FIELD(in_reply_to_status_id, field_type::nullable_unsigned_integer);
TWEET_FIELD(retweet_count, field_type::unsigned_integer); TWEET_FIELD(retweet_count, field_type::unsigned_integer);
@ -306,7 +306,7 @@ sax_tweet_reader_visitor::field_lookup::field_lookup() {
TWEET_FIELD(created_at, field_type::string); TWEET_FIELD(created_at, field_type::string);
TWEET_FIELD(user, field_type::object) TWEET_FIELD(user, field_type::object)
#undef TWEET_FIELD #undef TWEET_FIELD
#define USER_FIELD(KEY, TYPE) add("\"" #KEY "\"", strlen("\"" #KEY "\""), containers::user, TYPE, offsetof(tweet, user)+offsetof(twitter_user, KEY)); #define USER_FIELD(KEY, TYPE) add("\"" #KEY "\"", std::strlen("\"" #KEY "\""), containers::user, TYPE, offsetof(tweet, user)+offsetof(twitter_user, KEY));
USER_FIELD(id, field_type::unsigned_integer); USER_FIELD(id, field_type::unsigned_integer);
USER_FIELD(screen_name, field_type::string); USER_FIELD(screen_name, field_type::string);
#undef USER_FIELD #undef USER_FIELD

View File

@ -278,7 +278,7 @@ In some cases, you may have valid JSON strings that you do not wish to parse but
// Starts with a valid JSON document as a string. // Starts with a valid JSON document as a string.
// It does not have to be null-terminated. // It does not have to be null-terminated.
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
// Create a buffer to receive the minified string. Make sure that there is enough room (length bytes). // Create a buffer to receive the minified string. Make sure that there is enough room (length bytes).
std::unique_ptr<char[]> buffer{new char[length]}; std::unique_ptr<char[]> buffer{new char[length]};
size_t new_length{}; // It will receive the minified length. size_t new_length{}; // It will receive the minified length.
@ -296,7 +296,7 @@ The simdjson library has fast functions to validate UTF-8 strings. They are many
```C++ ```C++
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
bool is_ok = simdjson::validate_utf8(some_string, length); bool is_ok = simdjson::validate_utf8(some_string, length);
``` ```

View File

@ -260,7 +260,7 @@ In some cases, you may have valid JSON strings that you do not wish to parse but
// Starts with a valid JSON document as a string. // Starts with a valid JSON document as a string.
// It does not have to be null-terminated. // It does not have to be null-terminated.
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
// Create a buffer to receive the minified string. Make sure that there is enough room (length bytes). // Create a buffer to receive the minified string. Make sure that there is enough room (length bytes).
std::unique_ptr<char[]> buffer{new char[length]}; std::unique_ptr<char[]> buffer{new char[length]};
size_t new_length{}; // It will receive the minified length. size_t new_length{}; // It will receive the minified length.
@ -278,7 +278,7 @@ The simdjson library has fast functions to validate UTF-8 strings. They are many
``` ```
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
bool is_ok = simdjson::validate_utf8(some_string, length); bool is_ok = simdjson::validate_utf8(some_string, length);
``` ```

View File

@ -12,6 +12,8 @@ are still some scenarios where tuning can enhance performance.
* [Visual Studio](#visual-studio) * [Visual Studio](#visual-studio)
* [Downclocking](#downclocking) * [Downclocking](#downclocking)
* [Best Use of the DOM API](#best-use-of-the-dom-api) * [Best Use of the DOM API](#best-use-of-the-dom-api)
* [Padding and Temporary Copies](#padding-and-temporary-copies)
Reusing the parser for maximum efficiency Reusing the parser for maximum efficiency
----------------------------------------- -----------------------------------------
@ -174,3 +176,25 @@ Best Use of the DOM API
The simdjson API provides access to the JSON DOM (document-object-model) content as a tree of `dom::element` instances, each representing an object, an array or an atomic type (null, true, false, number). These `dom::element` instances are lightweight objects (e.g., spanning 16 bytes) and it might be advantageous to pass them by value, as opposed to passing them by reference or by pointer. The simdjson API provides access to the JSON DOM (document-object-model) content as a tree of `dom::element` instances, each representing an object, an array or an atomic type (null, true, false, number). These `dom::element` instances are lightweight objects (e.g., spanning 16 bytes) and it might be advantageous to pass them by value, as opposed to passing them by reference or by pointer.
Padding and Temporary Copies
--------------
The simdjson function `parser.parse` reads data from a padded buffer, containing SIMDJSON_PADDING extra bytes added at the end.
If you are passing a `padded_string` to `parser.parse` or loading the JSON directly from
disk (`parser.load`), padding is automatically handled.
When calling `parser.parse` on a pointer (e.g., `parser.parse(mystring, mylength)`) a temporary copy is made by default with adequate padding and you, again, do not need to be concerned with padding.
Some users may not be able use our `padded_string` class or to load the data directly from disk (`parser.load`). They may need to pass data pointers to the library. If these users wish to avoid temporary copies and corresponding temporary memory allocations, they may want to call `parser.parse` with the `realloc_if_needed` parameter set to false (e.g., `parser.parse(mystring, mylength, false)`). In such cases, they need to ensure that there are at least SIMDJSON_PADDING extra bytes at the end that can be safely accessed and read. They do not need to initialize the padded bytes to any value in particular. The following example is safe:
```C++
const char *json = R"({"key":"value"})";
const size_t json_len = std::strlen(json);
std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]};
memcpy(padded_json_copy.get(), json, json_len);
memset(padded_json_copy.get() + json_len, 0, SIMDJSON_PADDING);
simdjson::dom::parser parser;
simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false);
````
Setting the `realloc_if_needed` parameter false in this manner may lead to better performance, but it requires that the user takes more responsibilities: the simdjson library cannot verify that the input buffer was padded.

View File

@ -66,7 +66,7 @@ inline bool document::dump_raw_tape(std::ostream &os) const noexcept {
switch (type) { switch (type) {
case '"': // we have a string case '"': // we have a string
os << "string \""; os << "string \"";
memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t)); std::memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t));
os << internal::escape_json_string(std::string_view( os << internal::escape_json_string(std::string_view(
(const char *)(string_buf.get() + payload + sizeof(uint32_t)), (const char *)(string_buf.get() + payload + sizeof(uint32_t)),
string_length string_length
@ -92,7 +92,7 @@ inline bool document::dump_raw_tape(std::ostream &os) const noexcept {
return false; return false;
} }
double answer; double answer;
memcpy(&answer, &tape[++tape_idx], sizeof(answer)); std::memcpy(&answer, &tape[++tape_idx], sizeof(answer));
os << answer << '\n'; os << answer << '\n';
break; break;
case 'n': // we have a null case 'n': // we have a null

View File

@ -252,7 +252,7 @@ dom::parser::Iterator::Iterator(
current_val(o.current_val) current_val(o.current_val)
{ {
depth_index = new scopeindex_t[max_depth+1]; depth_index = new scopeindex_t[max_depth+1];
memcpy(depth_index, o.depth_index, (depth + 1) * sizeof(depth_index[0])); std::memcpy(depth_index, o.depth_index, (depth + 1) * sizeof(depth_index[0]));
} }
dom::parser::Iterator::~Iterator() noexcept { dom::parser::Iterator::~Iterator() noexcept {

View File

@ -78,7 +78,7 @@ public:
// return the length of the string in bytes // return the length of the string in bytes
inline uint32_t get_string_length() const { inline uint32_t get_string_length() const {
uint32_t answer; uint32_t answer;
memcpy(&answer, std::memcpy(&answer,
reinterpret_cast<const char *>(doc.string_buf.get() + reinterpret_cast<const char *>(doc.string_buf.get() +
(current_val & internal::JSON_VALUE_MASK)), (current_val & internal::JSON_VALUE_MASK)),
sizeof(uint32_t)); sizeof(uint32_t));
@ -93,7 +93,7 @@ public:
// case of error // case of error
} }
double answer; double answer;
memcpy(&answer, &doc.tape[location + 1], sizeof(answer)); std::memcpy(&answer, &doc.tape[location + 1], sizeof(answer));
return answer; return answer;
} }

View File

@ -98,7 +98,7 @@ inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bo
if (realloc_if_needed) { if (realloc_if_needed) {
tmp_buf.reset((uint8_t *)internal::allocate_padded_buffer(len)); tmp_buf.reset((uint8_t *)internal::allocate_padded_buffer(len));
if (tmp_buf.get() == nullptr) { return MEMALLOC; } if (tmp_buf.get() == nullptr) { return MEMALLOC; }
memcpy((void *)tmp_buf.get(), buf, len); std::memcpy((void *)tmp_buf.get(), buf, len);
} }
_error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc); _error = implementation->parse(realloc_if_needed ? tmp_buf.get() : buf, len, doc);
if (_error) { return _error; } if (_error) { return _error; }

View File

@ -114,8 +114,30 @@ public:
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
* those bytes are initialized to, as long as they are allocated. * those bytes are initialized to, as long as they are allocated.
* *
* If realloc_if_needed is true, it is assumed that the buffer does *not* have enough padding, * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding,
* and it is copied into an enlarged temporary buffer before parsing. * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe:
*
* const char *json = R"({"key":"value"})";
* const size_t json_len = std::strlen(json);
* simdjson::dom::parser parser;
* simdjson::dom::element element = parser.parse(json, json_len);
*
* If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)),
* you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end.
* The benefit of setting realloc_if_needed to false is that you avoid a temporary
* memory allocation and a copy.
*
* The padded bytes may be read. It is not important how you initialize
* these bytes though we recommend a sensible default like null character values or spaces.
* For example, the following low-level code is safe:
*
* const char *json = R"({"key":"value"})";
* const size_t json_len = std::strlen(json);
* std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]};
* std::memcpy(padded_json_copy.get(), json, json_len);
* std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING);
* simdjson::dom::parser parser;
* simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false);
* *
* ### Parser Capacity * ### Parser Capacity
* *

View File

@ -43,7 +43,7 @@ enum error_code {
* *
* dom::parser parser; * dom::parser parser;
* dom::element doc; * dom::element doc;
* auto error = parser.parse("foo").get(doc); * auto error = parser.parse("foo",3).get(doc);
* if (error) { printf("Error: %s\n", error_message(error)); } * if (error) { printf("Error: %s\n", error_message(error)); }
* *
* @return The error message. * @return The error message.

View File

@ -81,14 +81,14 @@ simdjson_really_inline T tape_ref::next_tape_value() const noexcept {
// It is not generally safe. It is safer, and often faster to rely // It is not generally safe. It is safer, and often faster to rely
// on memcpy. Yes, it is uglier, but it is also encapsulated. // on memcpy. Yes, it is uglier, but it is also encapsulated.
T x; T x;
memcpy(&x,&doc->tape[json_index + 1],sizeof(uint64_t)); std::memcpy(&x,&doc->tape[json_index + 1],sizeof(uint64_t));
return x; return x;
} }
simdjson_really_inline uint32_t internal::tape_ref::get_string_length() const noexcept { simdjson_really_inline uint32_t internal::tape_ref::get_string_length() const noexcept {
size_t string_buf_index = size_t(tape_value()); size_t string_buf_index = size_t(tape_value());
uint32_t len; uint32_t len;
memcpy(&len, &doc->string_buf[string_buf_index], sizeof(len)); std::memcpy(&len, &doc->string_buf[string_buf_index], sizeof(len));
return len; return len;
} }

View File

@ -27,7 +27,7 @@ inline char *allocate_padded_buffer(size_t length) noexcept {
// We write zeroes in the padded region to avoid having uninitized // We write zeroes in the padded region to avoid having uninitized
// garbage. If nothing else, garbage getting read might trigger a // garbage. If nothing else, garbage getting read might trigger a
// warning in a memory checking. // warning in a memory checking.
memset(padded_buffer + length, 0, totalpaddedlength - length); std::memset(padded_buffer + length, 0, totalpaddedlength - length);
return padded_buffer; return padded_buffer;
} // allocate_padded_buffer() } // allocate_padded_buffer()
@ -43,7 +43,7 @@ inline padded_string::padded_string(size_t length) noexcept
inline padded_string::padded_string(const char *data, size_t length) noexcept inline padded_string::padded_string(const char *data, size_t length) noexcept
: viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) {
if ((data != nullptr) and (data_ptr != nullptr)) { if ((data != nullptr) and (data_ptr != nullptr)) {
memcpy(data_ptr, data, length); std::memcpy(data_ptr, data, length);
data_ptr[length] = '\0'; // easier when you need a c_str data_ptr[length] = '\0'; // easier when you need a c_str
} }
} }
@ -51,7 +51,7 @@ inline padded_string::padded_string(const char *data, size_t length) noexcept
inline padded_string::padded_string(const std::string & str_ ) noexcept inline padded_string::padded_string(const std::string & str_ ) noexcept
: viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) { : viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) {
if (data_ptr != nullptr) { if (data_ptr != nullptr) {
memcpy(data_ptr, str_.data(), str_.size()); std::memcpy(data_ptr, str_.data(), str_.size());
data_ptr[str_.size()] = '\0'; // easier when you need a c_str data_ptr[str_.size()] = '\0'; // easier when you need a c_str
} }
} }
@ -59,7 +59,7 @@ inline padded_string::padded_string(const std::string & str_ ) noexcept
inline padded_string::padded_string(std::string_view sv_) noexcept inline padded_string::padded_string(std::string_view sv_) noexcept
: viable_size(sv_.size()), data_ptr(internal::allocate_padded_buffer(sv_.size())) { : viable_size(sv_.size()), data_ptr(internal::allocate_padded_buffer(sv_.size())) {
if (data_ptr != nullptr) { if (data_ptr != nullptr) {
memcpy(data_ptr, sv_.data(), sv_.size()); std::memcpy(data_ptr, sv_.data(), sv_.size());
data_ptr[sv_.size()] = '\0'; // easier when you need a c_str data_ptr[sv_.size()] = '\0'; // easier when you need a c_str
} }
} }

View File

@ -76,8 +76,8 @@ simdjson_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block()
template<size_t STEP_SIZE> template<size_t STEP_SIZE>
simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const { simdjson_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
memcpy(dst, buf + idx, len - idx); std::memcpy(dst, buf + idx, len - idx);
return len - idx; return len - idx;
} }

View File

@ -179,8 +179,8 @@ SIMDJSON_WARN_UNUSED simdjson_really_inline error_code tape_builder::visit_root_
// //
uint8_t *copy = static_cast<uint8_t *>(malloc(iter.remaining_len() + SIMDJSON_PADDING)); uint8_t *copy = static_cast<uint8_t *>(malloc(iter.remaining_len() + SIMDJSON_PADDING));
if (copy == nullptr) { return MEMALLOC; } if (copy == nullptr) { return MEMALLOC; }
memcpy(copy, value, iter.remaining_len()); std::memcpy(copy, value, iter.remaining_len());
memset(copy + iter.remaining_len(), ' ', SIMDJSON_PADDING); std::memset(copy + iter.remaining_len(), ' ', SIMDJSON_PADDING);
error_code error = visit_number(iter, copy); error_code error = visit_number(iter, copy);
free(copy); free(copy);
return error; return error;

View File

@ -255,10 +255,10 @@ namespace parse_api_tests {
uint64_t count = 0; uint64_t count = 0;
constexpr const int BATCH_SIZE = 128; constexpr const int BATCH_SIZE = 128;
uint8_t empty_batches_ndjson[BATCH_SIZE*16+SIMDJSON_PADDING]; uint8_t empty_batches_ndjson[BATCH_SIZE*16+SIMDJSON_PADDING];
memset(&empty_batches_ndjson[0], ' ', BATCH_SIZE*16+SIMDJSON_PADDING); std::memset(&empty_batches_ndjson[0], ' ', BATCH_SIZE*16+SIMDJSON_PADDING);
memcpy(&empty_batches_ndjson[BATCH_SIZE*3+2], "1", 1); std::memcpy(&empty_batches_ndjson[BATCH_SIZE*3+2], "1", 1);
memcpy(&empty_batches_ndjson[BATCH_SIZE*10+4], "2", 1); std::memcpy(&empty_batches_ndjson[BATCH_SIZE*10+4], "2", 1);
memcpy(&empty_batches_ndjson[BATCH_SIZE*11+6], "3", 1); std::memcpy(&empty_batches_ndjson[BATCH_SIZE*11+6], "3", 1);
simdjson::dom::document_stream stream; simdjson::dom::document_stream stream;
ASSERT_SUCCESS( parser.parse_many(empty_batches_ndjson, BATCH_SIZE*16).get(stream) ); ASSERT_SUCCESS( parser.parse_many(empty_batches_ndjson, BATCH_SIZE*16).get(stream) );
for (auto doc : stream) { for (auto doc : stream) {

View File

@ -150,7 +150,7 @@ namespace adversarial {
bool number_overrun_at_root() { bool number_overrun_at_root() {
TEST_START(); TEST_START();
constexpr const char *json = "1" PADDING_FILLED_WITH_NUMBERS ","; constexpr const char *json = "1" PADDING_FILLED_WITH_NUMBERS ",";
constexpr size_t len = 1; // strlen("1"); constexpr size_t len = 1; // std::strlen("1");
dom::parser parser; dom::parser parser;
uint64_t foo; uint64_t foo;
@ -161,7 +161,7 @@ namespace adversarial {
bool number_overrun_in_array() { bool number_overrun_in_array() {
TEST_START(); TEST_START();
constexpr const char *json = "[1" PADDING_FILLED_WITH_NUMBERS "]"; constexpr const char *json = "[1" PADDING_FILLED_WITH_NUMBERS "]";
constexpr size_t len = 2; // strlen("[1"); constexpr size_t len = 2; // std::strlen("[1");
dom::parser parser; dom::parser parser;
uint64_t foo; uint64_t foo;
@ -171,7 +171,7 @@ namespace adversarial {
bool number_overrun_in_object() { bool number_overrun_in_object() {
TEST_START(); TEST_START();
constexpr const char *json = "{\"key\":1" PADDING_FILLED_WITH_NUMBERS "}"; constexpr const char *json = "{\"key\":1" PADDING_FILLED_WITH_NUMBERS "}";
constexpr size_t len = 8; // strlen("{\"key\":1"); constexpr size_t len = 8; // std::strlen("{\"key\":1");
dom::parser parser; dom::parser parser;
uint64_t foo; uint64_t foo;
@ -179,7 +179,7 @@ namespace adversarial {
TEST_SUCCEED(); TEST_SUCCEED();
} }
bool run() { bool run() {
static_assert(33 > SIMDJSON_PADDING, "corruption test doesn't have enough padding"); // 33 = strlen(PADDING_FILLED_WITH_NUMBERS) static_assert(33 > SIMDJSON_PADDING, "corruption test doesn't have enough padding"); // 33 = std::strlen(PADDING_FILLED_WITH_NUMBERS)
return true return true
&& number_overrun_at_root() && number_overrun_at_root()
&& number_overrun_in_array() && number_overrun_in_array()

View File

@ -23,7 +23,7 @@ static bool has_extension(const char *filename, const char *extension) {
} }
bool starts_with(const char *pre, const char *str) { bool starts_with(const char *pre, const char *str) {
size_t len_pre = strlen(pre), len_str = strlen(str); size_t len_pre = std::strlen(pre), len_str = std::strlen(str);
return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0; return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0;
} }
@ -34,7 +34,7 @@ bool contains(const char *pre, const char *str) {
bool validate(const char *dirname) { bool validate(const char *dirname) {
bool everything_fine = true; bool everything_fine = true;
const char *extension = ".json"; const char *extension = ".json";
size_t dirlen = strlen(dirname); size_t dirlen = std::strlen(dirname);
struct dirent **entry_list; struct dirent **entry_list;
int c = scandir(dirname, &entry_list, nullptr, alphasort); int c = scandir(dirname, &entry_list, nullptr, alphasort);
if (c < 0) { if (c < 0) {
@ -56,7 +56,7 @@ bool validate(const char *dirname) {
if (has_extension(name, extension)) { if (has_extension(name, extension)) {
printf("validating: file %s ", name); printf("validating: file %s ", name);
fflush(nullptr); fflush(nullptr);
size_t namelen = strlen(name); size_t namelen = std::strlen(name);
size_t fullpathlen = dirlen + 1 + namelen + 1; size_t fullpathlen = dirlen + 1 + namelen + 1;
char *fullpath = static_cast<char *>(malloc(fullpathlen)); char *fullpath = static_cast<char *>(malloc(fullpathlen));
snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name); snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name);

View File

@ -21,7 +21,7 @@ static bool has_extension(const char *filename, const char *extension) {
} }
bool starts_with(const char *pre, const char *str) { bool starts_with(const char *pre, const char *str) {
size_t len_pre = strlen(pre), len_str = strlen(str); size_t len_pre = std::strlen(pre), len_str = std::strlen(str);
return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0; return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0;
} }
@ -32,7 +32,7 @@ bool contains(const char *pre, const char *str) {
bool validate_minefield(const char *dirname) { bool validate_minefield(const char *dirname) {
bool everything_fine = true; bool everything_fine = true;
const char *extension = ".json"; const char *extension = ".json";
size_t dirlen = strlen(dirname); size_t dirlen = std::strlen(dirname);
struct dirent **entry_list; struct dirent **entry_list;
int c = scandir(dirname, &entry_list, nullptr, alphasort); int c = scandir(dirname, &entry_list, nullptr, alphasort);
if (c < 0) { if (c < 0) {
@ -54,7 +54,7 @@ bool validate_minefield(const char *dirname) {
if (has_extension(name, extension)) { if (has_extension(name, extension)) {
printf("validating: file %s ", name); printf("validating: file %s ", name);
fflush(nullptr); fflush(nullptr);
size_t namelen = strlen(name); size_t namelen = std::strlen(name);
size_t fullpathlen = dirlen + 1 + namelen + 1; size_t fullpathlen = dirlen + 1 + namelen + 1;
char *fullpath = static_cast<char *>(malloc(fullpathlen)); char *fullpath = static_cast<char *>(malloc(fullpathlen));
snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name); snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name);

View File

@ -54,7 +54,7 @@ size_t invalid_count;
const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"}; const char *really_bad[] = {"013}", "0x14", "0e]", "0e+]", "0e+-1]"};
bool starts_with(const char *pre, const char *str) { bool starts_with(const char *pre, const char *str) {
size_t lenpre = strlen(pre); size_t lenpre = std::strlen(pre);
return strncmp(pre, str, lenpre) == 0; return strncmp(pre, str, lenpre) == 0;
} }
@ -168,7 +168,7 @@ bool validate(const char *dirname) {
parse_error = 0; parse_error = 0;
size_t total_count = 0; size_t total_count = 0;
const char *extension = ".json"; const char *extension = ".json";
size_t dirlen = strlen(dirname); size_t dirlen = std::strlen(dirname);
struct dirent **entry_list; struct dirent **entry_list;
int c = scandir(dirname, &entry_list, 0, alphasort); int c = scandir(dirname, &entry_list, 0, alphasort);
if (c < 0) { if (c < 0) {
@ -183,7 +183,7 @@ bool validate(const char *dirname) {
for (int i = 0; i < c; i++) { for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name; const char *name = entry_list[i]->d_name;
if (has_extension(name, extension)) { if (has_extension(name, extension)) {
size_t filelen = strlen(name); size_t filelen = std::strlen(name);
fullpath = (char *)malloc(dirlen + filelen + 1 + 1); fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
strcpy(fullpath, dirname); strcpy(fullpath, dirname);
if (needsep) { if (needsep) {

View File

@ -22,7 +22,7 @@ static bool has_extension(const char *filename, const char *extension) {
} }
bool starts_with(const char *pre, const char *str) { bool starts_with(const char *pre, const char *str) {
size_t len_pre = strlen(pre), len_str = strlen(str); size_t len_pre = std::strlen(pre), len_str = std::strlen(str);
return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0; return len_str < len_pre ? false : strncmp(pre, str, len_pre) == 0;
} }
@ -36,7 +36,7 @@ bool validate(const char *dirname) {
const char *extension2 = ".jsonl"; const char *extension2 = ".jsonl";
const char *extension3 = ".json"; // bad json files shoud fail const char *extension3 = ".json"; // bad json files shoud fail
size_t dirlen = strlen(dirname); size_t dirlen = std::strlen(dirname);
struct dirent **entry_list; struct dirent **entry_list;
int c = scandir(dirname, &entry_list, nullptr, alphasort); int c = scandir(dirname, &entry_list, nullptr, alphasort);
if (c < 0) { if (c < 0) {
@ -63,7 +63,7 @@ bool validate(const char *dirname) {
/* Finding the file path */ /* Finding the file path */
printf("validating: file %s ", name); printf("validating: file %s ", name);
fflush(nullptr); fflush(nullptr);
size_t namelen = strlen(name); size_t namelen = std::strlen(name);
size_t fullpathlen = dirlen + 1 + namelen + 1; size_t fullpathlen = dirlen + 1 + namelen + 1;
char *fullpath = static_cast<char *>(malloc(fullpathlen)); char *fullpath = static_cast<char *>(malloc(fullpathlen));
snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name); snprintf(fullpath, fullpathlen, "%s%s%s", dirname, needsep ? "/" : "", name);

View File

@ -179,7 +179,7 @@ bool issue1142() {
ASSERT_EQUAL(std::string(R"([])"), simdjson::minify(example3)); ASSERT_EQUAL(std::string(R"([])"), simdjson::minify(example3));
const char * input_array = "[]"; const char * input_array = "[]";
size_t input_length = strlen(input_array); size_t input_length = std::strlen(input_array);
auto element4 = parser.parse(input_array, input_length).at_pointer("");; auto element4 = parser.parse(input_array, input_length).at_pointer("");;
ASSERT_EQUAL(std::string(R"([])"), simdjson::minify(element4)); ASSERT_EQUAL(std::string(R"([])"), simdjson::minify(element4));

View File

@ -261,7 +261,7 @@ SIMDJSON_POP_DISABLE_WARNINGS
void minify() { void minify() {
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
std::unique_ptr<char[]> buffer{new char[length]}; std::unique_ptr<char[]> buffer{new char[length]};
size_t new_length{}; size_t new_length{};
auto error = simdjson::minify(some_string, length, buffer.get(), new_length); auto error = simdjson::minify(some_string, length, buffer.get(), new_length);
@ -270,7 +270,7 @@ void minify() {
abort(); abort();
} else { } else {
const char * expected_string = "[1,2,3,4]"; const char * expected_string = "[1,2,3,4]";
size_t expected_length = strlen(expected_string); size_t expected_length = std::strlen(expected_string);
if(expected_length != new_length) { if(expected_length != new_length) {
std::cerr << "mismatched length (error) " << std::endl; std::cerr << "mismatched length (error) " << std::endl;
abort(); abort();
@ -286,14 +286,14 @@ void minify() {
bool is_correct() { bool is_correct() {
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
bool is_ok = simdjson::validate_utf8(some_string, length); bool is_ok = simdjson::validate_utf8(some_string, length);
return is_ok; return is_ok;
} }
bool is_correct_string_view() { bool is_correct_string_view() {
const char * some_string = "[ 1, 2, 3, 4] "; const char * some_string = "[ 1, 2, 3, 4] ";
size_t length = strlen(some_string); size_t length = std::strlen(some_string);
std::string_view v(some_string, length); std::string_view v(some_string, length);
bool is_ok = simdjson::validate_utf8(v); bool is_ok = simdjson::validate_utf8(v);
return is_ok; return is_ok;
@ -305,6 +305,31 @@ bool is_correct_string() {
return is_ok; return is_ok;
} }
void parse_documentation() {
const char *json = R"({"key":"value"})";
const size_t json_len = std::strlen(json);
simdjson::dom::parser parser;
simdjson::dom::element element = parser.parse(json, json_len);
// Next line is to avoid unused warning.
(void)element;
}
void parse_documentation_lowlevel() {
// Such low-level code is not generally recommended. Please
// see parse_documentation() instead.
// Motivation: https://github.com/simdjson/simdjson/issues/1175
const char *json = R"({"key":"value"})";
const size_t json_len = std::strlen(json);
std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]};
std::memcpy(padded_json_copy.get(), json, json_len);
std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING);
simdjson::dom::parser parser;
simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false);
// Next line is to avoid unused warning.
(void)element;
}
int main() { int main() {
basics_dom_1(); basics_dom_1();
basics_dom_2(); basics_dom_2();

View File

@ -305,7 +305,7 @@ static bool has_extension(const char *filename, const char *extension) {
} }
bool starts_with(const char *pre, const char *str) { bool starts_with(const char *pre, const char *str) {
size_t lenpre = strlen(pre), lenstr = strlen(str); size_t lenpre = std::strlen(pre), lenstr = std::strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0; return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
} }
@ -313,7 +313,7 @@ bool validate(const char *dirname) {
size_t total_strings = 0; size_t total_strings = 0;
probable_bug = false; probable_bug = false;
const char *extension = ".json"; const char *extension = ".json";
size_t dirlen = strlen(dirname); size_t dirlen = std::strlen(dirname);
struct dirent **entry_list; struct dirent **entry_list;
int c = scandir(dirname, &entry_list, 0, alphasort); int c = scandir(dirname, &entry_list, 0, alphasort);
if (c < 0) { if (c < 0) {
@ -328,7 +328,7 @@ bool validate(const char *dirname) {
for (int i = 0; i < c; i++) { for (int i = 0; i < c; i++) {
const char *name = entry_list[i]->d_name; const char *name = entry_list[i]->d_name;
if (has_extension(name, extension)) { if (has_extension(name, extension)) {
size_t filelen = strlen(name); size_t filelen = std::strlen(name);
fullpath = (char *)malloc(dirlen + filelen + 1 + 1); fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
strcpy(fullpath, dirname); strcpy(fullpath, dirname);
if (needsep) { if (needsep) {

View File

@ -224,14 +224,14 @@ void test() {
"\x91\x85\x95\x9e", "\x91\x85\x95\x9e",
"\x6c\x02\x8e\x18"}; "\x6c\x02\x8e\x18"};
for (size_t i = 0; i < 8; i++) { for (size_t i = 0; i < 8; i++) {
size_t len = strlen(goodsequences[i]); size_t len = std::strlen(goodsequences[i]);
if (!simdjson::validate_utf8(goodsequences[i], len)) { if (!simdjson::validate_utf8(goodsequences[i], len)) {
printf("bug goodsequences[%zu]\n", i); printf("bug goodsequences[%zu]\n", i);
abort(); abort();
} }
} }
for (size_t i = 0; i < 26; i++) { for (size_t i = 0; i < 26; i++) {
size_t len = strlen(badsequences[i]); size_t len = std::strlen(badsequences[i]);
if (simdjson::validate_utf8(badsequences[i], len)) { if (simdjson::validate_utf8(badsequences[i], len)) {
printf("bug lookup2 badsequences[%zu]\n", i); printf("bug lookup2 badsequences[%zu]\n", i);
abort(); abort();