This updates the minifier. (#446)
This commit is contained in:
parent
2dc61fbdc4
commit
f611b65bc0
|
@ -1,4 +1,4 @@
|
||||||
/* auto-generated on Wed Dec 18 14:39:04 UTC 2019. Do not edit! */
|
/* auto-generated on Wed Jan 15 13:09:01 EST 2020. Do not edit! */
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "simdjson.h"
|
#include "simdjson.h"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
||||||
/* auto-generated on Wed Dec 18 14:39:04 UTC 2019. Do not edit! */
|
/* auto-generated on Wed Jan 15 13:09:01 EST 2020. Do not edit! */
|
||||||
/* begin file include/simdjson/simdjson_version.h */
|
/* begin file include/simdjson/simdjson_version.h */
|
||||||
// /include/simdjson/simdjson_version.h automatically generated by release.py,
|
// /include/simdjson/simdjson_version.h automatically generated by release.py,
|
||||||
// do not change by hand
|
// do not change by hand
|
||||||
|
@ -18,10 +18,10 @@ enum {
|
||||||
#ifndef SIMDJSON_PORTABILITY_H
|
#ifndef SIMDJSON_PORTABILITY_H
|
||||||
#define SIMDJSON_PORTABILITY_H
|
#define SIMDJSON_PORTABILITY_H
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdlib>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#include <iso646.h>
|
#include <iso646.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -34,7 +34,10 @@ enum {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// this is almost standard?
|
// this is almost standard?
|
||||||
#define STRINGIFY(a) #a
|
#undef STRINGIFY_IMPLEMENTATION_
|
||||||
|
#undef STRINGIFY
|
||||||
|
#define STRINGIFY_IMPLEMENTATION_(a) #a
|
||||||
|
#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
|
||||||
|
|
||||||
// we are going to use runtime dispatch
|
// we are going to use runtime dispatch
|
||||||
#ifdef IS_X86_64
|
#ifdef IS_X86_64
|
||||||
|
@ -54,7 +57,7 @@ enum {
|
||||||
#define UNTARGET_REGION _Pragma("GCC pop_options")
|
#define UNTARGET_REGION _Pragma("GCC pop_options")
|
||||||
#endif // clang then gcc
|
#endif // clang then gcc
|
||||||
|
|
||||||
#endif // x86
|
#endif // x86
|
||||||
|
|
||||||
// Default target region macros don't do anything.
|
// Default target region macros don't do anything.
|
||||||
#ifndef TARGET_REGION
|
#ifndef TARGET_REGION
|
||||||
|
@ -67,9 +70,11 @@ enum {
|
||||||
#define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul")
|
#define TARGET_WESTMERE TARGET_REGION("sse4.2,pclmul")
|
||||||
#define TARGET_ARM64
|
#define TARGET_ARM64
|
||||||
|
|
||||||
|
// Threading is disabled
|
||||||
|
#undef SIMDJSON_THREADS_ENABLED
|
||||||
// Is threading enabled?
|
// Is threading enabled?
|
||||||
#if defined(BOOST_HAS_THREADS) || defined(_REENTRANT) || defined(_MT)
|
#if defined(BOOST_HAS_THREADS) || defined(_REENTRANT) || defined(_MT)
|
||||||
#define SIMDJSON_THREADS_ENABLED 1
|
#define SIMDJSON_THREADS_ENABLED
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__clang__)
|
#if defined(__clang__)
|
||||||
|
@ -84,7 +89,6 @@ enum {
|
||||||
#include <intrin.h> // visual studio
|
#include <intrin.h> // visual studio
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define simdjson_strcasecmp _stricmp
|
#define simdjson_strcasecmp _stricmp
|
||||||
#else
|
#else
|
||||||
|
@ -493,6 +497,22 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
|
||||||
#ifndef SIMDJSON_SIMDJSON_H
|
#ifndef SIMDJSON_SIMDJSON_H
|
||||||
#define SIMDJSON_SIMDJSON_H
|
#define SIMDJSON_SIMDJSON_H
|
||||||
|
|
||||||
|
#ifndef __cplusplus
|
||||||
|
#error simdjson requires a C++ compiler
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SIMDJSON_CPLUSPLUS
|
||||||
|
#if defined(_MSVC_LANG) && !defined(__clang__)
|
||||||
|
#define SIMDJSON_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
|
||||||
|
#else
|
||||||
|
#define SIMDJSON_CPLUSPLUS __cplusplus
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (SIMDJSON_CPLUSPLUS < 201703L)
|
||||||
|
#error simdjson requires a compiler compliant with the C++17 standard
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
|
@ -529,8 +549,8 @@ enum ErrorValues {
|
||||||
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
|
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
|
||||||
NUMBER_ERROR, // Problem while parsing a number
|
NUMBER_ERROR, // Problem while parsing a number
|
||||||
UTF8_ERROR, // the input is not valid UTF-8
|
UTF8_ERROR, // the input is not valid UTF-8
|
||||||
UNITIALIZED, // unknown error, or uninitialized document
|
UNINITIALIZED, // unknown error, or uninitialized document
|
||||||
EMPTY, // no structural document found
|
EMPTY, // no structural element found
|
||||||
UNESCAPED_CHARS, // found unescaped characters in a string.
|
UNESCAPED_CHARS, // found unescaped characters in a string.
|
||||||
UNCLOSED_STRING, // missing quote at the end
|
UNCLOSED_STRING, // missing quote at the end
|
||||||
UNEXPECTED_ERROR // indicative of a bug in simdjson
|
UNEXPECTED_ERROR // indicative of a bug in simdjson
|
||||||
|
@ -623,39 +643,68 @@ const std::string &error_message(const int);
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
// low-level function to allocate memory with padding so we can read passed the
|
// low-level function to allocate memory with padding so we can read passed the
|
||||||
// "length" bytes safely. if you must provide a pointer to some data, create it
|
// "length" bytes safely. if you must provide a pointer to some data, create it
|
||||||
// with this function: length is the max. size in bytes of the string caller is
|
// with this function: length is the max. size in bytes of the string caller is
|
||||||
// responsible to free the memory (free(...))
|
// responsible to free the memory (free(...))
|
||||||
char *allocate_padded_buffer(size_t length);
|
inline char *allocate_padded_buffer(size_t length) noexcept {
|
||||||
|
// we could do a simple malloc
|
||||||
|
// return (char *) malloc(length + SIMDJSON_PADDING);
|
||||||
|
// However, we might as well align to cache lines...
|
||||||
|
size_t totalpaddedlength = length + SIMDJSON_PADDING;
|
||||||
|
char *padded_buffer = aligned_malloc_char(64, totalpaddedlength);
|
||||||
|
#ifndef NDEBUG
|
||||||
|
if (padded_buffer == nullptr) {
|
||||||
|
errno = EINVAL;
|
||||||
|
perror("simdjson::allocate_padded_buffer() aligned_malloc_char() failed");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#endif // NDEBUG
|
||||||
|
memset(padded_buffer + length, 0, totalpaddedlength - length);
|
||||||
|
return padded_buffer;
|
||||||
|
} // allocate_padded_buffer
|
||||||
|
|
||||||
// Simple string with padded allocation.
|
// Simple string with padded allocation.
|
||||||
// We deliberately forbid copies, users should rely on swap or move
|
// We deliberately forbid copies, users should rely on swap or move
|
||||||
// constructors.
|
// constructors.
|
||||||
class padded_string {
|
struct padded_string final {
|
||||||
public:
|
|
||||||
explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
|
explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
|
||||||
|
|
||||||
explicit padded_string(size_t length) noexcept
|
explicit padded_string(size_t length) noexcept
|
||||||
: viable_size(length), data_ptr(allocate_padded_buffer(length)) {
|
: viable_size(length), data_ptr(allocate_padded_buffer(length)) {
|
||||||
|
|
||||||
if (data_ptr != nullptr)
|
if (data_ptr != nullptr)
|
||||||
data_ptr[length] = '\0'; // easier when you need a c_str
|
data_ptr[length] = '\0'; // easier when you need a c_str
|
||||||
}
|
}
|
||||||
|
|
||||||
explicit padded_string(char *data, size_t length) noexcept
|
explicit padded_string(char *data, size_t length) noexcept
|
||||||
: viable_size(length), data_ptr(allocate_padded_buffer(length)) {
|
: viable_size(length), data_ptr(allocate_padded_buffer(length)) {
|
||||||
if (data_ptr != nullptr) {
|
if ((data != nullptr) and (data_ptr != nullptr)) {
|
||||||
memcpy(data_ptr, data, length);
|
memcpy(data_ptr, data, length);
|
||||||
data_ptr[length] = '\0'; // easier when you need a c_str
|
data_ptr[length] = '\0'; // easier when you need a c_str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
padded_string(std::string s) noexcept
|
|
||||||
: viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
|
// note: do not pass std::string arguments by value
|
||||||
|
padded_string(const std::string & str_ ) noexcept
|
||||||
|
: viable_size(str_.size()), data_ptr(allocate_padded_buffer(str_.size())) {
|
||||||
if (data_ptr != nullptr) {
|
if (data_ptr != nullptr) {
|
||||||
memcpy(data_ptr, s.data(), s.size());
|
memcpy(data_ptr, str_.data(), str_.size());
|
||||||
data_ptr[s.size()] = '\0'; // easier when you need a c_str
|
data_ptr[str_.size()] = '\0'; // easier when you need a c_str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: do pass std::string_view arguments by value
|
||||||
|
padded_string(std::string_view sv_) noexcept
|
||||||
|
: viable_size(sv_.size()), data_ptr(allocate_padded_buffer(sv_.size())) {
|
||||||
|
if (data_ptr != nullptr) {
|
||||||
|
memcpy(data_ptr, sv_.data(), sv_.size());
|
||||||
|
data_ptr[sv_.size()] = '\0'; // easier when you need a c_str
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
padded_string(padded_string &&o) noexcept
|
padded_string(padded_string &&o) noexcept
|
||||||
: viable_size(o.viable_size), data_ptr(o.data_ptr) {
|
: viable_size(o.viable_size), data_ptr(o.data_ptr) {
|
||||||
o.data_ptr = nullptr; // we take ownership
|
o.data_ptr = nullptr; // we take ownership
|
||||||
|
@ -678,21 +727,25 @@ public:
|
||||||
o.viable_size = tmp_viable_size;
|
o.viable_size = tmp_viable_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
~padded_string() { aligned_free_char(data_ptr); }
|
~padded_string() {
|
||||||
|
aligned_free_char(data_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
size_t size() const { return viable_size; }
|
size_t size() const { return viable_size; }
|
||||||
|
|
||||||
size_t length() const { return viable_size; }
|
size_t length() const { return viable_size; }
|
||||||
|
|
||||||
char *data() const { return data_ptr; }
|
char *data() const { return data_ptr; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
padded_string &operator=(const padded_string &o) = delete;
|
padded_string &operator=(const padded_string &o) = delete;
|
||||||
padded_string(const padded_string &o) = delete;
|
padded_string(const padded_string &o) = delete;
|
||||||
|
|
||||||
size_t viable_size;
|
size_t viable_size;
|
||||||
char *data_ptr;
|
char *data_ptr{nullptr};
|
||||||
};
|
|
||||||
|
}; // padded_string
|
||||||
|
|
||||||
} // namespace simdjson
|
} // namespace simdjson
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -766,6 +819,7 @@ static inline size_t json_minify(const padded_string &p, char *out) {
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
|
#define JSON_VALUE_MASK 0xFFFFFFFFFFFFFF
|
||||||
|
|
||||||
|
@ -782,10 +836,14 @@ class ParsedJson {
|
||||||
public:
|
public:
|
||||||
// create a ParsedJson container with zero capacity, call allocate_capacity to
|
// create a ParsedJson container with zero capacity, call allocate_capacity to
|
||||||
// allocate memory
|
// allocate memory
|
||||||
ParsedJson();
|
ParsedJson()=default;
|
||||||
~ParsedJson();
|
~ParsedJson()=default;
|
||||||
ParsedJson(ParsedJson &&p);
|
|
||||||
ParsedJson &operator=(ParsedJson &&o);
|
// this is a move only class
|
||||||
|
ParsedJson(ParsedJson &&p) = default;
|
||||||
|
ParsedJson(const ParsedJson &p) = delete;
|
||||||
|
ParsedJson &operator=(ParsedJson &&o) = default;
|
||||||
|
ParsedJson &operator=(const ParsedJson &o) = delete;
|
||||||
|
|
||||||
// if needed, allocate memory so that the object is able to process JSON
|
// if needed, allocate memory so that the object is able to process JSON
|
||||||
// documents having up to len bytes and max_depth "depth"
|
// documents having up to len bytes and max_depth "depth"
|
||||||
|
@ -838,7 +896,8 @@ public:
|
||||||
|
|
||||||
really_inline void write_tape_s64(int64_t i) {
|
really_inline void write_tape_s64(int64_t i) {
|
||||||
write_tape(0, 'l');
|
write_tape(0, 'l');
|
||||||
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
|
std::memcpy(&tape[current_loc], &i, sizeof(i));
|
||||||
|
++current_loc;
|
||||||
}
|
}
|
||||||
|
|
||||||
really_inline void write_tape_u64(uint64_t i) {
|
really_inline void write_tape_u64(uint64_t i) {
|
||||||
|
@ -874,27 +933,22 @@ public:
|
||||||
uint32_t current_loc{0};
|
uint32_t current_loc{0};
|
||||||
uint32_t n_structural_indexes{0};
|
uint32_t n_structural_indexes{0};
|
||||||
|
|
||||||
uint32_t *structural_indexes;
|
std::unique_ptr<uint32_t[]> structural_indexes;
|
||||||
|
|
||||||
|
std::unique_ptr<uint64_t[]> tape;
|
||||||
|
std::unique_ptr<uint32_t[]> containing_scope_offset;
|
||||||
|
|
||||||
uint64_t *tape;
|
|
||||||
uint32_t *containing_scope_offset;
|
|
||||||
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
#ifdef SIMDJSON_USE_COMPUTED_GOTO
|
||||||
void **ret_address;
|
std::unique_ptr<void*[]> ret_address;
|
||||||
#else
|
#else
|
||||||
char *ret_address;
|
std::unique_ptr<char[]> ret_address;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint8_t *string_buf; // should be at least byte_capacity
|
std::unique_ptr<uint8_t[]> string_buf;// should be at least byte_capacity
|
||||||
uint8_t *current_string_buf_loc;
|
uint8_t *current_string_buf_loc;
|
||||||
bool valid{false};
|
bool valid{false};
|
||||||
int error_code{simdjson::UNITIALIZED};
|
int error_code{simdjson::UNINITIALIZED};
|
||||||
|
|
||||||
private:
|
|
||||||
// we don't want the default constructor to be called
|
|
||||||
ParsedJson(const ParsedJson &p) =
|
|
||||||
delete; // we don't want the default constructor to be called
|
|
||||||
// we don't want the assignment to be called
|
|
||||||
ParsedJson &operator=(const ParsedJson &o) = delete;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// dump bits low to high
|
// dump bits low to high
|
||||||
|
@ -979,14 +1033,14 @@ public:
|
||||||
// within the string: get_string_length determines the true string length.
|
// within the string: get_string_length determines the true string length.
|
||||||
inline const char *get_string() const {
|
inline const char *get_string() const {
|
||||||
return reinterpret_cast<const char *>(
|
return reinterpret_cast<const char *>(
|
||||||
pj->string_buf + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
|
pj->string_buf.get() + (current_val & JSON_VALUE_MASK) + sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
// return the length of the string in bytes
|
// return the length of the string in bytes
|
||||||
inline uint32_t get_string_length() const {
|
inline uint32_t get_string_length() const {
|
||||||
uint32_t answer;
|
uint32_t answer;
|
||||||
memcpy(&answer,
|
memcpy(&answer,
|
||||||
reinterpret_cast<const char *>(pj->string_buf +
|
reinterpret_cast<const char *>(pj->string_buf.get() +
|
||||||
(current_val & JSON_VALUE_MASK)),
|
(current_val & JSON_VALUE_MASK)),
|
||||||
sizeof(uint32_t));
|
sizeof(uint32_t));
|
||||||
return answer;
|
return answer;
|
||||||
|
@ -1665,22 +1719,32 @@ bool ParsedJson::BasicIterator<max_depth>::relative_move_to(const char *pointer,
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
|
|
||||||
|
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
|
||||||
|
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
|
||||||
|
// you may want to call on a function like trimmed_length_safe_utf8.
|
||||||
|
// A function like find_last_json_buf_idx may also prove useful.
|
||||||
template <Architecture T = Architecture::NATIVE>
|
template <Architecture T = Architecture::NATIVE>
|
||||||
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj, bool streaming);
|
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj, bool streaming);
|
||||||
|
|
||||||
|
// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
|
||||||
|
// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
|
||||||
|
// you may want to call on a function like trimmed_length_safe_utf8.
|
||||||
|
// A function like find_last_json_buf_idx may also prove useful.
|
||||||
template <Architecture T = Architecture::NATIVE>
|
template <Architecture T = Architecture::NATIVE>
|
||||||
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj, bool streaming) {
|
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj, bool streaming) {
|
||||||
return find_structural_bits<T>((const uint8_t *)buf, len, pj, streaming);
|
return find_structural_bits<T>((const uint8_t *)buf, len, pj, streaming);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template <Architecture T = Architecture::NATIVE>
|
template <Architecture T = Architecture::NATIVE>
|
||||||
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj){
|
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
|
||||||
return find_structural_bits<T>((const uint8_t *)buf, len, pj, false);
|
return find_structural_bits<T>(buf, len, pj, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <Architecture T = Architecture::NATIVE>
|
template <Architecture T = Architecture::NATIVE>
|
||||||
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) {
|
int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) {
|
||||||
return find_structural_bits<T>((const uint8_t *)buf, len, pj, false);
|
return find_structural_bits<T>((const uint8_t *)buf, len, pj);
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace simdjson
|
}; // namespace simdjson
|
||||||
|
@ -1701,7 +1765,8 @@ WARN_UNUSED int
|
||||||
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
unified_machine(const uint8_t *buf, size_t len, ParsedJson &pj);
|
||||||
|
|
||||||
template <Architecture T = Architecture::NATIVE>
|
template <Architecture T = Architecture::NATIVE>
|
||||||
int unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
WARN_UNUSED int
|
||||||
|
unified_machine(const char *buf, size_t len, ParsedJson &pj) {
|
||||||
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
|
return unified_machine<T>(reinterpret_cast<const uint8_t *>(buf), len, pj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2003,6 +2068,8 @@ namespace simdjson {
|
||||||
* */
|
* */
|
||||||
JsonStream(const std::string &s, size_t batch_size = 1000000) : JsonStream(s.data(), s.size(), batch_size) {};
|
JsonStream(const std::string &s, size_t batch_size = 1000000) : JsonStream(s.data(), s.size(), batch_size) {};
|
||||||
|
|
||||||
|
~JsonStream();
|
||||||
|
|
||||||
/* Parse the next document found in the buffer previously given to JsonStream.
|
/* Parse the next document found in the buffer previously given to JsonStream.
|
||||||
|
|
||||||
* The content should be a valid JSON document encoded as UTF-8. If there is a
|
* The content should be a valid JSON document encoded as UTF-8. If there is a
|
||||||
|
@ -2034,12 +2101,14 @@ namespace simdjson {
|
||||||
/* Sets a new buffer for this JsonStream. Will also reinitialize all the variables,
|
/* Sets a new buffer for this JsonStream. Will also reinitialize all the variables,
|
||||||
* which acts as a reset. A new JsonStream without initializing again.
|
* which acts as a reset. A new JsonStream without initializing again.
|
||||||
* */
|
* */
|
||||||
void set_new_buffer(const char *buf, size_t len);
|
// todo: implement and test this function, note that _batch_size is mutable
|
||||||
|
// void set_new_buffer(const char *buf, size_t len);
|
||||||
|
|
||||||
/* Sets a new buffer for this JsonStream. Will also reinitialize all the variables,
|
/* Sets a new buffer for this JsonStream. Will also reinitialize all the variables,
|
||||||
* which is basically a reset. A new JsonStream without initializing again.
|
* which is basically a reset. A new JsonStream without initializing again.
|
||||||
* */
|
* */
|
||||||
void set_new_buffer(const std::string &s) { set_new_buffer(s.data(), s.size()); }
|
// todo: implement and test this function, note that _batch_size is mutable
|
||||||
|
// void set_new_buffer(const std::string &s) { set_new_buffer(s.data(), s.size()); }
|
||||||
|
|
||||||
/* Returns the location (index) of where the next document should be in the buffer.
|
/* Returns the location (index) of where the next document should be in the buffer.
|
||||||
* Can be used for debugging, it tells the user the position of the end of the last
|
* Can be used for debugging, it tells the user the position of the end of the last
|
||||||
|
@ -2059,43 +2128,89 @@ namespace simdjson {
|
||||||
size_t _len;
|
size_t _len;
|
||||||
size_t _batch_size;
|
size_t _batch_size;
|
||||||
size_t next_json{0};
|
size_t next_json{0};
|
||||||
bool error_on_last_attempt{false};
|
|
||||||
bool load_next_batch{true};
|
bool load_next_batch{true};
|
||||||
size_t current_buffer_loc{0};
|
size_t current_buffer_loc{0};
|
||||||
size_t last_json_buffer_loc{0};
|
size_t last_json_buffer_loc{0};
|
||||||
size_t n_parsed_docs{0};
|
size_t n_parsed_docs{0};
|
||||||
size_t n_bytes_parsed{0};
|
size_t n_bytes_parsed{0};
|
||||||
|
#ifdef SIMDJSON_THREADS_ENABLED
|
||||||
|
int stage1_is_ok_thread{0};
|
||||||
std::thread stage_1_thread;
|
std::thread stage_1_thread;
|
||||||
simdjson::ParsedJson pj_thread;
|
simdjson::ParsedJson pj_thread;
|
||||||
|
|
||||||
#ifdef SIMDJSON_THREADS_ENABLED
|
|
||||||
/* This algorithm is used to quickly identify the buffer position of
|
|
||||||
* the last JSON document inside the current batch.
|
|
||||||
*
|
|
||||||
* It does it's work by finding the last pair of structural characters
|
|
||||||
* that represent the end followed by the start of a document.
|
|
||||||
*
|
|
||||||
* Simply put, we iterate over the structural characters, starting from
|
|
||||||
* the end. We consider that we found the end of a JSON document when the
|
|
||||||
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
|
|
||||||
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
|
|
||||||
*
|
|
||||||
* This simple comparison works most of the time, but it does not cover cases
|
|
||||||
* where the batch's structural indexes contain a perfect amount of documents.
|
|
||||||
* In such a case, we do not have access to the structural index which follows
|
|
||||||
* the last document, therefore, we do not have access to the second element in
|
|
||||||
* the pair, and means that we cannot identify the last document. To fix this
|
|
||||||
* issue, we keep a count of the open and closed curly/square braces we found
|
|
||||||
* while searching for the pair. When we find a pair AND the count of open and
|
|
||||||
* closed curly/square braces is the same, we know that we just passed a complete
|
|
||||||
* document, therefore the last json buffer location is the end of the batch
|
|
||||||
* */
|
|
||||||
size_t find_last_json_buf_loc(const ParsedJson &pj);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* This algorithm is used to quickly identify the buffer position of
|
||||||
|
* the last JSON document inside the current batch.
|
||||||
|
*
|
||||||
|
* It does its work by finding the last pair of structural characters
|
||||||
|
* that represent the end followed by the start of a document.
|
||||||
|
*
|
||||||
|
* Simply put, we iterate over the structural characters, starting from
|
||||||
|
* the end. We consider that we found the end of a JSON document when the
|
||||||
|
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
|
||||||
|
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
|
||||||
|
*
|
||||||
|
* This simple comparison works most of the time, but it does not cover cases
|
||||||
|
* where the batch's structural indexes contain a perfect amount of documents.
|
||||||
|
* In such a case, we do not have access to the structural index which follows
|
||||||
|
* the last document, therefore, we do not have access to the second element in
|
||||||
|
* the pair, and means that we cannot identify the last document. To fix this
|
||||||
|
* issue, we keep a count of the open and closed curly/square braces we found
|
||||||
|
* while searching for the pair. When we find a pair AND the count of open and
|
||||||
|
* closed curly/square braces is the same, we know that we just passed a complete
|
||||||
|
* document, therefore the last json buffer location is the end of the batch
|
||||||
|
* */
|
||||||
|
inline size_t find_last_json_buf_idx(const char * buf, size_t size, const ParsedJson &pj) {
|
||||||
|
// this function can be generally useful
|
||||||
|
if(pj.n_structural_indexes == 0) return 0;
|
||||||
|
auto last_i = pj.n_structural_indexes - 1;
|
||||||
|
if (pj.structural_indexes[last_i] == size) {
|
||||||
|
if(last_i == 0) return 0;
|
||||||
|
last_i = pj.n_structural_indexes - 2;
|
||||||
|
}
|
||||||
|
auto arr_cnt = 0;
|
||||||
|
auto obj_cnt = 0;
|
||||||
|
for (auto i = last_i; i > 0; i--) {
|
||||||
|
auto idxb = pj.structural_indexes[i];
|
||||||
|
switch (buf[idxb]) {
|
||||||
|
case ':':
|
||||||
|
case ',':
|
||||||
|
continue;
|
||||||
|
case '}':
|
||||||
|
obj_cnt--;
|
||||||
|
continue;
|
||||||
|
case ']':
|
||||||
|
arr_cnt--;
|
||||||
|
continue;
|
||||||
|
case '{':
|
||||||
|
obj_cnt++;
|
||||||
|
break;
|
||||||
|
case '[':
|
||||||
|
arr_cnt++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto idxa = pj.structural_indexes[i - 1];
|
||||||
|
switch (buf[idxa]) {
|
||||||
|
case '{':
|
||||||
|
case '[':
|
||||||
|
case ':':
|
||||||
|
case ',':
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!arr_cnt && !obj_cnt) {
|
||||||
|
return last_i+1;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif //SIMDJSON_JSONSTREAM_H
|
#endif //SIMDJSON_JSONSTREAM_H
|
||||||
/* end file include/simdjson/jsonstream.h */
|
/* end file include/simdjson/jsonstream.h */
|
||||||
|
|
|
@ -59,8 +59,14 @@ size_t json_minify(const unsigned char *bytes, size_t how_many,
|
||||||
}
|
}
|
||||||
} // namespace simdjson
|
} // namespace simdjson
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
//
|
||||||
|
// This fast code is disabled in the context of runtime dispatching.
|
||||||
|
// See issue https://github.com/lemire/simdjson/issues/384
|
||||||
|
//
|
||||||
#include "simdprune_tables.h"
|
#include "simdprune_tables.h"
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <x86intrin.h> // currently, there is no runtime dispatch for the minifier
|
||||||
|
|
||||||
namespace simdjson {
|
namespace simdjson {
|
||||||
|
|
||||||
|
@ -363,18 +369,18 @@ size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
|
||||||
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||||
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||||
int pop4 = hamming((~whitespace));
|
int pop4 = hamming((~whitespace));
|
||||||
__m256i vmask1 = _mm256_loadu2_m128i(
|
__m128i x1 = _mm256_extracti128_si256(input_lo, 0);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
__m128i x2 = _mm256_extracti128_si256(input_lo, 1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
__m128i x3 = _mm256_extracti128_si256(input_hi, 0);
|
||||||
__m256i vmask2 = _mm256_loadu2_m128i(
|
__m128i x4 = _mm256_extracti128_si256(input_hi, 1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
x1 = skinnycleanm128(x1, mask1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
x2 = skinnycleanm128(x2, mask2);
|
||||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
x3 = skinnycleanm128(x3, mask3);
|
||||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
x4 = skinnycleanm128(x4, mask4);
|
||||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop1),
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
|
||||||
reinterpret_cast<__m128i *>(out), result1);
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
|
||||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(out + pop3),
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
|
||||||
reinterpret_cast<__m128i *>(out + pop2), result2);
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
|
||||||
out += pop4;
|
out += pop4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -447,23 +453,24 @@ size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
|
||||||
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
|
||||||
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
|
||||||
int pop4 = hamming((~whitespace));
|
int pop4 = hamming((~whitespace));
|
||||||
__m256i vmask1 = _mm256_loadu2_m128i(
|
__m128i x1 = _mm256_extracti128_si256(input_lo, 0);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask2 & 0x7FFF),
|
__m128i x2 = _mm256_extracti128_si256(input_lo, 1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask1 & 0x7FFF));
|
__m128i x3 = _mm256_extracti128_si256(input_hi, 0);
|
||||||
__m256i vmask2 = _mm256_loadu2_m128i(
|
__m128i x4 = _mm256_extracti128_si256(input_hi, 1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask4 & 0x7FFF),
|
x1 = skinnycleanm128(x1, mask1);
|
||||||
reinterpret_cast<const __m128i *>(mask128_epi8) + (mask3 & 0x7FFF));
|
x2 = skinnycleanm128(x2, mask2);
|
||||||
__m256i result1 = _mm256_shuffle_epi8(input_lo, vmask1);
|
x3 = skinnycleanm128(x3, mask3);
|
||||||
__m256i result2 = _mm256_shuffle_epi8(input_hi, vmask2);
|
x4 = skinnycleanm128(x4, mask4);
|
||||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop1),
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1);
|
||||||
reinterpret_cast<__m128i *>(buffer), result1);
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2);
|
||||||
_mm256_storeu2_m128i(reinterpret_cast<__m128i *>(buffer + pop3),
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3);
|
||||||
reinterpret_cast<__m128i *>(buffer + pop2), result2);
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4);
|
||||||
memcpy(out, buffer, pop4);
|
memcpy(out, buffer, pop4);
|
||||||
out += pop4;
|
out += pop4;
|
||||||
}
|
}
|
||||||
*out = '\0'; // NULL termination
|
*out = '\0'; // NULL termination
|
||||||
return out - initout;
|
return out - initout;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace simdjson
|
} // namespace simdjson
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue