Exposing the string minifier.
This commit is contained in:
parent
b4837f2e2f
commit
a6e4933d93
|
@ -126,8 +126,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
|
||||
size_t minisize;
|
||||
auto minierror = simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(),
|
||||
(uint8_t *)mini_buffer, minisize);
|
||||
auto minierror = minify_string(p.data(), p.size(),mini_buffer, minisize);
|
||||
if (!minierror) { std::cerr << minierror << std::endl; exit(1); }
|
||||
mini_buffer[minisize] = '\0';
|
||||
|
||||
|
|
|
@ -168,6 +168,24 @@ And another one:
|
|||
cout << "number: " << v << endl;
|
||||
```
|
||||
|
||||
Minifying JSON strings without parsing
|
||||
----------------------
|
||||
|
||||
In some cases, you may have valid JSON strings that you do not wish to parse but that you wish to minify. That is, you wish to remove all unnecessary spaces. We have a fast function for this purpose (`minify_string`). This function does not validate your content, and it does not parse it. Instead, it assumes that your string is valid UTF-8. It is much faster than parsing the string and re-serializing it in minified form. Usage is relatively simple. You must pass an input pointer with a length parameter, as well as an output pointer and an output length parameter (by reference). The output length parameter is not read, but written to. The output pointer should point to a valid memory region that is slightly overallocated (by `simdjson::SIMDJSON_PADDING`) compared to the original string length. The input pointer and input length are read, but not written to.
|
||||
|
||||
```C++
|
||||
// Starts with a valid JSON document as a string.
|
||||
// It does not have to be null-terminated.
|
||||
const char * some_string = "[ 1, 2, 3, 4] ";
|
||||
size_t length = strlen(some_string);
|
||||
// Create a buffer to receive the minified string. Make sure that there is enough room,
|
||||
// including some padding (simdjson::SIMDJSON_PADDING).
|
||||
std::unique_ptr<char> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
|
||||
size_t new_length{}; // It will receive the minified length.
|
||||
auto error = simdjson::minify_string(some_string, length, buffer.get(), new_length);
|
||||
// The buffer variable now has "[1,2,3,4]" and new_length has value 9.
|
||||
```
|
||||
|
||||
|
||||
C++17 Support
|
||||
-------------
|
||||
|
|
|
@ -72,11 +72,11 @@ public:
|
|||
/**
|
||||
* @private For internal implementation use
|
||||
*
|
||||
* Run a full document parse (ensure_capacity, stage1 and stage2).
|
||||
* Minify the input string assuming that it represents a JSON string, does not parse or validate.
|
||||
*
|
||||
* Overridden by each implementation.
|
||||
*
|
||||
* @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
|
||||
* @param buf the json document to minify.
|
||||
* @param len the length of the json document.
|
||||
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
|
||||
* @param dst_len the number of bytes written. Output only.
|
||||
|
|
|
@ -9,6 +9,24 @@
|
|||
|
||||
namespace simdjson {
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* Minify the input string assuming that it represents a JSON string, does not parse or validate.
|
||||
* This function is much faster than parsing a JSON string and then writing a minified version of it.
|
||||
* However, it does not validate the input.
|
||||
*
|
||||
*
|
||||
* @param buf the json document to minify.
|
||||
* @param len the length of the json document.
|
||||
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
|
||||
* @param dst_len the number of bytes written. Output only.
|
||||
* @return the error code, or SUCCESS if there was no error.
|
||||
*/
|
||||
WARN_UNUSED error_code minify_string(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept;
|
||||
|
||||
|
||||
/**
|
||||
* Minifies a JSON element or document, printing the smallest possible valid JSON.
|
||||
*
|
||||
|
|
|
@ -134,4 +134,9 @@ const implementation *detect_best_supported_implementation_on_first_use::set_bes
|
|||
SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{};
|
||||
SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton};
|
||||
|
||||
WARN_UNUSED error_code minify_string(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
|
||||
return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
|
||||
}
|
||||
|
||||
|
||||
} // namespace simdjson
|
||||
|
|
|
@ -1712,6 +1712,62 @@ namespace type_tests {
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
||||
namespace minify_string_tests {
|
||||
|
||||
bool check_minification(const char * input, size_t length, const char * expected, size_t expected_length) {
|
||||
std::unique_ptr<char> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
|
||||
if(buffer.get() == nullptr) {
|
||||
std::cerr << "cannot alloc " << std::endl;
|
||||
return false;
|
||||
}
|
||||
size_t newlength{};
|
||||
auto error = simdjson::minify_string(input, length, buffer.get(), newlength);
|
||||
if(error != simdjson::SUCCESS) {
|
||||
std::cerr << "error " << error << std::endl;
|
||||
return false;
|
||||
}
|
||||
// memcmp
|
||||
if(newlength != expected_length) {
|
||||
std::cerr << "lengths do not match " << std::endl;
|
||||
return false;
|
||||
}
|
||||
for(size_t i = 0; i < newlength; i++) {
|
||||
if(buffer.get()[i] != expected[i]) {
|
||||
std::cerr << "Inputs do not match (but same length) " << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool test_minify_string() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
|
||||
const std::string minified(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
|
||||
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
|
||||
}
|
||||
bool test_minify_string_array() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
std::string test("[ 1, 2, 3]");
|
||||
std::string minified("[1,2,3]");
|
||||
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
|
||||
}
|
||||
bool test_minify_string_object() {
|
||||
std::cout << "Running " << __func__ << std::endl;
|
||||
std::string test(R"({ "foo " : 1, "b ar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })");
|
||||
std::string minified(R"({"foo ":1,"b ar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
|
||||
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
|
||||
}
|
||||
bool run() {
|
||||
return test_minify_string() &&
|
||||
test_minify_string_array() &&
|
||||
test_minify_string_object();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace format_tests {
|
||||
using namespace simdjson;
|
||||
using namespace simdjson::dom;
|
||||
|
@ -1968,7 +2024,8 @@ int main(int argc, char *argv[]) {
|
|||
printf("unsupported CPU\n");
|
||||
}
|
||||
std::cout << "Running basic tests." << std::endl;
|
||||
if (parse_api_tests::run() &&
|
||||
if (minify_string_tests::run() &&
|
||||
parse_api_tests::run() &&
|
||||
dom_api_tests::run() &&
|
||||
type_tests::run() &&
|
||||
format_tests::run() &&
|
||||
|
|
|
@ -239,10 +239,36 @@ void performance_3() {
|
|||
SIMDJSON_POP_DISABLE_WARNINGS
|
||||
#endif
|
||||
|
||||
void minify_string() {
|
||||
const char * some_string = "[ 1, 2, 3, 4] ";
|
||||
size_t length = strlen(some_string);
|
||||
std::unique_ptr<char> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
|
||||
size_t new_length{};
|
||||
auto error = simdjson::minify_string(some_string, length, buffer.get(), new_length);
|
||||
if(error != simdjson::SUCCESS) {
|
||||
std::cerr << "error " << error << std::endl;
|
||||
abort();
|
||||
} else {
|
||||
const char * expected_string = "[1,2,3,4]";
|
||||
size_t expected_length = strlen(expected_string);
|
||||
if(expected_length != new_length) {
|
||||
std::cerr << "mismatched length (error) " << std::endl;
|
||||
abort();
|
||||
}
|
||||
for(size_t i = 0; i < new_length; i++) {
|
||||
if(expected_string[i] != buffer.get()[i]) {
|
||||
std::cerr << "mismatched content (error) " << std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
basics_dom_1();
|
||||
basics_dom_2();
|
||||
basics_dom_3();
|
||||
basics_dom_4();
|
||||
minify_string();
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue