diff --git a/benchmark/minifiercompetition.cpp b/benchmark/minifiercompetition.cpp index c2b178dd..6028c0d4 100644 --- a/benchmark/minifiercompetition.cpp +++ b/benchmark/minifiercompetition.cpp @@ -126,8 +126,7 @@ int main(int argc, char *argv[]) { char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1); size_t minisize; - auto minierror = simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(), - (uint8_t *)mini_buffer, minisize); + auto minierror = minify_string(p.data(), p.size(),mini_buffer, minisize); if (!minierror) { std::cerr << minierror << std::endl; exit(1); } mini_buffer[minisize] = '\0'; diff --git a/doc/basics.md b/doc/basics.md index 33bf377b..b67f7e1f 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -168,6 +168,24 @@ And another one: cout << "number: " << v << endl; ``` +Minifying JSON strings without parsing +---------------------- + +In some cases, you may have valid JSON strings that you do not wish to parse but that you wish to minify. That is, you wish to remove all unnecessary spaces. We have a fast function for this purpose (`minify_string`). This function does not validate your content, and it does not parse it. Instead, it assumes that your string is valid UTF-8. It is much faster than parsing the string and re-serializing it in minified form. Usage is relatively simple. You must pass an input pointer with a length parameter, as well as an output pointer and an output length parameter (by reference). The output length parameter is not read, but written to. The output pointer should point to a valid memory region that is slightly overallocated (by `simdjson::SIMDJSON_PADDING`) compared to the original string length. The input pointer and input length are read, but not written to. + +```C++ + // Starts with a valid JSON document as a string. + // It does not have to be null-terminated. + const char * some_string = "[ 1, 2, 3, 4] "; + size_t length = strlen(some_string); + // Create a buffer to receive the minified string. Make sure that there is enough room, + // including some padding (simdjson::SIMDJSON_PADDING). + std::unique_ptr buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]}; + size_t new_length{}; // It will receive the minified length. + auto error = simdjson::minify_string(some_string, length, buffer.get(), new_length); + // The buffer variable now has "[1,2,3,4]" and new_length has value 9. +``` + C++17 Support ------------- diff --git a/include/simdjson/implementation.h b/include/simdjson/implementation.h index ee08252d..c828dc97 100644 --- a/include/simdjson/implementation.h +++ b/include/simdjson/implementation.h @@ -72,11 +72,11 @@ public: /** * @private For internal implementation use * - * Run a full document parse (ensure_capacity, stage1 and stage2). + * Minify the input string assuming that it represents a JSON string, does not parse or validate. * * Overridden by each implementation. * - * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param buf the json document to minify. * @param len the length of the json document. * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. * @param dst_len the number of bytes written. Output only. diff --git a/include/simdjson/minify.h b/include/simdjson/minify.h index 55fa698a..7392e979 100644 --- a/include/simdjson/minify.h +++ b/include/simdjson/minify.h @@ -9,6 +9,24 @@ namespace simdjson { + + +/** + * + * Minify the input string assuming that it represents a JSON string, does not parse or validate. + * This function is much faster than parsing a JSON string and then writing a minified version of it. + * However, it does not validate the input. + * + * + * @param buf the json document to minify. + * @param len the length of the json document. + * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param dst_len the number of bytes written. Output only. + * @return the error code, or SUCCESS if there was no error. + */ +WARN_UNUSED error_code minify_string(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept; + + /** * Minifies a JSON element or document, printing the smallest possible valid JSON. * diff --git a/src/implementation.cpp b/src/implementation.cpp index 5bd1b133..eb457481 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -134,4 +134,9 @@ const implementation *detect_best_supported_implementation_on_first_use::set_bes SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{}; SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton}; +WARN_UNUSED error_code minify_string(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept { + return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len); +} + + } // namespace simdjson diff --git a/tests/basictests.cpp b/tests/basictests.cpp index 537ead0d..c04467b9 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -1712,6 +1712,62 @@ namespace type_tests { } + + +namespace minify_string_tests { + + bool check_minification(const char * input, size_t length, const char * expected, size_t expected_length) { + std::unique_ptr buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]}; + if(buffer.get() == nullptr) { + std::cerr << "cannot alloc " << std::endl; + return false; + } + size_t newlength{}; + auto error = simdjson::minify_string(input, length, buffer.get(), newlength); + if(error != simdjson::SUCCESS) { + std::cerr << "error " << error << std::endl; + return false; + } + // memcmp + if(newlength != expected_length) { + std::cerr << "lengths do not match " << std::endl; + return false; + } + for(size_t i = 0; i < newlength; i++) { + if(buffer.get()[i] != expected[i]) { + std::cerr << "Inputs do not match (but same length) " << std::endl; + return false; + } + } + return true; + } + + bool test_minify_string() { + std::cout << "Running " << __func__ << std::endl; + const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })"; + const std::string minified(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})"); + return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size()); + } + bool test_minify_string_array() { + std::cout << "Running " << __func__ << std::endl; + std::string test("[ 1, 2, 3]"); + std::string minified("[1,2,3]"); + return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size()); + } + bool test_minify_string_object() { + std::cout << "Running " << __func__ << std::endl; + std::string test(R"({ "foo " : 1, "b ar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })"); + std::string minified(R"({"foo ":1,"b ar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})"); + return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size()); + } + bool run() { + return test_minify_string() && + test_minify_string_array() && + test_minify_string_object(); + } +} + + namespace format_tests { using namespace simdjson; using namespace simdjson::dom; @@ -1968,7 +2024,8 @@ int main(int argc, char *argv[]) { printf("unsupported CPU\n"); } std::cout << "Running basic tests." << std::endl; - if (parse_api_tests::run() && + if (minify_string_tests::run() && + parse_api_tests::run() && dom_api_tests::run() && type_tests::run() && format_tests::run() && diff --git a/tests/readme_examples.cpp b/tests/readme_examples.cpp index 5418d54d..e61c984b 100644 --- a/tests/readme_examples.cpp +++ b/tests/readme_examples.cpp @@ -239,10 +239,36 @@ void performance_3() { SIMDJSON_POP_DISABLE_WARNINGS #endif +void minify_string() { + const char * some_string = "[ 1, 2, 3, 4] "; + size_t length = strlen(some_string); + std::unique_ptr buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]}; + size_t new_length{}; + auto error = simdjson::minify_string(some_string, length, buffer.get(), new_length); + if(error != simdjson::SUCCESS) { + std::cerr << "error " << error << std::endl; + abort(); + } else { + const char * expected_string = "[1,2,3,4]"; + size_t expected_length = strlen(expected_string); + if(expected_length != new_length) { + std::cerr << "mismatched length (error) " << std::endl; + abort(); + } + for(size_t i = 0; i < new_length; i++) { + if(expected_string[i] != buffer.get()[i]) { + std::cerr << "mismatched content (error) " << std::endl; + abort(); + } + } + } +} + int main() { basics_dom_1(); basics_dom_2(); basics_dom_3(); basics_dom_4(); + minify_string(); return 0; }