Faster and more correct serialization (#1168)

* Adding new files.

* Better.

* Fixing minifier and adding tests.

* Adding benchmarks.

* Including the array header.

* Replacing old stream-based code by the new code.

* Doubling up the itoa.

* Hidden away to_chars in internal namespace.

* Removing the repetitions.

* Documented the atoi functions.

* Tuning the escape sequences.

* Moving the operators off the main namespace.

* Added more tests.

* Tweaking the implementation so that it works with and without exp.

* The string_builder template and mini_formatter class
 are not part of  our public API and are subject to change
 at any time!

* Adding a benchmark and some optimization.

* Cleaning.

* Strictly speaking, this header is needed.
This commit is contained in:
Daniel Lemire 2020-09-23 10:00:39 -04:00 committed by GitHub
parent f410213003
commit 60c139a844
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 1986 additions and 384 deletions

View File

@ -63,16 +63,136 @@ static void serialize_twitter(State& state) {
bytes += serial.size();
benchmark::DoNotOptimize(serial);
}
// we validate the result
{
auto serial = simdjson::minify(doc);
dom::element doc2; // we parse the minified output
if ((error = parser.parse(serial).get(doc2))) { throw std::runtime_error("serialization error"); }
auto serial2 = simdjson::minify(doc2); // we minify a second time
if(serial != serial2) { throw std::runtime_error("serialization mismatch"); }
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(serialize_twitter)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
BENCHMARK(serialize_twitter)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
static void serialize_big_string_to_string(State& state) {
dom::parser parser;
std::vector<char> content;
content.push_back('\"');
for(size_t i = 0 ; i < 100000; i ++) {
content.push_back('0' + char(i%10)); // we add what looks like a long list of digits
}
content.push_back('\"');
dom::element doc;
simdjson::error_code error;
if ((error = parser.parse(content.data(), content.size()).get(doc))) {
cerr << "could not parse big string" << error << endl;
return;
}
size_t bytes = 0;
for (SIMDJSON_UNUSED auto _ : state) {
auto serial = simdjson::to_string(doc);
bytes += serial.size();
benchmark::DoNotOptimize(serial);
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(serialize_big_string_to_string)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
static void serialize_twitter_to_string(State& state) {
dom::parser parser;
padded_string docdata;
auto error = padded_string::load(TWITTER_JSON).get(docdata);
if(error) {
cerr << "could not parse twitter.json" << error << endl;
return;
}
// we do not want mem. alloc. in the loop.
if((error = parser.allocate(docdata.size()))) {
cout << error << endl;
return;
}
dom::element doc;
if ((error = parser.parse(docdata).get(doc))) {
cerr << "could not parse twitter.json" << error << endl;
return;
}
size_t bytes = 0;
for (SIMDJSON_UNUSED auto _ : state) {
auto serial = simdjson::to_string(doc);
bytes += serial.size();
benchmark::DoNotOptimize(serial);
}
// we validate the result
{
auto serial = simdjson::to_string(doc);
dom::element doc2; // we parse the stringify output
if ((error = parser.parse(serial).get(doc2))) { throw std::runtime_error("serialization error"); }
auto serial2 = simdjson::to_string(doc2); // we stringify again
if(serial != serial2) { throw std::runtime_error("serialization mismatch"); }
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(serialize_twitter_to_string)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
static void serialize_twitter_string_builder(State& state) {
dom::parser parser;
padded_string docdata;
auto error = padded_string::load(TWITTER_JSON).get(docdata);
if(error) {
cerr << "could not parse twitter.json" << error << endl;
return;
}
// we do not want mem. alloc. in the loop.
if((error = parser.allocate(docdata.size()))) {
cout << error << endl;
return;
}
dom::element doc;
if ((error = parser.parse(docdata).get(doc))) {
cerr << "could not parse twitter.json" << error << endl;
return;
}
size_t bytes = 0;
simdjson::internal::string_builder<> sb;// not part of our public API, for internal use
for (SIMDJSON_UNUSED auto _ : state) {
sb.clear();
sb.append(doc);
std::string_view serial = sb.str();
bytes += serial.size();
benchmark::DoNotOptimize(serial);
}
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
state.counters["Gigabytes"] = benchmark::Counter(
double(bytes), benchmark::Counter::kIsRate,
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
}
BENCHMARK(serialize_twitter_string_builder)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
return *(std::max_element(std::begin(v), std::end(v)));
})->DisplayAggregatesOnly(true);
static void numbers_scan(State& state) {
// Prints the number of results in twitter.json
dom::parser parser;

View File

@ -43,6 +43,7 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
// Public API
#include "simdjson/simdjson_version.h"
#include "simdjson/error.h"
#include "simdjson/minify.h"
#include "simdjson/padded_string.h"
#include "simdjson/implementation.h"
#include "simdjson/dom/array.h"
@ -51,6 +52,7 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "simdjson/dom/element.h"
#include "simdjson/dom/object.h"
#include "simdjson/dom/parser.h"
#include "simdjson/dom/serialization.h"
// Deprecated API
#include "simdjson/dom/jsonparser.h"
@ -68,6 +70,7 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "simdjson/dom/parsedjson_iterator-inl.h"
#include "simdjson/dom/parser-inl.h"
#include "simdjson/internal/tape_ref-inl.h"
#include "simdjson/dom/serialization-inl.h"
SIMDJSON_POP_DISABLE_WARNINGS

View File

@ -6,6 +6,15 @@
namespace simdjson {
namespace internal {
/**
* @private
* Our own implementation of the C++17 to_chars function.
* Defined in src/to_chars
*/
char *to_chars(char *first, const char *last, double value);
}
#ifndef SIMDJSON_EXCEPTIONS
#if __cpp_exceptions
#define SIMDJSON_EXCEPTIONS 1

View File

@ -144,39 +144,9 @@ inline bool array::iterator::operator>=(const array::iterator& other) const noex
inline bool array::iterator::operator>(const array::iterator& other) const noexcept {
return tape.json_index > other.tape.json_index;
}
inline std::ostream& operator<<(std::ostream& out, const array &value) {
return out << minify<array>(value);
}
} // namespace dom
template<>
inline std::ostream& minifier<dom::array>::print(std::ostream& out) {
out << '[';
auto iter = value.begin();
auto end = value.end();
if (iter != end) {
out << minify<dom::element>(*iter);
for (++iter; iter != end; ++iter) {
out << "," << minify<dom::element>(*iter);
}
}
return out << ']';
}
#if SIMDJSON_EXCEPTIONS
template<>
inline std::ostream& minifier<simdjson_result<dom::array>>::print(std::ostream& out) {
if (value.error()) { throw simdjson_error(value.error()); }
return out << minify<dom::array>(value.first);
}
inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::array> &value) noexcept(false) {
return out << minify<simdjson_result<dom::array>>(value);
}
#endif
} // namespace simdjson

View File

@ -4,10 +4,13 @@
#include "simdjson/common_defs.h"
#include "simdjson/error.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/minify.h"
#include <ostream>
namespace simdjson {
namespace internal {
template<typename T>
class string_builder;
}
namespace dom {
class document;
@ -125,19 +128,9 @@ private:
friend class element;
friend struct simdjson_result<element>;
template<typename T>
friend class simdjson::minifier;
friend class simdjson::internal::string_builder;
};
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, const array &value);
} // namespace dom
@ -159,20 +152,7 @@ public:
#endif // SIMDJSON_EXCEPTIONS
};
#if SIMDJSON_EXCEPTIONS
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw simdjson_error if the result being printed has an error. If there is an error with the
* underlying output stream, that error will be propagated (simdjson_error will not be
* thrown).
*/
inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::array> &value) noexcept(false);
#endif
} // namespace simdjson

View File

@ -2,7 +2,6 @@
#define SIMDJSON_DOM_DOCUMENT_H
#include "simdjson/common_defs.h"
#include "simdjson/minify.h"
#include <memory>
#include <ostream>
@ -67,8 +66,6 @@ public:
private:
inline error_code allocate(size_t len) noexcept;
template<typename T>
friend class simdjson::minifier;
friend class parser;
}; // class document

View File

@ -387,9 +387,6 @@ inline bool element::dump_raw_tape(std::ostream &out) const noexcept {
return tape.doc->dump_raw_tape(out);
}
inline std::ostream& operator<<(std::ostream& out, const element &value) {
return out << minify<element>(value);
}
inline std::ostream& operator<<(std::ostream& out, element_type type) {
switch (type) {
@ -416,143 +413,6 @@ inline std::ostream& operator<<(std::ostream& out, element_type type) {
} // namespace dom
template<>
inline std::ostream& minifier<dom::element>::print(std::ostream& out) {
using tape_type=internal::tape_type;
size_t depth = 0;
constexpr size_t MAX_DEPTH = 16;
bool is_object[MAX_DEPTH];
is_object[0] = false;
bool after_value = false;
internal::tape_ref iter(value.tape);
do {
// print commas after each value
if (after_value) {
out << ",";
}
// If we are in an object, print the next key and :, and skip to the next value.
if (is_object[depth]) {
out << '"' << internal::escape_json_string(iter.get_string_view()) << "\":";
iter.json_index++;
}
switch (iter.tape_ref_type()) {
// Arrays
case tape_type::START_ARRAY: {
// If we're too deep, we need to recurse to go deeper.
depth++;
if (simdjson_unlikely(depth >= MAX_DEPTH)) {
out << minify<dom::array>(dom::array(iter));
iter.json_index = iter.matching_brace_index() - 1; // Jump to the ]
depth--;
break;
}
// Output start [
out << '[';
iter.json_index++;
// Handle empty [] (we don't want to come back around and print commas)
if (iter.tape_ref_type() == tape_type::END_ARRAY) {
out << ']';
depth--;
break;
}
is_object[depth] = false;
after_value = false;
continue;
}
// Objects
case tape_type::START_OBJECT: {
// If we're too deep, we need to recurse to go deeper.
depth++;
if (simdjson_unlikely(depth >= MAX_DEPTH)) {
out << minify<dom::object>(dom::object(iter));
iter.json_index = iter.matching_brace_index() - 1; // Jump to the }
depth--;
break;
}
// Output start {
out << '{';
iter.json_index++;
// Handle empty {} (we don't want to come back around and print commas)
if (iter.tape_ref_type() == tape_type::END_OBJECT) {
out << '}';
depth--;
break;
}
is_object[depth] = true;
after_value = false;
continue;
}
// Scalars
case tape_type::STRING:
out << '"' << internal::escape_json_string(iter.get_string_view()) << '"';
break;
case tape_type::INT64:
out << iter.next_tape_value<int64_t>();
iter.json_index++; // numbers take up 2 spots, so we need to increment extra
break;
case tape_type::UINT64:
out << iter.next_tape_value<uint64_t>();
iter.json_index++; // numbers take up 2 spots, so we need to increment extra
break;
case tape_type::DOUBLE:
out << iter.next_tape_value<double>();
iter.json_index++; // numbers take up 2 spots, so we need to increment extra
break;
case tape_type::TRUE_VALUE:
out << "true";
break;
case tape_type::FALSE_VALUE:
out << "false";
break;
case tape_type::NULL_VALUE:
out << "null";
break;
// These are impossible
case tape_type::END_ARRAY:
case tape_type::END_OBJECT:
case tape_type::ROOT:
out << "unexpected content!!!"; // abort() usage is forbidden in the library
}
iter.json_index++;
after_value = true;
// Handle multiple ends in a row
while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY || iter.tape_ref_type() == tape_type::END_OBJECT)) {
out << char(iter.tape_ref_type());
depth--;
iter.json_index++;
}
// Stop when we're at depth 0
} while (depth != 0);
return out;
}
#if SIMDJSON_EXCEPTIONS
template<>
simdjson_really_inline std::ostream& minifier<simdjson_result<dom::element>>::print(std::ostream& out) {
if (value.error()) { throw simdjson_error(value.error()); }
return out << minify<dom::element>(value.first);
}
simdjson_really_inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::element> &value) noexcept(false) {
return out << minify<simdjson_result<dom::element>>(value);
}
#endif
} // namespace simdjson
#endif // SIMDJSON_INLINE_ELEMENT_H

View File

@ -4,12 +4,14 @@
#include "simdjson/common_defs.h"
#include "simdjson/error.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/minify.h"
#include <ostream>
namespace simdjson {
namespace internal {
template<typename T>
class string_builder;
}
namespace dom {
class array;
class document;
class object;
@ -473,29 +475,10 @@ private:
friend class array;
friend struct simdjson_result<element>;
template<typename T>
friend class simdjson::minifier;
friend class simdjson::internal::string_builder;
};
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, const element &value);
/**
* Print element type to an output stream.
*
* @param out The output stream.
* @param value The value to print.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, element_type type);
} // namespace dom
/** The result of a JSON navigation that may fail. */
@ -557,20 +540,6 @@ public:
#endif // SIMDJSON_EXCEPTIONS
};
#if SIMDJSON_EXCEPTIONS
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw simdjson_error if the result being printed has an error. If there is an error with the
* underlying output stream, that error will be propagated (simdjson_error will not be
* thrown).
*/
simdjson_really_inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::element> &value) noexcept(false);
#endif
} // namespace simdjson

View File

@ -236,47 +236,8 @@ inline bool object::iterator::key_equals_case_insensitive(std::string_view o) co
inline key_value_pair::key_value_pair(std::string_view _key, element _value) noexcept :
key(_key), value(_value) {}
inline std::ostream& operator<<(std::ostream& out, const object &value) {
return out << minify<object>(value);
}
inline std::ostream& operator<<(std::ostream& out, const key_value_pair &value) {
return out << minify<key_value_pair>(value);
}
} // namespace dom
template<>
inline std::ostream& minifier<dom::object>::print(std::ostream& out) {
out << '{';
auto pair = value.begin();
auto end = value.end();
if (pair != end) {
out << minify<dom::key_value_pair>(*pair);
for (++pair; pair != end; ++pair) {
out << "," << minify<dom::key_value_pair>(*pair);
}
}
return out << '}';
}
template<>
inline std::ostream& minifier<dom::key_value_pair>::print(std::ostream& out) {
return out << '"' << internal::escape_json_string(value.key) << "\":" << value.value;
}
#if SIMDJSON_EXCEPTIONS
template<>
inline std::ostream& minifier<simdjson_result<dom::object>>::print(std::ostream& out) {
if (value.error()) { throw simdjson_error(value.error()); }
return out << minify<dom::object>(value.first);
}
inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::object> &value) noexcept(false) {
return out << minify<simdjson_result<dom::object>>(value);
}
#endif // SIMDJSON_EXCEPTIONS
} // namespace simdjson
#if defined(__cpp_lib_ranges)

View File

@ -4,10 +4,12 @@
#include "simdjson/common_defs.h"
#include "simdjson/error.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/minify.h"
#include <ostream>
namespace simdjson {
namespace internal {
template<typename T>
class string_builder;
}
namespace dom {
class document;
@ -211,7 +213,7 @@ private:
friend class element;
friend struct simdjson_result<element>;
template<typename T>
friend class simdjson::minifier;
friend class simdjson::internal::string_builder;
};
/**
@ -229,27 +231,6 @@ private:
friend class object;
};
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, const object &value);
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, const key_value_pair &value);
} // namespace dom
/** The result of a JSON conversion that may fail. */
@ -273,21 +254,6 @@ public:
#endif // SIMDJSON_EXCEPTIONS
};
#if SIMDJSON_EXCEPTIONS
/**
* Print JSON to an output stream.
*
* By default, the value will be printed minified.
*
* @param out The output stream.
* @param value The value to print.
* @throw simdjson_error if the result being printed has an error. If there is an error with the
* underlying output stream, that error will be propagated (simdjson_error will not be
* thrown).
*/
inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::object> &value) noexcept(false);
#endif // SIMDJSON_EXCEPTIONS
} // namespace simdjson
#if defined(__cpp_lib_ranges)

View File

@ -25,11 +25,7 @@ simdjson_really_inline parser &parser::operator=(parser &&other) noexcept = defa
inline bool parser::is_valid() const noexcept { return valid; }
inline int parser::get_error_code() const noexcept { return error; }
inline std::string parser::get_error_message() const noexcept { return error_message(error); }
inline bool parser::print_json(std::ostream &os) const noexcept {
if (!valid) { return false; }
os << doc.root();
return true;
}
inline bool parser::dump_raw_tape(std::ostream &os) const noexcept {
return valid ? doc.dump_raw_tape(os) : false;
}

View File

@ -6,7 +6,6 @@
#include "simdjson/error.h"
#include "simdjson/internal/dom_parser_implementation.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/minify.h"
#include "simdjson/padded_string.h"
#include "simdjson/portability.h"
#include <memory>

View File

@ -0,0 +1,421 @@
#ifndef SIMDJSON_SERIALIZATION_INL_H
#define SIMDJSON_SERIALIZATION_INL_H
#include "simdjson/dom/serialization.h"
#include <cinttypes>
#include <type_traits>
namespace simdjson {
namespace dom {
inline bool parser::print_json(std::ostream &os) const noexcept {
if (!valid) { return false; }
simdjson::internal::string_builder<> sb;
sb.append(doc.root());
std::string_view answer = sb.str();
os << answer;
return true;
}
}
/***
* Number utility functions
**/
namespace {
/**@private
* Escape sequence like \b or \u0001
* We expect that most compilers will use 8 bytes for this data structure.
**/
struct escape_sequence {
uint8_t length;
const char string[7]; // technically, we only ever need 6 characters, we pad to 8
};
/**@private
* This converts a signed integer into a character sequence.
* The caller is responsible for providing enough memory (at least
* 20 characters.)
* Though various runtime libraries provide itoa functions,
* it is not part of the C++ standard. The C++17 standard
* adds the to_chars functions which would do as well, but
* we want to support C++11.
*/
char *fast_itoa(char *output, int64_t value) noexcept {
// This is a standard implementation of itoa.
// We first write in reverse order and then reverse.
if(value < 0) {
*output++ = '-';
value = -value;
}
char *write_pointer = output;
do {
*write_pointer++ = char('0' + (value % 10));
value /= 10;
} while (value != 0);
// then we reverse the result
char *const answer = write_pointer;
char *second_write_pointer = output;
write_pointer -= 1;
while (second_write_pointer < write_pointer) {
char c1 = *write_pointer;
char c2 = *second_write_pointer;
*second_write_pointer = c1;
*write_pointer = c2;
write_pointer--;
second_write_pointer++;
}
return answer;
}
/**@private
* This converts an unsigned integer into a character sequence.
* The caller is responsible for providing enough memory (at least
* 19 characters.)
* Though various runtime libraries provide itoa functions,
* it is not part of the C++ standard. The C++17 standard
* adds the to_chars functions which would do as well, but
* we want to support C++11.
*/
char *fast_itoa(char *output, uint64_t value) noexcept {
// This is a standard implementation of itoa.
// We first write in reverse order and then reverse.
char *write_pointer = output;
do {
*write_pointer++ = char('0' + (value % 10));
value /= 10;
} while (value != 0);
// then we reverse the result
char *const answer = write_pointer;
char *second_write_pointer = output;
write_pointer -= 1;
while (second_write_pointer < write_pointer) {
char c1 = *write_pointer;
char c2 = *second_write_pointer;
*second_write_pointer = c1;
*write_pointer = c2;
write_pointer--;
second_write_pointer++;
}
return answer;
}
} // anonymous namespace
namespace internal {
/***
* Minifier/formatter code.
**/
simdjson_really_inline void mini_formatter::number(uint64_t x) {
char number_buffer[24];
char *newp = fast_itoa(number_buffer, x);
buffer.insert(buffer.end(), number_buffer, newp);
}
simdjson_really_inline void mini_formatter::number(int64_t x) {
char number_buffer[24];
char *newp = fast_itoa(number_buffer, x);
buffer.insert(buffer.end(), number_buffer, newp);
}
simdjson_really_inline void mini_formatter::number(double x) {
char number_buffer[24];
// Currently, passing the nullptr to the second argument is
// safe because our implementation does not check the second
// argument.
char *newp = internal::to_chars(number_buffer, nullptr, x);
buffer.insert(buffer.end(), number_buffer, newp);
}
simdjson_really_inline void mini_formatter::start_array() { one_char('['); }
simdjson_really_inline void mini_formatter::end_array() { one_char(']'); }
simdjson_really_inline void mini_formatter::start_object() { one_char('{'); }
simdjson_really_inline void mini_formatter::end_object() { one_char('}'); }
simdjson_really_inline void mini_formatter::comma() { one_char(','); }
simdjson_really_inline void mini_formatter::true_atom() {
const char * s = "true";
buffer.insert(buffer.end(), s, s + 4);
}
simdjson_really_inline void mini_formatter::false_atom() {
const char * s = "false";
buffer.insert(buffer.end(), s, s + 5);
}
simdjson_really_inline void mini_formatter::null_atom() {
const char * s = "null";
buffer.insert(buffer.end(), s, s + 4);
}
simdjson_really_inline void mini_formatter::one_char(char c) { buffer.push_back(c); }
simdjson_really_inline void mini_formatter::key(std::string_view unescaped) {
string(unescaped);
one_char(':');
}
simdjson_really_inline void mini_formatter::string(std::string_view unescaped) {
one_char('\"');
size_t i = 0;
// Fast path for the case where we have no control character, no ", and no backslash.
// This should include most keys.
constexpr static bool needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for(;i + 8 <= unescaped.length(); i += 8) {
// Poor's man vectorization. This could get much faster if we used SIMD.
if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])]
| needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])]
| needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])]
| needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])]
) { break; }
}
for(;i < unescaped.length(); i++) {
if(needs_escaping[uint8_t(unescaped[i])]) { break; }
}
// The following is also possible and omits a 256-byte table, but it is slower:
// for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
// && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
// At least for long strings, the following should be fast. We could
// do better by integrating the checks and the insertion.
buffer.insert(buffer.end(), unescaped.data(), unescaped.data() + i);
// We caught a control character if we enter this loop (slow).
// Note that we are do not restart from the beginning, but rather we continue
// from the point where we encountered something that requires escaping.
for (; i < unescaped.length(); i++) {
switch (unescaped[i]) {
case '\"':
{
const char * s = "\\\"";
buffer.insert(buffer.end(), s, s + 2);
}
break;
case '\\':
{
const char * s = "\\\\";
buffer.insert(buffer.end(), s, s + 2);
}
break;
default:
if (uint8_t(unescaped[i]) <= 0x1F) {
// If packed, this uses 8 * 32 bytes.
// Note that we expect most compilers to embed this code in the data
// section.
constexpr static escape_sequence escaped[32] = {
{6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"},
{6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"},
{2, "\\b"}, {2, "\\t"}, {2, "\\n"}, {6, "\\u000b"},
{2, "\\f"}, {2, "\\r"}, {6, "\\u000e"}, {6, "\\u000f"},
{6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"},
{6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"},
{6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"},
{6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}};
auto u = escaped[uint8_t(unescaped[i])];
buffer.insert(buffer.end(), u.string, u.string + u.length);
} else {
one_char(unescaped[i]);
}
} // switch
} // for
one_char('\"');
}
inline void mini_formatter::clear() {
buffer.clear();
}
simdjson_really_inline std::string_view mini_formatter::str() const {
return std::string_view(buffer.data(), buffer.size());
}
/***
* String building code.
**/
template <class serializer>
inline void string_builder<serializer>::append(simdjson::dom::element value) {
// using tape_type = simdjson::internal::tape_type;
size_t depth = 0;
constexpr size_t MAX_DEPTH = 16;
bool is_object[MAX_DEPTH];
is_object[0] = false;
bool after_value = false;
internal::tape_ref iter(value.tape);
do {
// print commas after each value
if (after_value) {
format.comma();
}
// If we are in an object, print the next key and :, and skip to the next
// value.
if (is_object[depth]) {
format.key(iter.get_string_view());
iter.json_index++;
}
switch (iter.tape_ref_type()) {
// Arrays
case tape_type::START_ARRAY: {
// If we're too deep, we need to recurse to go deeper.
depth++;
if (simdjson_unlikely(depth >= MAX_DEPTH)) {
append(simdjson::dom::array(iter));
iter.json_index = iter.matching_brace_index() - 1; // Jump to the ]
depth--;
break;
}
// Output start [
format.start_array();
iter.json_index++;
// Handle empty [] (we don't want to come back around and print commas)
if (iter.tape_ref_type() == tape_type::END_ARRAY) {
format.end_array();
depth--;
break;
}
is_object[depth] = false;
after_value = false;
continue;
}
// Objects
case tape_type::START_OBJECT: {
// If we're too deep, we need to recurse to go deeper.
depth++;
if (simdjson_unlikely(depth >= MAX_DEPTH)) {
append(simdjson::dom::object(iter));
iter.json_index = iter.matching_brace_index() - 1; // Jump to the }
depth--;
break;
}
// Output start {
format.start_object();
iter.json_index++;
// Handle empty {} (we don't want to come back around and print commas)
if (iter.tape_ref_type() == tape_type::END_OBJECT) {
format.end_object();
depth--;
break;
}
is_object[depth] = true;
after_value = false;
continue;
}
// Scalars
case tape_type::STRING:
format.string(iter.get_string_view());
break;
case tape_type::INT64:
format.number(iter.next_tape_value<int64_t>());
iter.json_index++; // numbers take up 2 spots, so we need to increment
// extra
break;
case tape_type::UINT64:
format.number(iter.next_tape_value<uint64_t>());
iter.json_index++; // numbers take up 2 spots, so we need to increment
// extra
break;
case tape_type::DOUBLE:
format.number(iter.next_tape_value<double>());
iter.json_index++; // numbers take up 2 spots, so we need to increment
// extra
break;
case tape_type::TRUE_VALUE:
format.true_atom();
break;
case tape_type::FALSE_VALUE:
format.false_atom();
break;
case tape_type::NULL_VALUE:
format.null_atom();
break;
// These are impossible
case tape_type::END_ARRAY:
case tape_type::END_OBJECT:
case tape_type::ROOT:
SIMDJSON_UNREACHABLE();
}
iter.json_index++;
after_value = true;
// Handle multiple ends in a row
while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY ||
iter.tape_ref_type() == tape_type::END_OBJECT)) {
if (iter.tape_ref_type() == tape_type::END_ARRAY) {
format.end_array();
} else {
format.end_object();
}
depth--;
iter.json_index++;
}
// Stop when we're at depth 0
} while (depth != 0);
}
template <class serializer>
inline void string_builder<serializer>::append(simdjson::dom::object value) {
format.start_object();
auto pair = value.begin();
auto end = value.end();
if (pair != end) {
append(*pair);
for (++pair; pair != end; ++pair) {
format.comma();
append(*pair);
}
}
format.end_object();
}
template <class serializer>
inline void string_builder<serializer>::append(simdjson::dom::array value) {
format.start_array();
auto iter = value.begin();
auto end = value.end();
if (iter != end) {
append(*iter);
for (++iter; iter != end; ++iter) {
format.comma();
append(*iter);
}
}
format.end_array();
}
template <class serializer>
simdjson_really_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) {
format.key(kv.key);
append(kv.value);
}
template <class serializer>
simdjson_really_inline void string_builder<serializer>::clear() {
format.clear();
}
template <class serializer>
simdjson_really_inline std::string_view string_builder<serializer>::str() const {
return format.str();
}
} // namespace internal
} // namespace simdjson
#endif

View File

@ -0,0 +1,219 @@
#ifndef SIMDJSON_SERIALIZATION_H
#define SIMDJSON_SERIALIZATION_H
#include "simdjson/common_defs.h"
#include "simdjson/dom/document.h"
#include "simdjson/error.h"
#include "simdjson/internal/dom_parser_implementation.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/padded_string.h"
#include "simdjson/portability.h"
#include <vector>
namespace simdjson {
/**
* The string_builder template and mini_formatter class
* are not part of our public API and are subject to change
* at any time!
*/
namespace internal {
class mini_formatter;
/**
* @private The string_builder template allows us to construct
* a string from a document element. It is parametrized
* by a "formatter" which handles the details. Thus
* the string_builder template could support both minification
* and prettification, and various other tradeoffs.
*/
template <class formatter = mini_formatter>
class string_builder {
public:
/** Construct an initially empty builder, would print the empty string **/
string_builder() = default;
/** Append an element to the builder (to be printed) **/
inline void append(simdjson::dom::element value);
/** Append an array to the builder (to be printed) **/
inline void append(simdjson::dom::array value);
/** Append an objet to the builder (to be printed) **/
inline void append(simdjson::dom::object value);
/** Reset the builder (so that it would print the empty string) **/
simdjson_really_inline void clear();
/**
* Get access to the string. The string_view is owned by the builder
* and it is invalid to use it after the string_builder has been
* destroyed.
* However you can make a copy of the string_view on memory that you
* own.
*/
simdjson_really_inline std::string_view str() const;
/** Append a key_value_pair to the builder (to be printed) **/
simdjson_really_inline void append(simdjson::dom::key_value_pair value);
private:
formatter format{};
};
/**
* @private This is the class that we expect to use with the string_builder
* template. It tries to produce a compact version of the JSON element
* as quickly as possible.
*/
class mini_formatter {
public:
mini_formatter() = default;
/** Add a comma **/
simdjson_really_inline void comma();
/** Start an array, prints [ **/
simdjson_really_inline void start_array();
/** End an array, prints ] **/
simdjson_really_inline void end_array();
/** Start an array, prints { **/
simdjson_really_inline void start_object();
/** Start an array, prints } **/
simdjson_really_inline void end_object();
/** Prints a true **/
simdjson_really_inline void true_atom();
/** Prints a false **/
simdjson_really_inline void false_atom();
/** Prints a null **/
simdjson_really_inline void null_atom();
/** Prints a number **/
simdjson_really_inline void number(int64_t x);
/** Prints a number **/
simdjson_really_inline void number(uint64_t x);
/** Prints a number **/
simdjson_really_inline void number(double x);
/** Prints a key (string + colon) **/
simdjson_really_inline void key(std::string_view unescaped);
/** Prints a string. The string is escaped as needed. **/
simdjson_really_inline void string(std::string_view unescaped);
/** Clears out the content. **/
simdjson_really_inline void clear();
/**
* Get access to the buffer, it is own by the instance, but
* the user can make a copy.
**/
simdjson_really_inline std::string_view str() const;
private:
// implementation details (subject to change)
/** Prints one character **/
simdjson_really_inline void one_char(char c);
/** Backing buffer **/
std::vector<char> buffer{}; // not ideal!
};
} // internal
namespace dom {
/**
* Print JSON to an output stream.
*
* @param out The output stream.
* @param value The element.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, simdjson::dom::element value) {
simdjson::internal::string_builder<> sb;
sb.append(value);
return (out << sb.str());
}
#if SIMDJSON_EXCEPTIONS
inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::dom::element> x) {
if (x.error()) { throw simdjson::simdjson_error(x.error()); }
return (out << x.value());
}
#endif
/**
* Print JSON to an output stream.
*
* @param out The output stream.
* @param value The array.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, simdjson::dom::array value) {
simdjson::internal::string_builder<> sb;
sb.append(value);
return (out << sb.str());
}
#if SIMDJSON_EXCEPTIONS
inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::dom::array> x) {
if (x.error()) { throw simdjson::simdjson_error(x.error()); }
return (out << x.value());
}
#endif
/**
* Print JSON to an output stream.
*
* @param out The output stream.
* @param value The objet.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
inline std::ostream& operator<<(std::ostream& out, simdjson::dom::object value) {
simdjson::internal::string_builder<> sb;
sb.append(value);
return (out << sb.str());
}
#if SIMDJSON_EXCEPTIONS
inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::dom::object> x) {
if (x.error()) { throw simdjson::simdjson_error(x.error()); }
return (out << x.value());
}
#endif
} // namespace dom
/**
* Converts JSON to a string.
*
* dom::parser parser;
* element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded);
* cout << to_string(doc) << endl; // prints [1,2,3]
*
*/
template <class T>
std::string to_string(T x) {
// in C++, to_string is standard: http://www.cplusplus.com/reference/string/to_string/
// Currently minify and to_string are identical but in the future, they may
// differ.
simdjson::internal::string_builder<> sb;
sb.append(x);
std::string_view answer = sb.str();
return std::string(answer.data(), answer.size());
}
#if SIMDJSON_EXCEPTIONS
template <class T>
std::string to_string(simdjson_result<T> x) {
if (x.error()) { throw simdjson_error(x.error()); }
return to_string(x.value());
}
#endif
/**
* Minifies a JSON element or document, printing the smallest possible valid JSON.
*
* dom::parser parser;
* element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded);
* cout << minify(doc) << endl; // prints [1,2,3]
*
*/
template <class T>
std::string minify(T x) {
return to_string(x);
}
#if SIMDJSON_EXCEPTIONS
template <class T>
std::string minify(simdjson_result<T> x) {
if (x.error()) { throw simdjson_error(x.error()); }
return to_string(x.value());
}
#endif
} // namespace simdjson
#endif

View File

@ -27,50 +27,6 @@ namespace simdjson {
*/
SIMDJSON_WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept;
/**
* Minifies a JSON element or document, printing the smallest possible valid JSON.
*
* dom::parser parser;
* element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded);
* cout << minify(doc) << endl; // prints [1,2,3]
*
*/
template<typename T>
class minifier {
public:
/**
* Create a new minifier.
*
* @param _value The document or element to minify.
*/
inline minifier(const T &_value) noexcept : value{_value} {}
/**
* Minify JSON to a string.
*/
inline operator std::string() const noexcept { std::stringstream s; s << *this; return s.str(); }
/**
* Minify JSON to an output stream.
*/
inline std::ostream& print(std::ostream& out);
private:
const T &value;
};
template<typename T>
inline minifier<T> minify(const T &value) noexcept { return minifier<T>(value); }
/**
* Minify JSON to an output stream.
*
* @param out The output stream.
* @param formatter The minifier.
* @throw if there is an error with the underlying output stream. simdjson itself will not throw.
*/
template<typename T>
inline std::ostream& operator<<(std::ostream& out, minifier<T> formatter) { return formatter.print(out); }
} // namespace simdjson
#endif // SIMDJSON_MINIFY_H

View File

@ -2,7 +2,7 @@
SIMDJSON_PUSH_DISABLE_WARNINGS
SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "to_chars.cpp"
#include "error.cpp"
#include "implementation.cpp"

946
src/to_chars.cpp Normal file
View File

@ -0,0 +1,946 @@
#include <cmath>
#include <cstring>
#include <cstdint>
#include <array>
namespace simdjson {
namespace internal {
/*!
implements the Grisu2 algorithm for binary to decimal floating-point
conversion.
Adapted from JSON for Modern C++
This implementation is a slightly modified version of the reference
implementation which may be obtained from
http://florian.loitsch.com/publications (bench.tar.gz).
The code is distributed under the MIT license, Copyright (c) 2009 Florian
Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing
Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the
ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation,
PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and
Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming
Language Design and Implementation, PLDI 1996
*/
namespace dtoa_impl {
template <typename Target, typename Source>
Target reinterpret_bits(const Source source) {
static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
Target target;
std::memcpy(&target, &source, sizeof(Source));
return target;
}
struct diyfp // f * 2^e
{
static constexpr int kPrecision = 64; // = q
std::uint64_t f = 0;
int e = 0;
constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
/*!
@brief returns x - y
@pre x.e == y.e and x.f >= y.f
*/
static diyfp sub(const diyfp &x, const diyfp &y) noexcept {
return {x.f - y.f, x.e};
}
/*!
@brief returns x * y
@note The result is rounded. (Only the upper q bits are returned.)
*/
static diyfp mul(const diyfp &x, const diyfp &y) noexcept {
static_assert(kPrecision == 64, "internal error");
// Computes:
// f = round((x.f * y.f) / 2^q)
// e = x.e + y.e + q
// Emulate the 64-bit * 64-bit multiplication:
//
// p = u * v
// = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
// = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) +
// 2^64 (u_hi v_hi ) = (p0 ) + 2^32 ((p1 ) + (p2 ))
// + 2^64 (p3 ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo +
// 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) =
// (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi +
// p2_hi + p3) = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) +
// 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H )
//
// (Since Q might be larger than 2^32 - 1)
//
// = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
//
// (Q_hi + H does not overflow a 64-bit int)
//
// = p_lo + 2^64 p_hi
const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
const std::uint64_t u_hi = x.f >> 32u;
const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
const std::uint64_t v_hi = y.f >> 32u;
const std::uint64_t p0 = u_lo * v_lo;
const std::uint64_t p1 = u_lo * v_hi;
const std::uint64_t p2 = u_hi * v_lo;
const std::uint64_t p3 = u_hi * v_hi;
const std::uint64_t p0_hi = p0 >> 32u;
const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
const std::uint64_t p1_hi = p1 >> 32u;
const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
const std::uint64_t p2_hi = p2 >> 32u;
std::uint64_t Q = p0_hi + p1_lo + p2_lo;
// The full product might now be computed as
//
// p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
// p_lo = p0_lo + (Q << 32)
//
// But in this particular case here, the full p_lo is not required.
// Effectively we only need to add the highest bit in p_lo to p_hi (and
// Q_hi + 1 does not overflow).
Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
return {h, x.e + y.e + 64};
}
/*!
@brief normalize x such that the significand is >= 2^(q-1)
@pre x.f != 0
*/
static diyfp normalize(diyfp x) noexcept {
while ((x.f >> 63u) == 0) {
x.f <<= 1u;
x.e--;
}
return x;
}
/*!
@brief normalize x such that the result has the exponent E
@pre e >= x.e and the upper e - x.e bits of x.f must be zero.
*/
static diyfp normalize_to(const diyfp &x,
const int target_exponent) noexcept {
const int delta = x.e - target_exponent;
return {x.f << delta, target_exponent};
}
};
struct boundaries {
diyfp w;
diyfp minus;
diyfp plus;
};
/*!
Compute the (normalized) diyfp representing the input number 'value' and its
boundaries.
@pre value must be finite and positive
*/
template <typename FloatType> boundaries compute_boundaries(FloatType value) {
// Convert the IEEE representation into a diyfp.
//
// If v is denormal:
// value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1))
// If v is normalized:
// value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
static_assert(std::numeric_limits<FloatType>::is_iec559,
"internal error: dtoa_short requires an IEEE-754 "
"floating-point implementation");
constexpr int kPrecision =
std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
constexpr int kBias =
std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
constexpr int kMinExp = 1 - kBias;
constexpr std::uint64_t kHiddenBit = std::uint64_t{1}
<< (kPrecision - 1); // = 2^(p-1)
using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t,
std::uint64_t>::type;
const std::uint64_t bits = reinterpret_bits<bits_type>(value);
const std::uint64_t E = bits >> (kPrecision - 1);
const std::uint64_t F = bits & (kHiddenBit - 1);
const bool is_denormal = E == 0;
const diyfp v = is_denormal
? diyfp(F, kMinExp)
: diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
// Compute the boundaries m- and m+ of the floating-point value
// v = f * 2^e.
//
// Determine v- and v+, the floating-point predecessor and successor if v,
// respectively.
//
// v- = v - 2^e if f != 2^(p-1) or e == e_min (A)
// = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B)
//
// v+ = v + 2^e
//
// Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
// between m- and m+ round to v, regardless of how the input rounding
// algorithm breaks ties.
//
// ---+-------------+-------------+-------------+-------------+--- (A)
// v- m- v m+ v+
//
// -----------------+------+------+-------------+-------------+--- (B)
// v- m- v m+ v+
const bool lower_boundary_is_closer = F == 0 && E > 1;
const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
const diyfp m_minus = lower_boundary_is_closer
? diyfp(4 * v.f - 1, v.e - 2) // (B)
: diyfp(2 * v.f - 1, v.e - 1); // (A)
// Determine the normalized w+ = m+.
const diyfp w_plus = diyfp::normalize(m_plus);
// Determine w- = m- such that e_(w-) = e_(w+).
const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
return {diyfp::normalize(v), w_minus, w_plus};
}
// Given normalized diyfp w, Grisu needs to find a (normalized) cached
// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
// within a certain range [alpha, gamma] (Definition 3.2 from [1])
//
// alpha <= e = e_c + e_w + q <= gamma
//
// or
//
// f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
// <= f_c * f_w * 2^gamma
//
// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
//
// 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
//
// or
//
// 2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
//
// The choice of (alpha,gamma) determines the size of the table and the form of
// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
// in practice:
//
// The idea is to cut the number c * w = f * 2^e into two parts, which can be
// processed independently: An integral part p1, and a fractional part p2:
//
// f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
// = (f div 2^-e) + (f mod 2^-e) * 2^e
// = p1 + p2 * 2^e
//
// The conversion of p1 into decimal form requires a series of divisions and
// modulos by (a power of) 10. These operations are faster for 32-bit than for
// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
// achieved by choosing
//
// -e >= 32 or e <= -32 := gamma
//
// In order to convert the fractional part
//
// p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
//
// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
// d[-i] are extracted in order:
//
// (10 * p2) div 2^-e = d[-1]
// (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
//
// The multiplication by 10 must not overflow. It is sufficient to choose
//
// 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
//
// Since p2 = f mod 2^-e < 2^-e,
//
// -e <= 60 or e >= -60 := alpha
constexpr int kAlpha = -60;
constexpr int kGamma = -32;
struct cached_power // c = f * 2^e ~= 10^k
{
std::uint64_t f;
int e;
int k;
};
/*!
For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
satisfies (Definition 3.2 from [1])
alpha <= e_c + e + q <= gamma.
*/
inline cached_power get_cached_power_for_binary_exponent(int e) {
// Now
//
// alpha <= e_c + e + q <= gamma (1)
// ==> f_c * 2^alpha <= c * 2^e * 2^q
//
// and since the c's are normalized, 2^(q-1) <= f_c,
//
// ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
// ==> 2^(alpha - e - 1) <= c
//
// If c were an exact power of ten, i.e. c = 10^k, one may determine k as
//
// k = ceil( log_10( 2^(alpha - e - 1) ) )
// = ceil( (alpha - e - 1) * log_10(2) )
//
// From the paper:
// "In theory the result of the procedure could be wrong since c is rounded,
// and the computation itself is approximated [...]. In practice, however,
// this simple function is sufficient."
//
// For IEEE double precision floating-point numbers converted into
// normalized diyfp's w = f * 2^e, with q = 64,
//
// e >= -1022 (min IEEE exponent)
// -52 (p - 1)
// -52 (p - 1, possibly normalize denormal IEEE numbers)
// -11 (normalize the diyfp)
// = -1137
//
// and
//
// e <= +1023 (max IEEE exponent)
// -52 (p - 1)
// -11 (normalize the diyfp)
// = 960
//
// This binary exponent range [-1137,960] results in a decimal exponent
// range [-307,324]. One does not need to store a cached power for each
// k in this range. For each such k it suffices to find a cached power
// such that the exponent of the product lies in [alpha,gamma].
// This implies that the difference of the decimal exponents of adjacent
// table entries must be less than or equal to
//
// floor( (gamma - alpha) * log_10(2) ) = 8.
//
// (A smaller distance gamma-alpha would require a larger table.)
// NB:
// Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
constexpr int kCachedPowersMinDecExp = -300;
constexpr int kCachedPowersDecStep = 8;
static constexpr std::array<cached_power, 79> kCachedPowers = {{
{0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292},
{0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276},
{0xD3515C2831559A83, -954, -268}, {0x9D71AC8FADA6C9B5, -927, -260},
{0xEA9C227723EE8BCB, -901, -252}, {0xAECC49914078536D, -874, -244},
{0x823C12795DB6CE57, -847, -236}, {0xC21094364DFB5637, -821, -228},
{0x9096EA6F3848984F, -794, -220}, {0xD77485CB25823AC7, -768, -212},
{0xA086CFCD97BF97F4, -741, -204}, {0xEF340A98172AACE5, -715, -196},
{0xB23867FB2A35B28E, -688, -188}, {0x84C8D4DFD2C63F3B, -661, -180},
{0xC5DD44271AD3CDBA, -635, -172}, {0x936B9FCEBB25C996, -608, -164},
{0xDBAC6C247D62A584, -582, -156}, {0xA3AB66580D5FDAF6, -555, -148},
{0xF3E2F893DEC3F126, -529, -140}, {0xB5B5ADA8AAFF80B8, -502, -132},
{0x87625F056C7C4A8B, -475, -124}, {0xC9BCFF6034C13053, -449, -116},
{0x964E858C91BA2655, -422, -108}, {0xDFF9772470297EBD, -396, -100},
{0xA6DFBD9FB8E5B88F, -369, -92}, {0xF8A95FCF88747D94, -343, -84},
{0xB94470938FA89BCF, -316, -76}, {0x8A08F0F8BF0F156B, -289, -68},
{0xCDB02555653131B6, -263, -60}, {0x993FE2C6D07B7FAC, -236, -52},
{0xE45C10C42A2B3B06, -210, -44}, {0xAA242499697392D3, -183, -36},
{0xFD87B5F28300CA0E, -157, -28}, {0xBCE5086492111AEB, -130, -20},
{0x8CBCCC096F5088CC, -103, -12}, {0xD1B71758E219652C, -77, -4},
{0x9C40000000000000, -50, 4}, {0xE8D4A51000000000, -24, 12},
{0xAD78EBC5AC620000, 3, 20}, {0x813F3978F8940984, 30, 28},
{0xC097CE7BC90715B3, 56, 36}, {0x8F7E32CE7BEA5C70, 83, 44},
{0xD5D238A4ABE98068, 109, 52}, {0x9F4F2726179A2245, 136, 60},
{0xED63A231D4C4FB27, 162, 68}, {0xB0DE65388CC8ADA8, 189, 76},
{0x83C7088E1AAB65DB, 216, 84}, {0xC45D1DF942711D9A, 242, 92},
{0x924D692CA61BE758, 269, 100}, {0xDA01EE641A708DEA, 295, 108},
{0xA26DA3999AEF774A, 322, 116}, {0xF209787BB47D6B85, 348, 124},
{0xB454E4A179DD1877, 375, 132}, {0x865B86925B9BC5C2, 402, 140},
{0xC83553C5C8965D3D, 428, 148}, {0x952AB45CFA97A0B3, 455, 156},
{0xDE469FBD99A05FE3, 481, 164}, {0xA59BC234DB398C25, 508, 172},
{0xF6C69A72A3989F5C, 534, 180}, {0xB7DCBF5354E9BECE, 561, 188},
{0x88FCF317F22241E2, 588, 196}, {0xCC20CE9BD35C78A5, 614, 204},
{0x98165AF37B2153DF, 641, 212}, {0xE2A0B5DC971F303A, 667, 220},
{0xA8D9D1535CE3B396, 694, 228}, {0xFB9B7CD9A4A7443C, 720, 236},
{0xBB764C4CA7A44410, 747, 244}, {0x8BAB8EEFB6409C1A, 774, 252},
{0xD01FEF10A657842C, 800, 260}, {0x9B10A4E5E9913129, 827, 268},
{0xE7109BFBA19C0C9D, 853, 276}, {0xAC2820D9623BF429, 880, 284},
{0x80444B5E7AA7CF85, 907, 292}, {0xBF21E44003ACDD2D, 933, 300},
{0x8E679C2F5E44FF8F, 960, 308}, {0xD433179D9C8CB841, 986, 316},
{0x9E19DB92B4E31BA9, 1013, 324},
}};
// This computation gives exactly the same results for k as
// k = ceil((kAlpha - e - 1) * 0.30102999566398114)
// for |e| <= 1500, but doesn't require floating-point operations.
// NB: log_10(2) ~= 78913 / 2^18
const int f = kAlpha - e - 1;
const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) /
kCachedPowersDecStep;
const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
return cached;
}
/*!
For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
For n == 0, returns 1 and sets pow10 := 1.
*/
inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) {
// LCOV_EXCL_START
if (n >= 1000000000) {
pow10 = 1000000000;
return 10;
}
// LCOV_EXCL_STOP
else if (n >= 100000000) {
pow10 = 100000000;
return 9;
} else if (n >= 10000000) {
pow10 = 10000000;
return 8;
} else if (n >= 1000000) {
pow10 = 1000000;
return 7;
} else if (n >= 100000) {
pow10 = 100000;
return 6;
} else if (n >= 10000) {
pow10 = 10000;
return 5;
} else if (n >= 1000) {
pow10 = 1000;
return 4;
} else if (n >= 100) {
pow10 = 100;
return 3;
} else if (n >= 10) {
pow10 = 10;
return 2;
} else {
pow10 = 1;
return 1;
}
}
inline void grisu2_round(char *buf, int len, std::uint64_t dist,
std::uint64_t delta, std::uint64_t rest,
std::uint64_t ten_k) {
// <--------------------------- delta ---->
// <---- dist --------->
// --------------[------------------+-------------------]--------------
// M- w M+
//
// ten_k
// <------>
// <---- rest ---->
// --------------[------------------+----+--------------]--------------
// w V
// = buf * 10^k
//
// ten_k represents a unit-in-the-last-place in the decimal representation
// stored in buf.
// Decrement buf by ten_k while this takes buf closer to w.
// The tests are written in this order to avoid overflow in unsigned
// integer arithmetic.
while (rest < dist && delta - rest >= ten_k &&
(rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
buf[len - 1]--;
rest += ten_k;
}
}
/*!
Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
*/
inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent,
diyfp M_minus, diyfp w, diyfp M_plus) {
static_assert(kAlpha >= -60, "internal error");
static_assert(kGamma <= -32, "internal error");
// Generates the digits (and the exponent) of a decimal floating-point
// number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
// w, M- and M+ share the same exponent e, which satisfies alpha <= e <=
// gamma.
//
// <--------------------------- delta ---->
// <---- dist --------->
// --------------[------------------+-------------------]--------------
// M- w M+
//
// Grisu2 generates the digits of M+ from left to right and stops as soon as
// V is in [M-,M+].
std::uint64_t delta =
diyfp::sub(M_plus, M_minus)
.f; // (significand of (M+ - M-), implicit exponent is e)
std::uint64_t dist =
diyfp::sub(M_plus, w)
.f; // (significand of (M+ - w ), implicit exponent is e)
// Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
//
// M+ = f * 2^e
// = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
// = ((p1 ) * 2^-e + (p2 )) * 2^e
// = p1 + p2 * 2^e
const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
auto p1 = static_cast<std::uint32_t>(
M_plus.f >>
-one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e
// 1)
//
// Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
std::uint32_t pow10;
const int k = find_largest_pow10(p1, pow10);
// 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
//
// p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
// = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1))
//
// M+ = p1 + p2 * 2^e
// = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e
// = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
// = d[k-1] * 10^(k-1) + ( rest) * 2^e
//
// Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
//
// p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
//
// but stop as soon as
//
// rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
int n = k;
while (n > 0) {
// Invariants:
// M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k)
// pow10 = 10^(n-1) <= p1 < 10^n
//
const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1)
const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1)
//
// M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
// = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
//
buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
//
// M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
//
p1 = r;
n--;
//
// M+ = buffer * 10^n + (p1 + p2 * 2^e)
// pow10 = 10^n
//
// Now check if enough digits have been generated.
// Compute
//
// p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
//
// Note:
// Since rest and delta share the same exponent e, it suffices to
// compare the significands.
const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
if (rest <= delta) {
// V = buffer * 10^n, with M- <= V <= M+.
decimal_exponent += n;
// We may now just stop. But instead look if the buffer could be
// decremented to bring V closer to w.
//
// pow10 = 10^n is now 1 ulp in the decimal representation V.
// The rounding procedure works with diyfp's with an implicit
// exponent of e.
//
// 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
//
const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
grisu2_round(buffer, length, dist, delta, rest, ten_n);
return;
}
pow10 /= 10;
//
// pow10 = 10^(n-1) <= p1 < 10^n
// Invariants restored.
}
// 2)
//
// The digits of the integral part have been generated:
//
// M+ = d[k-1]...d[1]d[0] + p2 * 2^e
// = buffer + p2 * 2^e
//
// Now generate the digits of the fractional part p2 * 2^e.
//
// Note:
// No decimal point is generated: the exponent is adjusted instead.
//
// p2 actually represents the fraction
//
// p2 * 2^e
// = p2 / 2^-e
// = d[-1] / 10^1 + d[-2] / 10^2 + ...
//
// Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
//
// p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
// + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
//
// using
//
// 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
// = ( d) * 2^-e + ( r)
//
// or
// 10^m * p2 * 2^e = d + r * 2^e
//
// i.e.
//
// M+ = buffer + p2 * 2^e
// = buffer + 10^-m * (d + r * 2^e)
// = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
//
// and stop as soon as 10^-m * r * 2^e <= delta * 2^e
int m = 0;
for (;;) {
// Invariant:
// M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...)
// * 2^e
// = buffer * 10^-m + 10^-m * (p2 )
// * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e =
// buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e +
// (10*p2 mod 2^-e)) * 2^e
//
p2 *= 10;
const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e
const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
//
// M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
// = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
// = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
//
buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
//
// M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
//
p2 = r;
m++;
//
// M+ = buffer * 10^-m + 10^-m * p2 * 2^e
// Invariant restored.
// Check if enough digits have been generated.
//
// 10^-m * p2 * 2^e <= delta * 2^e
// p2 * 2^e <= 10^m * delta * 2^e
// p2 <= 10^m * delta
delta *= 10;
dist *= 10;
if (p2 <= delta) {
break;
}
}
// V = buffer * 10^-m, with M- <= V <= M+.
decimal_exponent -= m;
// 1 ulp in the decimal representation is now 10^-m.
// Since delta and dist are now scaled by 10^m, we need to do the
// same with ulp in order to keep the units in sync.
//
// 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
//
const std::uint64_t ten_m = one.f;
grisu2_round(buffer, length, dist, delta, p2, ten_m);
// By construction this algorithm generates the shortest possible decimal
// number (Loitsch, Theorem 6.2) which rounds back to w.
// For an input number of precision p, at least
//
// N = 1 + ceil(p * log_10(2))
//
// decimal digits are sufficient to identify all binary floating-point
// numbers (Matula, "In-and-Out conversions").
// This implies that the algorithm does not produce more than N decimal
// digits.
//
// N = 17 for p = 53 (IEEE double precision)
// N = 9 for p = 24 (IEEE single precision)
}
/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus,
diyfp v, diyfp m_plus) {
// --------(-----------------------+-----------------------)-------- (A)
// m- v m+
//
// --------------------(-----------+-----------------------)-------- (B)
// m- v m+
//
// First scale v (and m- and m+) such that the exponent is in the range
// [alpha, gamma].
const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
// The exponent of the products is = v.e + c_minus_k.e + q and is in the range
// [alpha,gamma]
const diyfp w = diyfp::mul(v, c_minus_k);
const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
const diyfp w_plus = diyfp::mul(m_plus, c_minus_k);
// ----(---+---)---------------(---+---)---------------(---+---)----
// w- w w+
// = c*m- = c*v = c*m+
//
// diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
// w+ are now off by a small amount.
// In fact:
//
// w - v * 10^k < 1 ulp
//
// To account for this inaccuracy, add resp. subtract 1 ulp.
//
// --------+---[---------------(---+---)---------------]---+--------
// w- M- w M+ w+
//
// Now any number in [M-, M+] (bounds included) will round to w when input,
// regardless of how the input rounding algorithm breaks ties.
//
// And digit_gen generates the shortest possible such number in [M-, M+].
// Note that this does not mean that Grisu2 always generates the shortest
// possible number in the interval (m-, m+).
const diyfp M_minus(w_minus.f + 1, w_minus.e);
const diyfp M_plus(w_plus.f - 1, w_plus.e);
decimal_exponent = -cached.k; // = -(-k) = k
grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
}
/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
template <typename FloatType>
void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) {
static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
"internal error: not enough precision");
// If the neighbors (and boundaries) of 'value' are always computed for
// double-precision numbers, all float's can be recovered using strtod (and
// strtof). However, the resulting decimal representations are not exactly
// "short".
//
// The documentation for 'std::to_chars'
// (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is
// converted to a string as if by std::sprintf in the default ("C") locale"
// and since sprintf promotes float's to double's, I think this is exactly
// what 'std::to_chars' does. On the other hand, the documentation for
// 'std::to_chars' requires that "parsing the representation using the
// corresponding std::from_chars function recovers value exactly". That
// indicates that single precision floating-point numbers should be recovered
// using 'std::strtof'.
//
// NB: If the neighbors are computed for single-precision numbers, there is a
// single float
// (7.0385307e-26f) which can't be recovered using strtod. The resulting
// double precision value is off by 1 ulp.
#if 0
const boundaries w = compute_boundaries(static_cast<double>(value));
#else
const boundaries w = compute_boundaries(value);
#endif
grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
}
/*!
@brief appends a decimal representation of e to buf
@return a pointer to the element following the exponent.
@pre -1000 < e < 1000
*/
inline char *append_exponent(char *buf, int e) {
if (e < 0) {
e = -e;
*buf++ = '-';
} else {
*buf++ = '+';
}
auto k = static_cast<std::uint32_t>(e);
if (k < 10) {
// Always print at least two digits in the exponent.
// This is for compatibility with printf("%g").
*buf++ = '0';
*buf++ = static_cast<char>('0' + k);
} else if (k < 100) {
*buf++ = static_cast<char>('0' + k / 10);
k %= 10;
*buf++ = static_cast<char>('0' + k);
} else {
*buf++ = static_cast<char>('0' + k / 100);
k %= 100;
*buf++ = static_cast<char>('0' + k / 10);
k %= 10;
*buf++ = static_cast<char>('0' + k);
}
return buf;
}
/*!
@brief prettify v = buf * 10^decimal_exponent
If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
notation. Otherwise it will be printed in exponential notation.
@pre min_exp < 0
@pre max_exp > 0
*/
inline char *format_buffer(char *buf, int len, int decimal_exponent,
int min_exp, int max_exp) {
const int k = len;
const int n = len + decimal_exponent;
// v = buf * 10^(n-k)
// k is the length of the buffer (number of decimal digits)
// n is the position of the decimal point relative to the start of the buffer.
if (k <= n && n <= max_exp) {
// digits[000]
// len <= max_exp + 2
std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
// Make it look like a floating-point number (#362, #378)
buf[n + 0] = '.';
buf[n + 1] = '0';
return buf + (static_cast<size_t>(n) + 2);
}
if (0 < n && n <= max_exp) {
// dig.its
// len <= max_digits10 + 1
std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n,
static_cast<size_t>(k) - static_cast<size_t>(n));
buf[n] = '.';
return buf + (static_cast<size_t>(k) + 1U);
}
if (min_exp < n && n <= 0) {
// 0.[000]digits
// len <= 2 + (-min_exp - 1) + max_digits10
std::memmove(buf + (2 + static_cast<size_t>(-n)), buf,
static_cast<size_t>(k));
buf[0] = '0';
buf[1] = '.';
std::memset(buf + 2, '0', static_cast<size_t>(-n));
return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
}
if (k == 1) {
// dE+123
// len <= 1 + 5
buf += 1;
} else {
// d.igitsE+123
// len <= max_digits10 + 1 + 5
std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
buf[1] = '.';
buf += 1 + static_cast<size_t>(k);
}
*buf++ = 'e';
return append_exponent(buf, n - 1);
}
} // namespace dtoa_impl
/*!
The format of the resulting decimal representation is similar to printf's %g
format. Returns an iterator pointing past-the-end of the decimal representation.
@note The input number must be finite, i.e. NaN's and Inf's are not supported.
@note The buffer must be large enough.
@note The result is NOT null-terminated.
*/
char *to_chars(char *first, const char *last, double value) {
static_cast<void>(last); // maybe unused - fix warning
// Use signbit(value) instead of (value < 0) since signbit works for -0.
if (std::signbit(value)) {
value = -value;
*first++ = '-';
}
if (value == 0) // +-0
{
*first++ = '0';
// Make it look like a floating-point number (#362, #378)
*first++ = '.';
*first++ = '0';
return first;
}
// Compute v = buffer * 10^decimal_exponent.
// The decimal digits are stored in the buffer, which needs to be interpreted
// as an unsigned decimal integer.
// len is the length of the buffer, i.e. the number of decimal digits.
int len = 0;
int decimal_exponent = 0;
dtoa_impl::grisu2(first, len, decimal_exponent, value);
// Format the buffer like printf("%.*g", prec, value)
constexpr int kMinExp = -4;
constexpr int kMaxExp = std::numeric_limits<double>::digits10;
return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp,
kMaxExp);
}
} // namespace internal
} // namespace simdjson

View File

@ -52,6 +52,7 @@ target_compile_definitions(stringparsingcheck PRIVATE NOMINMAX)
# All remaining tests link with simdjson proper
link_libraries(simdjson)
add_cpp_test(basictests LABELS acceptance per_implementation)
add_cpp_test(minify_tests LABELS acceptance per_implementation)
add_cpp_test(document_stream_tests LABELS acceptance per_implementation)
add_cpp_test(document_tests LABELS acceptance per_implementation)
add_cpp_test(errortests LABELS acceptance per_implementation)

View File

@ -1365,8 +1365,8 @@ namespace minify_tests {
bool test_minify() {
std::cout << "Running " << __func__ << std::endl;
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
const std::string minified(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 0.11111111111111113 ], "baz": { "a": 3.1415926535897936, "b": 2, "c": 3.141592653589794 } })";
const std::string minified(R"({"foo":1,"bar":[1,2,0.11111111111111113],"baz":{"a":3.1415926535897936,"b":2,"c":3.141592653589794}})");
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
}
bool test_minify_array() {
@ -1394,8 +1394,8 @@ namespace format_tests {
using namespace simdjson;
using namespace simdjson::dom;
using namespace std;
const padded_string DOCUMENT = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })"_padded;
const string MINIFIED(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
const padded_string DOCUMENT = R"({ "foo" : 1, "bar" : [ 1, 2, 0.11111111111111113 ], "baz": { "a": 3.1415926535897936, "b": 2, "c": 3.141592653589794 } })"_padded;
const string MINIFIED(R"({"foo":1,"bar":[1,2,0.11111111111111113],"baz":{"a":3.1415926535897936,"b":2,"c":3.141592653589794}})");
bool assert_minified(ostringstream &actual, const std::string &expected=MINIFIED) {
if (actual.str() != expected) {
cerr << "Failed to correctly minify " << DOCUMENT << endl;
@ -1451,7 +1451,7 @@ namespace format_tests {
ASSERT_SUCCESS( parser.parse(DOCUMENT)["bar"].get(array) );
ostringstream s;
s << array;
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_minify_array() {
std::cout << "Running " << __func__ << std::endl;
@ -1460,7 +1460,7 @@ namespace format_tests {
ASSERT_SUCCESS( parser.parse(DOCUMENT)["bar"].get(array) );
ostringstream s;
s << minify(array);
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_object() {
@ -1470,7 +1470,7 @@ namespace format_tests {
ASSERT_SUCCESS( parser.parse(DOCUMENT)["baz"].get(object) );
ostringstream s;
s << object;
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
bool print_minify_object() {
std::cout << "Running " << __func__ << std::endl;
@ -1479,7 +1479,7 @@ namespace format_tests {
ASSERT_SUCCESS( parser.parse(DOCUMENT)["baz"].get(object) );
ostringstream s;
s << minify(object);
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
#if SIMDJSON_EXCEPTIONS
@ -1536,14 +1536,14 @@ namespace format_tests {
dom::parser parser;
ostringstream s;
s << parser.parse(DOCUMENT)["bar"].get<dom::array>();
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_minify_array_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << minify(parser.parse(DOCUMENT)["bar"].get<dom::array>());
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_object_result_exception() {
@ -1551,14 +1551,14 @@ namespace format_tests {
dom::parser parser;
ostringstream s;
s << parser.parse(DOCUMENT)["baz"].get<dom::object>();
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
bool print_minify_object_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << minify(parser.parse(DOCUMENT)["baz"].get<dom::object>());
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
bool print_array_exception() {
@ -1567,7 +1567,7 @@ namespace format_tests {
dom::array array = parser.parse(DOCUMENT)["bar"];
ostringstream s;
s << array;
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_minify_array_exception() {
std::cout << "Running " << __func__ << std::endl;
@ -1575,7 +1575,7 @@ namespace format_tests {
dom::array array = parser.parse(DOCUMENT)["bar"];
ostringstream s;
s << minify(array);
return assert_minified(s, "[1,2,3]");
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_object_exception() {
@ -1584,7 +1584,7 @@ namespace format_tests {
dom::object object = parser.parse(DOCUMENT)["baz"];
ostringstream s;
s << object;
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
bool print_minify_object_exception() {
std::cout << "Running " << __func__ << std::endl;
@ -1592,7 +1592,7 @@ namespace format_tests {
dom::object object = parser.parse(DOCUMENT)["baz"];
ostringstream s;
s << minify(object);
return assert_minified(s, R"({"a":1,"b":2,"c":3})");
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
#endif // SIMDJSON_EXCEPTIONS
@ -1615,6 +1615,149 @@ namespace format_tests {
}
namespace to_string_tests {
using namespace simdjson;
using namespace simdjson::dom;
using namespace std;
const padded_string DOCUMENT = R"({ "foo" : 1, "bar" : [ 1, 2, 0.11111111111111113 ], "baz": { "a": 3.1415926535897936, "b": 2, "c": 3.141592653589794 } })"_padded;
const string MINIFIED(R"({"foo":1,"bar":[1,2,0.11111111111111113],"baz":{"a":3.1415926535897936,"b":2,"c":3.141592653589794}})");
bool assert_minified(ostringstream &actual, const std::string &expected=MINIFIED) {
if (actual.str() != expected) {
cerr << "Failed to correctly to_string " << DOCUMENT << endl;
cerr << "Expected: " << expected << endl;
cerr << "Actual: " << actual.str() << endl;
return false;
}
return true;
}
bool print_to_string_parser_parse() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::element doc;
ASSERT_SUCCESS( parser.parse(DOCUMENT).get(doc) );
ostringstream s;
s << to_string(doc);
return assert_minified(s);
}
bool print_to_string_element() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::element value;
ASSERT_SUCCESS( parser.parse(DOCUMENT)["foo"].get(value) );
ostringstream s;
s << to_string(value);
return assert_minified(s, "1");
}
bool print_to_string_array() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::array array;
ASSERT_SUCCESS( parser.parse(DOCUMENT)["bar"].get(array) );
ostringstream s;
s << to_string(array);
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_to_string_object() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::object object;
ASSERT_SUCCESS( parser.parse(DOCUMENT)["baz"].get(object) );
ostringstream s;
s << to_string(object);
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
#if SIMDJSON_EXCEPTIONS
bool print_to_string_parser_parse_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << to_string(parser.parse(DOCUMENT));
return assert_minified(s);
}
bool print_to_string_element_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << to_string(parser.parse(DOCUMENT)["foo"]);
return assert_minified(s, "1");
}
bool print_to_string_element_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
element value = parser.parse(DOCUMENT)["foo"];
ostringstream s;
s << to_string(value);
return assert_minified(s, "1");
}
bool print_to_string_array_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << to_string(parser.parse(DOCUMENT)["bar"].get<dom::array>());
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_to_string_object_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
ostringstream s;
s << to_string(parser.parse(DOCUMENT)["baz"].get<dom::object>());
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
bool print_to_string_array_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::array array = parser.parse(DOCUMENT)["bar"];
ostringstream s;
s << to_string(array);
return assert_minified(s, "[1,2,0.11111111111111113]");
}
bool print_to_string_object_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
dom::object object = parser.parse(DOCUMENT)["baz"];
ostringstream s;
s << to_string(object);
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}
#endif // SIMDJSON_EXCEPTIONS
bool run() {
return print_to_string_parser_parse() &&
print_to_string_element() &&
print_to_string_array() &&
print_to_string_object() &&
#if SIMDJSON_EXCEPTIONS
print_to_string_parser_parse_exception() &&
print_to_string_element_result_exception() &&
print_to_string_array_result_exception() &&
print_to_string_object_result_exception() &&
print_to_string_element_exception() &&
print_to_string_array_exception() &&
print_to_string_object_exception() &&
#endif
true;
}
}
int main(int argc, char *argv[]) {
std::cout << std::unitbuf;
int c;
@ -1646,7 +1789,8 @@ int main(int argc, char *argv[]) {
std::cout << "------------------------------------------------------------" << std::endl;
std::cout << "Running basic tests." << std::endl;
if (validate_tests::run() &&
if (to_string_tests::run() &&
validate_tests::run() &&
minify_tests::run() &&
parse_api_tests::run() &&
dom_api_tests::run() &&

80
tests/minify_tests.cpp Normal file
View File

@ -0,0 +1,80 @@
#include <cinttypes>
#include <ciso646>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <set>
#include <sstream>
#include <string>
#include <unistd.h>
#include <utility>
#include <vector>
#include "cast_tester.h"
#include "simdjson.h"
#include "test_macros.h"
const char *test_files[] = {
TWITTER_JSON, TWITTER_TIMELINE_JSON, REPEAT_JSON, CANADA_JSON,
MESH_JSON, APACHE_JSON, GSOC_JSON};
/**
* The general idea of these tests if that if you take a JSON file,
* load it, then convert it into a string, then parse that, and
* convert it again into a second string, then the two strings should
* be identifical. If not, then something was lost or added in the
* process.
*/
bool load_to_string(const char *filename) {
std::cout << "Loading " << filename << std::endl;
simdjson::dom::parser parser;
simdjson::dom::element doc;
auto error = parser.load(filename).get(doc);
if (error) { std::cerr << error << std::endl; return false; }
auto serial1 = simdjson::to_string(doc);
error = parser.parse(serial1).get(doc);
if (error) { std::cerr << error << std::endl; return false; }
auto serial2 = simdjson::to_string(doc);
bool match = (serial1 == serial2);
if (match) {
std::cout << "Parsing to_string and calling to_string again results in the "
"same content."
<< std::endl;
}
return match;
}
bool load_minify(const char *filename) {
std::cout << "Loading " << filename << std::endl;
simdjson::dom::parser parser;
simdjson::dom::element doc;
auto error = parser.load(filename).get(doc);
if (error) { std::cerr << error << std::endl; return false; }
auto serial1 = simdjson::minify(doc);
error = parser.parse(serial1).get(doc);
if (error) { std::cerr << error << std::endl; return false; }
auto serial2 = simdjson::minify(doc);
bool match = (serial1 == serial2);
if (match) {
std::cout << "Parsing minify and calling minify again results in the same "
"content."
<< std::endl;
}
return match;
}
bool minify_test() {
std::cout << "Running " << __func__ << std::endl;
for (size_t i = 0; i < sizeof(test_files) / sizeof(test_files[0]); i++) {
bool ok = load_to_string(test_files[i]) && load_minify(test_files[i]);
if (!ok) {
return false;
}
}
return true;
}
int main() { return minify_test() ? EXIT_SUCCESS : EXIT_FAILURE; }

View File

@ -7,6 +7,11 @@
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
const char *TWITTER_TIMELINE_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter_timeline.json";
const char *REPEAT_JSON = SIMDJSON_BENCHMARK_DATA_DIR "repeat.json";
const char *CANADA_JSON = SIMDJSON_BENCHMARK_DATA_DIR "canada.json";
const char *MESH_JSON = SIMDJSON_BENCHMARK_DATA_DIR "mesh.json";
const char *APACHE_JSON = SIMDJSON_BENCHMARK_DATA_DIR "apache_builds.json";
const char *GSOC_JSON = SIMDJSON_BENCHMARK_DATA_DIR "gsoc-2018.json";
const char *AMAZON_CELLPHONES_NDJSON = SIMDJSON_BENCHMARK_DATA_DIR "amazon_cellphones.ndjson";
#define SIMDJSON_BENCHMARK_SMALLDATA_DIR SIMDJSON_BENCHMARK_DATA_DIR "small/"