new number parsing (#1222)

* Remove our dependency on strtod_l by bundling our own slow path.

* Ok. Let us drop strtod entirely.

* Trimming down the powers to -342.

* Removing useless line.

* Many more comments.

* Adding some DLL exports.

* Let the gods help those who rely on windows+gcc.

* Marking the subnormals as unlikely. This is pretty much "performance neutral", but it might help just a bit with twitter.json.
This commit is contained in:
Daniel Lemire 2020-10-10 12:47:49 -04:00 committed by GitHub
parent 1d9926698e
commit 37e6d1e9c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1735 additions and 863 deletions

View File

@ -13,6 +13,12 @@ namespace internal {
* Defined in src/to_chars
*/
char *to_chars(char *first, const char *last, double value);
/**
* @private
* A number parsing routine.
* Defined in src/from_chars
*/
double from_chars(const char *first) noexcept;
}
#ifndef SIMDJSON_EXCEPTIONS
@ -209,48 +215,4 @@ namespace std {
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
/**
* We may fall back on the system's number parsing, and we want
* to be able to call a locale-insensitive number parser. It unfortunately
* means that we need to load up locale headers.
* The locale.h header is generally available:
*/
#include <locale.h>
/**
* Determining whether we should import xlocale.h or not is
* a bit of a nightmare. Visual Studio and recent recent GLIBC (GCC) do not need it.
* However, FreeBSD and Apple platforms will need it.
* And we would want to cover as many platforms as possible.
*/
#ifdef __has_include
// This is the easy case: we have __has_include and can check whether
// xlocale is available. If so, we load it up.
#if __has_include(<xlocale.h>)
#include <xlocale.h>
#endif // __has_include
#else // We do not have __has_include
// Here we do not have __has_include
// We first check for __GLIBC__
#ifdef __GLIBC__ // If we have __GLIBC__ then we should have features.h which should help.
// Note that having __GLIBC__ does not imply that we are compiling against glibc. But
// we hope that any platform that defines __GLIBC__ will mimick glibc.
#include <features.h>
// Check whether we have an old GLIBC.
#if !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
#include <xlocale.h> // Old glibc needs xlocale, otherwise xlocale is unavailable.
#endif // !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
#else // __GLIBC__
// Ok. So we do not have __GLIBC__
// We assume that everything that is not GLIBC and not on old freebsd or windows
// needs xlocale.
// It is likely that recent FreeBSD and Apple platforms load xlocale.h next:
#if !(defined(_WIN32) || (__FreeBSD_version < 1000010))
#include <xlocale.h> // Will always happen under apple.
#endif //
#endif // __GLIBC__
#endif // __has_include
/**
* End of the crazy locale headers.
*/
#endif // SIMDJSON_COMMON_DEFS_H

View File

@ -8,12 +8,7 @@ namespace {
/// @private
namespace numberparsing {
using internal::FASTFLOAT_LARGEST_POWER;
using internal::FASTFLOAT_SMALLEST_POWER;
using internal::value128;
using internal::power_of_ten;
using internal::mantissa_64;
using internal::mantissa_128;
#ifdef JSON_TEST_NUMBERS
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
@ -27,12 +22,25 @@ using internal::mantissa_128;
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
#endif
namespace {
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
simdjson_really_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
double d;
mantissa &= ~(1ULL << 52);
mantissa |= real_exponent << 52;
mantissa |= (((uint64_t)negative) << 63);
memcpy(&d, &mantissa, sizeof(d));
return d;
}
}
// Attempts to compute i * 10^(power) exactly; and if "negative" is
// true, negate the result.
// This function will only work in some cases, when it does not work, success is
// set to false. This should work *most of the time* (like 99% of the time).
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
// We assume that power is in the [smallest_power,
// largest_power] interval: the caller is responsible for this check.
simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
// we start with a fast path
// It was described in
@ -61,9 +69,9 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
// and s / p will produce correctly rounded values.
//
if (power < 0) {
d = d / power_of_ten[-power];
d = d / simdjson::internal::power_of_ten[-power];
} else {
d = d * power_of_ten[power];
d = d * simdjson::internal::power_of_ten[power];
}
if (negative) {
d = -d;
@ -97,16 +105,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
return true;
}
// We are going to need to do some 64-bit arithmetic to get a more precise product.
// We use a table lookup approach.
// It is safe because
// power >= FASTFLOAT_SMALLEST_POWER
// and power <= FASTFLOAT_LARGEST_POWER
// We recover the mantissa of the power, it has a leading 1. It is always
// rounded down.
uint64_t factor_mantissa = mantissa_64[power - FASTFLOAT_SMALLEST_POWER];
// The exponent is 1024 + 63 + power
// The exponent is 1024 + 63 + power
// + floor(log(5**power)/log(2)).
// The 1024 comes from the ieee64 standard.
// The 63 comes from the fact that we use a 64-bit word.
@ -119,61 +119,89 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
// is equal to
// floor(log(5**power)/log(2)) + power
//
// The 65536 is (1<<16) and corresponds to
// The 65536 is (1<<16) and corresponds to
// (65536 * power) >> 16 ---> power
//
// ((152170 * power ) >> 16) is equal to
// floor(log(5**power)/log(2))
// ((152170 * power ) >> 16) is equal to
// floor(log(5**power)/log(2))
//
// Note that this is not magic: 152170/(1<<16) is
// Note that this is not magic: 152170/(1<<16) is
// approximatively equal to log(5)/log(2).
// The 1<<16 value is a power of two; we could use a
// The 1<<16 value is a power of two; we could use a
// larger power of 2 if we wanted to.
//
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
// We want the most significant bit of i to be 1. Shift if needed.
int lz = leading_zeroes(i);
i <<= lz;
// We are going to need to do some 64-bit arithmetic to get a precise product.
// We use a table lookup approach.
// It is safe because
// power >= smallest_power
// and power <= largest_power
// We recover the mantissa of the power, it has a leading 1. It is always
// rounded down.
//
// We want the most significant 64 bits of the product. We know
// this will be non-zero because the most significant bit of i is
// 1.
value128 product = jsoncharutils::full_multiplication(i, factor_mantissa);
uint64_t lower = product.low;
uint64_t upper = product.high;
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
//
// The full_multiplication function computes the 128-bit product of two 64-bit words
// with a returned value of type value128 with a "low component" corresponding to the
// 64-bit least significant bits of the product and with a "high component" corresponding
// to the 64-bit most significant bits of the product.
simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
// implies that the either the most or the second most significant bit of the product
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
// we make of the product. It also makes it easy to reason aboutthe product: there
// 0 or 1 leading zero in the product.
// We know that upper has at most one leading zero because
// both i and factor_mantissa have a leading one. This means
// that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
// As long as the first 9 bits of "upper" are not "1", then we
// know that we have an exact computed value for the leading
// 55 bits because any imprecision would play out as a +1, in
// the worst case.
if (simdjson_unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
uint64_t factor_mantissa_low =
mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
// next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
// result (three 64-bit values)
product = jsoncharutils::full_multiplication(i, factor_mantissa_low);
uint64_t product_low = product.low;
uint64_t product_middle2 = product.high;
uint64_t product_middle1 = lower;
uint64_t product_high = upper;
uint64_t product_middle = product_middle1 + product_middle2;
if (product_middle < product_middle1) {
product_high++; // overflow carry
}
// We want to check whether mantissa *i + i would affect our result.
// This does happen, e.g. with 7.3177701707893310e+15.
if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
(product_low + i < product_low))) { // let us be prudent and bail out.
// Unless the least significant 9 bits of the high (64-bit) part of the full
// product are all 1s, then we know that the most significant 55 bits are
// exact and no further work is needed. Having 55 bits is necessary because
// we need 53 bits for the mantissa but we have to have one rounding bit and
// we can waste a bit if the most significant bit of the product is zero.
if((firstproduct.high & 0x1FF) == 0x1FF) {
// We want to compute i * 5^q, but only care about the top 55 bits at most.
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
// the full computation is wasteful. So we do what is called a "truncated
// multiplication".
// We take the most significant 64-bits, and we put them in
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
// to the desired approximation using one multiplication. Sometimes it does not suffice.
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
// then we get a better approximation to i * 5^q. In very rare cases, even that
// will not suffice, though it is seemingly very hard to find such a scenario.
//
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
// more complicated.
//
// There is an extra layer of complexity in that we need more than 55 bits of
// accuracy in the round-to-even scenario.
//
// The full_multiplication function computes the 128-bit product of two 64-bit words
// with a returned value of type value128 with a "low component" corresponding to the
// 64-bit least significant bits of the product and with a "high component" corresponding
// to the 64-bit most significant bits of the product.
simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
firstproduct.low += secondproduct.high;
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
// At this point, we might need to add at most one to firstproduct, but this
// can only change the value of firstproduct.high if firstproduct.low is maximal.
if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) {
// This is very unlikely, but if so, we need to do much more work!
return false;
}
upper = product_high;
lower = product_middle;
}
uint64_t lower = firstproduct.low;
uint64_t upper = firstproduct.high;
// The final mantissa should be 53 bits with a leading 1.
// We shift it so that it occupies 54 bits with a leading 1.
///////
@ -182,32 +210,56 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
lz += int(1 ^ upperbit);
// Here we have mantissa < (1<<54).
int64_t real_exponent = exponent - lz;
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
// Here have that real_exponent <= 0 so -real_exponent >= 0
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
d = 0.0;
return true;
}
// next line is safe because -real_exponent + 1 < 0
mantissa >>= -real_exponent + 1;
// Thankfully, we can't have both "round-to-even" and subnormals because
// "round-to-even" only occurs for powers close to 0.
mantissa += (mantissa & 1); // round up
mantissa >>= 1;
// There is a weird scenario where we don't have a subnormal but just.
// Suppose we start with 2.2250738585072013e-308, we end up
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
// subnormal, but we can only know this after rounding.
// So we only declare a subnormal if we are smaller than the threshold.
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
d = to_double(mantissa, real_exponent, negative);
return true;
}
// We have to round to even. The "to even" part
// is only a problem when we are right in between two floats
// which we guard against.
// If we have lots of trailing zeros, we may fall right between two
// floating-point values.
if (simdjson_unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
((mantissa & 3) == 1))) {
// if mantissa & 1 == 1 we might need to round up.
//
// Scenarios:
// 1. We are not in the middle. Then we should round up.
//
// 2. We are right in the middle. Whether we round up depends
// on the last significant bit: if it is "one" then we round
// up (round to even) otherwise, we do not.
//
// So if the last significant bit is 1, we can safely round up.
// Hence we only need to bail out if (mantissa & 3) == 1.
// Otherwise we may need more accuracy or analysis to determine whether
// we are exactly between two floating-point numbers.
// It can be triggered with 1e23.
// Note: because the factor_mantissa and factor_mantissa_low are
// almost always rounded down (except for small positive powers),
// almost always should round up.
return false;
//
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
// times a power of two. That is, it is right between a number with binary significand
// m and another number with binary significand m+1; and it must be the case
// that it cannot be represented by a float itself.
//
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
// Recall that 10^q = 5^q * 2^q.
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
// 2^{53} x 5^{-q} < 2^{64}.
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
//
// We require lower <= 1 and not lower == 0 because we could not prove that
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
mantissa &= ~1; // flip it so that we do not round up
}
}
mantissa += mantissa & 1;
@ -219,53 +271,29 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
// This will happen when parsing values such as 7.2057594037927933e+16
////////
mantissa = (1ULL << 52);
lz--; // undo previous addition
real_exponent++;
}
mantissa &= ~(1ULL << 52);
uint64_t real_exponent = exponent - lz;
// we have to check that real_exponent is in range, otherwise we bail out
if (simdjson_unlikely((real_exponent < 1) || (real_exponent > 2046))) {
if (simdjson_unlikely(real_exponent > 2046)) {
// We have an infinte value!!! We could actually throw an error here if we could.
return false;
}
mantissa |= real_exponent << 52;
mantissa |= (((uint64_t)negative) << 63);
memcpy(&d, &mantissa, sizeof(d));
d = to_double(mantissa, real_exponent, negative);
return true;
}
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
char *endptr;
// We want to call strtod with the C (default) locale to avoid
// potential issues in case someone has a different locale.
// Unfortunately, Visual Studio has a different syntax.
#ifdef _WIN32
static _locale_t c_locale = _create_locale(LC_ALL, "C");
*outDouble = _strtod_l((const char *)ptr, &endptr, c_locale);
#else
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
*outDouble = strtod_l((const char *)ptr, &endptr, c_locale);
#endif
// Some libraries will set errno = ERANGE when the value is subnormal,
// yet we may want to be able to parse subnormal values.
// However, we do not want to tolerate NAN or infinite values.
//
// Values like infinity or NaN are not allowed in the JSON specification.
// If you consume a large value and you map it to "infinity", you will no
// longer be able to serialize back a standard-compliant JSON. And there is
// no realistic application where you might need values so large than they
// can't fit in binary64. The maximal value is about 1.7976931348623157 x
// 10^308 It is an unimaginable large number. There will never be any piece of
// engineering involving as many as 10^308 parts. It is estimated that there
// are about 10^80 atoms in the universe. The estimate for the total number
// of electrons is similar. Using a double-precision floating-point value, we
// can represent easily the number of atoms in the universe. We could also
// represent the number of ways you can pick any three individual atoms at
// random in the universe. If you ever encounter a number much larger than
// 10^308, you know that you have a bug. RapidJSON will reject a document with
// a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
// will flat out throw an exception.
//
if ((endptr == (const char *)ptr) || (!std::isfinite(*outDouble))) {
// We call a fallback floating-point parser that might be slow. Note
// it will accept JSON numbers, but the JSON spec. is more restrictive so
// before you call parse_float_fallback, you need to have validated the input
// string with the JSON grammar.
// It will return an error (false) if the parsed number is infinite.
// The string parsing itself always succeeds. We know that there is at least
// one digit.
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
*outDouble = simdjson::internal::from_chars((const char *)ptr);
// We do not accept infinite values.
if (!std::isfinite(*outDouble)) {
return false;
}
return true;
@ -292,7 +320,7 @@ simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
template<typename W>
error_code slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
double d;
if (parse_float_strtod(src, &d)) {
if (parse_float_fallback(src, &d)) {
writer.append_double(d);
return SUCCESS;
}
@ -346,14 +374,14 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
auto start_exp = p;
int64_t exp_number = 0;
while (parse_digit(*p, exp_number)) { ++p; }
// It is possible for parse_digit to overflow.
// It is possible for parse_digit to overflow.
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
// Thus we *must* check for possible overflow before we negate exp_number.
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
// not oblige and may, in fact, generate two distinct paths in any case. It might be
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
// instructions for a simdjson_likely branch, an unconclusive gain.
// If there were no digits, it's an error.
@ -363,7 +391,7 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
// We have a valid positive exponent in exp_number at this point, except that
// it may have overflowed.
// If there were more than 18 digits, we may have overflowed the integer. We have to do
// If there were more than 18 digits, we may have overflowed the integer. We have to do
// something!!!!
if (simdjson_unlikely(p > start_exp+18)) {
// Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
@ -375,12 +403,12 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
// truncate at 324.
// Note that there is no reason to fail per se at this point in time.
// Note that there is no reason to fail per se at this point in time.
// E.g., 0e999999999999999999999 is a fine number.
if (p > start_exp+18) { exp_number = 999999999999999999; }
}
// At this point, we know that exp_number is a sane, positive, signed integer.
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
// To sum it up: the next line should never overflow.
@ -404,10 +432,11 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg
// If we frequently had to deal with long strings of digits,
// we could extend our code by using a 128-bit integer instead
// of a 64-bit integer. However, this is uncommon in practice.
// digit count is off by 1 because of the decimal (assuming there was one).
//
// 9999999999999999999 < 2**64 so we can accomodate 19 digits.
if (simdjson_unlikely(digit_count-1 > 19 && significant_digits(start_digits, digit_count) > 19)) {
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
// may not have a decimal separator!
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
@ -427,22 +456,25 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
// To future reader: we'd love if someone found a better way, or at least could explain this result!
if (simdjson_unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
// this is almost never going to get called!!!
// we start anew, going slowly!!!
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
// and putting into registers. i.e. if we pass it as reference, it gets slow.
// This is what forces the skip_double, as well.
error_code error = slow_float_parsing(src, writer);
writer.skip_double();
return error;
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
//
// Important: smallest_power is such that it leads to a zero value.
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
//
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
WRITE_DOUBLE(0, src, writer);
return SUCCESS;
} else { // (exponent > largest_power) and (i != 0)
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
return INVALID_NUMBER(src);
}
}
double d;
if (!compute_float_64(exponent, i, negative, d)) {
// we are almost never going to get here.
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
}
WRITE_DOUBLE(d, src, writer);
return SUCCESS;
@ -757,7 +789,7 @@ SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(cons
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
exponent += exp_neg ? 0-exp : exp;
overflow = overflow || exponent < FASTFLOAT_SMALLEST_POWER || exponent > FASTFLOAT_LARGEST_POWER;
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
}
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
@ -769,7 +801,7 @@ SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(cons
if (simdjson_likely(!overflow)) {
if (compute_float_64(exponent, i, negative, d)) { return d; }
}
if (!parse_float_strtod(src-negative, &d)) {
if (!parse_float_fallback(src-negative, &d)) {
return NUMBER_ERROR;
}
return d;

View File

@ -11,15 +11,6 @@ void found_bad_string(const uint8_t *buf);
namespace simdjson {
namespace internal {
constexpr int FASTFLOAT_SMALLEST_POWER = -325;
constexpr int FASTFLOAT_LARGEST_POWER = 308;
struct value128 {
uint64_t low;
uint64_t high;
};
// structural chars here are
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
// we are also interested in the four whitespace characters

View File

@ -5,22 +5,54 @@
namespace simdjson {
namespace internal {
/**
* The smallest non-zero float (binary64) is 2^1074.
* We take as input numbers of the form w x 10^q where w < 2^64.
* We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076.
* However, we have that
* (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^1074.
* Thus it is possible for a number of the form w * 10^-342 where
* w is a 64-bit value to be a non-zero floating-point number.
*********
* Any number of form w * 10^309 where w>= 1 is going to be
* infinite in binary64 so we never need to worry about powers
* of 5 greater than 308.
*/
constexpr int smallest_power = -342;
constexpr int largest_power = 308;
/**
* Represents a 128-bit value.
* low: least significant 64 bits.
* high: most significant 64 bits.
*/
struct value128 {
uint64_t low;
uint64_t high;
};
// Precomputed powers of ten from 10^0 to 10^22. These
// can be represented exactly using the double type.
extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[];
// The mantissas of powers of ten from -308 to 308, extended out to sixty four
// bits. The array contains the powers of ten approximated
// as a 64-bit mantissa. It goes from 10^FASTFLOAT_SMALLEST_POWER to
// 10^FASTFLOAT_LARGEST_POWER (inclusively).
// The mantissa is truncated, and
// never rounded up. Uses about 5KB.
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_64[];
// A complement to mantissa_64
// complete to a 128-bit mantissa.
// Uses about 5KB but is rarely accessed.
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_128[];
/**
* When mapping numbers from decimal to binary,
* we go from w * 10^q to m * 2^p but we have
* 10^q = 5^q * 2^q, so effectively
* we are trying to match
* w * 2^q * 5^q to m * 2^p. Thus the powers of two
* are not a concern since they can be represented
* exactly using the binary notation, only the powers of five
* affect the binary significand.
*/
// The truncated powers of five from 5^-342 all the way to 5^308
// The mantissa is truncated to 128 bits, and
// never rounded up. Uses about 5KB.
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[];
} // namespace internal
} // namespace simdjson

486
src/from_chars.cpp Normal file
View File

@ -0,0 +1,486 @@
#include <cmath>
#include <limits>
namespace simdjson {
namespace internal {
/**
* The code in the internal::from_chars function is meant to handle the floating-point number parsing
* when we have more than 19 digits in the decimal mantissa. This should only be seen
* in adversarial scenarios: we do not expect production systems to even produce
* such floating-point numbers.
*
* The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
* who credits Ken Thompson for the design (via a reference to the Go source
* code). See
* https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
* https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
* It is probably not very fast but it is a fallback that should almost never be
* called in real life. Google Wuffs is published under APL 2.0.
**/
namespace {
constexpr uint32_t max_digits = 768;
constexpr int32_t decimal_point_range = 2047;
} // namespace
struct adjusted_mantissa {
uint64_t mantissa;
int power2;
adjusted_mantissa() : mantissa(0), power2(0) {}
};
struct decimal {
uint32_t num_digits;
int32_t decimal_point;
bool negative;
bool truncated;
uint8_t digits[max_digits];
};
template <typename T> struct binary_format {
static constexpr int mantissa_explicit_bits();
static constexpr int minimum_exponent();
static constexpr int infinite_power();
static constexpr int sign_index();
};
template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
return 52;
}
template <> constexpr int binary_format<double>::minimum_exponent() {
return -1023;
}
template <> constexpr int binary_format<double>::infinite_power() {
return 0x7FF;
}
template <> constexpr int binary_format<double>::sign_index() { return 63; }
bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); }
// This should always succeed since it follows a call to parse_number.
decimal parse_decimal(const char *&p) noexcept {
decimal answer;
answer.num_digits = 0;
answer.decimal_point = 0;
answer.negative = false;
answer.truncated = false;
answer.negative = (*p == '-');
if ((*p == '-') || (*p == '+')) {
++p;
}
while (*p == '0') {
++p;
}
while (is_integer(*p)) {
if (answer.num_digits + 1 < max_digits) {
answer.digits[answer.num_digits++] = uint8_t(*p - '0');
} else {
answer.truncated = true;
}
++p;
}
const char *first_after_period{};
if (*p == '.') {
++p;
first_after_period = p;
// if we have not yet encountered a zero, we have to skip it as well
if (answer.num_digits == 0) {
// skip zeros
while (*p == '0') {
++p;
}
}
while (is_integer(*p)) {
if (answer.num_digits + 1 < max_digits) {
answer.digits[answer.num_digits++] = uint8_t(*p - '0');
} else {
answer.truncated = true;
}
++p;
}
answer.decimal_point = int32_t(first_after_period - p);
}
if (('e' == *p) || ('E' == *p)) {
++p;
bool neg_exp = false;
if ('-' == *p) {
neg_exp = true;
++p;
} else if ('+' == *p) {
++p;
}
int32_t exp_number = 0; // exponential part
while (is_integer(*p)) {
uint8_t digit = uint8_t(*p - '0');
if (exp_number < 0x10000) {
exp_number = 10 * exp_number + digit;
}
++p;
}
answer.decimal_point += (neg_exp ? -exp_number : exp_number);
}
answer.decimal_point += answer.num_digits;
return answer;
}
namespace {
// remove all final zeroes
inline void trim(decimal &h) {
while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
h.num_digits--;
}
}
uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
shift &= 63;
const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
0x051C, 0x051C,
};
uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
uint32_t num_new_digits = x_a >> 11;
uint32_t pow5_a = 0x7FF & x_a;
uint32_t pow5_b = 0x7FF & x_b;
const static uint8_t
number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
};
const uint8_t *pow5 =
&number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
uint32_t i = 0;
uint32_t n = pow5_b - pow5_a;
for (; i < n; i++) {
if (i >= h.num_digits) {
return num_new_digits - 1;
} else if (h.digits[i] == pow5[i]) {
continue;
} else if (h.digits[i] < pow5[i]) {
return num_new_digits - 1;
} else {
return num_new_digits;
}
}
return num_new_digits;
}
} // end of anonymous namespace
uint64_t round(decimal &h) {
if ((h.num_digits == 0) || (h.decimal_point < 0)) {
return 0;
} else if (h.decimal_point > 18) {
return UINT64_MAX;
}
// at this point, we know that h.decimal_point >= 0
uint32_t dp = uint32_t(h.decimal_point);
uint64_t n = 0;
for (uint32_t i = 0; i < dp; i++) {
n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
}
bool round_up = false;
if (dp < h.num_digits) {
round_up = h.digits[dp] >= 5; // normally, we round up
// but we may need to round to even!
if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
}
}
if (round_up) {
n++;
}
return n;
}
// computes h * 2^-shift
void decimal_left_shift(decimal &h, uint32_t shift) {
if (h.num_digits == 0) {
return;
}
uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
int32_t read_index = int32_t(h.num_digits - 1);
uint32_t write_index = h.num_digits - 1 + num_new_digits;
uint64_t n = 0;
while (read_index >= 0) {
n += uint64_t(h.digits[read_index]) << shift;
uint64_t quotient = n / 10;
uint64_t remainder = n - (10 * quotient);
if (write_index < max_digits) {
h.digits[write_index] = uint8_t(remainder);
} else if (remainder > 0) {
h.truncated = true;
}
n = quotient;
write_index--;
read_index--;
}
while (n > 0) {
uint64_t quotient = n / 10;
uint64_t remainder = n - (10 * quotient);
if (write_index < max_digits) {
h.digits[write_index] = uint8_t(remainder);
} else if (remainder > 0) {
h.truncated = true;
}
n = quotient;
write_index--;
}
h.num_digits += num_new_digits;
if (h.num_digits > max_digits) {
h.num_digits = max_digits;
}
h.decimal_point += int32_t(num_new_digits);
trim(h);
}
// computes h * 2^shift
void decimal_right_shift(decimal &h, uint32_t shift) {
uint32_t read_index = 0;
uint32_t write_index = 0;
uint64_t n = 0;
while ((n >> shift) == 0) {
if (read_index < h.num_digits) {
n = (10 * n) + h.digits[read_index++];
} else if (n == 0) {
return;
} else {
while ((n >> shift) == 0) {
n = 10 * n;
read_index++;
}
break;
}
}
h.decimal_point -= int32_t(read_index - 1);
if (h.decimal_point < -decimal_point_range) { // it is zero
h.num_digits = 0;
h.decimal_point = 0;
h.negative = false;
h.truncated = false;
return;
}
uint64_t mask = (uint64_t(1) << shift) - 1;
while (read_index < h.num_digits) {
uint8_t new_digit = uint8_t(n >> shift);
n = (10 * (n & mask)) + h.digits[read_index++];
h.digits[write_index++] = new_digit;
}
while (n > 0) {
uint8_t new_digit = uint8_t(n >> shift);
n = 10 * (n & mask);
if (write_index < max_digits) {
h.digits[write_index++] = new_digit;
} else if (new_digit > 0) {
h.truncated = true;
}
}
h.num_digits = write_index;
trim(h);
}
template <typename binary> adjusted_mantissa compute_float(decimal &d) {
adjusted_mantissa answer;
if (d.num_digits == 0) {
// should be zero
answer.power2 = 0;
answer.mantissa = 0;
return answer;
}
// At this point, going further, we can assume that d.num_digits > 0.
// We want to guard against excessive decimal point values because
// they can result in long running times. Indeed, we do
// shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
// which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
// fine (runs for a long time).
//
if(d.decimal_point < -324) {
// We have something smaller than 1e-324 which is always zero
// in binary64 and binary32.
// It should be zero.
answer.power2 = 0;
answer.mantissa = 0;
return answer;
} else if(d.decimal_point >= 310) {
// We have something at least as large as 0.1e310 which is
// always infinite.
answer.power2 = binary::infinite_power();
answer.mantissa = 0;
return answer;
}
static const uint32_t max_shift = 60;
static const uint32_t num_powers = 19;
static const uint8_t powers[19] = {
0, 3, 6, 9, 13, 16, 19, 23, 26, 29, //
33, 36, 39, 43, 46, 49, 53, 56, 59, //
};
int32_t exp2 = 0;
while (d.decimal_point > 0) {
uint32_t n = uint32_t(d.decimal_point);
uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
decimal_right_shift(d, shift);
if (d.decimal_point < -decimal_point_range) {
// should be zero
answer.power2 = 0;
answer.mantissa = 0;
return answer;
}
exp2 += int32_t(shift);
}
// We shift left toward [1/2 ... 1].
while (d.decimal_point <= 0) {
uint32_t shift;
if (d.decimal_point == 0) {
if (d.digits[0] >= 5) {
break;
}
shift = (d.digits[0] < 2) ? 2 : 1;
} else {
uint32_t n = uint32_t(-d.decimal_point);
shift = (n < num_powers) ? powers[n] : max_shift;
}
decimal_left_shift(d, shift);
if (d.decimal_point > decimal_point_range) {
// we want to get infinity:
answer.power2 = 0xFF;
answer.mantissa = 0;
return answer;
}
exp2 -= int32_t(shift);
}
// We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
exp2--;
constexpr int32_t minimum_exponent = binary::minimum_exponent();
while ((minimum_exponent + 1) > exp2) {
uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
if (n > max_shift) {
n = max_shift;
}
decimal_right_shift(d, n);
exp2 += int32_t(n);
}
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
answer.power2 = binary::infinite_power();
answer.mantissa = 0;
return answer;
}
const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
decimal_left_shift(d, mantissa_size_in_bits);
uint64_t mantissa = round(d);
// It is possible that we have an overflow, in which case we need
// to shift back.
if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
decimal_right_shift(d, 1);
exp2 += 1;
mantissa = round(d);
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
answer.power2 = binary::infinite_power();
answer.mantissa = 0;
return answer;
}
}
answer.power2 = exp2 - binary::minimum_exponent();
if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
answer.power2--;
}
answer.mantissa =
mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
return answer;
}
template <typename binary>
adjusted_mantissa parse_long_mantissa(const char *first) {
decimal d = parse_decimal(first);
return compute_float<binary>(d);
}
double from_chars(const char *first) noexcept {
bool negative = first[0] == '-';
if (negative) {
first++;
}
adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
uint64_t word = am.mantissa;
word |= uint64_t(am.power2)
<< binary_format<double>::mantissa_explicit_bits();
word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
: word;
double value;
std::memcpy(&value, &word, sizeof(double));
return value;
}
} // internal
} // simdjson

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ SIMDJSON_PUSH_DISABLE_WARNINGS
SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "to_chars.cpp"
#include "from_chars.cpp"
#include "internal/error_tables.cpp"
#include "internal/jsoncharutils_tables.cpp"
#include "internal/numberparsing_tables.cpp"

View File

@ -54,6 +54,7 @@ target_compile_definitions(stringparsingcheck PRIVATE NOMINMAX)
# All remaining tests link with simdjson proper
link_libraries(simdjson)
add_cpp_test(random_string_number_tests LABELS acceptance per_implementation)
add_cpp_test(basictests LABELS acceptance per_implementation)
add_cpp_test(minify_tests LABELS acceptance per_implementation)
add_cpp_test(document_stream_tests LABELS acceptance per_implementation)

View File

@ -16,6 +16,17 @@
#include "cast_tester.h"
#include "test_macros.h"
/**
* Some systems have bad floating-point parsing. We want to exclude them.
*/
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
// Finally, we want to exclude legacy 32-bit systems.
#ifndef SIMDJSON_IS_32BITS
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
#define TEST_FLOATS
#endif
#endif
const size_t AMAZON_CELLPHONES_NDJSON_DOC_COUNT = 793;
#define SIMDJSON_SHOW_DEFINE(x) printf("%s=%s\n", #x, STRINGIFY(x))
@ -34,6 +45,34 @@ namespace number_tests {
return ua + ub + 0x80000000;
}
bool ground_truth() {
std::cout << __func__ << std::endl;
std::pair<std::string,double> ground_truth[] = {
{"2.2250738585072013e-308",0x1p-1022},
{"-92666518056446206563E3", -0x1.39f764644154dp+76},
{"-92666518056446206563E3", -0x1.39f764644154dp+76},
{"-42823146028335318693e-128", -0x1.0176daa6cdaafp-360},
{"90054602635948575728E72", 0x1.61ab4ea9cb6c3p+305},
{"1.00000000000000188558920870223463870174566020691753515394643550663070558368373221972569761144603605635692374830246134201063722058e-309", 0x0.0b8157268fdafp-1022},
{"0e9999999999999999999999999999", 0x0p+0},
{"-2402844368454405395.2", -0x1.0ac4f1c7422e7p+61}
};
simdjson::dom::parser parser;
for(auto string_double : ground_truth) {
std::cout << "parsing the string '" << string_double.first << "'" << std::endl;
std::cout << "I am expecting the floating-point value '" << string_double.second << "'" << std::endl;
double result;
ASSERT_SUCCESS(parser.parse(string_double.first).get(result));
std::cout << "Resulting float is '" << result << "'" << std::endl;
if(result != string_double.second) {
std::cerr << std::hexfloat << result << " vs " << string_double.second << std::endl;
std::cerr << string_double.first << std::endl;
return false;
}
}
return true;
}
bool small_integers() {
std::cout << __func__ << std::endl;
@ -56,6 +95,7 @@ namespace number_tests {
std::cout << __func__ << std::endl;
simdjson::dom::parser parser;
std::vector<std::pair<std::string, double>> testing = {
{"9999999999999999999e0",9999999999999999999.0},
{"9999999999999999999.0",9999999999999999999.0},
{"9999999999999999999",9999999999999999999.},
{"999999999999999999.9",999999999999999999.9},
@ -99,7 +139,6 @@ namespace number_tests {
double expected = pow(2, i);
size_t n = snprintf(buf, sizeof(buf), "%.*e", std::numeric_limits<double>::max_digits10 - 1, expected);
if (n >= sizeof(buf)) { abort(); }
fflush(NULL);
double actual;
auto error = parser.parse(buf, n).get(actual);
if (error) { std::cerr << error << std::endl; return false; }
@ -194,14 +233,13 @@ namespace number_tests {
simdjson::dom::parser parser;
bool is_pow_correct{1e-308 == std::pow(10,-308)};
int start_point = is_pow_correct ? -10000 : -307;
int start_point = is_pow_correct ? -1000 : -307;
if(!is_pow_correct) {
std::cout << "On your system, the pow function is busted. Sorry about that. " << std::endl;
}
for (int i = start_point; i <= 308; ++i) {// large negative values should be zero.
size_t n = snprintf(buf, sizeof(buf), "1e%d", i);
if (n >= sizeof(buf)) { abort(); }
fflush(NULL);
double actual;
auto error = parser.parse(buf, n).get(actual);
if (error) { std::cerr << error << std::endl; return false; }
@ -217,8 +255,69 @@ namespace number_tests {
printf("Powers of 10 can be parsed.\n");
return true;
}
bool basic_test_64bit(std::string vals, double val) {
std::cout << " parsing " << vals << std::endl;
double std_answer;
char *endptr;
// We want to call strtod with the C (default) locale to avoid
// potential issues in case someone has a different locale.
// Unfortunately, Visual Studio has a different syntax.
const char * cval = vals.c_str();
#ifdef _WIN32
static _locale_t c_locale = _create_locale(LC_ALL, "C");
std_answer = _strtod_l(cval, &endptr, c_locale);
#else
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
std_answer = strtod_l(cval, &endptr, c_locale);
#endif
if(endptr == cval) {
std::cerr << "Your runtime library failed to parse " << vals << std::endl;
}
double actual;
simdjson::dom::parser parser;
auto error = parser.parse(vals).get(actual);
if(error) {
std::cerr << error << std::endl;
return false;
}
if (actual != val) {
std::cerr << std::hexfloat << actual << " but I was expecting " << val
<< std::endl;
std::cerr << "string: " << vals << std::endl;
std::cout << std::dec;
if(std_answer == actual) {
std::cerr << "simdjson agrees with your runtime library, so we will accept the answer." << std::endl;
return true;
}
return false;
}
std::cout << std::hexfloat << actual << " == " << val << std::endl;
std::cout << std::dec;
return true;
}
bool specific_tests() {
std::cout << __func__ << std::endl;
return basic_test_64bit("-2402844368454405395.2",-2402844368454405395.2) &&
basic_test_64bit("4503599627370496.5", 4503599627370496.5) &&
basic_test_64bit("4503599627475352.5", 4503599627475352.5) &&
basic_test_64bit("4503599627475353.5", 4503599627475353.5) &&
basic_test_64bit("2251799813685248.25", 2251799813685248.25) &&
basic_test_64bit("1125899906842624.125", 1125899906842624.125) &&
basic_test_64bit("1125899906842901.875", 1125899906842901.875) &&
basic_test_64bit("2251799813685803.75", 2251799813685803.75) &&
basic_test_64bit("4503599627370497.5", 4503599627370497.5) &&
basic_test_64bit("45035996.273704995", 45035996.273704995) &&
basic_test_64bit("45035996.273704985", 45035996.273704985) &&
basic_test_64bit
basic_test_64bit
}
bool run() {
return small_integers() &&
return specific_tests() &&
ground_truth() &&
small_integers() &&
powers_of_two() &&
powers_of_ten() &&
nines();
@ -1189,7 +1288,14 @@ namespace type_tests {
&& (expected_value >= 0 ?
test_cast<uint64_t>(result, expected_value) :
test_cast_error<uint64_t>(result, NUMBER_OUT_OF_RANGE))
#ifdef TEST_FLOATS
// We trust the underlying system to be accurate.
&& test_cast<double>(result, static_cast<double>(expected_value))
#else
// We don't trust the underlying system so we only run the test_cast
// exact test when the expected_value is within the 53-bit range.
&& ((expected_value<-9007199254740992) || (expected_value>9007199254740992) || test_cast<double>(result, static_cast<double>(expected_value)))
#endif
&& test_cast_error<bool>(result, INCORRECT_TYPE)
&& test_is_null(result, false);
}
@ -1209,6 +1315,14 @@ namespace type_tests {
&& test_cast_error<int64_t>(result, NUMBER_OUT_OF_RANGE)
&& test_cast<uint64_t>(result, expected_value)
&& test_cast<double>(result, static_cast<double>(expected_value))
#ifdef TEST_FLOATS
// We trust the underlying system to be accurate.
&& test_cast<double>(result, static_cast<double>(expected_value))
#else
// We don't trust the underlying system so we only run the test_cast
// exact test when the expected_value is within the 53-bit range.
&& ((expected_value>9007199254740992) || test_cast<double>(result, static_cast<double>(expected_value)))
#endif
&& test_cast_error<bool>(result, INCORRECT_TYPE)
&& test_is_null(result, false);
}
@ -1409,7 +1523,7 @@ namespace minify_tests {
auto e = simdjson::minify(bogus_json.get(), i, output_json.get(), newlength);
if(e) {
std::cerr << "got an error (unexpected) : " << e << std::endl;
return false;
return false;
}
}
return true;
@ -1427,7 +1541,7 @@ namespace minify_tests {
auto e = simdjson::minify(bogus_json.get(), i, output_json.get(), newlength);
if(e) {
std::cerr << "got an error (unexpected) : " << e << std::endl;
return false;
return false;
}
}
return true;
@ -1899,7 +2013,7 @@ int main(int argc, char *argv[]) {
dom_api_tests::run() &&
type_tests::run() &&
format_tests::run() &&
number_tests::run()
number_tests::run()
) {
std::cout << "Basic tests are ok." << std::endl;
return EXIT_SUCCESS;

View File

@ -1,9 +1,10 @@
#include <cstring>
#include <inttypes.h>
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#ifndef JSON_TEST_NUMBERS
#define JSON_TEST_NUMBERS
@ -22,6 +23,29 @@ void found_unsigned_integer(uint64_t result, const uint8_t *buf);
#include "simdjson.h"
/**
* Some systems have bad floating-point parsing. We want to exclude them.
*/
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
// Ok. So under Visual Studio, linux, apple and freebsd systems, we have a good chance of having a decent
// enough strtod. It is not certain, but it is maybe a good enough heuristics. We exclude systems like msys2
// or cygwin.
//
// Finally, we want to exclude legacy 32-bit systems.
#ifndef SIMDJSON_IS_32BITS
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
#define TEST_FLOATS
// Apple and freebsd need a special header, typically.
#if defined __APPLE__ || defined(__FreeBSD__)
# include <xlocale.h>
#endif
#endif
#endif
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
@ -73,6 +97,10 @@ bool is_in_bad_list(const char *buf) {
return false;
}
#ifndef TEST_FLOATS
// We do not recognize the system, so we do not verify our results.
void found_invalid_number(const uint8_t *) {}
#else
void found_invalid_number(const uint8_t *buf) {
invalid_count++;
char *endptr;
@ -82,7 +110,7 @@ void found_invalid_number(const uint8_t *buf) {
#else
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
double expected = strtod_l((const char *)buf, &endptr, c_locale);
#endif
#endif
if (endptr != (const char *)buf) {
if (!is_in_bad_list((const char *)buf)) {
printf("Warning: found_invalid_number %.32s whereas strtod parses it to "
@ -93,6 +121,7 @@ void found_invalid_number(const uint8_t *buf) {
}
}
}
#endif
void found_integer(int64_t result, const uint8_t *buf) {
int_count++;
@ -101,7 +130,7 @@ void found_integer(int64_t result, const uint8_t *buf) {
if ((endptr == (const char *)buf) || (expected != result)) {
#if (!(__MINGW32__) && !(__MINGW64__))
fprintf(stderr, "Error: parsed %" PRId64 " out of %.32s, ", result, buf);
#else // mingw is busted since we include #include <inttypes.h>
#else // mingw is busted since we include #include <inttypes.h> and it will still not provide PRId64
fprintf(stderr, "Error: parsed %lld out of %.32s, ", (long long)result, buf);
#endif
fprintf(stderr, " while parsing %s \n", fullpath);
@ -124,6 +153,10 @@ void found_unsigned_integer(uint64_t result, const uint8_t *buf) {
}
}
#ifndef TEST_FLOATS
// We do not recognize the system, so we do not verify our results.
void found_float(double , const uint8_t *) {}
#else
void found_float(double result, const uint8_t *buf) {
char *endptr;
float_count++;
@ -133,7 +166,7 @@ void found_float(double result, const uint8_t *buf) {
#else
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
double expected = strtod_l((const char *)buf, &endptr, c_locale);
#endif
#endif
if (endptr == (const char *)buf) {
fprintf(stderr,
"parsed %f from %.32s whereas strtod refuses to parse a float, ",
@ -141,7 +174,7 @@ void found_float(double result, const uint8_t *buf) {
fprintf(stderr, " while parsing %s \n", fullpath);
parse_error |= PARSE_ERROR;
}
if (fpclassify(expected) != fpclassify(result)) {
if (std::fpclassify(expected) != std::fpclassify(result)) {
fprintf(stderr,
"floats not in the same category expected: %f observed: %f \n",
expected, result);
@ -158,6 +191,7 @@ void found_float(double result, const uint8_t *buf) {
parse_error |= PARSE_ERROR;
}
}
#endif
#include "simdjson.h"
#include "simdjson.cpp"

View File

@ -0,0 +1,197 @@
#include <cstring>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <random>
#include <climits>
#include <unistd.h>
#include "simdjson.h"
/**
* Some systems have bad floating-point parsing. We want to exclude them.
*/
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
// Ok. So under Visual Studio, linux, apple and freebsd systems, we have a good chance of having a decent
// enough strtod. It is not certain, but it is maybe a good enough heuristics. We exclude systems like msys2
// or cygwin.
//
// Finally, we want to exclude legacy 32-bit systems.
#ifndef SIMDJSON_IS_32BITS
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
#define TEST_FLOATS
// Apple and freebsd need a special header, typically.
#if defined __APPLE__ || defined(__FreeBSD__)
# include <xlocale.h>
#endif
#endif
#endif
struct RandomEngine {
RandomEngine() = delete;
RandomEngine(uint32_t seed) : one_zero_generator(0,1), digit_generator(0,9), nonzero_digit_generator(1,9), digit_count_generator (1,40),exp_count_generator (1,3), generator(seed) {}
std::uniform_int_distribution<int> one_zero_generator;
std::uniform_int_distribution<int> digit_generator;
std::uniform_int_distribution<int> nonzero_digit_generator;
std::uniform_int_distribution<int> digit_count_generator;
std::uniform_int_distribution<int> exp_count_generator;
bool next_bool() { return one_zero_generator(generator); }
int next_digit() { return digit_generator(generator); }
int next_nonzero_digit() { return nonzero_digit_generator(generator); }
int next_digit_count() { return digit_count_generator(generator); }
int next_exp_count() { return exp_count_generator(generator); }
std::mt19937 generator;
};
size_t build_random_string(RandomEngine &rand, char *buffer) {
size_t pos{0};
if (rand.next_bool()) {
buffer[pos++] = '-';
}
size_t number_of_digits = size_t(rand.next_digit_count());
std::uniform_int_distribution<int> decimal_generator(1,int(number_of_digits));
size_t location_of_decimal_separator = size_t(decimal_generator(rand.generator));
for (size_t i = 0; i < number_of_digits; i++) {
if (i == location_of_decimal_separator) {
buffer[pos++] = '.';
}
if (( i == 0) && (location_of_decimal_separator != 1)) {
buffer[pos++] = char(rand.next_nonzero_digit() + '0');
} else {
buffer[pos++] = char(rand.next_digit() + '0');
}
}
if (rand.next_bool()) {
if (rand.next_bool()) {
buffer[pos++] = 'e';
} else {
buffer[pos++] = 'E';
}
if (rand.next_bool()) {
buffer[pos++] = '-';
} else {
if (rand.next_bool()) {
buffer[pos++] = '+';
}
}
number_of_digits = rand.next_exp_count();
size_t i = 0;
if(number_of_digits > 0) {
buffer[pos++] = char(rand.next_nonzero_digit() + '0');
i++;
}
for (; i < number_of_digits; i++) {
buffer[pos++] = char(rand.next_digit() + '0');
}
}
buffer[pos] = '\0'; // null termination
return pos;
}
#ifndef TEST_FLOATS
// We do not recognize the system, so we do not verify our results.
bool check_float(double , const char *) {
return true;
}
#else
bool check_float(double result, const char *buf) {
char *endptr;
#ifdef _WIN32
static _locale_t c_locale = _create_locale(LC_ALL, "C");
double expected = _strtod_l((const char *)buf, &endptr, c_locale);
#else
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
double expected = strtod_l((const char *)buf, &endptr, c_locale);
#endif
if (endptr == (const char *)buf) {
fprintf(stderr,
"parsed %f from %.32s whereas strtod refuses to parse a float, ",
result, buf);
return false;
}
if (expected != result) {
fprintf(stderr, "parsed %.128e from \n", result);
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
fprintf(stderr, " %.128e,", expected);
return false;
}
return true;
}
#endif
/**
* We generate random strings and we try to parse them,
* and we verify that we get the same answer.
*/
bool tester(int seed, size_t volume) {
char buffer[1024]; // large buffer (can't overflow)
simdjson::dom::parser parser;
RandomEngine rand(seed);
double result;
for (size_t i = 0; i < volume; i++) {
if((i%100000) == 0) { std::cout << "."; std::cout.flush(); }
size_t length = build_random_string(rand, buffer);
auto error = parser.parse(buffer, length).get(result);
// When we parse a (finite) number, it better match strtod.
if ((!error) && (!check_float(result, buffer))) { return false; }
}
return true;
}
int main(int argc, char *argv[]) {
// We test 1,000,000 random strings by default.
// You can specify more tests with the '-m' flag if you want.
size_t howmany = 1000000;
int c;
while ((c = getopt(argc, argv, "a:m:h")) != -1) {
switch (c) {
case 'a': {
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
if (!impl) {
fprintf(stderr, "Unsupported architecture value -a %s\n", optarg);
return EXIT_FAILURE;
}
if(!impl->supported_by_runtime_system()) {
fprintf(stderr, "The selected implementation does not match your current CPU: -a %s\n", optarg);
return EXIT_FAILURE;
}
simdjson::active_implementation = impl;
break;
}
case 'h': {
std::cout << "-a to select an architecture" << std::endl;
std::cout << "-m to select a number of tests" << std::endl;
return EXIT_SUCCESS;
}
case 'm': {
long long requested_howmany = atoll(optarg);
if(requested_howmany <= 0) {
fprintf(stderr, "Please provide a positive number of tests -m %s no larger than %lld \n", optarg, LLONG_MAX);
return EXIT_FAILURE;
}
howmany = size_t(requested_howmany);
break;
}
default:
fprintf(stderr, "Unexpected argument %c\n", c);
return EXIT_FAILURE;
}
}
if (tester(1234344, howmany)) {
std::cout << "All tests ok." << std::endl;
return EXIT_SUCCESS;
}
std::cout << "Failure." << std::endl;
return EXIT_FAILURE;
}