new number parsing (#1222)
* Remove our dependency on strtod_l by bundling our own slow path. * Ok. Let us drop strtod entirely. * Trimming down the powers to -342. * Removing useless line. * Many more comments. * Adding some DLL exports. * Let the gods help those who rely on windows+gcc. * Marking the subnormals as unlikely. This is pretty much "performance neutral", but it might help just a bit with twitter.json.
This commit is contained in:
parent
1d9926698e
commit
37e6d1e9c7
|
@ -13,6 +13,12 @@ namespace internal {
|
|||
* Defined in src/to_chars
|
||||
*/
|
||||
char *to_chars(char *first, const char *last, double value);
|
||||
/**
|
||||
* @private
|
||||
* A number parsing routine.
|
||||
* Defined in src/from_chars
|
||||
*/
|
||||
double from_chars(const char *first) noexcept;
|
||||
}
|
||||
|
||||
#ifndef SIMDJSON_EXCEPTIONS
|
||||
|
@ -209,48 +215,4 @@ namespace std {
|
|||
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
|
||||
|
||||
|
||||
/**
|
||||
* We may fall back on the system's number parsing, and we want
|
||||
* to be able to call a locale-insensitive number parser. It unfortunately
|
||||
* means that we need to load up locale headers.
|
||||
* The locale.h header is generally available:
|
||||
*/
|
||||
#include <locale.h>
|
||||
/**
|
||||
* Determining whether we should import xlocale.h or not is
|
||||
* a bit of a nightmare. Visual Studio and recent recent GLIBC (GCC) do not need it.
|
||||
* However, FreeBSD and Apple platforms will need it.
|
||||
* And we would want to cover as many platforms as possible.
|
||||
*/
|
||||
#ifdef __has_include
|
||||
// This is the easy case: we have __has_include and can check whether
|
||||
// xlocale is available. If so, we load it up.
|
||||
#if __has_include(<xlocale.h>)
|
||||
#include <xlocale.h>
|
||||
#endif // __has_include
|
||||
#else // We do not have __has_include
|
||||
// Here we do not have __has_include
|
||||
// We first check for __GLIBC__
|
||||
#ifdef __GLIBC__ // If we have __GLIBC__ then we should have features.h which should help.
|
||||
// Note that having __GLIBC__ does not imply that we are compiling against glibc. But
|
||||
// we hope that any platform that defines __GLIBC__ will mimick glibc.
|
||||
#include <features.h>
|
||||
// Check whether we have an old GLIBC.
|
||||
#if !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
|
||||
#include <xlocale.h> // Old glibc needs xlocale, otherwise xlocale is unavailable.
|
||||
#endif // !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
|
||||
#else // __GLIBC__
|
||||
// Ok. So we do not have __GLIBC__
|
||||
// We assume that everything that is not GLIBC and not on old freebsd or windows
|
||||
// needs xlocale.
|
||||
// It is likely that recent FreeBSD and Apple platforms load xlocale.h next:
|
||||
#if !(defined(_WIN32) || (__FreeBSD_version < 1000010))
|
||||
#include <xlocale.h> // Will always happen under apple.
|
||||
#endif //
|
||||
#endif // __GLIBC__
|
||||
#endif // __has_include
|
||||
/**
|
||||
* End of the crazy locale headers.
|
||||
*/
|
||||
|
||||
#endif // SIMDJSON_COMMON_DEFS_H
|
||||
|
|
|
@ -8,12 +8,7 @@ namespace {
|
|||
/// @private
|
||||
namespace numberparsing {
|
||||
|
||||
using internal::FASTFLOAT_LARGEST_POWER;
|
||||
using internal::FASTFLOAT_SMALLEST_POWER;
|
||||
using internal::value128;
|
||||
using internal::power_of_ten;
|
||||
using internal::mantissa_64;
|
||||
using internal::mantissa_128;
|
||||
|
||||
|
||||
#ifdef JSON_TEST_NUMBERS
|
||||
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
||||
|
@ -27,12 +22,25 @@ using internal::mantissa_128;
|
|||
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
||||
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
||||
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
||||
simdjson_really_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
||||
double d;
|
||||
mantissa &= ~(1ULL << 52);
|
||||
mantissa |= real_exponent << 52;
|
||||
mantissa |= (((uint64_t)negative) << 63);
|
||||
memcpy(&d, &mantissa, sizeof(d));
|
||||
return d;
|
||||
}
|
||||
}
|
||||
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
||||
// true, negate the result.
|
||||
// This function will only work in some cases, when it does not work, success is
|
||||
// set to false. This should work *most of the time* (like 99% of the time).
|
||||
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
|
||||
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
|
||||
// We assume that power is in the [smallest_power,
|
||||
// largest_power] interval: the caller is responsible for this check.
|
||||
simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
||||
// we start with a fast path
|
||||
// It was described in
|
||||
|
@ -61,9 +69,9 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
|
|||
// and s / p will produce correctly rounded values.
|
||||
//
|
||||
if (power < 0) {
|
||||
d = d / power_of_ten[-power];
|
||||
d = d / simdjson::internal::power_of_ten[-power];
|
||||
} else {
|
||||
d = d * power_of_ten[power];
|
||||
d = d * simdjson::internal::power_of_ten[power];
|
||||
}
|
||||
if (negative) {
|
||||
d = -d;
|
||||
|
@ -97,16 +105,8 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
|
|||
return true;
|
||||
}
|
||||
|
||||
// We are going to need to do some 64-bit arithmetic to get a more precise product.
|
||||
// We use a table lookup approach.
|
||||
// It is safe because
|
||||
// power >= FASTFLOAT_SMALLEST_POWER
|
||||
// and power <= FASTFLOAT_LARGEST_POWER
|
||||
// We recover the mantissa of the power, it has a leading 1. It is always
|
||||
// rounded down.
|
||||
uint64_t factor_mantissa = mantissa_64[power - FASTFLOAT_SMALLEST_POWER];
|
||||
|
||||
// The exponent is 1024 + 63 + power
|
||||
|
||||
// The exponent is 1024 + 63 + power
|
||||
// + floor(log(5**power)/log(2)).
|
||||
// The 1024 comes from the ieee64 standard.
|
||||
// The 63 comes from the fact that we use a 64-bit word.
|
||||
|
@ -119,61 +119,89 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
|
|||
// is equal to
|
||||
// floor(log(5**power)/log(2)) + power
|
||||
//
|
||||
// The 65536 is (1<<16) and corresponds to
|
||||
// The 65536 is (1<<16) and corresponds to
|
||||
// (65536 * power) >> 16 ---> power
|
||||
//
|
||||
// ((152170 * power ) >> 16) is equal to
|
||||
// floor(log(5**power)/log(2))
|
||||
// ((152170 * power ) >> 16) is equal to
|
||||
// floor(log(5**power)/log(2))
|
||||
//
|
||||
// Note that this is not magic: 152170/(1<<16) is
|
||||
// Note that this is not magic: 152170/(1<<16) is
|
||||
// approximatively equal to log(5)/log(2).
|
||||
// The 1<<16 value is a power of two; we could use a
|
||||
// The 1<<16 value is a power of two; we could use a
|
||||
// larger power of 2 if we wanted to.
|
||||
//
|
||||
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
||||
|
||||
|
||||
|
||||
// We want the most significant bit of i to be 1. Shift if needed.
|
||||
int lz = leading_zeroes(i);
|
||||
i <<= lz;
|
||||
|
||||
|
||||
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
||||
// We use a table lookup approach.
|
||||
// It is safe because
|
||||
// power >= smallest_power
|
||||
// and power <= largest_power
|
||||
// We recover the mantissa of the power, it has a leading 1. It is always
|
||||
// rounded down.
|
||||
//
|
||||
// We want the most significant 64 bits of the product. We know
|
||||
// this will be non-zero because the most significant bit of i is
|
||||
// 1.
|
||||
value128 product = jsoncharutils::full_multiplication(i, factor_mantissa);
|
||||
uint64_t lower = product.low;
|
||||
uint64_t upper = product.high;
|
||||
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
||||
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
||||
//
|
||||
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
||||
// with a returned value of type value128 with a "low component" corresponding to the
|
||||
// 64-bit least significant bits of the product and with a "high component" corresponding
|
||||
// to the 64-bit most significant bits of the product.
|
||||
simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
||||
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
||||
// implies that the either the most or the second most significant bit of the product
|
||||
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
||||
// we make of the product. It also makes it easy to reason aboutthe product: there
|
||||
// 0 or 1 leading zero in the product.
|
||||
|
||||
// We know that upper has at most one leading zero because
|
||||
// both i and factor_mantissa have a leading one. This means
|
||||
// that the result is at least as large as ((1<<63)*(1<<63))/(1<<64).
|
||||
|
||||
// As long as the first 9 bits of "upper" are not "1", then we
|
||||
// know that we have an exact computed value for the leading
|
||||
// 55 bits because any imprecision would play out as a +1, in
|
||||
// the worst case.
|
||||
if (simdjson_unlikely((upper & 0x1FF) == 0x1FF) && (lower + i < lower)) {
|
||||
uint64_t factor_mantissa_low =
|
||||
mantissa_128[power - FASTFLOAT_SMALLEST_POWER];
|
||||
// next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit
|
||||
// result (three 64-bit values)
|
||||
product = jsoncharutils::full_multiplication(i, factor_mantissa_low);
|
||||
uint64_t product_low = product.low;
|
||||
uint64_t product_middle2 = product.high;
|
||||
uint64_t product_middle1 = lower;
|
||||
uint64_t product_high = upper;
|
||||
uint64_t product_middle = product_middle1 + product_middle2;
|
||||
if (product_middle < product_middle1) {
|
||||
product_high++; // overflow carry
|
||||
}
|
||||
// We want to check whether mantissa *i + i would affect our result.
|
||||
// This does happen, e.g. with 7.3177701707893310e+15.
|
||||
if (((product_middle + 1 == 0) && ((product_high & 0x1FF) == 0x1FF) &&
|
||||
(product_low + i < product_low))) { // let us be prudent and bail out.
|
||||
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
||||
// product are all 1s, then we know that the most significant 55 bits are
|
||||
// exact and no further work is needed. Having 55 bits is necessary because
|
||||
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
||||
// we can waste a bit if the most significant bit of the product is zero.
|
||||
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
||||
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
||||
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
||||
// the full computation is wasteful. So we do what is called a "truncated
|
||||
// multiplication".
|
||||
// We take the most significant 64-bits, and we put them in
|
||||
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
||||
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
||||
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
||||
// then we get a better approximation to i * 5^q. In very rare cases, even that
|
||||
// will not suffice, though it is seemingly very hard to find such a scenario.
|
||||
//
|
||||
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
||||
// more complicated.
|
||||
//
|
||||
// There is an extra layer of complexity in that we need more than 55 bits of
|
||||
// accuracy in the round-to-even scenario.
|
||||
//
|
||||
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
||||
// with a returned value of type value128 with a "low component" corresponding to the
|
||||
// 64-bit least significant bits of the product and with a "high component" corresponding
|
||||
// to the 64-bit most significant bits of the product.
|
||||
simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
||||
firstproduct.low += secondproduct.high;
|
||||
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
||||
// At this point, we might need to add at most one to firstproduct, but this
|
||||
// can only change the value of firstproduct.high if firstproduct.low is maximal.
|
||||
if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) {
|
||||
// This is very unlikely, but if so, we need to do much more work!
|
||||
return false;
|
||||
}
|
||||
upper = product_high;
|
||||
lower = product_middle;
|
||||
}
|
||||
uint64_t lower = firstproduct.low;
|
||||
uint64_t upper = firstproduct.high;
|
||||
// The final mantissa should be 53 bits with a leading 1.
|
||||
// We shift it so that it occupies 54 bits with a leading 1.
|
||||
///////
|
||||
|
@ -182,32 +210,56 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
|
|||
lz += int(1 ^ upperbit);
|
||||
|
||||
// Here we have mantissa < (1<<54).
|
||||
|
||||
int64_t real_exponent = exponent - lz;
|
||||
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
||||
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
||||
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
||||
d = 0.0;
|
||||
return true;
|
||||
}
|
||||
// next line is safe because -real_exponent + 1 < 0
|
||||
mantissa >>= -real_exponent + 1;
|
||||
// Thankfully, we can't have both "round-to-even" and subnormals because
|
||||
// "round-to-even" only occurs for powers close to 0.
|
||||
mantissa += (mantissa & 1); // round up
|
||||
mantissa >>= 1;
|
||||
// There is a weird scenario where we don't have a subnormal but just.
|
||||
// Suppose we start with 2.2250738585072013e-308, we end up
|
||||
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
||||
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
||||
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
||||
// subnormal, but we can only know this after rounding.
|
||||
// So we only declare a subnormal if we are smaller than the threshold.
|
||||
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
||||
d = to_double(mantissa, real_exponent, negative);
|
||||
return true;
|
||||
}
|
||||
// We have to round to even. The "to even" part
|
||||
// is only a problem when we are right in between two floats
|
||||
// which we guard against.
|
||||
// If we have lots of trailing zeros, we may fall right between two
|
||||
// floating-point values.
|
||||
if (simdjson_unlikely((lower == 0) && ((upper & 0x1FF) == 0) &&
|
||||
((mantissa & 3) == 1))) {
|
||||
// if mantissa & 1 == 1 we might need to round up.
|
||||
//
|
||||
// Scenarios:
|
||||
// 1. We are not in the middle. Then we should round up.
|
||||
//
|
||||
// 2. We are right in the middle. Whether we round up depends
|
||||
// on the last significant bit: if it is "one" then we round
|
||||
// up (round to even) otherwise, we do not.
|
||||
//
|
||||
// So if the last significant bit is 1, we can safely round up.
|
||||
// Hence we only need to bail out if (mantissa & 3) == 1.
|
||||
// Otherwise we may need more accuracy or analysis to determine whether
|
||||
// we are exactly between two floating-point numbers.
|
||||
// It can be triggered with 1e23.
|
||||
// Note: because the factor_mantissa and factor_mantissa_low are
|
||||
// almost always rounded down (except for small positive powers),
|
||||
// almost always should round up.
|
||||
return false;
|
||||
//
|
||||
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
||||
// times a power of two. That is, it is right between a number with binary significand
|
||||
// m and another number with binary significand m+1; and it must be the case
|
||||
// that it cannot be represented by a float itself.
|
||||
//
|
||||
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
||||
// Recall that 10^q = 5^q * 2^q.
|
||||
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
||||
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
||||
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
||||
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
||||
// 2^{53} x 5^{-q} < 2^{64}.
|
||||
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
||||
//
|
||||
// We require lower <= 1 and not lower == 0 because we could not prove that
|
||||
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
||||
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
||||
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
||||
mantissa &= ~1; // flip it so that we do not round up
|
||||
}
|
||||
}
|
||||
|
||||
mantissa += mantissa & 1;
|
||||
|
@ -219,53 +271,29 @@ simdjson_really_inline bool compute_float_64(int64_t power, uint64_t i, bool neg
|
|||
// This will happen when parsing values such as 7.2057594037927933e+16
|
||||
////////
|
||||
mantissa = (1ULL << 52);
|
||||
lz--; // undo previous addition
|
||||
real_exponent++;
|
||||
}
|
||||
mantissa &= ~(1ULL << 52);
|
||||
uint64_t real_exponent = exponent - lz;
|
||||
// we have to check that real_exponent is in range, otherwise we bail out
|
||||
if (simdjson_unlikely((real_exponent < 1) || (real_exponent > 2046))) {
|
||||
if (simdjson_unlikely(real_exponent > 2046)) {
|
||||
// We have an infinte value!!! We could actually throw an error here if we could.
|
||||
return false;
|
||||
}
|
||||
mantissa |= real_exponent << 52;
|
||||
mantissa |= (((uint64_t)negative) << 63);
|
||||
memcpy(&d, &mantissa, sizeof(d));
|
||||
d = to_double(mantissa, real_exponent, negative);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
|
||||
char *endptr;
|
||||
// We want to call strtod with the C (default) locale to avoid
|
||||
// potential issues in case someone has a different locale.
|
||||
// Unfortunately, Visual Studio has a different syntax.
|
||||
#ifdef _WIN32
|
||||
static _locale_t c_locale = _create_locale(LC_ALL, "C");
|
||||
*outDouble = _strtod_l((const char *)ptr, &endptr, c_locale);
|
||||
#else
|
||||
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
|
||||
*outDouble = strtod_l((const char *)ptr, &endptr, c_locale);
|
||||
#endif
|
||||
// Some libraries will set errno = ERANGE when the value is subnormal,
|
||||
// yet we may want to be able to parse subnormal values.
|
||||
// However, we do not want to tolerate NAN or infinite values.
|
||||
//
|
||||
// Values like infinity or NaN are not allowed in the JSON specification.
|
||||
// If you consume a large value and you map it to "infinity", you will no
|
||||
// longer be able to serialize back a standard-compliant JSON. And there is
|
||||
// no realistic application where you might need values so large than they
|
||||
// can't fit in binary64. The maximal value is about 1.7976931348623157 x
|
||||
// 10^308 It is an unimaginable large number. There will never be any piece of
|
||||
// engineering involving as many as 10^308 parts. It is estimated that there
|
||||
// are about 10^80 atoms in the universe. The estimate for the total number
|
||||
// of electrons is similar. Using a double-precision floating-point value, we
|
||||
// can represent easily the number of atoms in the universe. We could also
|
||||
// represent the number of ways you can pick any three individual atoms at
|
||||
// random in the universe. If you ever encounter a number much larger than
|
||||
// 10^308, you know that you have a bug. RapidJSON will reject a document with
|
||||
// a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
|
||||
// will flat out throw an exception.
|
||||
//
|
||||
if ((endptr == (const char *)ptr) || (!std::isfinite(*outDouble))) {
|
||||
// We call a fallback floating-point parser that might be slow. Note
|
||||
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
||||
// before you call parse_float_fallback, you need to have validated the input
|
||||
// string with the JSON grammar.
|
||||
// It will return an error (false) if the parsed number is infinite.
|
||||
// The string parsing itself always succeeds. We know that there is at least
|
||||
// one digit.
|
||||
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
||||
*outDouble = simdjson::internal::from_chars((const char *)ptr);
|
||||
// We do not accept infinite values.
|
||||
if (!std::isfinite(*outDouble)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -292,7 +320,7 @@ simdjson_really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|||
template<typename W>
|
||||
error_code slow_float_parsing(SIMDJSON_UNUSED const uint8_t * src, W writer) {
|
||||
double d;
|
||||
if (parse_float_strtod(src, &d)) {
|
||||
if (parse_float_fallback(src, &d)) {
|
||||
writer.append_double(d);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
@ -346,14 +374,14 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
|
|||
auto start_exp = p;
|
||||
int64_t exp_number = 0;
|
||||
while (parse_digit(*p, exp_number)) { ++p; }
|
||||
// It is possible for parse_digit to overflow.
|
||||
// It is possible for parse_digit to overflow.
|
||||
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
||||
// Thus we *must* check for possible overflow before we negate exp_number.
|
||||
|
||||
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
||||
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
||||
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
||||
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
||||
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
||||
// instructions for a simdjson_likely branch, an unconclusive gain.
|
||||
|
||||
// If there were no digits, it's an error.
|
||||
|
@ -363,7 +391,7 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
|
|||
// We have a valid positive exponent in exp_number at this point, except that
|
||||
// it may have overflowed.
|
||||
|
||||
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
||||
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
||||
// something!!!!
|
||||
if (simdjson_unlikely(p > start_exp+18)) {
|
||||
// Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
|
||||
|
@ -375,12 +403,12 @@ simdjson_really_inline error_code parse_exponent(SIMDJSON_UNUSED const uint8_t *
|
|||
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
||||
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
||||
// truncate at 324.
|
||||
// Note that there is no reason to fail per se at this point in time.
|
||||
// Note that there is no reason to fail per se at this point in time.
|
||||
// E.g., 0e999999999999999999999 is a fine number.
|
||||
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
||||
}
|
||||
// At this point, we know that exp_number is a sane, positive, signed integer.
|
||||
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
||||
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
||||
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
||||
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
||||
// To sum it up: the next line should never overflow.
|
||||
|
@ -404,10 +432,11 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg
|
|||
// If we frequently had to deal with long strings of digits,
|
||||
// we could extend our code by using a 128-bit integer instead
|
||||
// of a 64-bit integer. However, this is uncommon in practice.
|
||||
// digit count is off by 1 because of the decimal (assuming there was one).
|
||||
//
|
||||
// 9999999999999999999 < 2**64 so we can accomodate 19 digits.
|
||||
if (simdjson_unlikely(digit_count-1 > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
||||
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
||||
// may not have a decimal separator!
|
||||
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
||||
// Ok, chances are good that we had an overflow!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
|
@ -427,22 +456,25 @@ simdjson_really_inline error_code write_float(const uint8_t *const src, bool neg
|
|||
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
||||
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
||||
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
||||
if (simdjson_unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
// NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
|
||||
// because slow_float_parsing is a non-inlined function. If we passed our writer reference to
|
||||
// it, it would force it to be stored in memory, preventing the compiler from picking it apart
|
||||
// and putting into registers. i.e. if we pass it as reference, it gets slow.
|
||||
// This is what forces the skip_double, as well.
|
||||
error_code error = slow_float_parsing(src, writer);
|
||||
writer.skip_double();
|
||||
return error;
|
||||
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
||||
//
|
||||
// Important: smallest_power is such that it leads to a zero value.
|
||||
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
||||
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
||||
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
||||
//
|
||||
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
||||
WRITE_DOUBLE(0, src, writer);
|
||||
return SUCCESS;
|
||||
} else { // (exponent > largest_power) and (i != 0)
|
||||
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
||||
return INVALID_NUMBER(src);
|
||||
}
|
||||
}
|
||||
double d;
|
||||
if (!compute_float_64(exponent, i, negative, d)) {
|
||||
// we are almost never going to get here.
|
||||
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
|
||||
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
||||
}
|
||||
WRITE_DOUBLE(d, src, writer);
|
||||
return SUCCESS;
|
||||
|
@ -757,7 +789,7 @@ SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(cons
|
|||
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
||||
|
||||
exponent += exp_neg ? 0-exp : exp;
|
||||
overflow = overflow || exponent < FASTFLOAT_SMALLEST_POWER || exponent > FASTFLOAT_LARGEST_POWER;
|
||||
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
||||
}
|
||||
|
||||
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
||||
|
@ -769,7 +801,7 @@ SIMDJSON_UNUSED simdjson_really_inline simdjson_result<double> parse_double(cons
|
|||
if (simdjson_likely(!overflow)) {
|
||||
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
||||
}
|
||||
if (!parse_float_strtod(src-negative, &d)) {
|
||||
if (!parse_float_fallback(src-negative, &d)) {
|
||||
return NUMBER_ERROR;
|
||||
}
|
||||
return d;
|
||||
|
|
|
@ -11,15 +11,6 @@ void found_bad_string(const uint8_t *buf);
|
|||
|
||||
namespace simdjson {
|
||||
namespace internal {
|
||||
|
||||
constexpr int FASTFLOAT_SMALLEST_POWER = -325;
|
||||
constexpr int FASTFLOAT_LARGEST_POWER = 308;
|
||||
|
||||
struct value128 {
|
||||
uint64_t low;
|
||||
uint64_t high;
|
||||
};
|
||||
|
||||
// structural chars here are
|
||||
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
|
||||
// we are also interested in the four whitespace characters
|
||||
|
|
|
@ -5,22 +5,54 @@
|
|||
|
||||
namespace simdjson {
|
||||
namespace internal {
|
||||
/**
|
||||
* The smallest non-zero float (binary64) is 2^−1074.
|
||||
* We take as input numbers of the form w x 10^q where w < 2^64.
|
||||
* We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076.
|
||||
* However, we have that
|
||||
* (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^−1074.
|
||||
* Thus it is possible for a number of the form w * 10^-342 where
|
||||
* w is a 64-bit value to be a non-zero floating-point number.
|
||||
*********
|
||||
* Any number of form w * 10^309 where w>= 1 is going to be
|
||||
* infinite in binary64 so we never need to worry about powers
|
||||
* of 5 greater than 308.
|
||||
*/
|
||||
constexpr int smallest_power = -342;
|
||||
constexpr int largest_power = 308;
|
||||
|
||||
/**
|
||||
* Represents a 128-bit value.
|
||||
* low: least significant 64 bits.
|
||||
* high: most significant 64 bits.
|
||||
*/
|
||||
struct value128 {
|
||||
uint64_t low;
|
||||
uint64_t high;
|
||||
};
|
||||
|
||||
|
||||
// Precomputed powers of ten from 10^0 to 10^22. These
|
||||
// can be represented exactly using the double type.
|
||||
extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[];
|
||||
// The mantissas of powers of ten from -308 to 308, extended out to sixty four
|
||||
// bits. The array contains the powers of ten approximated
|
||||
// as a 64-bit mantissa. It goes from 10^FASTFLOAT_SMALLEST_POWER to
|
||||
// 10^FASTFLOAT_LARGEST_POWER (inclusively).
|
||||
// The mantissa is truncated, and
|
||||
// never rounded up. Uses about 5KB.
|
||||
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_64[];
|
||||
// A complement to mantissa_64
|
||||
// complete to a 128-bit mantissa.
|
||||
// Uses about 5KB but is rarely accessed.
|
||||
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_128[];
|
||||
|
||||
|
||||
/**
|
||||
* When mapping numbers from decimal to binary,
|
||||
* we go from w * 10^q to m * 2^p but we have
|
||||
* 10^q = 5^q * 2^q, so effectively
|
||||
* we are trying to match
|
||||
* w * 2^q * 5^q to m * 2^p. Thus the powers of two
|
||||
* are not a concern since they can be represented
|
||||
* exactly using the binary notation, only the powers of five
|
||||
* affect the binary significand.
|
||||
*/
|
||||
|
||||
|
||||
// The truncated powers of five from 5^-342 all the way to 5^308
|
||||
// The mantissa is truncated to 128 bits, and
|
||||
// never rounded up. Uses about 5KB.
|
||||
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[];
|
||||
} // namespace internal
|
||||
} // namespace simdjson
|
||||
|
||||
|
|
|
@ -0,0 +1,486 @@
|
|||
#include <cmath>
|
||||
#include <limits>
|
||||
namespace simdjson {
|
||||
namespace internal {
|
||||
|
||||
/**
|
||||
* The code in the internal::from_chars function is meant to handle the floating-point number parsing
|
||||
* when we have more than 19 digits in the decimal mantissa. This should only be seen
|
||||
* in adversarial scenarios: we do not expect production systems to even produce
|
||||
* such floating-point numbers.
|
||||
*
|
||||
* The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
|
||||
* who credits Ken Thompson for the design (via a reference to the Go source
|
||||
* code). See
|
||||
* https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
|
||||
* https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
|
||||
* It is probably not very fast but it is a fallback that should almost never be
|
||||
* called in real life. Google Wuffs is published under APL 2.0.
|
||||
**/
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t max_digits = 768;
|
||||
constexpr int32_t decimal_point_range = 2047;
|
||||
} // namespace
|
||||
|
||||
struct adjusted_mantissa {
|
||||
uint64_t mantissa;
|
||||
int power2;
|
||||
adjusted_mantissa() : mantissa(0), power2(0) {}
|
||||
};
|
||||
|
||||
struct decimal {
|
||||
uint32_t num_digits;
|
||||
int32_t decimal_point;
|
||||
bool negative;
|
||||
bool truncated;
|
||||
uint8_t digits[max_digits];
|
||||
};
|
||||
|
||||
template <typename T> struct binary_format {
|
||||
static constexpr int mantissa_explicit_bits();
|
||||
static constexpr int minimum_exponent();
|
||||
static constexpr int infinite_power();
|
||||
static constexpr int sign_index();
|
||||
};
|
||||
|
||||
template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
|
||||
return 52;
|
||||
}
|
||||
|
||||
template <> constexpr int binary_format<double>::minimum_exponent() {
|
||||
return -1023;
|
||||
}
|
||||
template <> constexpr int binary_format<double>::infinite_power() {
|
||||
return 0x7FF;
|
||||
}
|
||||
|
||||
template <> constexpr int binary_format<double>::sign_index() { return 63; }
|
||||
|
||||
bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); }
|
||||
|
||||
// This should always succeed since it follows a call to parse_number.
|
||||
decimal parse_decimal(const char *&p) noexcept {
|
||||
decimal answer;
|
||||
answer.num_digits = 0;
|
||||
answer.decimal_point = 0;
|
||||
answer.negative = false;
|
||||
answer.truncated = false;
|
||||
answer.negative = (*p == '-');
|
||||
if ((*p == '-') || (*p == '+')) {
|
||||
++p;
|
||||
}
|
||||
|
||||
while (*p == '0') {
|
||||
++p;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
if (answer.num_digits + 1 < max_digits) {
|
||||
answer.digits[answer.num_digits++] = uint8_t(*p - '0');
|
||||
} else {
|
||||
answer.truncated = true;
|
||||
}
|
||||
++p;
|
||||
}
|
||||
const char *first_after_period{};
|
||||
if (*p == '.') {
|
||||
++p;
|
||||
first_after_period = p;
|
||||
// if we have not yet encountered a zero, we have to skip it as well
|
||||
if (answer.num_digits == 0) {
|
||||
// skip zeros
|
||||
while (*p == '0') {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
if (answer.num_digits + 1 < max_digits) {
|
||||
answer.digits[answer.num_digits++] = uint8_t(*p - '0');
|
||||
} else {
|
||||
answer.truncated = true;
|
||||
}
|
||||
++p;
|
||||
}
|
||||
answer.decimal_point = int32_t(first_after_period - p);
|
||||
}
|
||||
|
||||
if (('e' == *p) || ('E' == *p)) {
|
||||
++p;
|
||||
bool neg_exp = false;
|
||||
if ('-' == *p) {
|
||||
neg_exp = true;
|
||||
++p;
|
||||
} else if ('+' == *p) {
|
||||
++p;
|
||||
}
|
||||
int32_t exp_number = 0; // exponential part
|
||||
while (is_integer(*p)) {
|
||||
uint8_t digit = uint8_t(*p - '0');
|
||||
if (exp_number < 0x10000) {
|
||||
exp_number = 10 * exp_number + digit;
|
||||
}
|
||||
++p;
|
||||
}
|
||||
answer.decimal_point += (neg_exp ? -exp_number : exp_number);
|
||||
}
|
||||
answer.decimal_point += answer.num_digits;
|
||||
return answer;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// remove all final zeroes
|
||||
inline void trim(decimal &h) {
|
||||
while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
|
||||
h.num_digits--;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
|
||||
shift &= 63;
|
||||
const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
|
||||
0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
|
||||
0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
|
||||
0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
|
||||
0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
|
||||
0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
|
||||
0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
|
||||
0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
|
||||
0x051C, 0x051C,
|
||||
};
|
||||
uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
|
||||
uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
|
||||
uint32_t num_new_digits = x_a >> 11;
|
||||
uint32_t pow5_a = 0x7FF & x_a;
|
||||
uint32_t pow5_b = 0x7FF & x_b;
|
||||
const static uint8_t
|
||||
number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
|
||||
5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
|
||||
3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
|
||||
2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
|
||||
5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
|
||||
5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
|
||||
3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
|
||||
8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
|
||||
7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
|
||||
6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
|
||||
0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
|
||||
8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
|
||||
6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
|
||||
5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
|
||||
6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
|
||||
2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
|
||||
2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
|
||||
4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
|
||||
2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
|
||||
6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
|
||||
8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
|
||||
5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
|
||||
7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
|
||||
1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
|
||||
6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
|
||||
4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
|
||||
2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
|
||||
3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
|
||||
2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
|
||||
9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
|
||||
2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
|
||||
8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
|
||||
2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
|
||||
9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
|
||||
0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
|
||||
4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
|
||||
5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
|
||||
2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
|
||||
6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
|
||||
4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
|
||||
2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
|
||||
9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
|
||||
3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
|
||||
3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
|
||||
1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
|
||||
1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
|
||||
3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
|
||||
0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
|
||||
5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
|
||||
3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
|
||||
9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
|
||||
9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
|
||||
7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
|
||||
6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
|
||||
6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
|
||||
4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
|
||||
5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
|
||||
6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
|
||||
};
|
||||
const uint8_t *pow5 =
|
||||
&number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
|
||||
uint32_t i = 0;
|
||||
uint32_t n = pow5_b - pow5_a;
|
||||
for (; i < n; i++) {
|
||||
if (i >= h.num_digits) {
|
||||
return num_new_digits - 1;
|
||||
} else if (h.digits[i] == pow5[i]) {
|
||||
continue;
|
||||
} else if (h.digits[i] < pow5[i]) {
|
||||
return num_new_digits - 1;
|
||||
} else {
|
||||
return num_new_digits;
|
||||
}
|
||||
}
|
||||
return num_new_digits;
|
||||
}
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
uint64_t round(decimal &h) {
|
||||
if ((h.num_digits == 0) || (h.decimal_point < 0)) {
|
||||
return 0;
|
||||
} else if (h.decimal_point > 18) {
|
||||
return UINT64_MAX;
|
||||
}
|
||||
// at this point, we know that h.decimal_point >= 0
|
||||
uint32_t dp = uint32_t(h.decimal_point);
|
||||
uint64_t n = 0;
|
||||
for (uint32_t i = 0; i < dp; i++) {
|
||||
n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
|
||||
}
|
||||
bool round_up = false;
|
||||
if (dp < h.num_digits) {
|
||||
round_up = h.digits[dp] >= 5; // normally, we round up
|
||||
// but we may need to round to even!
|
||||
if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
|
||||
round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
|
||||
}
|
||||
}
|
||||
if (round_up) {
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// computes h * 2^-shift
|
||||
void decimal_left_shift(decimal &h, uint32_t shift) {
|
||||
if (h.num_digits == 0) {
|
||||
return;
|
||||
}
|
||||
uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
|
||||
int32_t read_index = int32_t(h.num_digits - 1);
|
||||
uint32_t write_index = h.num_digits - 1 + num_new_digits;
|
||||
uint64_t n = 0;
|
||||
|
||||
while (read_index >= 0) {
|
||||
n += uint64_t(h.digits[read_index]) << shift;
|
||||
uint64_t quotient = n / 10;
|
||||
uint64_t remainder = n - (10 * quotient);
|
||||
if (write_index < max_digits) {
|
||||
h.digits[write_index] = uint8_t(remainder);
|
||||
} else if (remainder > 0) {
|
||||
h.truncated = true;
|
||||
}
|
||||
n = quotient;
|
||||
write_index--;
|
||||
read_index--;
|
||||
}
|
||||
while (n > 0) {
|
||||
uint64_t quotient = n / 10;
|
||||
uint64_t remainder = n - (10 * quotient);
|
||||
if (write_index < max_digits) {
|
||||
h.digits[write_index] = uint8_t(remainder);
|
||||
} else if (remainder > 0) {
|
||||
h.truncated = true;
|
||||
}
|
||||
n = quotient;
|
||||
write_index--;
|
||||
}
|
||||
h.num_digits += num_new_digits;
|
||||
if (h.num_digits > max_digits) {
|
||||
h.num_digits = max_digits;
|
||||
}
|
||||
h.decimal_point += int32_t(num_new_digits);
|
||||
trim(h);
|
||||
}
|
||||
|
||||
// computes h * 2^shift
|
||||
void decimal_right_shift(decimal &h, uint32_t shift) {
|
||||
uint32_t read_index = 0;
|
||||
uint32_t write_index = 0;
|
||||
|
||||
uint64_t n = 0;
|
||||
|
||||
while ((n >> shift) == 0) {
|
||||
if (read_index < h.num_digits) {
|
||||
n = (10 * n) + h.digits[read_index++];
|
||||
} else if (n == 0) {
|
||||
return;
|
||||
} else {
|
||||
while ((n >> shift) == 0) {
|
||||
n = 10 * n;
|
||||
read_index++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
h.decimal_point -= int32_t(read_index - 1);
|
||||
if (h.decimal_point < -decimal_point_range) { // it is zero
|
||||
h.num_digits = 0;
|
||||
h.decimal_point = 0;
|
||||
h.negative = false;
|
||||
h.truncated = false;
|
||||
return;
|
||||
}
|
||||
uint64_t mask = (uint64_t(1) << shift) - 1;
|
||||
while (read_index < h.num_digits) {
|
||||
uint8_t new_digit = uint8_t(n >> shift);
|
||||
n = (10 * (n & mask)) + h.digits[read_index++];
|
||||
h.digits[write_index++] = new_digit;
|
||||
}
|
||||
while (n > 0) {
|
||||
uint8_t new_digit = uint8_t(n >> shift);
|
||||
n = 10 * (n & mask);
|
||||
if (write_index < max_digits) {
|
||||
h.digits[write_index++] = new_digit;
|
||||
} else if (new_digit > 0) {
|
||||
h.truncated = true;
|
||||
}
|
||||
}
|
||||
h.num_digits = write_index;
|
||||
trim(h);
|
||||
}
|
||||
|
||||
template <typename binary> adjusted_mantissa compute_float(decimal &d) {
|
||||
adjusted_mantissa answer;
|
||||
if (d.num_digits == 0) {
|
||||
// should be zero
|
||||
answer.power2 = 0;
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
// At this point, going further, we can assume that d.num_digits > 0.
|
||||
// We want to guard against excessive decimal point values because
|
||||
// they can result in long running times. Indeed, we do
|
||||
// shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
|
||||
// which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
|
||||
// fine (runs for a long time).
|
||||
//
|
||||
if(d.decimal_point < -324) {
|
||||
// We have something smaller than 1e-324 which is always zero
|
||||
// in binary64 and binary32.
|
||||
// It should be zero.
|
||||
answer.power2 = 0;
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
} else if(d.decimal_point >= 310) {
|
||||
// We have something at least as large as 0.1e310 which is
|
||||
// always infinite.
|
||||
answer.power2 = binary::infinite_power();
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
|
||||
static const uint32_t max_shift = 60;
|
||||
static const uint32_t num_powers = 19;
|
||||
static const uint8_t powers[19] = {
|
||||
0, 3, 6, 9, 13, 16, 19, 23, 26, 29, //
|
||||
33, 36, 39, 43, 46, 49, 53, 56, 59, //
|
||||
};
|
||||
int32_t exp2 = 0;
|
||||
while (d.decimal_point > 0) {
|
||||
uint32_t n = uint32_t(d.decimal_point);
|
||||
uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
|
||||
decimal_right_shift(d, shift);
|
||||
if (d.decimal_point < -decimal_point_range) {
|
||||
// should be zero
|
||||
answer.power2 = 0;
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
exp2 += int32_t(shift);
|
||||
}
|
||||
// We shift left toward [1/2 ... 1].
|
||||
while (d.decimal_point <= 0) {
|
||||
uint32_t shift;
|
||||
if (d.decimal_point == 0) {
|
||||
if (d.digits[0] >= 5) {
|
||||
break;
|
||||
}
|
||||
shift = (d.digits[0] < 2) ? 2 : 1;
|
||||
} else {
|
||||
uint32_t n = uint32_t(-d.decimal_point);
|
||||
shift = (n < num_powers) ? powers[n] : max_shift;
|
||||
}
|
||||
decimal_left_shift(d, shift);
|
||||
if (d.decimal_point > decimal_point_range) {
|
||||
// we want to get infinity:
|
||||
answer.power2 = 0xFF;
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
exp2 -= int32_t(shift);
|
||||
}
|
||||
// We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
|
||||
exp2--;
|
||||
constexpr int32_t minimum_exponent = binary::minimum_exponent();
|
||||
while ((minimum_exponent + 1) > exp2) {
|
||||
uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
|
||||
if (n > max_shift) {
|
||||
n = max_shift;
|
||||
}
|
||||
decimal_right_shift(d, n);
|
||||
exp2 += int32_t(n);
|
||||
}
|
||||
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
|
||||
answer.power2 = binary::infinite_power();
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
|
||||
const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
|
||||
decimal_left_shift(d, mantissa_size_in_bits);
|
||||
|
||||
uint64_t mantissa = round(d);
|
||||
// It is possible that we have an overflow, in which case we need
|
||||
// to shift back.
|
||||
if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
|
||||
decimal_right_shift(d, 1);
|
||||
exp2 += 1;
|
||||
mantissa = round(d);
|
||||
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
|
||||
answer.power2 = binary::infinite_power();
|
||||
answer.mantissa = 0;
|
||||
return answer;
|
||||
}
|
||||
}
|
||||
answer.power2 = exp2 - binary::minimum_exponent();
|
||||
if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
|
||||
answer.power2--;
|
||||
}
|
||||
answer.mantissa =
|
||||
mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
|
||||
return answer;
|
||||
}
|
||||
|
||||
template <typename binary>
|
||||
adjusted_mantissa parse_long_mantissa(const char *first) {
|
||||
decimal d = parse_decimal(first);
|
||||
return compute_float<binary>(d);
|
||||
}
|
||||
|
||||
double from_chars(const char *first) noexcept {
|
||||
bool negative = first[0] == '-';
|
||||
if (negative) {
|
||||
first++;
|
||||
}
|
||||
adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
|
||||
uint64_t word = am.mantissa;
|
||||
word |= uint64_t(am.power2)
|
||||
<< binary_format<double>::mantissa_explicit_bits();
|
||||
word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
|
||||
: word;
|
||||
double value;
|
||||
std::memcpy(&value, &word, sizeof(double));
|
||||
return value;
|
||||
}
|
||||
|
||||
} // internal
|
||||
} // simdjson
|
File diff suppressed because it is too large
Load Diff
|
@ -4,6 +4,7 @@ SIMDJSON_PUSH_DISABLE_WARNINGS
|
|||
SIMDJSON_DISABLE_UNDESIRED_WARNINGS
|
||||
|
||||
#include "to_chars.cpp"
|
||||
#include "from_chars.cpp"
|
||||
#include "internal/error_tables.cpp"
|
||||
#include "internal/jsoncharutils_tables.cpp"
|
||||
#include "internal/numberparsing_tables.cpp"
|
||||
|
|
|
@ -54,6 +54,7 @@ target_compile_definitions(stringparsingcheck PRIVATE NOMINMAX)
|
|||
|
||||
# All remaining tests link with simdjson proper
|
||||
link_libraries(simdjson)
|
||||
add_cpp_test(random_string_number_tests LABELS acceptance per_implementation)
|
||||
add_cpp_test(basictests LABELS acceptance per_implementation)
|
||||
add_cpp_test(minify_tests LABELS acceptance per_implementation)
|
||||
add_cpp_test(document_stream_tests LABELS acceptance per_implementation)
|
||||
|
|
|
@ -16,6 +16,17 @@
|
|||
#include "cast_tester.h"
|
||||
#include "test_macros.h"
|
||||
|
||||
/**
|
||||
* Some systems have bad floating-point parsing. We want to exclude them.
|
||||
*/
|
||||
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
|
||||
// Finally, we want to exclude legacy 32-bit systems.
|
||||
#ifndef SIMDJSON_IS_32BITS
|
||||
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
|
||||
#define TEST_FLOATS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
const size_t AMAZON_CELLPHONES_NDJSON_DOC_COUNT = 793;
|
||||
#define SIMDJSON_SHOW_DEFINE(x) printf("%s=%s\n", #x, STRINGIFY(x))
|
||||
|
||||
|
@ -34,6 +45,34 @@ namespace number_tests {
|
|||
return ua + ub + 0x80000000;
|
||||
}
|
||||
|
||||
bool ground_truth() {
|
||||
std::cout << __func__ << std::endl;
|
||||
std::pair<std::string,double> ground_truth[] = {
|
||||
{"2.2250738585072013e-308",0x1p-1022},
|
||||
{"-92666518056446206563E3", -0x1.39f764644154dp+76},
|
||||
{"-92666518056446206563E3", -0x1.39f764644154dp+76},
|
||||
{"-42823146028335318693e-128", -0x1.0176daa6cdaafp-360},
|
||||
{"90054602635948575728E72", 0x1.61ab4ea9cb6c3p+305},
|
||||
{"1.00000000000000188558920870223463870174566020691753515394643550663070558368373221972569761144603605635692374830246134201063722058e-309", 0x0.0b8157268fdafp-1022},
|
||||
{"0e9999999999999999999999999999", 0x0p+0},
|
||||
{"-2402844368454405395.2", -0x1.0ac4f1c7422e7p+61}
|
||||
};
|
||||
simdjson::dom::parser parser;
|
||||
for(auto string_double : ground_truth) {
|
||||
std::cout << "parsing the string '" << string_double.first << "'" << std::endl;
|
||||
std::cout << "I am expecting the floating-point value '" << string_double.second << "'" << std::endl;
|
||||
double result;
|
||||
ASSERT_SUCCESS(parser.parse(string_double.first).get(result));
|
||||
std::cout << "Resulting float is '" << result << "'" << std::endl;
|
||||
if(result != string_double.second) {
|
||||
std::cerr << std::hexfloat << result << " vs " << string_double.second << std::endl;
|
||||
std::cerr << string_double.first << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool small_integers() {
|
||||
std::cout << __func__ << std::endl;
|
||||
|
@ -56,6 +95,7 @@ namespace number_tests {
|
|||
std::cout << __func__ << std::endl;
|
||||
simdjson::dom::parser parser;
|
||||
std::vector<std::pair<std::string, double>> testing = {
|
||||
{"9999999999999999999e0",9999999999999999999.0},
|
||||
{"9999999999999999999.0",9999999999999999999.0},
|
||||
{"9999999999999999999",9999999999999999999.},
|
||||
{"999999999999999999.9",999999999999999999.9},
|
||||
|
@ -99,7 +139,6 @@ namespace number_tests {
|
|||
double expected = pow(2, i);
|
||||
size_t n = snprintf(buf, sizeof(buf), "%.*e", std::numeric_limits<double>::max_digits10 - 1, expected);
|
||||
if (n >= sizeof(buf)) { abort(); }
|
||||
fflush(NULL);
|
||||
double actual;
|
||||
auto error = parser.parse(buf, n).get(actual);
|
||||
if (error) { std::cerr << error << std::endl; return false; }
|
||||
|
@ -194,14 +233,13 @@ namespace number_tests {
|
|||
simdjson::dom::parser parser;
|
||||
|
||||
bool is_pow_correct{1e-308 == std::pow(10,-308)};
|
||||
int start_point = is_pow_correct ? -10000 : -307;
|
||||
int start_point = is_pow_correct ? -1000 : -307;
|
||||
if(!is_pow_correct) {
|
||||
std::cout << "On your system, the pow function is busted. Sorry about that. " << std::endl;
|
||||
}
|
||||
for (int i = start_point; i <= 308; ++i) {// large negative values should be zero.
|
||||
size_t n = snprintf(buf, sizeof(buf), "1e%d", i);
|
||||
if (n >= sizeof(buf)) { abort(); }
|
||||
fflush(NULL);
|
||||
double actual;
|
||||
auto error = parser.parse(buf, n).get(actual);
|
||||
if (error) { std::cerr << error << std::endl; return false; }
|
||||
|
@ -217,8 +255,69 @@ namespace number_tests {
|
|||
printf("Powers of 10 can be parsed.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool basic_test_64bit(std::string vals, double val) {
|
||||
std::cout << " parsing " << vals << std::endl;
|
||||
double std_answer;
|
||||
char *endptr;
|
||||
// We want to call strtod with the C (default) locale to avoid
|
||||
// potential issues in case someone has a different locale.
|
||||
// Unfortunately, Visual Studio has a different syntax.
|
||||
const char * cval = vals.c_str();
|
||||
#ifdef _WIN32
|
||||
static _locale_t c_locale = _create_locale(LC_ALL, "C");
|
||||
std_answer = _strtod_l(cval, &endptr, c_locale);
|
||||
#else
|
||||
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
|
||||
std_answer = strtod_l(cval, &endptr, c_locale);
|
||||
#endif
|
||||
if(endptr == cval) {
|
||||
std::cerr << "Your runtime library failed to parse " << vals << std::endl;
|
||||
}
|
||||
double actual;
|
||||
simdjson::dom::parser parser;
|
||||
auto error = parser.parse(vals).get(actual);
|
||||
if(error) {
|
||||
std::cerr << error << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (actual != val) {
|
||||
std::cerr << std::hexfloat << actual << " but I was expecting " << val
|
||||
<< std::endl;
|
||||
std::cerr << "string: " << vals << std::endl;
|
||||
std::cout << std::dec;
|
||||
if(std_answer == actual) {
|
||||
std::cerr << "simdjson agrees with your runtime library, so we will accept the answer." << std::endl;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
std::cout << std::hexfloat << actual << " == " << val << std::endl;
|
||||
std::cout << std::dec;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool specific_tests() {
|
||||
std::cout << __func__ << std::endl;
|
||||
return basic_test_64bit("-2402844368454405395.2",-2402844368454405395.2) &&
|
||||
basic_test_64bit("4503599627370496.5", 4503599627370496.5) &&
|
||||
basic_test_64bit("4503599627475352.5", 4503599627475352.5) &&
|
||||
basic_test_64bit("4503599627475353.5", 4503599627475353.5) &&
|
||||
basic_test_64bit("2251799813685248.25", 2251799813685248.25) &&
|
||||
basic_test_64bit("1125899906842624.125", 1125899906842624.125) &&
|
||||
basic_test_64bit("1125899906842901.875", 1125899906842901.875) &&
|
||||
basic_test_64bit("2251799813685803.75", 2251799813685803.75) &&
|
||||
basic_test_64bit("4503599627370497.5", 4503599627370497.5) &&
|
||||
basic_test_64bit("45035996.273704995", 45035996.273704995) &&
|
||||
basic_test_64bit("45035996.273704985", 45035996.273704985) &&
|
||||
basic_test_64bit("0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000044501477170144022721148195934182639518696390927032912960468522194496444440421538910330590478162701758282983178260792422137401728773891892910553144148156412434867599762821265346585071045737627442980259622449029037796981144446145705102663115100318287949527959668236039986479250965780342141637013812613333119898765515451440315261253813266652951306000184917766328660755595837392240989947807556594098101021612198814605258742579179000071675999344145086087205681577915435923018910334964869420614052182892431445797605163650903606514140377217442262561590244668525767372446430075513332450079650686719491377688478005309963967709758965844137894433796621993967316936280457084866613206797017728916080020698679408551343728867675409720757232455434770912461317493580281734466552734375", 0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000044501477170144022721148195934182639518696390927032912960468522194496444440421538910330590478162701758282983178260792422137401728773891892910553144148156412434867599762821265346585071045737627442980259622449029037796981144446145705102663115100318287949527959668236039986479250965780342141637013812613333119898765515451440315261253813266652951306000184917766328660755595837392240989947807556594098101021612198814605258742579179000071675999344145086087205681577915435923018910334964869420614052182892431445797605163650903606514140377217442262561590244668525767372446430075513332450079650686719491377688478005309963967709758965844137894433796621993967316936280457084866613206797017728916080020698679408551343728867675409720757232455434770912461317493580281734466552734375) &&
|
||||
basic_test_64bit("0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022250738585072008890245868760858598876504231122409594654935248025624400092282356951787758888037591552642309780950434312085877387158357291821993020294379224223559819827501242041788969571311791082261043971979604000454897391938079198936081525613113376149842043271751033627391549782731594143828136275113838604094249464942286316695429105080201815926642134996606517803095075913058719846423906068637102005108723282784678843631944515866135041223479014792369585208321597621066375401613736583044193603714778355306682834535634005074073040135602968046375918583163124224521599262546494300836851861719422417646455137135420132217031370496583210154654068035397417906022589503023501937519773030945763173210852507299305089761582519159720757232455434770912461317493580281734466552734375", 0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022250738585072008890245868760858598876504231122409594654935248025624400092282356951787758888037591552642309780950434312085877387158357291821993020294379224223559819827501242041788969571311791082261043971979604000454897391938079198936081525613113376149842043271751033627391549782731594143828136275113838604094249464942286316695429105080201815926642134996606517803095075913058719846423906068637102005108723282784678843631944515866135041223479014792369585208321597621066375401613736583044193603714778355306682834535634005074073040135602968046375918583163124224521599262546494300836851861719422417646455137135420132217031370496583210154654068035397417906022589503023501937519773030945763173210852507299305089761582519159720757232455434770912461317493580281734466552734375);
|
||||
}
|
||||
|
||||
bool run() {
|
||||
return small_integers() &&
|
||||
return specific_tests() &&
|
||||
ground_truth() &&
|
||||
small_integers() &&
|
||||
powers_of_two() &&
|
||||
powers_of_ten() &&
|
||||
nines();
|
||||
|
@ -1189,7 +1288,14 @@ namespace type_tests {
|
|||
&& (expected_value >= 0 ?
|
||||
test_cast<uint64_t>(result, expected_value) :
|
||||
test_cast_error<uint64_t>(result, NUMBER_OUT_OF_RANGE))
|
||||
#ifdef TEST_FLOATS
|
||||
// We trust the underlying system to be accurate.
|
||||
&& test_cast<double>(result, static_cast<double>(expected_value))
|
||||
#else
|
||||
// We don't trust the underlying system so we only run the test_cast
|
||||
// exact test when the expected_value is within the 53-bit range.
|
||||
&& ((expected_value<-9007199254740992) || (expected_value>9007199254740992) || test_cast<double>(result, static_cast<double>(expected_value)))
|
||||
#endif
|
||||
&& test_cast_error<bool>(result, INCORRECT_TYPE)
|
||||
&& test_is_null(result, false);
|
||||
}
|
||||
|
@ -1209,6 +1315,14 @@ namespace type_tests {
|
|||
&& test_cast_error<int64_t>(result, NUMBER_OUT_OF_RANGE)
|
||||
&& test_cast<uint64_t>(result, expected_value)
|
||||
&& test_cast<double>(result, static_cast<double>(expected_value))
|
||||
#ifdef TEST_FLOATS
|
||||
// We trust the underlying system to be accurate.
|
||||
&& test_cast<double>(result, static_cast<double>(expected_value))
|
||||
#else
|
||||
// We don't trust the underlying system so we only run the test_cast
|
||||
// exact test when the expected_value is within the 53-bit range.
|
||||
&& ((expected_value>9007199254740992) || test_cast<double>(result, static_cast<double>(expected_value)))
|
||||
#endif
|
||||
&& test_cast_error<bool>(result, INCORRECT_TYPE)
|
||||
&& test_is_null(result, false);
|
||||
}
|
||||
|
@ -1409,7 +1523,7 @@ namespace minify_tests {
|
|||
auto e = simdjson::minify(bogus_json.get(), i, output_json.get(), newlength);
|
||||
if(e) {
|
||||
std::cerr << "got an error (unexpected) : " << e << std::endl;
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -1427,7 +1541,7 @@ namespace minify_tests {
|
|||
auto e = simdjson::minify(bogus_json.get(), i, output_json.get(), newlength);
|
||||
if(e) {
|
||||
std::cerr << "got an error (unexpected) : " << e << std::endl;
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -1899,7 +2013,7 @@ int main(int argc, char *argv[]) {
|
|||
dom_api_tests::run() &&
|
||||
type_tests::run() &&
|
||||
format_tests::run() &&
|
||||
number_tests::run()
|
||||
number_tests::run()
|
||||
) {
|
||||
std::cout << "Basic tests are ok." << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
#include <cstring>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
|
||||
|
||||
#ifndef JSON_TEST_NUMBERS
|
||||
#define JSON_TEST_NUMBERS
|
||||
|
@ -22,6 +23,29 @@ void found_unsigned_integer(uint64_t result, const uint8_t *buf);
|
|||
|
||||
#include "simdjson.h"
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Some systems have bad floating-point parsing. We want to exclude them.
|
||||
*/
|
||||
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
|
||||
// Ok. So under Visual Studio, linux, apple and freebsd systems, we have a good chance of having a decent
|
||||
// enough strtod. It is not certain, but it is maybe a good enough heuristics. We exclude systems like msys2
|
||||
// or cygwin.
|
||||
//
|
||||
// Finally, we want to exclude legacy 32-bit systems.
|
||||
#ifndef SIMDJSON_IS_32BITS
|
||||
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
|
||||
#define TEST_FLOATS
|
||||
// Apple and freebsd need a special header, typically.
|
||||
#if defined __APPLE__ || defined(__FreeBSD__)
|
||||
# include <xlocale.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// ulp distance
|
||||
// Marc B. Reynolds, 2016-2019
|
||||
// Public Domain under http://unlicense.org, see link for details.
|
||||
|
@ -73,6 +97,10 @@ bool is_in_bad_list(const char *buf) {
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef TEST_FLOATS
|
||||
// We do not recognize the system, so we do not verify our results.
|
||||
void found_invalid_number(const uint8_t *) {}
|
||||
#else
|
||||
void found_invalid_number(const uint8_t *buf) {
|
||||
invalid_count++;
|
||||
char *endptr;
|
||||
|
@ -82,7 +110,7 @@ void found_invalid_number(const uint8_t *buf) {
|
|||
#else
|
||||
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
|
||||
double expected = strtod_l((const char *)buf, &endptr, c_locale);
|
||||
#endif
|
||||
#endif
|
||||
if (endptr != (const char *)buf) {
|
||||
if (!is_in_bad_list((const char *)buf)) {
|
||||
printf("Warning: found_invalid_number %.32s whereas strtod parses it to "
|
||||
|
@ -93,6 +121,7 @@ void found_invalid_number(const uint8_t *buf) {
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void found_integer(int64_t result, const uint8_t *buf) {
|
||||
int_count++;
|
||||
|
@ -101,7 +130,7 @@ void found_integer(int64_t result, const uint8_t *buf) {
|
|||
if ((endptr == (const char *)buf) || (expected != result)) {
|
||||
#if (!(__MINGW32__) && !(__MINGW64__))
|
||||
fprintf(stderr, "Error: parsed %" PRId64 " out of %.32s, ", result, buf);
|
||||
#else // mingw is busted since we include #include <inttypes.h>
|
||||
#else // mingw is busted since we include #include <inttypes.h> and it will still not provide PRId64
|
||||
fprintf(stderr, "Error: parsed %lld out of %.32s, ", (long long)result, buf);
|
||||
#endif
|
||||
fprintf(stderr, " while parsing %s \n", fullpath);
|
||||
|
@ -124,6 +153,10 @@ void found_unsigned_integer(uint64_t result, const uint8_t *buf) {
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef TEST_FLOATS
|
||||
// We do not recognize the system, so we do not verify our results.
|
||||
void found_float(double , const uint8_t *) {}
|
||||
#else
|
||||
void found_float(double result, const uint8_t *buf) {
|
||||
char *endptr;
|
||||
float_count++;
|
||||
|
@ -133,7 +166,7 @@ void found_float(double result, const uint8_t *buf) {
|
|||
#else
|
||||
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
|
||||
double expected = strtod_l((const char *)buf, &endptr, c_locale);
|
||||
#endif
|
||||
#endif
|
||||
if (endptr == (const char *)buf) {
|
||||
fprintf(stderr,
|
||||
"parsed %f from %.32s whereas strtod refuses to parse a float, ",
|
||||
|
@ -141,7 +174,7 @@ void found_float(double result, const uint8_t *buf) {
|
|||
fprintf(stderr, " while parsing %s \n", fullpath);
|
||||
parse_error |= PARSE_ERROR;
|
||||
}
|
||||
if (fpclassify(expected) != fpclassify(result)) {
|
||||
if (std::fpclassify(expected) != std::fpclassify(result)) {
|
||||
fprintf(stderr,
|
||||
"floats not in the same category expected: %f observed: %f \n",
|
||||
expected, result);
|
||||
|
@ -158,6 +191,7 @@ void found_float(double result, const uint8_t *buf) {
|
|||
parse_error |= PARSE_ERROR;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "simdjson.h"
|
||||
#include "simdjson.cpp"
|
||||
|
|
|
@ -0,0 +1,197 @@
|
|||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <climits>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "simdjson.h"
|
||||
|
||||
|
||||
/**
|
||||
* Some systems have bad floating-point parsing. We want to exclude them.
|
||||
*/
|
||||
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined (__linux__) || defined (__APPLE__) || defined(__FreeBSD__)
|
||||
// Ok. So under Visual Studio, linux, apple and freebsd systems, we have a good chance of having a decent
|
||||
// enough strtod. It is not certain, but it is maybe a good enough heuristics. We exclude systems like msys2
|
||||
// or cygwin.
|
||||
//
|
||||
// Finally, we want to exclude legacy 32-bit systems.
|
||||
#ifndef SIMDJSON_IS_32BITS
|
||||
// So we only run some of the floating-point tests under 64-bit linux, apple, regular visual studio, freebsd.
|
||||
#define TEST_FLOATS
|
||||
// Apple and freebsd need a special header, typically.
|
||||
#if defined __APPLE__ || defined(__FreeBSD__)
|
||||
# include <xlocale.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
struct RandomEngine {
|
||||
RandomEngine() = delete;
|
||||
RandomEngine(uint32_t seed) : one_zero_generator(0,1), digit_generator(0,9), nonzero_digit_generator(1,9), digit_count_generator (1,40),exp_count_generator (1,3), generator(seed) {}
|
||||
std::uniform_int_distribution<int> one_zero_generator;
|
||||
std::uniform_int_distribution<int> digit_generator;
|
||||
std::uniform_int_distribution<int> nonzero_digit_generator;
|
||||
|
||||
std::uniform_int_distribution<int> digit_count_generator;
|
||||
std::uniform_int_distribution<int> exp_count_generator;
|
||||
bool next_bool() { return one_zero_generator(generator); }
|
||||
int next_digit() { return digit_generator(generator); }
|
||||
int next_nonzero_digit() { return nonzero_digit_generator(generator); }
|
||||
int next_digit_count() { return digit_count_generator(generator); }
|
||||
int next_exp_count() { return exp_count_generator(generator); }
|
||||
|
||||
std::mt19937 generator;
|
||||
};
|
||||
|
||||
size_t build_random_string(RandomEngine &rand, char *buffer) {
|
||||
size_t pos{0};
|
||||
if (rand.next_bool()) {
|
||||
buffer[pos++] = '-';
|
||||
}
|
||||
size_t number_of_digits = size_t(rand.next_digit_count());
|
||||
std::uniform_int_distribution<int> decimal_generator(1,int(number_of_digits));
|
||||
size_t location_of_decimal_separator = size_t(decimal_generator(rand.generator));
|
||||
for (size_t i = 0; i < number_of_digits; i++) {
|
||||
if (i == location_of_decimal_separator) {
|
||||
buffer[pos++] = '.';
|
||||
}
|
||||
if (( i == 0) && (location_of_decimal_separator != 1)) {
|
||||
buffer[pos++] = char(rand.next_nonzero_digit() + '0');
|
||||
} else {
|
||||
buffer[pos++] = char(rand.next_digit() + '0');
|
||||
}
|
||||
}
|
||||
if (rand.next_bool()) {
|
||||
if (rand.next_bool()) {
|
||||
buffer[pos++] = 'e';
|
||||
} else {
|
||||
buffer[pos++] = 'E';
|
||||
}
|
||||
if (rand.next_bool()) {
|
||||
buffer[pos++] = '-';
|
||||
} else {
|
||||
if (rand.next_bool()) {
|
||||
buffer[pos++] = '+';
|
||||
}
|
||||
}
|
||||
number_of_digits = rand.next_exp_count();
|
||||
size_t i = 0;
|
||||
if(number_of_digits > 0) {
|
||||
buffer[pos++] = char(rand.next_nonzero_digit() + '0');
|
||||
i++;
|
||||
}
|
||||
for (; i < number_of_digits; i++) {
|
||||
buffer[pos++] = char(rand.next_digit() + '0');
|
||||
}
|
||||
}
|
||||
buffer[pos] = '\0'; // null termination
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
#ifndef TEST_FLOATS
|
||||
// We do not recognize the system, so we do not verify our results.
|
||||
bool check_float(double , const char *) {
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
bool check_float(double result, const char *buf) {
|
||||
char *endptr;
|
||||
#ifdef _WIN32
|
||||
static _locale_t c_locale = _create_locale(LC_ALL, "C");
|
||||
double expected = _strtod_l((const char *)buf, &endptr, c_locale);
|
||||
#else
|
||||
static locale_t c_locale = newlocale(LC_ALL_MASK, "C", NULL);
|
||||
double expected = strtod_l((const char *)buf, &endptr, c_locale);
|
||||
#endif
|
||||
if (endptr == (const char *)buf) {
|
||||
fprintf(stderr,
|
||||
"parsed %f from %.32s whereas strtod refuses to parse a float, ",
|
||||
result, buf);
|
||||
return false;
|
||||
}
|
||||
if (expected != result) {
|
||||
fprintf(stderr, "parsed %.128e from \n", result);
|
||||
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
|
||||
fprintf(stderr, " %.128e,", expected);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* We generate random strings and we try to parse them,
|
||||
* and we verify that we get the same answer.
|
||||
*/
|
||||
bool tester(int seed, size_t volume) {
|
||||
char buffer[1024]; // large buffer (can't overflow)
|
||||
simdjson::dom::parser parser;
|
||||
RandomEngine rand(seed);
|
||||
double result;
|
||||
for (size_t i = 0; i < volume; i++) {
|
||||
if((i%100000) == 0) { std::cout << "."; std::cout.flush(); }
|
||||
size_t length = build_random_string(rand, buffer);
|
||||
auto error = parser.parse(buffer, length).get(result);
|
||||
// When we parse a (finite) number, it better match strtod.
|
||||
if ((!error) && (!check_float(result, buffer))) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// We test 1,000,000 random strings by default.
|
||||
// You can specify more tests with the '-m' flag if you want.
|
||||
size_t howmany = 1000000;
|
||||
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "a:m:h")) != -1) {
|
||||
switch (c) {
|
||||
case 'a': {
|
||||
const simdjson::implementation *impl = simdjson::available_implementations[optarg];
|
||||
if (!impl) {
|
||||
fprintf(stderr, "Unsupported architecture value -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if(!impl->supported_by_runtime_system()) {
|
||||
fprintf(stderr, "The selected implementation does not match your current CPU: -a %s\n", optarg);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
simdjson::active_implementation = impl;
|
||||
break;
|
||||
}
|
||||
case 'h': {
|
||||
std::cout << "-a to select an architecture" << std::endl;
|
||||
std::cout << "-m to select a number of tests" << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
case 'm': {
|
||||
long long requested_howmany = atoll(optarg);
|
||||
if(requested_howmany <= 0) {
|
||||
fprintf(stderr, "Please provide a positive number of tests -m %s no larger than %lld \n", optarg, LLONG_MAX);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
howmany = size_t(requested_howmany);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "Unexpected argument %c\n", c);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
if (tester(1234344, howmany)) {
|
||||
std::cout << "All tests ok." << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
std::cout << "Failure." << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
Loading…
Reference in New Issue