From e926b4b3c9b4af11bc126c49b4fefb2567cdbc9c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 15 Jul 2019 22:17:49 -0400 Subject: [PATCH] More accurate number parsing (#217) * This drastically improves the accuracy (down to to a ULP of 1) * More comments and documentation. --- README.md | 1 + include/simdjson/numberparsing.h | 88 ++++++++++++++++++++------------ tests/numberparsingcheck.cpp | 33 +++++++++++- 3 files changed, 88 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index b7e902b4..e7472343 100644 --- a/README.md +++ b/README.md @@ -333,6 +333,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) - We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document. - We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits::lowest()` to `std::numeric_limits::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document. +- We aim for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one. - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.) - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.) - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.) diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h index 280988e6..b29497fd 100644 --- a/include/simdjson/numberparsing.h +++ b/include/simdjson/numberparsing.h @@ -201,7 +201,7 @@ parse_float(const uint8_t *const buf, ++p; negative = true; } - double i; + long double i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; @@ -217,12 +217,13 @@ parse_float(const uint8_t *const buf, } if ('.' == *p) { ++p; - double fractionalweight = 1; + int fractionalweight = 308; if(is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - fractionalweight *= 0.1; - i = i + digit * fractionalweight; + + fractionalweight --; + i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); } else { #ifdef JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); @@ -232,8 +233,8 @@ parse_float(const uint8_t *const buf, while (is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - fractionalweight *= 0.1; - i = i + digit * fractionalweight; + fractionalweight --; + i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); } } if (('e' == *p) || ('E' == *p)) { @@ -388,6 +389,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf, // is made of a single number), then it is necessary to copy the content and append // a space before calling this function. // +// Our objective is accurate parsing (ULP of 0 or 1) at high speed. static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj, const uint32_t offset, @@ -434,20 +436,26 @@ static really_inline bool parse_number(const uint8_t *const buf, // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = *p - '0'; - i = 10 * i + digit; // might overflow + // a multiplication by 10 is cheaper than an arbitrary integer multiplication + i = 10 * i + digit; // might overflow, we will handle the overflow later ++p; } } int64_t exponent = 0; bool is_float = false; if ('.' == *p) { - is_float = true; + is_float = true; // At this point we know that we have a float + // we continue with the fiction that we have an integer. If the + // floating point number is representable as x * 10^z for some integer + // z that fits in 53 bits, then we will be able to convert back the + // the integer into a float in a lossless manner. ++p; const char *const firstafterperiod = p; if(is_integer(*p)) { unsigned char digit = *p - '0'; ++p; - i = i * 10 + digit; + i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult. + // we will handle the overflow later } else { #ifdef JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); @@ -469,7 +477,7 @@ static really_inline bool parse_number(const uint8_t *const buf, } exponent = firstafterperiod - p; } - int digitcount = p - startdigits - 1; + int digitcount = p - startdigits - 1; // used later to guard against overflows int64_t expnumber = 0; // exponential part if (('e' == *p) || ('E' == *p)) { is_float = true; @@ -510,39 +518,55 @@ static really_inline bool parse_number(const uint8_t *const buf, exponent += (negexp ? -expnumber : expnumber); } if (is_float) { - if (unlikely(digitcount >= 19)) { // this is uncommon!!! + uint64_t powerindex = 308 + exponent; + if (unlikely((digitcount >= 19))) { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + const char * start = startdigits; + while((*start == '0') || (*start == '.')) { + start++; + } + digitcount -= (start - startdigits); + if(digitcount >= 19) { + // Ok, chances are good that we had an overflow! + // this is almost never going to get called!!! + // we start anew, going slowly!!! + return parse_float(buf, pj, offset, + found_minus); + + } + } + if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! return parse_float(buf, pj, offset, found_minus); } - /////////// - // We want 0.1e1 to be a float. - ////////// - if (i == 0) { - pj.write_tape_double(0.0); -#ifdef JSON_TEST_NUMBERS // for unit testing - foundFloat(0.0, buf + offset); -#endif - } else { - double d = i; - d = negative ? -d : d; - uint64_t powerindex = 308 + exponent; - if(likely(powerindex <= 2 * 308)) { - // common case - d *= power_of_ten[powerindex]; - } else { - // this is uncommon so let us move this special case out - // of the main loop - return parse_float(buf, pj, offset,found_minus); - } + double factor = power_of_ten[powerindex]; + factor = negative ? -factor : factor; + if(i <= UINT64_C(0x1fffffffffffff)) { + // we can convert i to a double safely (losslessly) so the + // following should have good performance. + double d = i * factor; pj.write_tape_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing foundFloat(d, buf + offset); #endif - } + } else {//if(i=< UINT64_C(0x1fffffffffffff)) + // we cannot convert the number in a lossless manner. + // we have to do it in two steps. + double d1 = (double)(uint32_t)i; + double d2 = (double)(uint32_t)(i>>32); + double d = d1 * factor + d2 * factor * 4294967296; + pj.write_tape_double(d); +#ifdef JSON_TEST_NUMBERS // for unit testing + foundFloat(d, buf + offset); +#endif + }//if(i=< UINT64_C(0x1fffffffffffff)) } else { if (unlikely(digitcount >= 18)) { // this is uncommon!!! + // there is a good chance that we had an overflow, so we need + // need to recover: we parse the whole thing again. return parse_large_integer(buf, pj, offset, found_minus); } diff --git a/tests/numberparsingcheck.cpp b/tests/numberparsingcheck.cpp index 04ae513d..94089daf 100644 --- a/tests/numberparsingcheck.cpp +++ b/tests/numberparsingcheck.cpp @@ -13,6 +13,33 @@ #include "simdjson/common_defs.h" + +// ulp distance +// Marc B. Reynolds, 2016-2019 +// Public Domain under http://unlicense.org, see link for details. +// adapted by D. Lemire +inline uint32_t f32_ulp_dist(float a, float b) { + uint32_t ua, ub; + memcpy(&ua, &a, sizeof(ua)); + memcpy(&ub, &b, sizeof(ub)); + if ((int32_t)(ub^ua) >= 0) + return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua); + return ua+ub+0x80000000; +} + +// ulp distance +// Marc B. Reynolds, 2016-2019 +// Public Domain under http://unlicense.org, see link for details. +// adapted by D. Lemire +inline uint64_t f64_ulp_dist(double a, double b) { + uint64_t ua, ub; + memcpy(&ua, &a, sizeof(ua)); + memcpy(&ub, &b, sizeof(ub)); + if ((int64_t)(ub^ua) >= 0) + return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua); + return ua+ub+0x80000000; +} + int parse_error; char *fullpath; enum { PARSE_WARNING, PARSE_ERROR }; @@ -81,14 +108,16 @@ void foundFloat(double result, const uint8_t *buf) { expected, result); fprintf(stderr, "%.32s\n", buf); parse_error |= PARSE_ERROR; + return; } // we want to get some reasonable relative accuracy - else if (fabs(expected - result) > - 1e-14 * fmin(fabs(expected), fabs(result))) { + uint64_t ULP = f64_ulp_dist(expected,result); + if (f64_ulp_dist(expected,result) > 1) { fprintf(stderr, "parsed %.128e from \n", result); fprintf(stderr, " %.32s whereas strtod gives\n", buf); fprintf(stderr, " %.128e,", expected); fprintf(stderr, " while parsing %s \n", fullpath); + fprintf(stderr, " =========== ULP: %u,", (unsigned int)ULP); parse_error |= PARSE_ERROR; } }