More accurate number parsing (#217)

* This drastically improves the accuracy (down to to a ULP of 1)

* More comments and documentation.
This commit is contained in:
Daniel Lemire 2019-07-15 22:17:49 -04:00 committed by GitHub
parent 6c168f046d
commit e926b4b3c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 88 additions and 34 deletions

View File

@ -333,6 +333,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()` to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
- We aim for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one.
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)

View File

@ -201,7 +201,7 @@ parse_float(const uint8_t *const buf,
++p;
negative = true;
}
double i;
long double i;
if (*p == '0') { // 0 cannot be followed by an integer
++p;
i = 0;
@ -217,12 +217,13 @@ parse_float(const uint8_t *const buf,
}
if ('.' == *p) {
++p;
double fractionalweight = 1;
int fractionalweight = 308;
if(is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
fractionalweight *= 0.1;
i = i + digit * fractionalweight;
fractionalweight --;
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
@ -232,8 +233,8 @@ parse_float(const uint8_t *const buf,
while (is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
fractionalweight *= 0.1;
i = i + digit * fractionalweight;
fractionalweight --;
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
}
}
if (('e' == *p) || ('E' == *p)) {
@ -388,6 +389,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
// is made of a single number), then it is necessary to copy the content and append
// a space before calling this function.
//
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
static really_inline bool parse_number(const uint8_t *const buf,
ParsedJson &pj,
const uint32_t offset,
@ -434,20 +436,26 @@ static really_inline bool parse_number(const uint8_t *const buf,
// we rarely see large integer parts like 123456789
while (is_integer(*p)) {
digit = *p - '0';
i = 10 * i + digit; // might overflow
// a multiplication by 10 is cheaper than an arbitrary integer multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
++p;
}
}
int64_t exponent = 0;
bool is_float = false;
if ('.' == *p) {
is_float = true;
is_float = true; // At this point we know that we have a float
// we continue with the fiction that we have an integer. If the
// floating point number is representable as x * 10^z for some integer
// z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner.
++p;
const char *const firstafterperiod = p;
if(is_integer(*p)) {
unsigned char digit = *p - '0';
++p;
i = i * 10 + digit;
i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
// we will handle the overflow later
} else {
#ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset);
@ -469,7 +477,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
}
exponent = firstafterperiod - p;
}
int digitcount = p - startdigits - 1;
int digitcount = p - startdigits - 1; // used later to guard against overflows
int64_t expnumber = 0; // exponential part
if (('e' == *p) || ('E' == *p)) {
is_float = true;
@ -510,39 +518,55 @@ static really_inline bool parse_number(const uint8_t *const buf,
exponent += (negexp ? -expnumber : expnumber);
}
if (is_float) {
if (unlikely(digitcount >= 19)) { // this is uncommon!!!
uint64_t powerindex = 308 + exponent;
if (unlikely((digitcount >= 19))) { // this is uncommon
// It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
const char * start = startdigits;
while((*start == '0') || (*start == '.')) {
start++;
}
digitcount -= (start - startdigits);
if(digitcount >= 19) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, pj, offset,
found_minus);
}
}
if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, pj, offset,
found_minus);
}
///////////
// We want 0.1e1 to be a float.
//////////
if (i == 0) {
pj.write_tape_double(0.0);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(0.0, buf + offset);
#endif
} else {
double d = i;
d = negative ? -d : d;
uint64_t powerindex = 308 + exponent;
if(likely(powerindex <= 2 * 308)) {
// common case
d *= power_of_ten[powerindex];
} else {
// this is uncommon so let us move this special case out
// of the main loop
return parse_float(buf, pj, offset,found_minus);
}
double factor = power_of_ten[powerindex];
factor = negative ? -factor : factor;
if(i <= UINT64_C(0x1fffffffffffff)) {
// we can convert i to a double safely (losslessly) so the
// following should have good performance.
double d = i * factor;
pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset);
#endif
}
} else {//if(i=< UINT64_C(0x1fffffffffffff))
// we cannot convert the number in a lossless manner.
// we have to do it in two steps.
double d1 = (double)(uint32_t)i;
double d2 = (double)(uint32_t)(i>>32);
double d = d1 * factor + d2 * factor * 4294967296;
pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset);
#endif
}//if(i=< UINT64_C(0x1fffffffffffff))
} else {
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
// there is a good chance that we had an overflow, so we need
// need to recover: we parse the whole thing again.
return parse_large_integer(buf, pj, offset,
found_minus);
}

View File

@ -13,6 +13,33 @@
#include "simdjson/common_defs.h"
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint32_t f32_ulp_dist(float a, float b) {
uint32_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int32_t)(ub^ua) >= 0)
return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
}
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint64_t f64_ulp_dist(double a, double b) {
uint64_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int64_t)(ub^ua) >= 0)
return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
}
int parse_error;
char *fullpath;
enum { PARSE_WARNING, PARSE_ERROR };
@ -81,14 +108,16 @@ void foundFloat(double result, const uint8_t *buf) {
expected, result);
fprintf(stderr, "%.32s\n", buf);
parse_error |= PARSE_ERROR;
return;
}
// we want to get some reasonable relative accuracy
else if (fabs(expected - result) >
1e-14 * fmin(fabs(expected), fabs(result))) {
uint64_t ULP = f64_ulp_dist(expected,result);
if (f64_ulp_dist(expected,result) > 1) {
fprintf(stderr, "parsed %.128e from \n", result);
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
fprintf(stderr, " %.128e,", expected);
fprintf(stderr, " while parsing %s \n", fullpath);
fprintf(stderr, " =========== ULP: %u,", (unsigned int)ULP);
parse_error |= PARSE_ERROR;
}
}