More accurate number parsing (#217)

* This drastically improves the accuracy (down to to a ULP of 1)

* More comments and documentation.
This commit is contained in:
Daniel Lemire 2019-07-15 22:17:49 -04:00 committed by GitHub
parent 6c168f046d
commit e926b4b3c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 88 additions and 34 deletions

View File

@ -333,6 +333,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document. - We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()` to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document. - We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()` to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
- We aim for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one.
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.) - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.) - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.) - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)

View File

@ -201,7 +201,7 @@ parse_float(const uint8_t *const buf,
++p; ++p;
negative = true; negative = true;
} }
double i; long double i;
if (*p == '0') { // 0 cannot be followed by an integer if (*p == '0') { // 0 cannot be followed by an integer
++p; ++p;
i = 0; i = 0;
@ -217,12 +217,13 @@ parse_float(const uint8_t *const buf,
} }
if ('.' == *p) { if ('.' == *p) {
++p; ++p;
double fractionalweight = 1; int fractionalweight = 308;
if(is_integer(*p)) { if(is_integer(*p)) {
unsigned char digit = *p - '0'; unsigned char digit = *p - '0';
++p; ++p;
fractionalweight *= 0.1;
i = i + digit * fractionalweight; fractionalweight --;
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
} else { } else {
#ifdef JSON_TEST_NUMBERS // for unit testing #ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset); foundInvalidNumber(buf + offset);
@ -232,8 +233,8 @@ parse_float(const uint8_t *const buf,
while (is_integer(*p)) { while (is_integer(*p)) {
unsigned char digit = *p - '0'; unsigned char digit = *p - '0';
++p; ++p;
fractionalweight *= 0.1; fractionalweight --;
i = i + digit * fractionalweight; i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
} }
} }
if (('e' == *p) || ('E' == *p)) { if (('e' == *p) || ('E' == *p)) {
@ -388,6 +389,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
// is made of a single number), then it is necessary to copy the content and append // is made of a single number), then it is necessary to copy the content and append
// a space before calling this function. // a space before calling this function.
// //
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
static really_inline bool parse_number(const uint8_t *const buf, static really_inline bool parse_number(const uint8_t *const buf,
ParsedJson &pj, ParsedJson &pj,
const uint32_t offset, const uint32_t offset,
@ -434,20 +436,26 @@ static really_inline bool parse_number(const uint8_t *const buf,
// we rarely see large integer parts like 123456789 // we rarely see large integer parts like 123456789
while (is_integer(*p)) { while (is_integer(*p)) {
digit = *p - '0'; digit = *p - '0';
i = 10 * i + digit; // might overflow // a multiplication by 10 is cheaper than an arbitrary integer multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
++p; ++p;
} }
} }
int64_t exponent = 0; int64_t exponent = 0;
bool is_float = false; bool is_float = false;
if ('.' == *p) { if ('.' == *p) {
is_float = true; is_float = true; // At this point we know that we have a float
// we continue with the fiction that we have an integer. If the
// floating point number is representable as x * 10^z for some integer
// z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner.
++p; ++p;
const char *const firstafterperiod = p; const char *const firstafterperiod = p;
if(is_integer(*p)) { if(is_integer(*p)) {
unsigned char digit = *p - '0'; unsigned char digit = *p - '0';
++p; ++p;
i = i * 10 + digit; i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
// we will handle the overflow later
} else { } else {
#ifdef JSON_TEST_NUMBERS // for unit testing #ifdef JSON_TEST_NUMBERS // for unit testing
foundInvalidNumber(buf + offset); foundInvalidNumber(buf + offset);
@ -469,7 +477,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
} }
exponent = firstafterperiod - p; exponent = firstafterperiod - p;
} }
int digitcount = p - startdigits - 1; int digitcount = p - startdigits - 1; // used later to guard against overflows
int64_t expnumber = 0; // exponential part int64_t expnumber = 0; // exponential part
if (('e' == *p) || ('E' == *p)) { if (('e' == *p) || ('E' == *p)) {
is_float = true; is_float = true;
@ -510,39 +518,55 @@ static really_inline bool parse_number(const uint8_t *const buf,
exponent += (negexp ? -expnumber : expnumber); exponent += (negexp ? -expnumber : expnumber);
} }
if (is_float) { if (is_float) {
if (unlikely(digitcount >= 19)) { // this is uncommon!!! uint64_t powerindex = 308 + exponent;
if (unlikely((digitcount >= 19))) { // this is uncommon
// It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
const char * start = startdigits;
while((*start == '0') || (*start == '.')) {
start++;
}
digitcount -= (start - startdigits);
if(digitcount >= 19) {
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!!
// we start anew, going slowly!!!
return parse_float(buf, pj, offset,
found_minus);
}
}
if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
// this is almost never going to get called!!! // this is almost never going to get called!!!
// we start anew, going slowly!!! // we start anew, going slowly!!!
return parse_float(buf, pj, offset, return parse_float(buf, pj, offset,
found_minus); found_minus);
} }
/////////// double factor = power_of_ten[powerindex];
// We want 0.1e1 to be a float. factor = negative ? -factor : factor;
////////// if(i <= UINT64_C(0x1fffffffffffff)) {
if (i == 0) { // we can convert i to a double safely (losslessly) so the
pj.write_tape_double(0.0); // following should have good performance.
#ifdef JSON_TEST_NUMBERS // for unit testing double d = i * factor;
foundFloat(0.0, buf + offset);
#endif
} else {
double d = i;
d = negative ? -d : d;
uint64_t powerindex = 308 + exponent;
if(likely(powerindex <= 2 * 308)) {
// common case
d *= power_of_ten[powerindex];
} else {
// this is uncommon so let us move this special case out
// of the main loop
return parse_float(buf, pj, offset,found_minus);
}
pj.write_tape_double(d); pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing #ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset); foundFloat(d, buf + offset);
#endif #endif
} } else {//if(i=< UINT64_C(0x1fffffffffffff))
// we cannot convert the number in a lossless manner.
// we have to do it in two steps.
double d1 = (double)(uint32_t)i;
double d2 = (double)(uint32_t)(i>>32);
double d = d1 * factor + d2 * factor * 4294967296;
pj.write_tape_double(d);
#ifdef JSON_TEST_NUMBERS // for unit testing
foundFloat(d, buf + offset);
#endif
}//if(i=< UINT64_C(0x1fffffffffffff))
} else { } else {
if (unlikely(digitcount >= 18)) { // this is uncommon!!! if (unlikely(digitcount >= 18)) { // this is uncommon!!!
// there is a good chance that we had an overflow, so we need
// need to recover: we parse the whole thing again.
return parse_large_integer(buf, pj, offset, return parse_large_integer(buf, pj, offset,
found_minus); found_minus);
} }

View File

@ -13,6 +13,33 @@
#include "simdjson/common_defs.h" #include "simdjson/common_defs.h"
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint32_t f32_ulp_dist(float a, float b) {
uint32_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int32_t)(ub^ua) >= 0)
return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
}
// ulp distance
// Marc B. Reynolds, 2016-2019
// Public Domain under http://unlicense.org, see link for details.
// adapted by D. Lemire
inline uint64_t f64_ulp_dist(double a, double b) {
uint64_t ua, ub;
memcpy(&ua, &a, sizeof(ua));
memcpy(&ub, &b, sizeof(ub));
if ((int64_t)(ub^ua) >= 0)
return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
return ua+ub+0x80000000;
}
int parse_error; int parse_error;
char *fullpath; char *fullpath;
enum { PARSE_WARNING, PARSE_ERROR }; enum { PARSE_WARNING, PARSE_ERROR };
@ -81,14 +108,16 @@ void foundFloat(double result, const uint8_t *buf) {
expected, result); expected, result);
fprintf(stderr, "%.32s\n", buf); fprintf(stderr, "%.32s\n", buf);
parse_error |= PARSE_ERROR; parse_error |= PARSE_ERROR;
return;
} }
// we want to get some reasonable relative accuracy // we want to get some reasonable relative accuracy
else if (fabs(expected - result) > uint64_t ULP = f64_ulp_dist(expected,result);
1e-14 * fmin(fabs(expected), fabs(result))) { if (f64_ulp_dist(expected,result) > 1) {
fprintf(stderr, "parsed %.128e from \n", result); fprintf(stderr, "parsed %.128e from \n", result);
fprintf(stderr, " %.32s whereas strtod gives\n", buf); fprintf(stderr, " %.32s whereas strtod gives\n", buf);
fprintf(stderr, " %.128e,", expected); fprintf(stderr, " %.128e,", expected);
fprintf(stderr, " while parsing %s \n", fullpath); fprintf(stderr, " while parsing %s \n", fullpath);
fprintf(stderr, " =========== ULP: %u,", (unsigned int)ULP);
parse_error |= PARSE_ERROR; parse_error |= PARSE_ERROR;
} }
} }