More accurate number parsing (#217)
* This drastically improves the accuracy (down to to a ULP of 1) * More comments and documentation.
This commit is contained in:
parent
6c168f046d
commit
e926b4b3c9
|
@ -333,6 +333,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ
|
||||||
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
|
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
|
||||||
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
|
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
|
||||||
- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()` to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
|
- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()` to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
|
||||||
|
- We aim for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one.
|
||||||
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
|
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
|
||||||
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
|
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
|
||||||
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)
|
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)
|
||||||
|
|
|
@ -201,7 +201,7 @@ parse_float(const uint8_t *const buf,
|
||||||
++p;
|
++p;
|
||||||
negative = true;
|
negative = true;
|
||||||
}
|
}
|
||||||
double i;
|
long double i;
|
||||||
if (*p == '0') { // 0 cannot be followed by an integer
|
if (*p == '0') { // 0 cannot be followed by an integer
|
||||||
++p;
|
++p;
|
||||||
i = 0;
|
i = 0;
|
||||||
|
@ -217,12 +217,13 @@ parse_float(const uint8_t *const buf,
|
||||||
}
|
}
|
||||||
if ('.' == *p) {
|
if ('.' == *p) {
|
||||||
++p;
|
++p;
|
||||||
double fractionalweight = 1;
|
int fractionalweight = 308;
|
||||||
if(is_integer(*p)) {
|
if(is_integer(*p)) {
|
||||||
unsigned char digit = *p - '0';
|
unsigned char digit = *p - '0';
|
||||||
++p;
|
++p;
|
||||||
fractionalweight *= 0.1;
|
|
||||||
i = i + digit * fractionalweight;
|
fractionalweight --;
|
||||||
|
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
|
||||||
} else {
|
} else {
|
||||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||||
foundInvalidNumber(buf + offset);
|
foundInvalidNumber(buf + offset);
|
||||||
|
@ -232,8 +233,8 @@ parse_float(const uint8_t *const buf,
|
||||||
while (is_integer(*p)) {
|
while (is_integer(*p)) {
|
||||||
unsigned char digit = *p - '0';
|
unsigned char digit = *p - '0';
|
||||||
++p;
|
++p;
|
||||||
fractionalweight *= 0.1;
|
fractionalweight --;
|
||||||
i = i + digit * fractionalweight;
|
i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (('e' == *p) || ('E' == *p)) {
|
if (('e' == *p) || ('E' == *p)) {
|
||||||
|
@ -388,6 +389,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
|
||||||
// is made of a single number), then it is necessary to copy the content and append
|
// is made of a single number), then it is necessary to copy the content and append
|
||||||
// a space before calling this function.
|
// a space before calling this function.
|
||||||
//
|
//
|
||||||
|
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
|
||||||
static really_inline bool parse_number(const uint8_t *const buf,
|
static really_inline bool parse_number(const uint8_t *const buf,
|
||||||
ParsedJson &pj,
|
ParsedJson &pj,
|
||||||
const uint32_t offset,
|
const uint32_t offset,
|
||||||
|
@ -434,20 +436,26 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
||||||
// we rarely see large integer parts like 123456789
|
// we rarely see large integer parts like 123456789
|
||||||
while (is_integer(*p)) {
|
while (is_integer(*p)) {
|
||||||
digit = *p - '0';
|
digit = *p - '0';
|
||||||
i = 10 * i + digit; // might overflow
|
// a multiplication by 10 is cheaper than an arbitrary integer multiplication
|
||||||
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
||||||
++p;
|
++p;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int64_t exponent = 0;
|
int64_t exponent = 0;
|
||||||
bool is_float = false;
|
bool is_float = false;
|
||||||
if ('.' == *p) {
|
if ('.' == *p) {
|
||||||
is_float = true;
|
is_float = true; // At this point we know that we have a float
|
||||||
|
// we continue with the fiction that we have an integer. If the
|
||||||
|
// floating point number is representable as x * 10^z for some integer
|
||||||
|
// z that fits in 53 bits, then we will be able to convert back the
|
||||||
|
// the integer into a float in a lossless manner.
|
||||||
++p;
|
++p;
|
||||||
const char *const firstafterperiod = p;
|
const char *const firstafterperiod = p;
|
||||||
if(is_integer(*p)) {
|
if(is_integer(*p)) {
|
||||||
unsigned char digit = *p - '0';
|
unsigned char digit = *p - '0';
|
||||||
++p;
|
++p;
|
||||||
i = i * 10 + digit;
|
i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
|
||||||
|
// we will handle the overflow later
|
||||||
} else {
|
} else {
|
||||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||||
foundInvalidNumber(buf + offset);
|
foundInvalidNumber(buf + offset);
|
||||||
|
@ -469,7 +477,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
||||||
}
|
}
|
||||||
exponent = firstafterperiod - p;
|
exponent = firstafterperiod - p;
|
||||||
}
|
}
|
||||||
int digitcount = p - startdigits - 1;
|
int digitcount = p - startdigits - 1; // used later to guard against overflows
|
||||||
int64_t expnumber = 0; // exponential part
|
int64_t expnumber = 0; // exponential part
|
||||||
if (('e' == *p) || ('E' == *p)) {
|
if (('e' == *p) || ('E' == *p)) {
|
||||||
is_float = true;
|
is_float = true;
|
||||||
|
@ -510,39 +518,55 @@ static really_inline bool parse_number(const uint8_t *const buf,
|
||||||
exponent += (negexp ? -expnumber : expnumber);
|
exponent += (negexp ? -expnumber : expnumber);
|
||||||
}
|
}
|
||||||
if (is_float) {
|
if (is_float) {
|
||||||
if (unlikely(digitcount >= 19)) { // this is uncommon!!!
|
uint64_t powerindex = 308 + exponent;
|
||||||
|
if (unlikely((digitcount >= 19))) { // this is uncommon
|
||||||
|
// It is possible that the integer had an overflow.
|
||||||
|
// We have to handle the case where we have 0.0000somenumber.
|
||||||
|
const char * start = startdigits;
|
||||||
|
while((*start == '0') || (*start == '.')) {
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
digitcount -= (start - startdigits);
|
||||||
|
if(digitcount >= 19) {
|
||||||
|
// Ok, chances are good that we had an overflow!
|
||||||
|
// this is almost never going to get called!!!
|
||||||
|
// we start anew, going slowly!!!
|
||||||
|
return parse_float(buf, pj, offset,
|
||||||
|
found_minus);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
|
||||||
// this is almost never going to get called!!!
|
// this is almost never going to get called!!!
|
||||||
// we start anew, going slowly!!!
|
// we start anew, going slowly!!!
|
||||||
return parse_float(buf, pj, offset,
|
return parse_float(buf, pj, offset,
|
||||||
found_minus);
|
found_minus);
|
||||||
}
|
}
|
||||||
///////////
|
double factor = power_of_ten[powerindex];
|
||||||
// We want 0.1e1 to be a float.
|
factor = negative ? -factor : factor;
|
||||||
//////////
|
if(i <= UINT64_C(0x1fffffffffffff)) {
|
||||||
if (i == 0) {
|
// we can convert i to a double safely (losslessly) so the
|
||||||
pj.write_tape_double(0.0);
|
// following should have good performance.
|
||||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
double d = i * factor;
|
||||||
foundFloat(0.0, buf + offset);
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
double d = i;
|
|
||||||
d = negative ? -d : d;
|
|
||||||
uint64_t powerindex = 308 + exponent;
|
|
||||||
if(likely(powerindex <= 2 * 308)) {
|
|
||||||
// common case
|
|
||||||
d *= power_of_ten[powerindex];
|
|
||||||
} else {
|
|
||||||
// this is uncommon so let us move this special case out
|
|
||||||
// of the main loop
|
|
||||||
return parse_float(buf, pj, offset,found_minus);
|
|
||||||
}
|
|
||||||
pj.write_tape_double(d);
|
pj.write_tape_double(d);
|
||||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||||
foundFloat(d, buf + offset);
|
foundFloat(d, buf + offset);
|
||||||
#endif
|
#endif
|
||||||
}
|
} else {//if(i=< UINT64_C(0x1fffffffffffff))
|
||||||
|
// we cannot convert the number in a lossless manner.
|
||||||
|
// we have to do it in two steps.
|
||||||
|
double d1 = (double)(uint32_t)i;
|
||||||
|
double d2 = (double)(uint32_t)(i>>32);
|
||||||
|
double d = d1 * factor + d2 * factor * 4294967296;
|
||||||
|
pj.write_tape_double(d);
|
||||||
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||||
|
foundFloat(d, buf + offset);
|
||||||
|
#endif
|
||||||
|
}//if(i=< UINT64_C(0x1fffffffffffff))
|
||||||
} else {
|
} else {
|
||||||
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
|
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
|
||||||
|
// there is a good chance that we had an overflow, so we need
|
||||||
|
// need to recover: we parse the whole thing again.
|
||||||
return parse_large_integer(buf, pj, offset,
|
return parse_large_integer(buf, pj, offset,
|
||||||
found_minus);
|
found_minus);
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,6 +13,33 @@
|
||||||
|
|
||||||
#include "simdjson/common_defs.h"
|
#include "simdjson/common_defs.h"
|
||||||
|
|
||||||
|
|
||||||
|
// ulp distance
|
||||||
|
// Marc B. Reynolds, 2016-2019
|
||||||
|
// Public Domain under http://unlicense.org, see link for details.
|
||||||
|
// adapted by D. Lemire
|
||||||
|
inline uint32_t f32_ulp_dist(float a, float b) {
|
||||||
|
uint32_t ua, ub;
|
||||||
|
memcpy(&ua, &a, sizeof(ua));
|
||||||
|
memcpy(&ub, &b, sizeof(ub));
|
||||||
|
if ((int32_t)(ub^ua) >= 0)
|
||||||
|
return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
|
||||||
|
return ua+ub+0x80000000;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ulp distance
|
||||||
|
// Marc B. Reynolds, 2016-2019
|
||||||
|
// Public Domain under http://unlicense.org, see link for details.
|
||||||
|
// adapted by D. Lemire
|
||||||
|
inline uint64_t f64_ulp_dist(double a, double b) {
|
||||||
|
uint64_t ua, ub;
|
||||||
|
memcpy(&ua, &a, sizeof(ua));
|
||||||
|
memcpy(&ub, &b, sizeof(ub));
|
||||||
|
if ((int64_t)(ub^ua) >= 0)
|
||||||
|
return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
|
||||||
|
return ua+ub+0x80000000;
|
||||||
|
}
|
||||||
|
|
||||||
int parse_error;
|
int parse_error;
|
||||||
char *fullpath;
|
char *fullpath;
|
||||||
enum { PARSE_WARNING, PARSE_ERROR };
|
enum { PARSE_WARNING, PARSE_ERROR };
|
||||||
|
@ -81,14 +108,16 @@ void foundFloat(double result, const uint8_t *buf) {
|
||||||
expected, result);
|
expected, result);
|
||||||
fprintf(stderr, "%.32s\n", buf);
|
fprintf(stderr, "%.32s\n", buf);
|
||||||
parse_error |= PARSE_ERROR;
|
parse_error |= PARSE_ERROR;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
// we want to get some reasonable relative accuracy
|
// we want to get some reasonable relative accuracy
|
||||||
else if (fabs(expected - result) >
|
uint64_t ULP = f64_ulp_dist(expected,result);
|
||||||
1e-14 * fmin(fabs(expected), fabs(result))) {
|
if (f64_ulp_dist(expected,result) > 1) {
|
||||||
fprintf(stderr, "parsed %.128e from \n", result);
|
fprintf(stderr, "parsed %.128e from \n", result);
|
||||||
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
|
fprintf(stderr, " %.32s whereas strtod gives\n", buf);
|
||||||
fprintf(stderr, " %.128e,", expected);
|
fprintf(stderr, " %.128e,", expected);
|
||||||
fprintf(stderr, " while parsing %s \n", fullpath);
|
fprintf(stderr, " while parsing %s \n", fullpath);
|
||||||
|
fprintf(stderr, " =========== ULP: %u,", (unsigned int)ULP);
|
||||||
parse_error |= PARSE_ERROR;
|
parse_error |= PARSE_ERROR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue