More accurate number parsing (#217)

* This drastically improves the accuracy (down to to a ULP of 1) * More comments and documentation.
2019-07-15 22:17:49 -04:00 · 2019-07-15 22:17:49 -04:00 · e926b4b3c9
parent 6c168f046d
commit e926b4b3c9
3 changed files with 88 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -333,6 +333,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ
 - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
 - We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
 - We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()`  to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
+- We aim for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one.
 - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
 - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
 - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -201,7 +201,7 @@ parse_float(const uint8_t *const buf,
    ++p;
    negative = true;
  }
-  double i;
+  long double i;
  if (*p == '0') { // 0 cannot be followed by an integer
    ++p;
    i = 0;
@ -217,12 +217,13 @@ parse_float(const uint8_t *const buf,
  }
  if ('.' == *p) {
    ++p;
-    double fractionalweight = 1;
+    int fractionalweight = 308;
    if(is_integer(*p)) {
      unsigned char digit = *p - '0';
      ++p;
-      fractionalweight *= 0.1;
-      i = i + digit * fractionalweight;
+
+      fractionalweight --;
+      i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
    } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
@ -232,8 +233,8 @@ parse_float(const uint8_t *const buf,
    while (is_integer(*p)) {
      unsigned char digit = *p - '0';
      ++p;
-      fractionalweight *= 0.1;
-      i = i + digit * fractionalweight;
+      fractionalweight --;
+      i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
    }
  }
  if (('e' == *p) || ('E' == *p)) {
@ -388,6 +389,7 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
 // is made of a single number), then it is necessary to copy the content and append
 // a space before calling this function.
 //
+// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
 static really_inline bool parse_number(const uint8_t *const buf,
                                       ParsedJson &pj,
                                       const uint32_t offset,
@ -434,20 +436,26 @@ static really_inline bool parse_number(const uint8_t *const buf,
    // we rarely see large integer parts like 123456789
    while (is_integer(*p)) {
      digit = *p - '0';
-      i = 10 * i + digit; // might overflow
+      // a multiplication by 10 is cheaper than an arbitrary integer multiplication
+      i = 10 * i + digit; // might overflow, we will handle the overflow later
      ++p;
    }
  }
  int64_t exponent = 0;
  bool is_float = false;
  if ('.' == *p) {
-    is_float = true;
+    is_float = true; // At this point we know that we have a float
+    // we continue with the fiction that we have an integer. If the
+    // floating point number is representable as x * 10^z for some integer
+    // z that fits in 53 bits, then we will be able to convert back the
+    // the integer into a float in a lossless manner.
    ++p;
    const char *const firstafterperiod = p;
    if(is_integer(*p)) {
      unsigned char digit = *p - '0';
      ++p;
-      i = i * 10 + digit;
+      i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
+      // we will handle the overflow later
    } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundInvalidNumber(buf + offset);
@ -469,7 +477,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
    }
    exponent = firstafterperiod - p;
  }
-  int digitcount = p - startdigits - 1;
+  int digitcount = p - startdigits - 1; // used later to guard against overflows
  int64_t expnumber = 0; // exponential part
  if (('e' == *p) || ('E' == *p)) {
    is_float = true;
@ -510,39 +518,55 @@ static really_inline bool parse_number(const uint8_t *const buf,
    exponent += (negexp ? -expnumber : expnumber);
  }
  if (is_float) {
-    if (unlikely(digitcount >= 19)) { // this is uncommon!!!
+    uint64_t powerindex = 308 + exponent;
+    if (unlikely((digitcount >= 19))) { // this is uncommon
+      // It is possible that the integer had an overflow. 
+      // We have to handle the case where we have 0.0000somenumber.
+      const char * start = startdigits;
+      while((*start == '0') || (*start == '.')) {
+         start++;
+      }
+      digitcount -= (start - startdigits);
+      if(digitcount >= 19) {
+        // Ok, chances are good that we had an overflow!
+        // this is almost never going to get called!!!
+        // we start anew, going slowly!!!
+        return parse_float(buf, pj, offset,
+                                       found_minus);
+        
+      } 
+    }
+    if (unlikely((powerindex > 2 * 308))) { // this is uncommon!!!
      // this is almost never going to get called!!!
      // we start anew, going slowly!!!
      return parse_float(buf, pj, offset,
                                       found_minus);
    }
-    ///////////
-    // We want 0.1e1 to be a float.
-    //////////
-    if (i == 0) {
-      pj.write_tape_double(0.0);
-#ifdef JSON_TEST_NUMBERS // for unit testing
-      foundFloat(0.0, buf + offset);
-#endif
-    } else {
-      double d = i;
-      d = negative ? -d : d;
-      uint64_t powerindex = 308 + exponent;
-      if(likely(powerindex <= 2 * 308)) {
-        // common case
-        d *= power_of_ten[powerindex];
-      } else {
-        // this is uncommon so let us move this special case out
-        // of the main loop
-        return parse_float(buf, pj, offset,found_minus);
-      }
+    double factor = power_of_ten[powerindex];
+    factor = negative ? -factor : factor;
+    if(i <= UINT64_C(0x1fffffffffffff)) {
+      // we can convert i to a double safely (losslessly) so the 
+      // following should have good performance.
+      double d = i * factor;
      pj.write_tape_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundFloat(d, buf + offset);
 #endif
-    }
+    } else {//if(i=< UINT64_C(0x1fffffffffffff))
+      // we cannot convert the number in a lossless manner.
+      // we have to do it in two steps.
+      double d1 = (double)(uint32_t)i;
+      double d2 = (double)(uint32_t)(i>>32);
+      double d = d1 * factor + d2 * factor * 4294967296;
+      pj.write_tape_double(d);
+#ifdef JSON_TEST_NUMBERS // for unit testing
+      foundFloat(d, buf + offset);
+#endif
+    }//if(i=< UINT64_C(0x1fffffffffffff))
  } else {
    if (unlikely(digitcount >= 18)) { // this is uncommon!!!
+      // there is a good chance that we had an overflow, so we need
+      // need to recover: we parse the whole thing again.
      return parse_large_integer(buf, pj, offset,
                                 found_minus);
    }
--- a/tests/numberparsingcheck.cpp
+++ b/tests/numberparsingcheck.cpp
@ -13,6 +13,33 @@

 #include "simdjson/common_defs.h"

+
+// ulp distance 
+// Marc B. Reynolds, 2016-2019
+// Public Domain under http://unlicense.org, see link for details.
+// adapted by D. Lemire
+inline uint32_t f32_ulp_dist(float a, float b) {
+  uint32_t ua, ub;
+  memcpy(&ua, &a, sizeof(ua)); 
+  memcpy(&ub, &b, sizeof(ub)); 
+  if ((int32_t)(ub^ua) >= 0)
+    return (int32_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
+  return ua+ub+0x80000000;
+}
+
+// ulp distance 
+// Marc B. Reynolds, 2016-2019
+// Public Domain under http://unlicense.org, see link for details.
+// adapted by D. Lemire
+inline uint64_t f64_ulp_dist(double a, double b) {
+  uint64_t ua, ub;
+  memcpy(&ua, &a, sizeof(ua)); 
+  memcpy(&ub, &b, sizeof(ub)); 
+  if ((int64_t)(ub^ua) >= 0)
+    return (int64_t)(ua-ub) >= 0 ? (ua-ub) : (ub-ua);
+  return ua+ub+0x80000000;
+}
+
 int parse_error;
 char *fullpath;
 enum { PARSE_WARNING, PARSE_ERROR };
@ -81,14 +108,16 @@ void foundFloat(double result, const uint8_t *buf) {
            expected, result);
    fprintf(stderr, "%.32s\n", buf);
    parse_error |= PARSE_ERROR;
+    return;
  }
  // we want to get some reasonable relative accuracy
-  else if (fabs(expected - result) >
-           1e-14 * fmin(fabs(expected), fabs(result))) {
+  uint64_t ULP = f64_ulp_dist(expected,result);
+  if (f64_ulp_dist(expected,result) > 1) {
    fprintf(stderr, "parsed %.128e from \n", result);
    fprintf(stderr, "       %.32s whereas strtod gives\n", buf);
    fprintf(stderr, "       %.128e,", expected);
    fprintf(stderr, " while parsing %s \n", fullpath);
+    fprintf(stderr, " ===========  ULP:  %u,", (unsigned int)ULP);
    parse_error |= PARSE_ERROR;
  }
 }