Issue182: fixed (#183)

* Verifying issue 182. * Fixing the corresponding bug.
2019-06-05 18:51:29 -04:00 · 2019-06-05 18:51:29 -04:00 · 59194dcf4d
parent b32c72f1fc
commit 59194dcf4d
3 changed files with 42 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -314,6 +314,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ

 - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
 - We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
+- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()`  to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
 - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
 - We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
 - We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings.)
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -6,6 +6,10 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"

+// Allowable floating-point values range from  std::numeric_limits<double>::lowest() 
+// to std::numeric_limits<double>::max(), so from 
+// -1.7976e308 all the way to 1.7975e308 in binary64. The lowest non-zero
+// normal values is std::numeric_limits<double>::min() or about 2.225074e-308.
 static const double power_of_ten[] = {
    1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
    1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
@ -163,6 +167,15 @@ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {

 #endif

+//
+// This function computes base * 10 ^ (- negative_exponent ).
+// It is only even going to be used when negative_exponent is tiny.
+static double subnormal_power10(double base, int negative_exponent) {
+  // this is probably not going to be fast
+  return base * 1e-308 * pow(10, negative_exponent + 308); 
+}
+
+
 // called by parse_number when we know that the output is a float,
 // but where there might be some integer overflow. The trick here is to
 // parse using floats from the start.
@ -258,15 +271,27 @@ parse_float(const uint8_t *const buf,
 #endif
      return false;
    }
-    if (expnumber > 308) {
+    if (unlikely(expnumber > 308)) {
+      // this path is unlikely
+      if(negexp) { 
+        // We either have zero or a subnormal. 
+        // We expect this to be uncommon so we go through a slow path.
+        i = subnormal_power10(i, - expnumber);
+      } else {
+// We know for sure that we have a number that is too large,
 // we refuse to parse this
 #ifdef JSON_TEST_NUMBERS // for unit testing
-      foundInvalidNumber(buf + offset);
+        foundInvalidNumber(buf + offset);
 #endif
-      return false;
-    }
-    int exponent = (negexp ? -expnumber : expnumber);
-    i *= power_of_ten[308 + exponent];
+        return false;
+      }
+    } else {
+      int exponent = (negexp ? -expnumber : expnumber);
+      // we have that expnumber is [0,308] so that 
+      // exponent is [-308,308] so that 
+      // 308 + exponent is in [0, 2 * 308]
+      i *= power_of_ten[308 + exponent];
+   }
  }
  if(is_not_structural_or_whitespace(*p)) {
    return false;
@ -474,13 +499,6 @@ static really_inline bool parse_number(const uint8_t *const buf,
 #endif
      return false;
    }
-    if(expnumber > 308) {
-// we refuse to parse this
-#ifdef JSON_TEST_NUMBERS // for unit testing
-        foundInvalidNumber(buf + offset);
-#endif
-        return false;       
-    }
    exponent += (negexp ? -expnumber : expnumber);
  }
  if ((exponent != 0) || (expnumber != 0)) {
@ -501,7 +519,15 @@ static really_inline bool parse_number(const uint8_t *const buf,
    } else {
      double d = i;
      d = negative ? -d : d;
-      d *= power_of_ten[308 + exponent];
+      uint64_t powerindex = 308 + exponent;
+      if(likely(powerindex <= 2 * 308)) {
+        // common case
+        d *= power_of_ten[powerindex];
+      } else {
+        // this is uncommon so let us move this special case out
+        // of the main loop
+        return parse_float(buf, pj, offset,found_minus);
+      }
      pj.write_tape_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
      foundFloat(d, buf + offset);
--- a/jsonchecker/pass17.json
+++ b/jsonchecker/pass17.json
@ -0,0 +1 @@
+[1.0e-308,0.1e-308,0.01e-307,1.79769e+308,2.22507e-308,-1.79769e+308,-2.22507e-308]
				`@ -0,0 +1 @@`
				`[1.0e-308,0.1e-308,0.01e-307,1.79769e+308,2.22507e-308,-1.79769e+308,-2.22507e-308]`