Merge pull request #990 from simdjson/jkeiser/fast-large-integer
Don't reparse large integers
This commit is contained in:
commit
82fb45aa2a
|
@ -1,14 +1,25 @@
|
|||
namespace stage2 {
|
||||
namespace numberparsing {
|
||||
|
||||
#ifdef JSON_TEST_NUMBERS
|
||||
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), false)
|
||||
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), writer.append_s64((VALUE)))
|
||||
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), writer.append_u64((VALUE)))
|
||||
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), writer.append_double((VALUE)))
|
||||
#else
|
||||
#define INVALID_NUMBER(SRC) (false)
|
||||
#define WRITE_INTEGER(VALUE, SRC, WRITER) writer.append_s64((VALUE))
|
||||
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) writer.append_u64((VALUE))
|
||||
#define WRITE_DOUBLE(VALUE, SRC, WRITER) writer.append_double((VALUE))
|
||||
#endif
|
||||
|
||||
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
||||
// true, negate the result.
|
||||
// This function will only work in some cases, when it does not work, success is
|
||||
// set to false. This should work *most of the time* (like 99% of the time).
|
||||
// We assume that power is in the [FASTFLOAT_SMALLEST_POWER,
|
||||
// FASTFLOAT_LARGEST_POWER] interval: the caller is responsible for this check.
|
||||
really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
|
||||
bool *success) {
|
||||
really_inline double compute_float_64(int64_t power, uint64_t i, bool negative, bool *success) {
|
||||
// we start with a fast path
|
||||
// It was described in
|
||||
// Clinger WD. How to read floating point numbers accurately.
|
||||
|
@ -241,109 +252,132 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
|
|||
0x3333333333333333);
|
||||
}
|
||||
|
||||
// called by parse_number when we know that the output is an integer,
|
||||
// but where there might be some integer overflow.
|
||||
// we want to catch overflows!
|
||||
// Do not call this function directly as it skips some of the checks from
|
||||
// parse_number
|
||||
//
|
||||
// This function will almost never be called!!!
|
||||
//
|
||||
template<typename W>
|
||||
never_inline bool parse_large_integer(const uint8_t *const src,
|
||||
W writer,
|
||||
bool found_minus) {
|
||||
const char *p = reinterpret_cast<const char *>(src);
|
||||
|
||||
bool negative = false;
|
||||
if (found_minus) {
|
||||
++p;
|
||||
negative = true;
|
||||
}
|
||||
uint64_t i;
|
||||
if (*p == '0') { // 0 cannot be followed by an integer
|
||||
++p;
|
||||
i = 0;
|
||||
} else {
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
i = digit;
|
||||
p++;
|
||||
// the is_made_of_eight_digits_fast routine is unlikely to help here because
|
||||
// we rarely see large integer parts like 123456789
|
||||
while (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
if (mul_overflow(i, 10, &i)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
if (add_overflow(i, digit, &i)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false; // overflow
|
||||
}
|
||||
++p;
|
||||
}
|
||||
}
|
||||
if (negative) {
|
||||
if (i > 0x8000000000000000) {
|
||||
// overflows!
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false; // overflow
|
||||
} else if (i == 0x8000000000000000) {
|
||||
// In two's complement, we cannot represent 0x8000000000000000
|
||||
// as a positive signed integer, but the negative version is
|
||||
// possible.
|
||||
constexpr int64_t signed_answer = INT64_MIN;
|
||||
writer.append_s64(signed_answer);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(signed_answer, src);
|
||||
#endif
|
||||
} else {
|
||||
// we can negate safely
|
||||
int64_t signed_answer = -static_cast<int64_t>(i);
|
||||
writer.append_s64(signed_answer);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(signed_answer, src);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
// we have a positive integer, the contract is that
|
||||
// we try to represent it as a signed integer and only
|
||||
// fallback on unsigned integers if absolutely necessary.
|
||||
if (i < 0x8000000000000000) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(i, src);
|
||||
#endif
|
||||
writer.append_s64(i);
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_unsigned_integer(i, src);
|
||||
#endif
|
||||
writer.append_u64(i);
|
||||
}
|
||||
}
|
||||
return is_structural_or_whitespace(*p);
|
||||
}
|
||||
|
||||
template<typename W>
|
||||
bool slow_float_parsing(UNUSED const char * src, W writer) {
|
||||
double d;
|
||||
if (parse_float_strtod(src, &d)) {
|
||||
writer.append_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_float(d, (const uint8_t *)src);
|
||||
#endif
|
||||
WRITE_DOUBLE(d, (const uint8_t *)src, writer);
|
||||
return true;
|
||||
}
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number((const uint8_t *)src);
|
||||
return INVALID_NUMBER((const uint8_t *)src);
|
||||
}
|
||||
|
||||
really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) {
|
||||
// we continue with the fiction that we have an integer. If the
|
||||
// floating point number is representable as x * 10^z for some integer
|
||||
// z that fits in 53 bits, then we will be able to convert back the
|
||||
// the integer into a float in a lossless manner.
|
||||
const char *const first_after_period = p;
|
||||
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
|
||||
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
++p;
|
||||
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
|
||||
// cheaper than arbitrary mult.
|
||||
// we will handle the overflow later
|
||||
#ifdef SWAR_NUMBER_PARSING
|
||||
// this helps if we have lots of decimals!
|
||||
// this turns out to be frequent enough.
|
||||
if (is_made_of_eight_digits_fast(p)) {
|
||||
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
||||
p += 8;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
while (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
++p;
|
||||
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
|
||||
// because we have parse_highprecision_float later.
|
||||
}
|
||||
exponent = first_after_period - p;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
|
||||
bool neg_exp = false;
|
||||
if ('-' == *p) {
|
||||
neg_exp = true;
|
||||
++p;
|
||||
} else if ('+' == *p) {
|
||||
++p;
|
||||
}
|
||||
|
||||
// e[+-] must be followed by a number
|
||||
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
int64_t exp_number = digit;
|
||||
p++;
|
||||
if (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
// we need to check for overflows; we refuse to parse this
|
||||
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
exponent += (neg_exp ? -exp_number : exp_number);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename W>
|
||||
really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) {
|
||||
// If we frequently had to deal with long strings of digits,
|
||||
// we could extend our code by using a 128-bit integer instead
|
||||
// of a 64-bit integer. However, this is uncommon in practice.
|
||||
// digit count is off by 1 because of the decimal (assuming there was one).
|
||||
if (unlikely((digit_count-1 >= 19))) { // this is uncommon
|
||||
// It is possible that the integer had an overflow.
|
||||
// We have to handle the case where we have 0.0000somenumber.
|
||||
const char *start = start_digits;
|
||||
while ((*start == '0') || (*start == '.')) {
|
||||
start++;
|
||||
}
|
||||
// we over-decrement by one when there is a '.'
|
||||
digit_count -= int(start - start_digits);
|
||||
if (digit_count >= 19) {
|
||||
// Ok, chances are good that we had an overflow!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
// This will happen in the following examples:
|
||||
// 10000000000000000000000000000000000000000000e+308
|
||||
// 3.1415926535897932384626433832795028841971693993751
|
||||
//
|
||||
bool success = slow_float_parsing((const char *) src, writer);
|
||||
// The number was already written, but we made a copy of the writer
|
||||
// when we passed it to the parse_large_integer() function, so
|
||||
writer.skip_double();
|
||||
return success;
|
||||
}
|
||||
}
|
||||
// NOTE: it's weird that the unlikely() only wraps half the if, but it seems to get slower any other
|
||||
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
||||
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
||||
if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
bool success = slow_float_parsing((const char *) src, writer);
|
||||
// The number was already written, but we made a copy of the writer when we passed it to the
|
||||
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
|
||||
writer.skip_double();
|
||||
return success;
|
||||
}
|
||||
bool success = true;
|
||||
double d = compute_float_64(exponent, i, negative, &success);
|
||||
if (!success) {
|
||||
// we are almost never going to get here.
|
||||
if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); }
|
||||
}
|
||||
WRITE_DOUBLE(d, src, writer);
|
||||
return true;
|
||||
}
|
||||
|
||||
// parse the number at src
|
||||
|
@ -369,32 +403,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
|
|||
if (found_minus) {
|
||||
++p;
|
||||
negative = true;
|
||||
if (!is_integer(*p)) { // a negative sign must be followed by an integer
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
// a negative sign must be followed by an integer
|
||||
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
|
||||
}
|
||||
const char *const start_digits = p;
|
||||
|
||||
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
|
||||
if (*p == '0') { // 0 cannot be followed by an integer
|
||||
if (*p == '0') {
|
||||
++p;
|
||||
if (is_integer(*p)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
|
||||
i = 0;
|
||||
} else {
|
||||
if (!(is_integer(*p))) { // must start with an integer
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
// NOTE: This is a redundant check--either we're negative, in which case we checked whether this
|
||||
// is a digit above, or the caller already determined we start with a digit. But removing this
|
||||
// check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
|
||||
// Please do try yourself, or think of ways to explain it--we'd love to understand :)
|
||||
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
i = digit;
|
||||
p++;
|
||||
|
@ -408,163 +432,67 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
|
|||
++p;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Handle floats if there is a . or e (or both)
|
||||
//
|
||||
int64_t exponent = 0;
|
||||
bool is_float = false;
|
||||
if ('.' == *p) {
|
||||
is_float = true; // At this point we know that we have a float
|
||||
// we continue with the fiction that we have an integer. If the
|
||||
// floating point number is representable as x * 10^z for some integer
|
||||
// z that fits in 53 bits, then we will be able to convert back the
|
||||
// the integer into a float in a lossless manner.
|
||||
is_float = true;
|
||||
++p;
|
||||
const char *const first_after_period = p;
|
||||
if (is_integer(*p)) {
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
++p;
|
||||
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
|
||||
// cheaper than arbitrary mult.
|
||||
// we will handle the overflow later
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
#ifdef SWAR_NUMBER_PARSING
|
||||
// this helps if we have lots of decimals!
|
||||
// this turns out to be frequent enough.
|
||||
if (is_made_of_eight_digits_fast(p)) {
|
||||
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
||||
p += 8;
|
||||
}
|
||||
#endif
|
||||
while (is_integer(*p)) {
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
++p;
|
||||
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
|
||||
// because we have parse_highprecision_float later.
|
||||
}
|
||||
exponent = first_after_period - p;
|
||||
if (!parse_decimal(src, p, i, exponent)) { return false; }
|
||||
}
|
||||
int digit_count =
|
||||
int(p - start_digits) - 1; // used later to guard against overflows
|
||||
int64_t exp_number = 0; // exponential part
|
||||
int digit_count = int(p - start_digits); // used later to guard against overflows
|
||||
if (('e' == *p) || ('E' == *p)) {
|
||||
is_float = true;
|
||||
++p;
|
||||
bool neg_exp = false;
|
||||
if ('-' == *p) {
|
||||
neg_exp = true;
|
||||
++p;
|
||||
} else if ('+' == *p) {
|
||||
++p;
|
||||
}
|
||||
if (!is_integer(*p)) {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
unsigned char digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = digit;
|
||||
p++;
|
||||
if (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
if (is_integer(*p)) {
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
while (is_integer(*p)) {
|
||||
if (exp_number > 0x100000000) { // we need to check for overflows
|
||||
// we refuse to parse this
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
digit = static_cast<unsigned char>(*p - '0');
|
||||
exp_number = 10 * exp_number + digit;
|
||||
++p;
|
||||
}
|
||||
exponent += (neg_exp ? -exp_number : exp_number);
|
||||
if (!parse_exponent(src, p, exponent)) { return false; }
|
||||
}
|
||||
if (is_float) {
|
||||
// If we frequently had to deal with long strings of digits,
|
||||
// we could extend our code by using a 128-bit integer instead
|
||||
// of a 64-bit integer. However, this is uncommon in practice.
|
||||
if (unlikely((digit_count >= 19))) { // this is uncommon
|
||||
// It is possible that the integer had an overflow.
|
||||
// We have to handle the case where we have 0.0000somenumber.
|
||||
const char *start = start_digits;
|
||||
while ((*start == '0') || (*start == '.')) {
|
||||
start++;
|
||||
}
|
||||
// we over-decrement by one when there is a '.'
|
||||
digit_count -= int(start - start_digits);
|
||||
if (digit_count >= 19) {
|
||||
// Ok, chances are good that we had an overflow!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
// This will happen in the following examples:
|
||||
// 10000000000000000000000000000000000000000000e+308
|
||||
// 3.1415926535897932384626433832795028841971693993751
|
||||
//
|
||||
bool success = slow_float_parsing((const char *) src, writer);
|
||||
// The number was already written, but we made a copy of the writer
|
||||
// when we passed it to the parse_large_integer() function, so
|
||||
writer.skip_double();
|
||||
return success;
|
||||
}
|
||||
}
|
||||
if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
|
||||
(exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
|
||||
// this is almost never going to get called!!!
|
||||
// we start anew, going slowly!!!
|
||||
bool success = slow_float_parsing((const char *) src, writer);
|
||||
// The number was already written, but we made a copy of the writer when we passed it to the
|
||||
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
|
||||
writer.skip_double();
|
||||
return success;
|
||||
}
|
||||
bool success = true;
|
||||
double d = compute_float_64(exponent, i, negative, &success);
|
||||
if (!success) {
|
||||
// we are almost never going to get here.
|
||||
success = parse_float_strtod((const char *)src, &d);
|
||||
}
|
||||
if (success) {
|
||||
writer.append_double(d);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_float(d, src);
|
||||
#endif
|
||||
return true;
|
||||
} else {
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_invalid_number(src);
|
||||
#endif
|
||||
return false;
|
||||
return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
|
||||
}
|
||||
|
||||
// The longest negative 64-bit number is 19 digits.
|
||||
// The longest positive 64-bit number is 20 digits.
|
||||
// We do it this way so we don't trigger this branch unless we must.
|
||||
int longest_digit_count = negative ? 19 : 20;
|
||||
if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
|
||||
if (digit_count == longest_digit_count) {
|
||||
// Anything negative above INT64_MAX is either invalid or INT64_MIN.
|
||||
if (negative && i > uint64_t(INT64_MAX)) {
|
||||
// If the number is negative and can't fit in a signed integer, it's invalid.
|
||||
if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); }
|
||||
|
||||
// If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
|
||||
// C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
|
||||
WRITE_INTEGER(INT64_MIN, src, writer);
|
||||
return is_structural_or_whitespace(*p);
|
||||
}
|
||||
|
||||
// Positive overflow check:
|
||||
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
||||
// biggest uint64_t.
|
||||
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
||||
// If we got here, it's a 20 digit number starting with the digit "1".
|
||||
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
||||
// than 1,553,255,926,290,448,384.
|
||||
// - That is smaller than the smallest possible 20-digit number the user could write:
|
||||
// 10,000,000,000,000,000,000.
|
||||
// - Therefore, if the number is positive and lower than that, it's overflow.
|
||||
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
|
||||
//
|
||||
if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX))) { return INVALID_NUMBER(src); }
|
||||
}
|
||||
|
||||
// Write unsigned if it doesn't fit in a signed integer.
|
||||
if (i > uint64_t(INT64_MAX)) {
|
||||
WRITE_UNSIGNED(i, src, writer);
|
||||
} else {
|
||||
if (unlikely(digit_count >= 18)) { // this is uncommon!!!
|
||||
// there is a good chance that we had an overflow, so we need
|
||||
// need to recover: we parse the whole thing again.
|
||||
bool success = parse_large_integer(src, writer, found_minus);
|
||||
// The number was already written, but we made a copy of the writer
|
||||
// when we passed it to the parse_large_integer() function, so
|
||||
writer.skip_large_integer();
|
||||
return success;
|
||||
}
|
||||
i = negative ? 0 - i : i;
|
||||
writer.append_s64(i);
|
||||
#ifdef JSON_TEST_NUMBERS // for unit testing
|
||||
found_integer(i, src);
|
||||
#endif
|
||||
WRITE_INTEGER(negative ? 0 - i : i, src, writer);
|
||||
}
|
||||
return is_structural_or_whitespace(*p);
|
||||
|
||||
#endif // SIMDJSON_SKIPNUMBERPARSING
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue