Check overflow without reparsing integers

This commit is contained in:
John Keiser 2020-06-26 16:16:00 -07:00
parent e0f3060527
commit d3c089130d
1 changed files with 154 additions and 220 deletions

View File

@ -252,69 +252,6 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
0x3333333333333333); 0x3333333333333333);
} }
// called by parse_number when we know that the output is an integer,
// but where there might be some integer overflow.
// we want to catch overflows!
// Do not call this function directly as it skips some of the checks from
// parse_number
//
// This function will almost never be called!!!
//
template<typename W>
never_inline bool parse_large_integer(const uint8_t *const src,
W writer,
bool found_minus) {
const char *p = reinterpret_cast<const char *>(src);
bool negative = false;
if (found_minus) {
++p;
negative = true;
}
uint64_t i;
if (*p == '0') { // 0 cannot be followed by an integer
++p;
i = 0;
} else {
unsigned char digit = static_cast<unsigned char>(*p - '0');
i = digit;
p++;
// the is_made_of_eight_digits_fast routine is unlikely to help here because
// we rarely see large integer parts like 123456789
while (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
// This is i = 10 * i + digit, but with overflow checks.
if (mul_overflow(i, 10, &i)) { return INVALID_NUMBER(src); }
if (add_overflow(i, digit, &i)) { return INVALID_NUMBER(src); }
++p;
}
}
if (negative) {
if (i > 0x8000000000000000) { return INVALID_NUMBER(src); } // overflow: -i won't fit in INT32_MIN
if (i == 0x8000000000000000) {
// In two's complement, we cannot represent 0x8000000000000000
// as a positive signed integer, but the negative version is
// possible.
constexpr int64_t signed_answer = INT64_MIN;
WRITE_INTEGER(signed_answer, src, writer);
} else {
// we can negate safely
int64_t signed_answer = -static_cast<int64_t>(i);
WRITE_INTEGER(signed_answer, src, writer);
}
} else {
// we have a positive integer, the contract is that
// we try to represent it as a signed integer and only
// fallback on unsigned integers if absolutely necessary.
if (i < 0x8000000000000000) {
WRITE_INTEGER(i, src, writer);
} else {
WRITE_UNSIGNED(i, src, writer);
}
}
return is_structural_or_whitespace(*p);
}
template<typename W> template<typename W>
bool slow_float_parsing(UNUSED const char * src, W writer) { bool slow_float_parsing(UNUSED const char * src, W writer) {
double d; double d;
@ -325,60 +262,121 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
return INVALID_NUMBER((const uint8_t *)src); return INVALID_NUMBER((const uint8_t *)src);
} }
template<typename W> really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) {
really_inline bool write_negative_integer(const uint8_t * const src, int digit_count, uint64_t i, W &writer) { // we continue with the fiction that we have an integer. If the
// // floating point number is representable as x * 10^z for some integer
// Handle large numbers // z that fits in 53 bits, then we will be able to convert back the
// // the integer into a float in a lossless manner.
if (unlikely(digit_count >= 18)) { // this is uncommon!!! const char *const first_after_period = p;
// 19 digits or more is an overflow. if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
if (digit_count > 18) { return invalid_number(src); }
constexpr const uint64_t int64_min_magnitude = uint64_t(INT64_MAX)+1;
// If it's 18 digits, check if it fits in a negative 64-bit integer.
if (i > int64_min_magnitude) { return invalid_number(src); }
// C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it. unsigned char digit = static_cast<unsigned char>(*p - '0');
if (i == int64_min_magnitude) { ++p;
return write_signed_integer(INT64_MIN, src, writer); i = i * 10 + digit; // might overflow + multiplication by 10 is likely
} // cheaper than arbitrary mult.
// we will handle the overflow later
#ifdef SWAR_NUMBER_PARSING
// this helps if we have lots of decimals!
// this turns out to be frequent enough.
if (is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p);
p += 8;
}
#endif
while (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
++p;
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
// because we have parse_highprecision_float later.
}
exponent = first_after_period - p;
return true;
}
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
bool neg_exp = false;
if ('-' == *p) {
neg_exp = true;
++p;
} else if ('+' == *p) {
++p;
} }
// Otherwise, just negate and return // e[+-] must be followed by a number
return write_signed_integer(0 - i, src, writer); if (!is_integer(*p)) { return INVALID_NUMBER(src); }
unsigned char digit = static_cast<unsigned char>(*p - '0');
int64_t exp_number = digit;
p++;
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
while (is_integer(*p)) {
// we need to check for overflows; we refuse to parse this
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
exponent += (neg_exp ? -exp_number : exp_number);
return true;
} }
template<typename W> template<typename W>
really_inline bool write_positive_integer(const uint8_t * const src, int digit_count, uint64_t i, W &writer) { really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) {
// // If we frequently had to deal with long strings of digits,
// Check for overflow // we could extend our code by using a 128-bit integer instead
// // of a 64-bit integer. However, this is uncommon in practice.
if (unlikely(digit_count >= 19)) { // this is uncommon! // digit count is off by 1 because of the decimal (assuming there was one).
// 20 or more digits is overflow. if (unlikely((digit_count-1 >= 19))) { // this is uncommon
if (digit_count > 19) { return invalid_number(src); } // It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber.
// - It is 19 digits. const char *start = start_digits;
// - 18,446,744,073,709,551,615 is the biggest uint64_t. while ((*start == '0') || (*start == '.')) {
// start++;
// A leading 2-9 is therefore overflow. (0 cannot be followed by other digits anyway.) }
if (src[0] != uint8_t('1')) { return invalid_number(src); } // we over-decrement by one when there is a '.'
digit_count -= int(start - start_digits);
// - It is 19 digits. if (digit_count >= 19) {
// - There is a leading 1. // Ok, chances are good that we had an overflow!
// - 19,999,999,999,999,999,999 is the biggest number the user could have written. // this is almost never going to get called!!!
// - 18,446,744,073,709,551,615 is the biggest uint64_t. // we start anew, going slowly!!!
// - 1,553,255,926,290,448,383 is the overflow of the biggest number we could store. // This will happen in the following examples:
// - 10,000,000,000,000,000,000 is the smallest number the user could have written. // 10000000000000000000000000000000000000000000e+308
// - The user could not have written an overflow. // 3.1415926535897932384626433832795028841971693993751
// Therefore, any number the user could not have written is overflow. //
if (i < 10000000000000000000ULL) { return invalid_number(src); } bool success = slow_float_parsing((const char *) src, writer);
// The number was already written, but we made a copy of the writer
// when we passed it to the parse_large_integer() function, so
writer.skip_double();
return success;
}
} }
// TODO unlikely wraps the wrong thing here
// Write an unsigned integer if it doesn't fit in int64_t if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
if (i > uint64_t(INT64_MAX)) { (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
return write_unsigned_integer(i, src, writer); // this is almost never going to get called!!!
// we start anew, going slowly!!!
bool success = slow_float_parsing((const char *) src, writer);
// The number was already written, but we made a copy of the writer when we passed it to the
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
writer.skip_double();
return success;
} }
// Write a signed integer if it does bool success = true;
return write_signed_integer(i, src, writer); double d = compute_float_64(exponent, i, negative, &success);
if (!success) {
// we are almost never going to get here.
if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); }
}
WRITE_DOUBLE(d, src, writer);
return true;
} }
// parse the number at src // parse the number at src
@ -430,131 +428,67 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
++p; ++p;
} }
} }
//
// Handle floats if there is a . or e (or both)
//
int64_t exponent = 0; int64_t exponent = 0;
bool is_float = false; bool is_float = false;
if ('.' == *p) { if ('.' == *p) {
is_float = true; // At this point we know that we have a float is_float = true;
// we continue with the fiction that we have an integer. If the
// floating point number is representable as x * 10^z for some integer
// z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner.
++p; ++p;
const char *const first_after_period = p; if (!parse_decimal(src, p, i, exponent)) { return false; }
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
unsigned char digit = static_cast<unsigned char>(*p - '0');
++p;
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
// cheaper than arbitrary mult.
// we will handle the overflow later
#ifdef SWAR_NUMBER_PARSING
// this helps if we have lots of decimals!
// this turns out to be frequent enough.
if (is_made_of_eight_digits_fast(p)) {
i = i * 100000000 + parse_eight_digits_unrolled(p);
p += 8;
}
#endif
while (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
++p;
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
// because we have parse_highprecision_float later.
}
exponent = first_after_period - p;
} }
int digit_count = int(p - start_digits) - 1; // used later to guard against overflows int digit_count = int(p - start_digits); // used later to guard against overflows
int64_t exp_number = 0; // exponential part
if (('e' == *p) || ('E' == *p)) { if (('e' == *p) || ('E' == *p)) {
is_float = true; is_float = true;
++p; ++p;
bool neg_exp = false; if (!parse_exponent(src, p, exponent)) { return false; }
if ('-' == *p) {
neg_exp = true;
++p;
} else if ('+' == *p) {
++p;
}
// e[+-] must be followed by a number
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
unsigned char digit = static_cast<unsigned char>(*p - '0');
exp_number = digit;
p++;
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
while (is_integer(*p)) {
// we need to check for overflows; we refuse to parse this
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
exponent += (neg_exp ? -exp_number : exp_number);
} }
if (is_float) { if (is_float) {
// If we frequently had to deal with long strings of digits, return write_float(src, negative, i, start_digits, digit_count, exponent, writer);
// we could extend our code by using a 128-bit integer instead }
// of a 64-bit integer. However, this is uncommon in practice.
if (unlikely((digit_count >= 19))) { // this is uncommon // The longest negative 64-bit number is 19 digits.
// It is possible that the integer had an overflow. // The longest positive 64-bit number is 20 digits.
// We have to handle the case where we have 0.0000somenumber. // We do it this way so we don't trigger this branch unless we must.
const char *start = start_digits; int longest_digit_count = negative ? 19 : 20;
while ((*start == '0') || (*start == '.')) { if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
start++; if (digit_count == longest_digit_count) {
} // Anything negative above INT64_MAX is either invalid or INT64_MIN.
// we over-decrement by one when there is a '.' if (negative && i > uint64_t(INT64_MAX)) {
digit_count -= int(start - start_digits); // If the number is negative and can't fit in a signed integer, it's invalid.
if (digit_count >= 19) { if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); }
// Ok, chances are good that we had an overflow!
// this is almost never going to get called!!! // If it's negative, it has to be INT64_MAX+1 now (or INT64_MIN).
// we start anew, going slowly!!! // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
// This will happen in the following examples: WRITE_INTEGER(INT64_MIN, src, writer);
// 10000000000000000000000000000000000000000000e+308 return is_structural_or_whitespace(*p);
// 3.1415926535897932384626433832795028841971693993751
//
bool success = slow_float_parsing((const char *) src, writer);
// The number was already written, but we made a copy of the writer
// when we passed it to the parse_large_integer() function, so
writer.skip_double();
return success;
}
} }
// TODO unlikely wraps the wrong thing here
if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || // Positive overflow check:
(exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
// this is almost never going to get called!!! // biggest uint64_t.
// we start anew, going slowly!!! // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
bool success = slow_float_parsing((const char *) src, writer); // If we got here, it's a 20 digit number starting with the digit "1".
// The number was already written, but we made a copy of the writer when we passed it to the // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned // than 1,553,255,926,290,448,384.
writer.skip_double(); // - That is smaller than the smallest possible 20-digit number the user could write:
return success; // 10,000,000,000,000,000,000.
} // - Therefore, if the number is positive and lower than that, it's overflow.
bool success = true; // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
double d = compute_float_64(exponent, i, negative, &success); //
if (!success) { if (!negative && (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX))) { return INVALID_NUMBER(src); }
// we are almost never going to get here. }
if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); }
} // Write unsigned if it doesn't fit in a signed integer.
WRITE_DOUBLE(d, src, writer); if (i > uint64_t(INT64_MAX)) {
return true; WRITE_UNSIGNED(i, src, writer);
} else { } else {
if (negative) { WRITE_INTEGER(negative ? 0 - i : i, src, writer);
if (!write_negative_integer(src, digit_count, i, writer)) { return false; }
} else {
if (!write_positive_integer(src, digit_count, i, writer)) { return false; }
}
} }
return is_structural_or_whitespace(*p); return is_structural_or_whitespace(*p);
#endif // SIMDJSON_SKIPNUMBERPARSING #endif // SIMDJSON_SKIPNUMBERPARSING
} }