Merge pull request #1018 from simdjson/jkeiser/simplify-integer-parse

Remove some branches from number parsing
This commit is contained in:
John Keiser 2020-07-16 12:21:43 -07:00 committed by GitHub
commit 90cc1411da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 100 additions and 126 deletions

View File

@ -21,7 +21,7 @@ namespace arm64 {
// we don't have SSE, so let us use a scalar function // we don't have SSE, so let us use a scalar function
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
static inline uint32_t parse_eight_digits_unrolled(const char *chars) { static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
uint64_t val; uint64_t val;
memcpy(&val, chars, sizeof(uint64_t)); memcpy(&val, chars, sizeof(uint64_t));
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;

View File

@ -16,13 +16,16 @@ void found_float(double result, const uint8_t *buf);
namespace simdjson { namespace simdjson {
namespace fallback { namespace fallback {
static inline uint32_t parse_eight_digits_unrolled(const char *chars) { static really_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
uint32_t result = 0; uint32_t result = 0;
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
result = result*10 + (chars[i] - '0'); result = result*10 + (chars[i] - '0');
} }
return result; return result;
} }
static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
return parse_eight_digits_unrolled((const char *)chars);
}
#define SWAR_NUMBER_PARSING #define SWAR_NUMBER_PARSING

View File

@ -199,9 +199,9 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
return d; return d;
} }
static bool parse_float_strtod(const char *ptr, double *outDouble) { static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
char *endptr; char *endptr;
*outDouble = strtod(ptr, &endptr); *outDouble = strtod((const char *)ptr, &endptr);
// Some libraries will set errno = ERANGE when the value is subnormal, // Some libraries will set errno = ERANGE when the value is subnormal,
// yet we may want to be able to parse subnormal values. // yet we may want to be able to parse subnormal values.
// However, we do not want to tolerate NAN or infinite values. // However, we do not want to tolerate NAN or infinite values.
@ -222,22 +222,16 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
// a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json) // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
// will flat out throw an exception. // will flat out throw an exception.
// //
if ((endptr == ptr) || (!std::isfinite(*outDouble))) { if ((endptr == (const char *)ptr) || (!std::isfinite(*outDouble))) {
return false; return false;
} }
return true; return true;
} }
really_inline bool is_integer(char c) {
return (c >= '0' && c <= '9');
// this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
}
// check quickly whether the next 8 chars are made of digits // check quickly whether the next 8 chars are made of digits
// at a glance, it looks better than Mula's // at a glance, it looks better than Mula's
// http://0x80.pl/articles/swar-digits-validate.html // http://0x80.pl/articles/swar-digits-validate.html
really_inline bool is_made_of_eight_digits_fast(const char *chars) { really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
uint64_t val; uint64_t val;
// this can read up to 7 bytes beyond the buffer size, but we require // this can read up to 7 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding // SIMDJSON_PADDING of padding
@ -253,28 +247,34 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
} }
template<typename W> template<typename W>
bool slow_float_parsing(UNUSED const char * src, W writer) { bool slow_float_parsing(UNUSED const uint8_t * src, W writer) {
double d; double d;
if (parse_float_strtod(src, &d)) { if (parse_float_strtod(src, &d)) {
WRITE_DOUBLE(d, (const uint8_t *)src, writer); WRITE_DOUBLE(d, src, writer);
return true; return true;
} }
return INVALID_NUMBER((const uint8_t *)src); return INVALID_NUMBER(src);
} }
really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) { template<typename I>
NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
really_inline bool parse_digit(const uint8_t c, I &i) {
const uint8_t digit = static_cast<uint8_t>(c - '0');
if (digit > 9) {
return false;
}
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
return true;
}
really_inline bool parse_decimal(UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
// we continue with the fiction that we have an integer. If the // we continue with the fiction that we have an integer. If the
// floating point number is representable as x * 10^z for some integer // floating point number is representable as x * 10^z for some integer
// z that fits in 53 bits, then we will be able to convert back the // z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner. // the integer into a float in a lossless manner.
const char *const first_after_period = p; const uint8_t *const first_after_period = p;
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
unsigned char digit = static_cast<unsigned char>(*p - '0');
++p;
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
// cheaper than arbitrary mult.
// we will handle the overflow later
#ifdef SWAR_NUMBER_PARSING #ifdef SWAR_NUMBER_PARSING
// this helps if we have lots of decimals! // this helps if we have lots of decimals!
// this turns out to be frequent enough. // this turns out to be frequent enough.
@ -283,53 +283,43 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
p += 8; p += 8;
} }
#endif #endif
while (is_integer(*p)) { // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
digit = static_cast<unsigned char>(*p - '0'); if (parse_digit(*p, i)) { ++p; }
++p; while (parse_digit(*p, i)) { p++; }
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
// because we have parse_highprecision_float later.
}
exponent = first_after_period - p; exponent = first_after_period - p;
// Decimal without digits (123.) is illegal
if (exponent == 0) {
return INVALID_NUMBER(src);
}
return true; return true;
} }
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) { really_inline bool parse_exponent(UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
bool neg_exp = false; // Exp Sign: -123.456e[-]78
if ('-' == *p) { bool neg_exp = ('-' == *p);
neg_exp = true; if (neg_exp || '+' == *p) { p++; } // Skip + as well
++p;
} else if ('+' == *p) {
++p;
}
// e[+-] must be followed by a number // Exponent: -123.456e-[78]
if (!is_integer(*p)) { return INVALID_NUMBER(src); } auto start_exp = p;
unsigned char digit = static_cast<unsigned char>(*p - '0'); int64_t exp_number = 0;
int64_t exp_number = digit; while (parse_digit(*p, exp_number)) { ++p; }
p++;
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
while (is_integer(*p)) {
// we need to check for overflows; we refuse to parse this
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
exponent += (neg_exp ? -exp_number : exp_number); exponent += (neg_exp ? -exp_number : exp_number);
// If there were no digits, it's an error.
// If there were more than 18 digits, we may have overflowed the integer.
if (unlikely(p == start_exp || p > start_exp+18)) {
// Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
while (*start_exp == '0') { start_exp++; }
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
// support exponents smaller than -9,999,999,999,999,999,999 and bigger
// than 9,999,999,999,999,999,999.
if (p == start_exp || p > start_exp+18) { return INVALID_NUMBER(src); }
}
return true; return true;
} }
template<typename W> template<typename W>
really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) { really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
// If we frequently had to deal with long strings of digits, // If we frequently had to deal with long strings of digits,
// we could extend our code by using a 128-bit integer instead // we could extend our code by using a 128-bit integer instead
// of a 64-bit integer. However, this is uncommon in practice. // of a 64-bit integer. However, this is uncommon in practice.
@ -337,7 +327,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
if (unlikely((digit_count-1 >= 19))) { // this is uncommon if (unlikely((digit_count-1 >= 19))) { // this is uncommon
// It is possible that the integer had an overflow. // It is possible that the integer had an overflow.
// We have to handle the case where we have 0.0000somenumber. // We have to handle the case where we have 0.0000somenumber.
const char *start = start_digits; const uint8_t *start = start_digits;
while ((*start == '0') || (*start == '.')) { while ((*start == '0') || (*start == '.')) {
start++; start++;
} }
@ -351,7 +341,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
// 10000000000000000000000000000000000000000000e+308 // 10000000000000000000000000000000000000000000e+308
// 3.1415926535897932384626433832795028841971693993751 // 3.1415926535897932384626433832795028841971693993751
// //
bool success = slow_float_parsing((const char *) src, writer); bool success = slow_float_parsing(src, writer);
// The number was already written, but we made a copy of the writer // The number was already written, but we made a copy of the writer
// when we passed it to the parse_large_integer() function, so // when we passed it to the parse_large_integer() function, so
writer.skip_double(); writer.skip_double();
@ -364,7 +354,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
// this is almost never going to get called!!! // this is almost never going to get called!!!
// we start anew, going slowly!!! // we start anew, going slowly!!!
bool success = slow_float_parsing((const char *) src, writer); bool success = slow_float_parsing(src, writer);
// The number was already written, but we made a copy of the writer when we passed it to the // The number was already written, but we made a copy of the writer when we passed it to the
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
writer.skip_double(); writer.skip_double();
@ -374,12 +364,23 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
double d = compute_float_64(exponent, i, negative, &success); double d = compute_float_64(exponent, i, negative, &success);
if (!success) { if (!success) {
// we are almost never going to get here. // we are almost never going to get here.
if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); } if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
} }
WRITE_DOUBLE(d, src, writer); WRITE_DOUBLE(d, src, writer);
return true; return true;
} }
// for performance analysis, it is sometimes useful to skip parsing
#ifdef SIMDJSON_SKIPNUMBERPARSING
template<typename W>
really_inline bool parse_number(const uint8_t *const, W &writer) {
writer.append_s64(0); // always write zero
return true; // always succeeds
}
#else
// parse the number at src // parse the number at src
// define JSON_TEST_NUMBERS for unit testing // define JSON_TEST_NUMBERS for unit testing
// //
@ -390,48 +391,25 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
// //
// Our objective is accurate parsing (ULP of 0) at high speed. // Our objective is accurate parsing (ULP of 0) at high speed.
template<typename W> template<typename W>
really_inline bool parse_number(UNUSED const uint8_t *const src, really_inline bool parse_number(const uint8_t *const src, W &writer) {
UNUSED bool found_minus,
W &writer) {
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
// useful to skip parsing
writer.append_s64(0); // always write zero
return true; // always succeeds
#else
const char *p = reinterpret_cast<const char *>(src);
bool negative = false;
if (found_minus) {
++p;
negative = true;
// a negative sign must be followed by an integer
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
}
const char *const start_digits = p;
uint64_t i; // an unsigned int avoids signed overflows (which are bad) //
if (*p == '0') { // Check for minus sign
++p; //
if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer bool negative = (*src == '-');
i = 0; const uint8_t *p = src + negative;
} else {
// NOTE: This is a redundant check--either we're negative, in which case we checked whether this //
// is a digit above, or the caller already determined we start with a digit. But removing this // Parse the integer part.
// check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448 //
// Please do try yourself, or think of ways to explain it--we'd love to understand :) // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer const uint8_t *const start_digits = p;
unsigned char digit = static_cast<unsigned char>(*p - '0'); uint64_t i = 0;
i = digit; while (parse_digit(*p, i)) { p++; }
p++;
// the is_made_of_eight_digits_fast routine is unlikely to help here because // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
// we rarely see large integer parts like 123456789 int digit_count = int(p - start_digits);
while (is_integer(*p)) { if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
digit = static_cast<unsigned char>(*p - '0');
// a multiplication by 10 is cheaper than an arbitrary integer
// multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
++p;
}
}
// //
// Handle floats if there is a . or e (or both) // Handle floats if there is a . or e (or both)
@ -442,8 +420,8 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
is_float = true; is_float = true;
++p; ++p;
if (!parse_decimal(src, p, i, exponent)) { return false; } if (!parse_decimal(src, p, i, exponent)) { return false; }
digit_count = int(p - start_digits); // used later to guard against overflows
} }
int digit_count = int(p - start_digits); // used later to guard against overflows
if (('e' == *p) || ('E' == *p)) { if (('e' == *p) || ('E' == *p)) {
is_float = true; is_float = true;
++p; ++p;
@ -492,9 +470,9 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
WRITE_INTEGER(negative ? 0 - i : i, src, writer); WRITE_INTEGER(negative ? 0 - i : i, src, writer);
} }
return is_structural_or_whitespace(*p); return is_structural_or_whitespace(*p);
}
#endif // SIMDJSON_SKIPNUMBERPARSING #endif // SIMDJSON_SKIPNUMBERPARSING
}
} // namespace numberparsing } // namespace numberparsing
} // namespace stage2 } // namespace stage2

View File

@ -159,17 +159,17 @@ struct structural_parser : structural_iterator {
return false; return false;
} }
WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { WARN_UNUSED really_inline bool parse_number(const uint8_t *src) {
log_value("number"); log_value("number");
bool succeeded = numberparsing::parse_number(src, found_minus, tape); bool succeeded = numberparsing::parse_number(src, tape);
if (!succeeded) { log_error("Invalid number"); } if (!succeeded) { log_error("Invalid number"); }
return !succeeded; return !succeeded;
} }
WARN_UNUSED really_inline bool parse_number(bool found_minus) { WARN_UNUSED really_inline bool parse_number() {
return parse_number(current(), found_minus); return parse_number(current());
} }
really_inline bool parse_number_with_space_terminated_copy(const bool is_negative) { really_inline bool parse_number_with_space_terminated_copy() {
/** /**
* We need to make a copy to make sure that the string is space terminated. * We need to make a copy to make sure that the string is space terminated.
* This is not about padding the input, which should already padded up * This is not about padding the input, which should already padded up
@ -190,7 +190,7 @@ struct structural_parser : structural_iterator {
memcpy(copy, buf, parser.len); memcpy(copy, buf, parser.len);
memset(copy + parser.len, ' ', SIMDJSON_PADDING); memset(copy + parser.len, ' ', SIMDJSON_PADDING);
size_t idx = *current_structural; size_t idx = *current_structural;
bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw bool result = parse_number(&copy[idx]); // parse_number does not throw
free(copy); free(copy);
return result; return result;
} }
@ -214,12 +214,10 @@ struct structural_parser : structural_iterator {
FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
tape.append(0, internal::tape_type::NULL_VALUE); tape.append(0, internal::tape_type::NULL_VALUE);
return continue_state; return continue_state;
case '-':
case '0': case '1': case '2': case '3': case '4': case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': case '5': case '6': case '7': case '8': case '9':
FAIL_IF( parse_number(false) ); FAIL_IF( parse_number() );
return continue_state;
case '-':
FAIL_IF( parse_number(true) );
return continue_state; return continue_state;
case '{': case '{':
FAIL_IF( start_object(continue_state) ); FAIL_IF( start_object(continue_state) );
@ -375,18 +373,13 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
parser.tape.append(0, internal::tape_type::NULL_VALUE); parser.tape.append(0, internal::tape_type::NULL_VALUE);
goto finish; goto finish;
case '-':
case '0': case '1': case '2': case '3': case '4': case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': case '5': case '6': case '7': case '8': case '9':
// Next line used to be an interesting functional programming exercise with // Next line used to be an interesting functional programming exercise with
// a lambda that gets passed to another function via a closure. This would confuse the // a lambda that gets passed to another function via a closure. This would confuse the
// clangcl compiler under Visual Studio 2019 (recent release). // clangcl compiler under Visual Studio 2019 (recent release).
{ if(parser.parse_number_with_space_terminated_copy(false)) { goto error; }} FAIL_IF(parser.parse_number_with_space_terminated_copy());
goto finish;
case '-':
// Next line used to be an interesting functional programming exercise with
// a lambda that gets passed to another function via a closure. This would confuse the
// clangcl compiler under Visual Studio 2019 (recent release).
{ if(parser.parse_number_with_space_terminated_copy(true)) { goto error; }}
goto finish; goto finish;
default: default:
parser.log_error("Document starts with a non-value character"); parser.log_error("Document starts with a non-value character");

View File

@ -19,7 +19,7 @@ void found_float(double result, const uint8_t *buf);
TARGET_HASWELL TARGET_HASWELL
namespace simdjson { namespace simdjson {
namespace haswell { namespace haswell {
static inline uint32_t parse_eight_digits_unrolled(const char *chars) { static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
// this actually computes *16* values so we are being wasteful. // this actually computes *16* values so we are being wasteful.
const __m128i ascii0 = _mm_set1_epi8('0'); const __m128i ascii0 = _mm_set1_epi8('0');
const __m128i mul_1_10 = const __m128i mul_1_10 =

View File

@ -20,7 +20,7 @@ void found_float(double result, const uint8_t *buf);
TARGET_WESTMERE TARGET_WESTMERE
namespace simdjson { namespace simdjson {
namespace westmere { namespace westmere {
static inline uint32_t parse_eight_digits_unrolled(const char *chars) { static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
// this actually computes *16* values so we are being wasteful. // this actually computes *16* values so we are being wasteful.
const __m128i ascii0 = _mm_set1_epi8('0'); const __m128i ascii0 = _mm_set1_epi8('0');
const __m128i mul_1_10 = const __m128i mul_1_10 =