Use common parse_digit() funtion in int parsing

This commit is contained in:
John Keiser 2020-07-01 12:15:17 -07:00
parent 62a39639c2
commit fc0102b079
1 changed files with 43 additions and 45 deletions

View File

@ -268,9 +268,9 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
// z that fits in 53 bits, then we will be able to convert back the
// the integer into a float in a lossless manner.
const char *const first_after_period = p;
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
unsigned char digit = static_cast<unsigned char>(*p - '0');
if (digit > 9) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
++p;
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
// cheaper than arbitrary mult.
@ -283,16 +283,36 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
p += 8;
}
#endif
while (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
while (digit <= 9) {
++p;
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
// because we have parse_highprecision_float later.
digit = static_cast<unsigned char>(*p - '0');
}
exponent = first_after_period - p;
return true;
}
template<typename I>
really_inline bool parse_digit(const char c, I &i) {
const unsigned char digit = static_cast<unsigned char>(c - '0');
if (digit <= 9) {
// a multiplication by 10 is cheaper than an arbitrary integer
// multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
return true;
} else {
return false;
}
}
template<typename I>
really_inline bool parse_first_digit(const char c, I &i) {
const unsigned char digit = static_cast<unsigned char>(c - '0');
i = digit;
return digit <= 9;
}
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
bool neg_exp = false;
if ('-' == *p) {
@ -303,26 +323,15 @@ really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&
}
// e[+-] must be followed by a number
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
unsigned char digit = static_cast<unsigned char>(*p - '0');
int64_t exp_number = digit;
p++;
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
int64_t exp_number;
if (!parse_first_digit(*p, exp_number)) { return INVALID_NUMBER(src); }
++p;
}
if (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
if (parse_digit(*p, exp_number)) { ++p; }
if (parse_digit(*p, exp_number)) { ++p; }
while (parse_digit(*p, exp_number)) {
++p;
}
while (is_integer(*p)) {
// we need to check for overflows; we refuse to parse this
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
digit = static_cast<unsigned char>(*p - '0');
exp_number = 10 * exp_number + digit;
++p;
}
exponent += (neg_exp ? -exp_number : exp_number);
return true;
@ -403,34 +412,23 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
if (found_minus) {
++p;
negative = true;
// a negative sign must be followed by an integer
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
}
const char *const start_digits = p;
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
if (*p == '0') {
//
// Parse the integer part.
//
const char *const start_digits = p;
uint64_t i;
if (!parse_first_digit(*p, i)) { return INVALID_NUMBER(src); }
++p;
if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
i = 0;
if (i == 0) {
// If the integer starts with 0, just check that there are no more digits.
if (static_cast<unsigned char>(*p - '0') <= 9) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
} else {
// NOTE: This is a redundant check--either we're negative, in which case we checked whether this
// is a digit above, or the caller already determined we start with a digit. But removing this
// check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
// Please do try yourself, or think of ways to explain it--we'd love to understand :)
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer
unsigned char digit = static_cast<unsigned char>(*p - '0');
i = digit;
p++;
// the is_made_of_eight_digits_fast routine is unlikely to help here because
// we rarely see large integer parts like 123456789
while (is_integer(*p)) {
digit = static_cast<unsigned char>(*p - '0');
// a multiplication by 10 is cheaper than an arbitrary integer
// multiplication
i = 10 * i + digit; // might overflow, we will handle the overflow later
++p;
}
// Integer starts with 1-9. Parse the rest of the integer
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
while (parse_digit(*p, i)) { p++; }
}
//