From da093c1982a481e43b25148176a08d8d7bcbd46f Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 29 Sep 2020 19:17:03 -0400 Subject: [PATCH] Fixing "undefined behavior" issue in new fast_itoa functions (#1186) * Fixing "undefined behavior" issue. * Simplifying our custom atoi * Fixing minor bug --- include/simdjson/dom/serialization-inl.h | 76 ++++++++++++------------ tests/basictests.cpp | 18 +++++- 2 files changed, 55 insertions(+), 39 deletions(-) diff --git a/include/simdjson/dom/serialization-inl.h b/include/simdjson/dom/serialization-inl.h index 96c4dfec..35ef1438 100644 --- a/include/simdjson/dom/serialization-inl.h +++ b/include/simdjson/dom/serialization-inl.h @@ -43,29 +43,36 @@ struct escape_sequence { */ char *fast_itoa(char *output, int64_t value) noexcept { // This is a standard implementation of itoa. - // We first write in reverse order and then reverse. + char buffer[20]; + uint64_t value_positive; + // In general, negating a signed integer is unsafe. if(value < 0) { *output++ = '-'; - value = -value; + // Doing value_positive = -value; while avoiding + // undefined behavior warnings. + // It assumes two complement's which is universal at this + // point in time. + std::memcpy(&value_positive, &value, sizeof(value)); + value_positive = (~value_positive) + 1; // this is a negation + } else { + value_positive = value; } - char *write_pointer = output; - do { - *write_pointer++ = char('0' + (value % 10)); - value /= 10; - } while (value != 0); - // then we reverse the result - char *const answer = write_pointer; - char *second_write_pointer = output; - write_pointer -= 1; - while (second_write_pointer < write_pointer) { - char c1 = *write_pointer; - char c2 = *second_write_pointer; - *second_write_pointer = c1; - *write_pointer = c2; - write_pointer--; - second_write_pointer++; + // We work solely with value_positive. It *might* be easier + // for an optimizing compiler to deal with an unsigned variable + // as far as performance goes. + const char *const end_buffer = buffer + 20; + char *write_pointer = buffer + 19; + // A faster approach is possible if we expect large integers: + // unroll the loop (work in 100s, 1000s) and use some kind of + // memoization. + while(value_positive >= 10) { + *write_pointer-- = char('0' + (value_positive % 10)); + value_positive /= 10; } - return answer; + *write_pointer = char('0' + value_positive); + size_t len = end_buffer - write_pointer; + std::memcpy(output, write_pointer, len); + return output + len; } /**@private * This converts an unsigned integer into a character sequence. @@ -78,25 +85,20 @@ char *fast_itoa(char *output, int64_t value) noexcept { */ char *fast_itoa(char *output, uint64_t value) noexcept { // This is a standard implementation of itoa. - // We first write in reverse order and then reverse. - char *write_pointer = output; - do { - *write_pointer++ = char('0' + (value % 10)); + char buffer[20]; + const char *const end_buffer = buffer + 20; + char *write_pointer = buffer + 19; + // A faster approach is possible if we expect large integers: + // unroll the loop (work in 100s, 1000s) and use some kind of + // memoization. + while(value >= 10) { + *write_pointer-- = char('0' + (value % 10)); value /= 10; - } while (value != 0); - // then we reverse the result - char *const answer = write_pointer; - char *second_write_pointer = output; - write_pointer -= 1; - while (second_write_pointer < write_pointer) { - char c1 = *write_pointer; - char c2 = *second_write_pointer; - *second_write_pointer = c1; - *write_pointer = c2; - write_pointer--; - second_write_pointer++; - } - return answer; + }; + *write_pointer = char('0' + value); + size_t len = end_buffer - write_pointer; + std::memcpy(output, write_pointer, len); + return output + len; } } // anonymous namespace namespace internal { diff --git a/tests/basictests.cpp b/tests/basictests.cpp index 7c04e2dd..bfc0120e 100644 --- a/tests/basictests.cpp +++ b/tests/basictests.cpp @@ -1677,6 +1677,19 @@ namespace to_string_tests { return true; } + bool print_to_string_large_int() { + std::cout << "Running " << __func__ << std::endl; + dom::parser parser; + dom::element doc; + ASSERT_SUCCESS( parser.parse("-922337203685477580"_padded).get(doc) ); + ostringstream s; + s << to_string(doc); + if(s.str() != "-922337203685477580") { + cerr << "failed to parse -922337203685477580" << endl; + return false; + } + return true; + } bool print_to_string_parser_parse() { std::cout << "Running " << __func__ << std::endl; @@ -1785,10 +1798,11 @@ namespace to_string_tests { #endif // SIMDJSON_EXCEPTIONS bool run() { - return print_to_string_parser_parse() && + return print_to_string_large_int() && + print_to_string_parser_parse() && print_to_string_element() && print_to_string_array() && - print_to_string_object() && + print_to_string_object() && #if SIMDJSON_EXCEPTIONS print_to_string_parser_parse_exception() && print_to_string_element_result_exception() &&