simdjson/include/simdjson/generic/stringparsing.h

134 lines
5.5 KiB
C++

// This file contains the common code every implementation uses
// It is intended to be included multiple times and compiled multiple times
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
namespace stringparsing {
// begin copypasta
// These chars yield themselves: " \ /
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
// u not handled in this table as it's complex
static const uint8_t escape_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
// handle a unicode codepoint
// write appropriate values into dest
// src will advance 6 bytes or 12 bytes
// dest will advance a variable amount (return via pointer)
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time
SIMDJSON_WARN_UNUSED
simdjson_really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
uint8_t **dst_ptr) {
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
// conversion isn't valid; we defer the check for this to inside the
// multilingual plane check
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
*src_ptr += 6;
// check for low surrogate for characters outside the Basic
// Multilingual Plane.
if (code_point >= 0xd800 && code_point < 0xdc00) {
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
return false;
}
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
// if the first code point is invalid we will get here, as we will go past
// the check for being outside the Basic Multilingual plane. If we don't
// find a \u immediately afterwards we fail out anyhow, but if we do,
// this check catches both the case of the first code point being invalid
// or the second code point being invalid.
if ((code_point | code_point_2) >> 16) {
return false;
}
code_point =
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
*src_ptr += 6;
}
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
*dst_ptr += offset;
return offset > 0;
}
SIMDJSON_WARN_UNUSED simdjson_really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
while (1) {
// Copy the next n bytes, and find the backslash and quote in them.
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
// If the next thing is the end quote, copy and return
if (bs_quote.has_quote_first()) {
// we encountered quotes first. Move dst to point to quotes and exit
return dst + bs_quote.quote_index();
}
if (bs_quote.has_backslash()) {
/* find out where the backspace is */
auto bs_dist = bs_quote.backslash_index();
uint8_t escape_char = src[bs_dist + 1];
/* we encountered backslash first. Handle backslash */
if (escape_char == 'u') {
/* move src/dst up to the start; they will be further adjusted
within the unicode codepoint handling code. */
src += bs_dist;
dst += bs_dist;
if (!handle_unicode_codepoint(&src, &dst)) {
return nullptr;
}
} else {
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
* write bs_dist+1 characters to output
* note this may reach beyond the part of the buffer we've actually
* seen. I think this is ok */
uint8_t escape_result = escape_map[escape_char];
if (escape_result == 0u) {
return nullptr; /* bogus escape value is an error */
}
dst[bs_dist] = escape_result;
src += bs_dist + 2;
dst += bs_dist + 1;
}
} else {
/* they are the same. Since they can't co-occur, it means we
* encountered neither. */
src += backslash_and_quote::BYTES_PROCESSED;
dst += backslash_and_quote::BYTES_PROCESSED;
}
}
/* can't be reached */
return nullptr;
}
SIMDJSON_UNUSED SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string_to_buffer(const uint8_t *src, uint8_t *&current_string_buf_loc, std::string_view &s) {
if (*(src++) != '"') { return STRING_ERROR; }
auto end = stringparsing::parse_string(src, current_string_buf_loc);
if (!end) { return STRING_ERROR; }
s = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
current_string_buf_loc = end;
return SUCCESS;
}
} // namespace stringparsing
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson