134 lines
5.5 KiB
C++
134 lines
5.5 KiB
C++
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace SIMDJSON_IMPLEMENTATION {
|
|
namespace {
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
SIMDJSON_WARN_UNUSED
|
|
simdjson_really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion isn't valid; we defer the check for this to inside the
|
|
// multilingual plane check
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// check for low surrogate for characters outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
|
return false;
|
|
}
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
|
|
// if the first code point is invalid we will get here, as we will go past
|
|
// the check for being outside the Basic Multilingual plane. If we don't
|
|
// find a \u immediately afterwards we fail out anyhow, but if we do,
|
|
// this check catches both the case of the first code point being invalid
|
|
// or the second code point being invalid.
|
|
if ((code_point | code_point_2) >> 16) {
|
|
return false;
|
|
}
|
|
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
SIMDJSON_WARN_UNUSED simdjson_really_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
/* can't be reached */
|
|
return nullptr;
|
|
}
|
|
|
|
SIMDJSON_UNUSED SIMDJSON_WARN_UNUSED simdjson_really_inline error_code parse_string_to_buffer(const uint8_t *src, uint8_t *¤t_string_buf_loc, std::string_view &s) {
|
|
if (*(src++) != '"') { return STRING_ERROR; }
|
|
auto end = stringparsing::parse_string(src, current_string_buf_loc);
|
|
if (!end) { return STRING_ERROR; }
|
|
s = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
|
|
current_string_buf_loc = end;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace SIMDJSON_IMPLEMENTATION
|
|
} // namespace simdjson
|