Pick new number parser as winner; move string parsing to own header
This commit is contained in:
parent
577d6792f4
commit
ceb55cc8db
2
Makefile
2
Makefile
|
@ -9,7 +9,7 @@
|
|||
CXXFLAGS = -std=c++11 -g2 -O2 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
||||
EXECUTABLES=parse jsoncheck numberparsingcheck minifiercompetition parsingcompetition minify
|
||||
|
||||
HEADERS= include/jsonparser/numberparsing.h include/jsonparser/jsonparser.h include/jsonparser/common_defs.h include/jsonparser/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/jsonparser/simdjson_internal.h include/jsonparser/stage1_find_marks.h include/jsonparser/stage2_flatten.h include/jsonparser/stage34_unified.h
|
||||
HEADERS= include/jsonparser/stringparsing.h include/jsonparser/numberparsing.h include/jsonparser/jsonparser.h include/jsonparser/common_defs.h include/jsonparser/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/jsonparser/simdjson_internal.h include/jsonparser/stage1_find_marks.h include/jsonparser/stage2_flatten.h include/jsonparser/stage34_unified.h
|
||||
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src/stage2_flatten.cpp src/stage34_unified.cpp
|
||||
MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
|
||||
MINIFIERLIBFILES=src/jsonminifier.cpp
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
#pragma once
|
||||
|
||||
#include "common_defs.h"
|
||||
#include "jsonparser/simdjson_internal.h"
|
||||
#include "jsonparser/jsoncharutils.h"
|
||||
|
||||
|
||||
// begin copypasta
|
||||
// These chars yield themselves: " \ /
|
||||
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
||||
// u not handled in this table as it's complex
|
||||
static const u8 escape_map[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
||||
0, 0, 0x08, 0, 0, 0, 0x12, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
||||
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const u32 leading_zeros_to_utf_bytes[33] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
|
||||
2, 2, 2, 2, // 11 bits for next
|
||||
3, 3, 3, 3, 3, // 16 bits for next
|
||||
4, 4, 4, 4, 4, // 21 bits for next
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // error
|
||||
|
||||
static const u32 UTF_PDEP_MASK[5] = {0x00, // error
|
||||
0x7f, 0x1f3f, 0x0f3f3f, 0x073f3f3f};
|
||||
|
||||
static const u32 UTF_OR_MASK[5] = {0x00, // error
|
||||
0x00, 0xc080, 0xe08080, 0xf0808080};
|
||||
|
||||
|
||||
|
||||
// handle a unicode codepoint
|
||||
// write appropriate values into dest
|
||||
// src will always advance 6 bytes
|
||||
// dest will advance a variable amount (return via pointer)
|
||||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if
|
||||
// it's a // wacky one
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
||||
return false;
|
||||
}
|
||||
*src_ptr += 6;
|
||||
// check for the weirdo double-UTF-16 nonsense for things outside Basic
|
||||
// Multilingual Plane.
|
||||
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
||||
// TODO: sanity check and clean up; snippeted from RapidJSON and poorly
|
||||
// understood at the moment
|
||||
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
||||
return false;
|
||||
}
|
||||
u32 code_point_2 = 0;
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
|
||||
return false;
|
||||
}
|
||||
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
|
||||
return false;
|
||||
}
|
||||
code_point =
|
||||
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
||||
*src_ptr += 6;
|
||||
}
|
||||
// TODO: check to see whether the below code is nonsense (it's really only a
|
||||
// sketch at this point)
|
||||
u32 lz = __builtin_clz(code_point);
|
||||
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
|
||||
u32 tmp =
|
||||
_pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
|
||||
// swap and move to the other side of the register
|
||||
tmp = __builtin_bswap32(tmp);
|
||||
tmp >>= ((4 - utf_bytes) * 8) & 31; // if utf_bytes, this could become a shift
|
||||
// by 32, hence the mask with 31
|
||||
// use memcpy to avoid undefined behavior:
|
||||
std::memcpy(*(u32 **)dst_ptr, &tmp, sizeof(u32)); //**(u32 **)dst_ptr = tmp;
|
||||
*dst_ptr += utf_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, u32 depth, u32 offset) {
|
||||
using namespace std;
|
||||
const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||
u8 *dst = pj.current_string_buf_loc;
|
||||
#ifdef DEBUG
|
||||
cout << "Entering parse string with offset " << offset << "\n";
|
||||
#endif
|
||||
while (1) {
|
||||
#ifdef DEBUG
|
||||
for (u32 j = 0; j < 32; j++) {
|
||||
char c = *(src + j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... string handling input\n";
|
||||
#endif
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u32 bs_bits =
|
||||
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
|
||||
dumpbits32(bs_bits, "backslash bits 2");
|
||||
u32 quote_bits =
|
||||
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
|
||||
dumpbits32(quote_bits, "quote_bits");
|
||||
u32 quote_dist = __builtin_ctz(quote_bits);
|
||||
u32 bs_dist = __builtin_ctz(bs_bits);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256((m256 *)(dst), v);
|
||||
#ifdef DEBUG
|
||||
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
|
||||
#endif
|
||||
|
||||
if (quote_dist < bs_dist) {
|
||||
#ifdef DEBUG
|
||||
cout << "Found end, leaving!\n";
|
||||
#endif
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
dst[quote_dist] = 0; // null terminate and get out
|
||||
|
||||
pj.write_tape(depth, pj.current_string_buf_loc - pj.string_buf, '"');
|
||||
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1;
|
||||
|
||||
return true;
|
||||
} else if (quote_dist > bs_dist) {
|
||||
u8 escape_char = src[bs_dist + 1];
|
||||
#ifdef DEBUG
|
||||
cout << "Found escape char: " << escape_char << "\n";
|
||||
#endif
|
||||
// we encountered backslash first. Handle backslash
|
||||
if (escape_char == 'u') {
|
||||
// move src/dst up to the start; they will be further adjusted
|
||||
// within the unicode codepoint handling code.
|
||||
src += bs_dist;
|
||||
dst += bs_dist;
|
||||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
||||
// write bs_dist+1 characters to output
|
||||
// note this may reach beyond the part of the buffer we've actually
|
||||
// seen. I think this is ok
|
||||
u8 escape_result = escape_map[escape_char];
|
||||
if (!escape_result)
|
||||
return false; // bogus escape value is an error
|
||||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist + 2;
|
||||
dst += bs_dist + 1;
|
||||
}
|
||||
} else {
|
||||
// they are the same. Since they can't co-occur, it means we encountered
|
||||
// neither.
|
||||
src += 32;
|
||||
dst += 32;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// can't be reached
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -12,6 +12,8 @@
|
|||
#include "jsonparser/common_defs.h"
|
||||
#include "jsonparser/simdjson_internal.h"
|
||||
#include "jsonparser/jsoncharutils.h"
|
||||
#include "jsonparser/numberparsing.h"
|
||||
#include "jsonparser/stringparsing.h"
|
||||
|
||||
#include <iostream>
|
||||
//#define DEBUG
|
||||
|
@ -29,446 +31,6 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
|
||||
// begin copypasta
|
||||
// These chars yield themselves: " \ /
|
||||
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
||||
// u not handled in this table as it's complex
|
||||
const u8 escape_map[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
||||
0, 0, 0x08, 0, 0, 0, 0x12, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
||||
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
const u32 leading_zeros_to_utf_bytes[33] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, // 7 bits for first one
|
||||
2, 2, 2, 2, // 11 bits for next
|
||||
3, 3, 3, 3, 3, // 16 bits for next
|
||||
4, 4, 4, 4, 4, // 21 bits for next
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // error
|
||||
|
||||
const u32 UTF_PDEP_MASK[5] = {0x00, // error
|
||||
0x7f, 0x1f3f, 0x0f3f3f, 0x073f3f3f};
|
||||
|
||||
const u32 UTF_OR_MASK[5] = {0x00, // error
|
||||
0x00, 0xc080, 0xe08080, 0xf0808080};
|
||||
|
||||
|
||||
|
||||
// handle a unicode codepoint
|
||||
// write appropriate values into dest
|
||||
// src will always advance 6 bytes
|
||||
// dest will advance a variable amount (return via pointer)
|
||||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if
|
||||
// it's a // wacky one
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
||||
return false;
|
||||
}
|
||||
*src_ptr += 6;
|
||||
// check for the weirdo double-UTF-16 nonsense for things outside Basic
|
||||
// Multilingual Plane.
|
||||
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
||||
// TODO: sanity check and clean up; snippeted from RapidJSON and poorly
|
||||
// understood at the moment
|
||||
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
||||
return false;
|
||||
}
|
||||
u32 code_point_2 = 0;
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
|
||||
return false;
|
||||
}
|
||||
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
|
||||
return false;
|
||||
}
|
||||
code_point =
|
||||
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
||||
*src_ptr += 6;
|
||||
}
|
||||
// TODO: check to see whether the below code is nonsense (it's really only a
|
||||
// sketch at this point)
|
||||
u32 lz = __builtin_clz(code_point);
|
||||
u32 utf_bytes = leading_zeros_to_utf_bytes[lz];
|
||||
u32 tmp =
|
||||
_pdep_u32(code_point, UTF_PDEP_MASK[utf_bytes]) | UTF_OR_MASK[utf_bytes];
|
||||
// swap and move to the other side of the register
|
||||
tmp = __builtin_bswap32(tmp);
|
||||
tmp >>= ((4 - utf_bytes) * 8) & 31; // if utf_bytes, this could become a shift
|
||||
// by 32, hence the mask with 31
|
||||
// use memcpy to avoid undefined behavior:
|
||||
std::memcpy(*(u32 **)dst_ptr, &tmp, sizeof(u32)); //**(u32 **)dst_ptr = tmp;
|
||||
*dst_ptr += utf_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
||||
ParsedJson &pj, u32 depth, u32 offset) {
|
||||
|
||||
const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
|
||||
u8 *dst = pj.current_string_buf_loc;
|
||||
#ifdef DEBUG
|
||||
cout << "Entering parse string with offset " << offset << "\n";
|
||||
#endif
|
||||
while (1) {
|
||||
#ifdef DEBUG
|
||||
for (u32 j = 0; j < 32; j++) {
|
||||
char c = *(src + j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... string handling input\n";
|
||||
#endif
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u32 bs_bits =
|
||||
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('\\')));
|
||||
dumpbits32(bs_bits, "backslash bits 2");
|
||||
u32 quote_bits =
|
||||
(u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, _mm256_set1_epi8('"')));
|
||||
dumpbits32(quote_bits, "quote_bits");
|
||||
u32 quote_dist = __builtin_ctz(quote_bits);
|
||||
u32 bs_dist = __builtin_ctz(bs_bits);
|
||||
// store to dest unconditionally - we can overwrite the bits we don't like
|
||||
// later
|
||||
_mm256_storeu_si256((m256 *)(dst), v);
|
||||
#ifdef DEBUG
|
||||
cout << "quote dist: " << quote_dist << " bs dist: " << bs_dist << "\n";
|
||||
#endif
|
||||
|
||||
if (quote_dist < bs_dist) {
|
||||
#ifdef DEBUG
|
||||
cout << "Found end, leaving!\n";
|
||||
#endif
|
||||
// we encountered quotes first. Move dst to point to quotes and exit
|
||||
dst[quote_dist] = 0; // null terminate and get out
|
||||
|
||||
pj.write_tape(depth, pj.current_string_buf_loc - pj.string_buf, '"');
|
||||
|
||||
pj.current_string_buf_loc = dst + quote_dist + 1;
|
||||
|
||||
return true;
|
||||
} else if (quote_dist > bs_dist) {
|
||||
u8 escape_char = src[bs_dist + 1];
|
||||
#ifdef DEBUG
|
||||
cout << "Found escape char: " << escape_char << "\n";
|
||||
#endif
|
||||
// we encountered backslash first. Handle backslash
|
||||
if (escape_char == 'u') {
|
||||
// move src/dst up to the start; they will be further adjusted
|
||||
// within the unicode codepoint handling code.
|
||||
src += bs_dist;
|
||||
dst += bs_dist;
|
||||
if (!handle_unicode_codepoint(&src, &dst)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
||||
// write bs_dist+1 characters to output
|
||||
// note this may reach beyond the part of the buffer we've actually
|
||||
// seen. I think this is ok
|
||||
u8 escape_result = escape_map[escape_char];
|
||||
if (!escape_result)
|
||||
return false; // bogus escape value is an error
|
||||
dst[bs_dist] = escape_result;
|
||||
src += bs_dist + 2;
|
||||
dst += bs_dist + 1;
|
||||
}
|
||||
} else {
|
||||
// they are the same. Since they can't co-occur, it means we encountered
|
||||
// neither.
|
||||
src += 32;
|
||||
dst += 32;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// can't be reached
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define NEWPARSENUMBER
|
||||
#ifdef NEWPARSENUMBER
|
||||
#include "jsonparser/numberparsing.h"
|
||||
#else
|
||||
|
||||
// does not validation whatsoever, assumes that all digit
|
||||
// this is CS 101
|
||||
inline u64 naivestrtoll(const char *p, const char *end) {
|
||||
if(p == end) return 0; // should be an error?
|
||||
// this code could get a whole lot smarter if we have many long ints:
|
||||
u64 x = *p - '0';
|
||||
p++;
|
||||
for(;p < end;p++) {
|
||||
x = (x*10) + (*p - '0');
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// put a parsed version of number (either as a double or a signed long) into the
|
||||
// number buffer, put a 'tag' indicating which type and where it is back onto
|
||||
// the tape at that location return false if we can't parse the number which
|
||||
// means either (a) the number isn't valid, or (b) the number is followed by
|
||||
// something that isn't whitespace, comma or a close }] character which are the
|
||||
// only things that should follow a number at this stage bools to detect what we
|
||||
// found in our initial character already here - we are already switching on 0
|
||||
// vs 1-9 vs - so we may as well keep separate paths where that's useful
|
||||
|
||||
// TODO: see if we really need a separate number_buf or whether we should just
|
||||
// have a generic scratch - would need to align before using for this
|
||||
really_inline bool parse_number(const u8 *buf, UNUSED size_t len,
|
||||
ParsedJson &pj,
|
||||
u32 depth, u32 offset,
|
||||
UNUSED bool found_zero, bool found_minus) {
|
||||
if (found_minus) {
|
||||
offset++;
|
||||
}
|
||||
const u8 *src = &buf[offset];
|
||||
// this can read past the string content, so we need to have overallocated
|
||||
m256 v = _mm256_loadu_si256((const m256 *)(src));
|
||||
u64 error_sump = 0;
|
||||
#ifdef DEBUG
|
||||
for (u32 j = 0; j < 32; j++) {
|
||||
char c = *(src + j);
|
||||
if (isprint(c)) {
|
||||
cout << c;
|
||||
} else {
|
||||
cout << '_';
|
||||
}
|
||||
}
|
||||
cout << "| ... number handling input\n";
|
||||
#endif
|
||||
|
||||
// categories to extract
|
||||
// Digits:
|
||||
// 0 (0x30) - bucket 0
|
||||
// 1-9 (never any distinction except if we didn't get the free kick at 0 due
|
||||
// to the leading minus) (0x31-0x39) - bucket 1
|
||||
// . (0x2e) - bucket 2
|
||||
// E or e - no distinction (0x45/0x65) - bucket 3
|
||||
// + (0x2b) - bucket 4
|
||||
// - (0x2d) - bucket 4
|
||||
// Terminators
|
||||
// Whitespace: 0x20, 0x09, 0x0a, 0x0d - bucket 5+6
|
||||
// Comma and the closes: 0x2c is comma, } is 0x5d, ] is 0x7d - bucket 5+7
|
||||
|
||||
// Another shufti - also a bit hand-hacked. Need to make a better construction
|
||||
const m256 low_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
33, 2, 2, 2, 2, 10, 2, 2, 2, 66, 64, 16, 32, 0xd0, 4, 0, 33, 2, 2, 2, 2,
|
||||
10, 2, 2, 2, 66, 64, 16, 32, 0xd0, 4, 0);
|
||||
const m256 high_nibble_mask = _mm256_setr_epi8(
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
64, 0, 52, 3, 8, -128, 8, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 52, 3, 8,
|
||||
-128, 8, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
m256 tmp = _mm256_and_si256(
|
||||
_mm256_shuffle_epi8(low_nibble_mask, v),
|
||||
_mm256_shuffle_epi8(
|
||||
high_nibble_mask,
|
||||
_mm256_and_si256(_mm256_srli_epi32(v, 4), _mm256_set1_epi8(0x7f))));
|
||||
#ifdef DEBUG
|
||||
// let us print out the magic:
|
||||
uint8_t buffer[32];
|
||||
_mm256_storeu_si256((__m256i *)buffer,tmp);
|
||||
for(int k = 0; k < 32; k++)
|
||||
printf("%.2x ",buffer[k]);
|
||||
printf("\n");
|
||||
#endif
|
||||
m256 enders_mask = _mm256_set1_epi8(0xe0);
|
||||
m256 tmp_enders = _mm256_cmpeq_epi8(_mm256_and_si256(tmp, enders_mask),
|
||||
_mm256_set1_epi8(0));
|
||||
u32 enders = ~(u32)_mm256_movemask_epi8(tmp_enders);
|
||||
dumpbits32(enders, "ender characters");
|
||||
//dumpbits32_always(enders, "ender characters");
|
||||
|
||||
if (enders == 0) {
|
||||
error_sump = 1;
|
||||
// if enders == 0 we have
|
||||
// a heroically long number string or some garbage
|
||||
}
|
||||
// TODO: make a mask that indicates where our digits are // DANIEL: Isn't that digit_characters?
|
||||
u32 number_mask = ~enders & (enders - 1);
|
||||
dumpbits32(number_mask, "number mask");
|
||||
//dumpbits32_always(number_mask, "number mask");
|
||||
m256 n_mask = _mm256_set1_epi8(0x1f);
|
||||
m256 tmp_n =
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, n_mask), _mm256_set1_epi8(0));
|
||||
u32 number_characters = ~(u32)_mm256_movemask_epi8(tmp_n);
|
||||
|
||||
// put something into our error sump if we have something
|
||||
// before our ending characters that isn't a valid character
|
||||
// for the inside of our JSON
|
||||
number_characters &= number_mask;
|
||||
error_sump |= number_characters ^ number_mask;
|
||||
dumpbits32(number_characters, "number characters");
|
||||
|
||||
m256 d_mask = _mm256_set1_epi8(0x03);
|
||||
m256 tmp_d =
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, d_mask), _mm256_set1_epi8(0));
|
||||
u32 digit_characters = ~(u32)_mm256_movemask_epi8(tmp_d);
|
||||
digit_characters &= number_mask;
|
||||
dumpbits32(digit_characters, "digit characters");
|
||||
// dumpbits32_always(digit_characters, "digit characters");
|
||||
|
||||
|
||||
m256 p_mask = _mm256_set1_epi8(0x04);
|
||||
m256 tmp_p =
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, p_mask), _mm256_set1_epi8(0));
|
||||
u32 decimal_characters = ~(u32)_mm256_movemask_epi8(tmp_p);
|
||||
decimal_characters &= number_mask;
|
||||
dumpbits32(decimal_characters, "decimal characters");
|
||||
|
||||
m256 e_mask = _mm256_set1_epi8(0x08);
|
||||
m256 tmp_e =
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, e_mask), _mm256_set1_epi8(0));
|
||||
u32 exponent_characters = ~(u32)_mm256_movemask_epi8(tmp_e);
|
||||
exponent_characters &= number_mask;
|
||||
dumpbits32(exponent_characters, "exponent characters");
|
||||
|
||||
|
||||
m256 zero_mask = _mm256_set1_epi8(0x1);
|
||||
m256 tmp_zero =
|
||||
_mm256_cmpeq_epi8(tmp, zero_mask);
|
||||
u32 zero_characters = (u32)_mm256_movemask_epi8(tmp_zero);
|
||||
dumpbits32(zero_characters, "zero characters");
|
||||
|
||||
// if the zero character is in first position, it
|
||||
// needs to be followed by decimal or exponent or ender (note: we
|
||||
// handle found_minus separately)
|
||||
u32 expo_or_decimal_or_ender = exponent_characters | decimal_characters | enders;
|
||||
error_sump |= zero_characters & 0x01 & (~(expo_or_decimal_or_ender >> 1));
|
||||
|
||||
m256 s_mask = _mm256_set1_epi8(0x10);
|
||||
m256 tmp_s =
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(tmp, s_mask), _mm256_set1_epi8(0));
|
||||
u32 sign_characters = ~(u32)_mm256_movemask_epi8(tmp_s);
|
||||
sign_characters &= number_mask;
|
||||
dumpbits32(sign_characters, "sign characters");
|
||||
|
||||
u32 digit_edges = ~(digit_characters << 1) & digit_characters;
|
||||
dumpbits32(digit_edges, "digit_edges");
|
||||
|
||||
// check that we have 1-3 'edges' only
|
||||
u32 t = digit_edges;
|
||||
t &= t - 1;
|
||||
t &= t - 1;
|
||||
t &= t - 1;
|
||||
error_sump |= t;
|
||||
|
||||
// check that we start with a digit
|
||||
error_sump |= ~digit_characters & 0x1;
|
||||
|
||||
// having done some checks, get lazy and fall back
|
||||
// to strtoll or strtod
|
||||
// TODO: handle the easy cases ourselves; these are
|
||||
// expensive and we've done a lot of the prepwork.
|
||||
// return errors if strto* fail, otherwise fill in a code on the tape
|
||||
// 'd' for floating point and 'l' for long and put a pointer to the
|
||||
// spot in the buffer.
|
||||
if ( digit_edges == 1) {
|
||||
//if (__builtin_popcount(digit_edges) == 1) { // DANIEL : shouldn't we have digit_edges == 1
|
||||
#define NAIVEINTPARSING
|
||||
#ifdef NAIVEINTPARSING
|
||||
// this is faster, maybe, because we use a naive strtoll
|
||||
// should be all digits?
|
||||
error_sump |= number_characters ^ digit_characters;
|
||||
int stringlength = __builtin_ctz(~digit_characters);
|
||||
const char *end = (const char *)src + stringlength;
|
||||
u64 result = naivestrtoll((const char *)src,end);
|
||||
if (found_minus) { // unfortunate that it is a branch?
|
||||
result = -result;
|
||||
}
|
||||
#else
|
||||
// try a strtoll
|
||||
char *end;
|
||||
s64 result = strtoll((const char *)src, &end, 10);
|
||||
if ((errno != 0) || (end == (const char *)src)) {
|
||||
error_sump |= 1;
|
||||
}
|
||||
error_sump |= is_not_structural_or_whitespace(*end);
|
||||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
cout << "Found number " << result << "\n";
|
||||
#endif
|
||||
pj.write_tape_s64(depth, result);
|
||||
} else {
|
||||
// try a strtod
|
||||
char *end;
|
||||
double result = strtod((const char *)src, &end);
|
||||
if ((errno != 0) || (end == (const char *)src)) {
|
||||
error_sump |= 1;
|
||||
}
|
||||
error_sump |= is_not_structural_or_whitespace(*end);
|
||||
if (found_minus) {
|
||||
result = -result;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
cout << "Found number " << result << "\n";
|
||||
#endif
|
||||
pj.write_tape_double(depth, result);
|
||||
// HACK: return true regardless
|
||||
return true; // FIXME: we have a spurious error here
|
||||
}
|
||||
// TODO: check the MSB element is a digit
|
||||
|
||||
// TODO: a whole bunch of checks
|
||||
|
||||
// TODO: <=1 decimal point, eE mark, +- construct
|
||||
|
||||
// TODO: first and last character in mask region must be
|
||||
// digit
|
||||
|
||||
// TODO: if it exists,
|
||||
// Decimal point is after the first cluster of numbers only
|
||||
// and before the second cluster of numbers only. It must
|
||||
// be digit_or_zero . digit_or_zero strictly
|
||||
|
||||
// TODO: eE mark and +- construct are adjacent with eE first
|
||||
// eE mark preceeds final cluster of numbers only
|
||||
// and immediately follows second-last cluster of numbers only (not
|
||||
// necessarily second, as we may have 4e10).
|
||||
// it may suffice to insist that eE is preceeded immediately
|
||||
// by a digit of any kind and that it's followed locally by
|
||||
// a digit immediately or a +- construct then a digit.
|
||||
|
||||
// TODO: if we have both . and the eE mark then the . must
|
||||
// precede the eE mark
|
||||
|
||||
if (error_sump)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// end copypasta
|
||||
|
||||
really_inline bool is_valid_true_atom(const u8 * loc) {
|
||||
u64 tv = *(const u64 *)"true ";
|
||||
u64 mask4 = 0x00000000ffffffff;
|
||||
|
@ -550,7 +112,6 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
|
|||
goto SITE_LABEL; \
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////// START STATE /////////////////////////////
|
||||
|
||||
DEBUG_PRINTF("at start\n");
|
||||
|
|
Loading…
Reference in New Issue