Even faster.
This commit is contained in:
parent
7dd590c43c
commit
78e75a8bae
31
Makefile
31
Makefile
|
@ -6,10 +6,12 @@
|
|||
|
||||
.PHONY: clean cleandist
|
||||
|
||||
CXXFLAGS = -std=c++11 -g2 -O3 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
||||
CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
||||
|
||||
ifeq ($(SANITIZE),1)
|
||||
CXXFLAGS += -g2 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
|
||||
CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
|
||||
else
|
||||
CXXFLAGS += -O3
|
||||
endif
|
||||
|
||||
EXECUTABLES=parse jsoncheck numberparsingcheck stringparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
|
||||
|
@ -19,7 +21,6 @@ LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src
|
|||
MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
|
||||
MINIFIERLIBFILES=src/jsonminifier.cpp
|
||||
|
||||
EXTRA_EXECUTABLES=parsenocheesy parsenodep8
|
||||
|
||||
RAPIDJSON_INCLUDE:=dependencies/rapidjson/include
|
||||
SAJSON_INCLUDE:=dependencies/sajson/include
|
||||
|
@ -79,30 +80,6 @@ allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES)
|
|||
parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM
|
||||
|
||||
testflatten: parse parsenocheesy parsenodep8 parsenodep10 parsenodep12
|
||||
for filename in jsonexamples/twitter.json jsonexamples/gsoc-2018.json jsonexamples/citm_catalog.json jsonexamples/canada.json ; do \
|
||||
echo $$filename ; \
|
||||
set -x; \
|
||||
./parsenocheesy $$filename ; \
|
||||
./parse $$filename ; \
|
||||
./parsenodep8 $$filename ; \
|
||||
./parsenodep10 $$filename ; \
|
||||
./parsenodep12 $$filename ; \
|
||||
set +x; \
|
||||
done
|
||||
|
||||
parsenocheesy: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o parsenocheesy benchmark/parse.cpp $(LIBFILES) -DSUPPRESS_CHEESY_FLATTEN
|
||||
|
||||
parsenodep8: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o parsenodep8 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=8
|
||||
|
||||
parsenodep10: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=10
|
||||
|
||||
parsenodep12: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||
$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=12
|
||||
|
||||
clean:
|
||||
rm -f $(EXECUTABLES) $(EXTRA_EXECUTABLES)
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ const char digittoval[256] = {
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1};
|
||||
|
||||
// return true if we have a valid hex between 0000 and FFFF
|
||||
inline bool hex_to_u32(const u8 *src, u32 *res) {
|
||||
u8 v1 = src[0];
|
||||
u8 v2 = src[1];
|
||||
|
@ -59,6 +60,16 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
|
|||
return (int32_t)(*res) >= 0;
|
||||
}
|
||||
|
||||
// returns a value with the highest bit set if it is not valud
|
||||
uint32_t hex_to_u32_nocheck(const u8 *src) {
|
||||
u8 v1 = src[0];
|
||||
u8 v2 = src[1];
|
||||
u8 v3 = src[2];
|
||||
u8 v4 = src[3];
|
||||
return digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
|
||||
digittoval[v4];
|
||||
}
|
||||
|
||||
// given a code point cp, writes to c
|
||||
// the utf-8 code, outputting the length in
|
||||
// bytes, if the length is zero, the code point
|
||||
|
@ -83,7 +94,7 @@ inline size_t codepoint_to_utf8(uint32_t cp, u8 *c) {
|
|||
c[1] = ((cp >> 6) & 63) + 128;
|
||||
c[2] = (cp & 63) + 128;
|
||||
return 3;
|
||||
} else if (cp <= 0x10FFFF) {
|
||||
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed
|
||||
c[0] = (cp >> 18) + 240;
|
||||
c[1] = ((cp >> 12) & 63) + 128;
|
||||
c[2] = ((cp >> 6) & 63) + 128;
|
||||
|
|
|
@ -39,10 +39,7 @@ static const u8 escape_map[256] = {
|
|||
// return true if the unicode codepoint was valid
|
||||
// We work in little-endian then swap at write time
|
||||
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if it is a surrogate pair
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
||||
return false;
|
||||
}
|
||||
u32 code_point = hex_to_u32_nocheck(*src_ptr + 2);
|
||||
*src_ptr += 6;
|
||||
// check for low surrogate for characters outside the Basic
|
||||
// Multilingual Plane.
|
||||
|
@ -50,21 +47,14 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
|||
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
||||
return false;
|
||||
}
|
||||
u32 code_point_2 = 0;
|
||||
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
|
||||
return false;
|
||||
}
|
||||
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
|
||||
return false;
|
||||
}
|
||||
u32 code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
|
||||
code_point =
|
||||
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
||||
*src_ptr += 6;
|
||||
}
|
||||
size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
|
||||
// assert(offset > 0);
|
||||
*dst_ptr += offset;
|
||||
return true;
|
||||
return offset > 0;
|
||||
}
|
||||
|
||||
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
||||
|
|
Loading…
Reference in New Issue