Even faster.
This commit is contained in:
parent
7dd590c43c
commit
78e75a8bae
31
Makefile
31
Makefile
|
@ -6,10 +6,12 @@
|
||||||
|
|
||||||
.PHONY: clean cleandist
|
.PHONY: clean cleandist
|
||||||
|
|
||||||
CXXFLAGS = -std=c++11 -g2 -O3 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
CXXFLAGS = -std=c++11 -march=native -Wall -Wextra -Wshadow -Iinclude -Ibenchmark/linux -Idependencies/rapidjson/include -Idependencies/sajson/include
|
||||||
|
|
||||||
ifeq ($(SANITIZE),1)
|
ifeq ($(SANITIZE),1)
|
||||||
CXXFLAGS += -g2 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
|
CXXFLAGS += -g3 -O0 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
|
||||||
|
else
|
||||||
|
CXXFLAGS += -O3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
EXECUTABLES=parse jsoncheck numberparsingcheck stringparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
|
EXECUTABLES=parse jsoncheck numberparsingcheck stringparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
|
||||||
|
@ -19,7 +21,6 @@ LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp src
|
||||||
MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
|
MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
|
||||||
MINIFIERLIBFILES=src/jsonminifier.cpp
|
MINIFIERLIBFILES=src/jsonminifier.cpp
|
||||||
|
|
||||||
EXTRA_EXECUTABLES=parsenocheesy parsenodep8
|
|
||||||
|
|
||||||
RAPIDJSON_INCLUDE:=dependencies/rapidjson/include
|
RAPIDJSON_INCLUDE:=dependencies/rapidjson/include
|
||||||
SAJSON_INCLUDE:=dependencies/sajson/include
|
SAJSON_INCLUDE:=dependencies/sajson/include
|
||||||
|
@ -79,30 +80,6 @@ allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES)
|
||||||
parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
parsehisto: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
||||||
$(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM
|
$(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM
|
||||||
|
|
||||||
testflatten: parse parsenocheesy parsenodep8 parsenodep10 parsenodep12
|
|
||||||
for filename in jsonexamples/twitter.json jsonexamples/gsoc-2018.json jsonexamples/citm_catalog.json jsonexamples/canada.json ; do \
|
|
||||||
echo $$filename ; \
|
|
||||||
set -x; \
|
|
||||||
./parsenocheesy $$filename ; \
|
|
||||||
./parse $$filename ; \
|
|
||||||
./parsenodep8 $$filename ; \
|
|
||||||
./parsenodep10 $$filename ; \
|
|
||||||
./parsenodep12 $$filename ; \
|
|
||||||
set +x; \
|
|
||||||
done
|
|
||||||
|
|
||||||
parsenocheesy: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
|
||||||
$(CXX) $(CXXFLAGS) -o parsenocheesy benchmark/parse.cpp $(LIBFILES) -DSUPPRESS_CHEESY_FLATTEN
|
|
||||||
|
|
||||||
parsenodep8: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
|
||||||
$(CXX) $(CXXFLAGS) -o parsenodep8 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=8
|
|
||||||
|
|
||||||
parsenodep10: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
|
||||||
$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=10
|
|
||||||
|
|
||||||
parsenodep12: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
|
|
||||||
$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=12
|
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(EXECUTABLES) $(EXTRA_EXECUTABLES)
|
rm -f $(EXECUTABLES) $(EXTRA_EXECUTABLES)
|
||||||
|
|
||||||
|
|
|
@ -49,6 +49,7 @@ const char digittoval[256] = {
|
||||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||||
-1, -1, -1, -1, -1, -1, -1, -1, -1};
|
-1, -1, -1, -1, -1, -1, -1, -1, -1};
|
||||||
|
|
||||||
|
// return true if we have a valid hex between 0000 and FFFF
|
||||||
inline bool hex_to_u32(const u8 *src, u32 *res) {
|
inline bool hex_to_u32(const u8 *src, u32 *res) {
|
||||||
u8 v1 = src[0];
|
u8 v1 = src[0];
|
||||||
u8 v2 = src[1];
|
u8 v2 = src[1];
|
||||||
|
@ -59,6 +60,16 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
|
||||||
return (int32_t)(*res) >= 0;
|
return (int32_t)(*res) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// returns a value with the highest bit set if it is not valud
|
||||||
|
uint32_t hex_to_u32_nocheck(const u8 *src) {
|
||||||
|
u8 v1 = src[0];
|
||||||
|
u8 v2 = src[1];
|
||||||
|
u8 v3 = src[2];
|
||||||
|
u8 v4 = src[3];
|
||||||
|
return digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
|
||||||
|
digittoval[v4];
|
||||||
|
}
|
||||||
|
|
||||||
// given a code point cp, writes to c
|
// given a code point cp, writes to c
|
||||||
// the utf-8 code, outputting the length in
|
// the utf-8 code, outputting the length in
|
||||||
// bytes, if the length is zero, the code point
|
// bytes, if the length is zero, the code point
|
||||||
|
@ -83,7 +94,7 @@ inline size_t codepoint_to_utf8(uint32_t cp, u8 *c) {
|
||||||
c[1] = ((cp >> 6) & 63) + 128;
|
c[1] = ((cp >> 6) & 63) + 128;
|
||||||
c[2] = (cp & 63) + 128;
|
c[2] = (cp & 63) + 128;
|
||||||
return 3;
|
return 3;
|
||||||
} else if (cp <= 0x10FFFF) {
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed
|
||||||
c[0] = (cp >> 18) + 240;
|
c[0] = (cp >> 18) + 240;
|
||||||
c[1] = ((cp >> 12) & 63) + 128;
|
c[1] = ((cp >> 12) & 63) + 128;
|
||||||
c[2] = ((cp >> 6) & 63) + 128;
|
c[2] = ((cp >> 6) & 63) + 128;
|
||||||
|
|
|
@ -39,10 +39,7 @@ static const u8 escape_map[256] = {
|
||||||
// return true if the unicode codepoint was valid
|
// return true if the unicode codepoint was valid
|
||||||
// We work in little-endian then swap at write time
|
// We work in little-endian then swap at write time
|
||||||
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
||||||
u32 code_point = 0; // read the hex, potentially reading another \u beyond if it is a surrogate pair
|
u32 code_point = hex_to_u32_nocheck(*src_ptr + 2);
|
||||||
if (!hex_to_u32(*src_ptr + 2, &code_point)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*src_ptr += 6;
|
*src_ptr += 6;
|
||||||
// check for low surrogate for characters outside the Basic
|
// check for low surrogate for characters outside the Basic
|
||||||
// Multilingual Plane.
|
// Multilingual Plane.
|
||||||
|
@ -50,21 +47,14 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
|
||||||
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
u32 code_point_2 = 0;
|
u32 code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
|
||||||
if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
code_point =
|
code_point =
|
||||||
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
|
||||||
*src_ptr += 6;
|
*src_ptr += 6;
|
||||||
}
|
}
|
||||||
size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
|
size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
|
||||||
// assert(offset > 0);
|
|
||||||
*dst_ptr += offset;
|
*dst_ptr += offset;
|
||||||
return true;
|
return offset > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
really_inline bool parse_string(const u8 *buf, UNUSED size_t len,
|
||||||
|
|
Loading…
Reference in New Issue