Even faster.

2018-11-20 11:56:10 -05:00 · 2018-11-20 11:56:10 -05:00 · 78e75a8bae
parent 7dd590c43c
commit 78e75a8bae
3 changed files with 19 additions and 41 deletions
--- a/31
+++ b/31
@ -6,10 +6,12 @@

 .PHONY: clean cleandist

-CXXFLAGS =  -std=c++11 -g2 -O3 -march=native -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux  -Idependencies/rapidjson/include -Idependencies/sajson/include 
+CXXFLAGS =  -std=c++11  -march=native -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux  -Idependencies/rapidjson/include -Idependencies/sajson/include 

 ifeq ($(SANITIZE),1)
-	CXXFLAGS += -g2 -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
+	CXXFLAGS += -g3 -O0  -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined
+else
+	CXXFLAGS += -O3
 endif

 EXECUTABLES=parse jsoncheck numberparsingcheck stringparsingcheck minifiercompetition parsingcompetition minify allparserscheckfile
@ -19,7 +21,6 @@ LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp     src
 MINIFIERHEADERS=include/jsonparser/jsonminifier.h include/jsonparser/simdprune_tables.h
 MINIFIERLIBFILES=src/jsonminifier.cpp

-EXTRA_EXECUTABLES=parsenocheesy parsenodep8

 RAPIDJSON_INCLUDE:=dependencies/rapidjson/include
 SAJSON_INCLUDE:=dependencies/sajson/include
@ -79,30 +80,6 @@ allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES)
 parsehisto: benchmark/parse.cpp  $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o parsehisto benchmark/parse.cpp $(LIBFILES) $(LIBFLAGS) -DBUILDHISTOGRAM

-testflatten: parse parsenocheesy parsenodep8 parsenodep10 parsenodep12
-	for filename in jsonexamples/twitter.json jsonexamples/gsoc-2018.json jsonexamples/citm_catalog.json jsonexamples/canada.json ; do \
-        	echo $$filename ; \
-		set -x; \
-		./parsenocheesy $$filename ; \
-		./parse $$filename ; \
-		./parsenodep8 $$filename ; \
-		./parsenodep10 $$filename ; \
-		./parsenodep12 $$filename ; \
-		set +x; \
-	done
-
-parsenocheesy: benchmark/parse.cpp  $(HEADERS) $(LIBFILES)
-	$(CXX) $(CXXFLAGS) -o parsenocheesy benchmark/parse.cpp $(LIBFILES) -DSUPPRESS_CHEESY_FLATTEN
-
-parsenodep8: benchmark/parse.cpp  $(HEADERS) $(LIBFILES)
-	$(CXX) $(CXXFLAGS) -o parsenodep8 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=8
-
-parsenodep10: benchmark/parse.cpp  $(HEADERS) $(LIBFILES)
-	$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=10
-
-parsenodep12: benchmark/parse.cpp  $(HEADERS) $(LIBFILES)
-	$(CXX) $(CXXFLAGS) -o parsenodep12 benchmark/parse.cpp $(LIBFILES) -DNO_PDEP_PLEASE -DNO_PDEP_WIDTH=12
-
 clean:
 	rm -f $(EXECUTABLES) $(EXTRA_EXECUTABLES)

--- a/include/jsonparser/jsoncharutils.h
+++ b/include/jsonparser/jsoncharutils.h
@ -49,6 +49,7 @@ const char digittoval[256] = {
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1};

+// return true if we have a valid hex between 0000 and FFFF
 inline bool hex_to_u32(const u8 *src, u32 *res) {
  u8 v1 = src[0];
  u8 v2 = src[1];
@ -59,6 +60,16 @@ inline bool hex_to_u32(const u8 *src, u32 *res) {
  return (int32_t)(*res) >= 0;
 }

+// returns a value with the highest bit set if it is not valud
+uint32_t hex_to_u32_nocheck(const u8 *src) {
+  u8 v1 = src[0];
+  u8 v2 = src[1];
+  u8 v3 = src[2];
+  u8 v4 = src[3];
+  return digittoval[v1] << 12 | digittoval[v2] << 8 | digittoval[v3] << 4 |
+         digittoval[v4];
+}
+
 // given a code point cp, writes to c
 // the utf-8 code, outputting the length in
 // bytes, if the length is zero, the code point
@ -83,7 +94,7 @@ inline size_t codepoint_to_utf8(uint32_t cp, u8 *c) {
    c[1] = ((cp >> 6) & 63) + 128;
    c[2] = (cp & 63) + 128;
    return 3;
-  } else if (cp <= 0x10FFFF) {
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this is not needed
    c[0] = (cp >> 18) + 240;
    c[1] = ((cp >> 12) & 63) + 128;
    c[2] = ((cp >> 6) & 63) + 128;
--- a/include/jsonparser/stringparsing.h
+++ b/include/jsonparser/stringparsing.h
@ -39,10 +39,7 @@ static const u8 escape_map[256] = {
 // return true if the unicode codepoint was valid
 // We work in little-endian then swap at write time
 really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
-  u32 code_point = 0; // read the hex, potentially reading another \u beyond if it is a surrogate pair
-  if (!hex_to_u32(*src_ptr + 2, &code_point)) {
-    return false;
-  }
+  u32 code_point = hex_to_u32_nocheck(*src_ptr + 2);
  *src_ptr += 6;
  // check for low surrogate for characters outside the Basic
  // Multilingual Plane.
@ -50,21 +47,14 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
    if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
      return false;
    }
-    u32 code_point_2 = 0;
-    if (!hex_to_u32(*src_ptr + 2, &code_point_2)) {
-      return false;
-    }
-    if (code_point_2 < 0xdc00 || code_point_2 > 0xdfff) {
-      return false;
-    }
+    u32 code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
    code_point =
        (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
    *src_ptr += 6;
  }
  size_t offset = codepoint_to_utf8(code_point, *dst_ptr); 
-  // assert(offset > 0);
  *dst_ptr += offset;
-  return true;
+  return offset > 0;
 }

 really_inline  bool parse_string(const u8 *buf, UNUSED size_t len,