Trying a detailed analysis.

2018-12-19 21:23:37 -05:00 · 2018-12-19 21:23:37 -05:00 · 20133963bc
parent 7d37dd5dea
commit 20133963bc
6 changed files with 73 additions and 17 deletions
--- a/21
+++ b/21
@ -25,6 +25,7 @@ endif
 MAINEXECUTABLES=parse minify json2json
 TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck 
 COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile
+SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing

 HEADERS= include/simdjson/simdutf8check.h include/simdjson/stringparsing.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_flatten.h include/simdjson/stage34_unified.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h
 LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/stage1_find_marks.cpp     src/stage2_flatten.cpp        src/stage34_unified.cpp
@ -85,6 +86,16 @@ $(UJSON4C_INCLUDE):
 parse: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o parse $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)

+parse_noutf8validation: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
+	$(CXX) $(CXXFLAGS) -o parse_noutf8validation -DSIMDJSON_SKIPUTF8VALIDATION $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
+
+parse_nonumberparsing: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
+	$(CXX) $(CXXFLAGS) -o parse_nonumberparsing  -DSIMDJSON_SKIPNUMBERPARSING $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
+
+parse_nostringparsing: benchmark/parse.cpp $(HEADERS) $(LIBFILES)
+	$(CXX) $(CXXFLAGS) -o parse_nostringparsing  -DSIMDJSON_SKIPSTRINGPARSING $(LIBFILES) benchmark/parse.cpp $(LIBFLAGS)
+
+
 jsoncheck:tests/jsoncheck.cpp $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o jsoncheck $(LIBFILES) tests/jsoncheck.cpp -I. $(LIBFLAGS)

@ -115,11 +126,9 @@ parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBF
 distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES) 
 	$(CXX) $(CXXFLAGS)  -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp  -I. $(LIBFLAGS) $(COREDEPSINCLUDE)

-
-parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) #$(EXTRAOBJECTS) 
+parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) 
 	$(CXX) $(CXXFLAGS)  -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE) 
-#$(EXTRADEPSINCLUDE)
-#$(EXTRAOBJECTS) 
+

 allparserscheckfile: tests/allparserscheckfile.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) 
 	$(CXX) $(CXXFLAGS) -o allparserscheckfile $(LIBFILES) tests/allparserscheckfile.cpp $(EXTRAOBJECTS) -I. $(LIBFLAGS) $(COREDEPSINCLUDE) $(EXTRADEPSINCLUDE)
@ -132,7 +141,7 @@ cppcheck:


 clean:
-	rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES)
+	rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) $(SUPPLEMENTARYEXECUTABLES)

 cleandist:
-	rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES)
+	rm -f $(EXTRAOBJECTS) $(MAINEXECUTABLES) $(EXTRA_EXECUTABLES) $(TESTEXECUTABLES) $(COMPARISONEXECUTABLES) $(SUPPLEMENTARYEXECUTABLES)
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -340,6 +340,10 @@ static really_inline bool parse_number(const u8 *const buf,
                                       ParsedJson &pj, 
                                       const u32 offset, 
                                       bool found_minus) {
+#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes useful to skip parsing
+  pj.write_tape_s64(0); // always write zero
+  return true; // always succeeds
+#else
  const char *p = (const char *)(buf + offset);
  bool negative = false;
  if (found_minus) {
@ -493,4 +497,5 @@ static really_inline bool parse_number(const u8 *const buf,
 #endif
  }
  return  is_structural_or_whitespace(*p);
+#endif // SIMDJSON_SKIPNUMBERPARSING
 }
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@ -61,7 +61,10 @@ really_inline bool handle_unicode_codepoint(const u8 **src_ptr, u8 **dst_ptr) {
 WARN_UNUSED
 really_inline  bool parse_string(const u8 *buf, UNUSED size_t len,
                                ParsedJson &pj, UNUSED const u32 depth, u32 offset) {
-  using namespace std;
+#ifdef SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing
+  pj.write_tape(0, '"');// don't bother with the string parsing at all
+  return true; // always succeeds
+#else
  const u8 *src = &buf[offset + 1]; // we know that buf at offset is a "
  u8 *dst = pj.current_string_buf_loc;
 #ifdef JSON_TEST_STRINGS // for unit testing
@ -195,6 +198,7 @@ really_inline  bool parse_string(const u8 *buf, UNUSED size_t len,
  }
  // can't be reached
  return true;
+#endif // SIMDJSON_SKIPSTRINGPARSING
 }


--- a/scripts/plotparse.sh
+++ b/scripts/plotparse.sh
@ -13,15 +13,35 @@ if [ "$os" = "Linux" ]; then
  echo "You are using linux."
  echo "We are going to just parse using simdjson, and collect perf stats."

-  make parse
+  make parse parse_noutf8validation parse_nonumberparsing parse_nostringparsing
  myfile=$plotdirectory"/parselinuxtable.txt"
  echo $myfile
  echo "" > $myfile
+
+  myfile_noutf8validation=$plotdirectory"/parselinuxtable_noutf8validation.txt"
+  echo $myfile_noutf8validation
+  echo "" > $myfile_noutf8validation
+
+  myfile=$plotdirectory"/parselinuxtable_nonumberparsing.txt"
+  echo $myfile_nonumberparsing
+  echo "" > $myfile_nonumberparsing
+
+  myfile=$plotdirectory"/parselinuxtable_nostringparsing.txt"
+  echo $myfile_nostringparsing
+  echo "" > $myfile_nostringparsing
+
+
  for i in $SCRIPTPATH/../jsonexamples/*.json; do
    [ -f "$i" ] || break
    echo $i
    $SCRIPTPATH/../parse -t "$i" >> "$myfile"
+    $SCRIPTPATH/../parse_noutf8validation -t "$i" >> "$myfile_noutf8validation"
+    $SCRIPTPATH/../parse_nonumberparsing -t "$i" >> "$myfile_nonumberparsing"
+    $SCRIPTPATH/../parse_nostringparsing -t "$i" >> "$myfile_nostringparsing"
  done
+  paste $myfile $myfile_noutf8validation $myfile_nonumberparsing $myfile_nostringparsing > $myfile.tmp
+  mv $myfile.tmp $myfile
+  rm  $myfile_noutf8validation $myfile_nonumberparsing $myfile_nostringparsing
  gnuplot -e "filename='$myfile';name='$plotdirectory/stackedperf.pdf'" $SCRIPTPATH/stackbar.gnuplot
 fi

--- a/scripts/stackbar.gnuplot
+++ b/scripts/stackbar.gnuplot
@ -21,7 +21,8 @@ set ytics nomirror

 set yrange [0:]

-set key right
+#set key right
+set key outside
 set style data histograms
 set style histogram rowstacked
 set xtic rotate by 300 scale 1
@ -29,6 +30,20 @@ set xtic rotate by 300 scale 1
 set style line 1 lt rgb "#A00000" lw 1 pt 1 ps 1
 set style line 2 lt rgb "#00A000" lw 1 pt 1 ps 1
 set style line 3 lt rgb "#5060D0" lw 1 pt 1 ps 1
-set style line 4 lt rgb "#FF1493" lw 1 pt 1 ps 1
+set style line 4 lt rgb "red" lw 1 pt 1 ps 1
+set style line 5 lt rgb "#808000" lw 1 pt 1 ps 1
+set style line 6 lt rgb "#00008B" lw 1 pt 1 ps 1
+set style line 7 lt rgb "black" lw 1 pt 1 ps 1
+set style line 8 lt rgb "blue" lw 1 pt 1 ps 1
+set style line 9 lt rgb "violet" lw 1 pt 1 ps 1

-plot filename using 3 t "stage 1" ls 2, '' using 4 t "stage 2" ls 3, '' using 5:xtic(1) t "stage 3" ls 1
+# plot filename using 3 t "stage 1" ls 2, '' using 4 t "stage 2" ls 3, '' using 5:xtic(1) t "stage 3" ls 1
+plot filename using 8 t "stage 1 without utf8 validation" ls 1, '' using ($3-$8)  t "utf8 validation (stage 1)" ls 2, '' using 4 t "stage 2" ls 3, '' using ($20 + $15 - $5) t "stage 3 (no number or string)" ls 4, '' using ($5 - $20) t "string parsing (stage 3)" ls 5,  '' using ($5 - $15):xtic(1) t "number parsing (stage 3)" ls 6
+
+
+
+# 1, 2 mem, 3 st1, 4 st2, 5  st3
+# 6, 7 mem, 8 st1, 9 st2, 10  st3 // noutf8
+# 11, 12 mem, 13 st1, 14 st2, 15  st3 // nonumber
+# 16, 17 mem, 18 st1, 19 st2, 20  st3 // nostring
+# string: $5 - $20 , number $5 - $15, no string no number $5 - ($5 - $20) - ($5 - $15) = $20 + $15 - $5
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -10,11 +10,14 @@
 #include "simdjson/common_defs.h"
 #include "simdjson/parsedjson.h"

-#define UTF8VALIDATE
+#ifndef SIMDJSON_SKIPUTF8VALIDATION
+#define SIMDJSON_UTF8VALIDATE
+#endif
+
 // It seems that many parsers do UTF-8 validation.
 // RapidJSON does not do it by default, but a flag
 // allows it. 
-#ifdef UTF8VALIDATE
+#ifdef SIMDJSON_UTF8VALIDATE
 #include "simdjson/simdutf8check.h"
 #endif
 using namespace std;
@ -37,7 +40,7 @@ WARN_UNUSED
    cerr << "Your ParsedJson object only supports documents up to "<< pj.bytecapacity << " bytes but you are trying to process " <<  len  << " bytes\n";
    return false;
  }
-#ifdef UTF8VALIDATE
+#ifdef SIMDJSON_UTF8VALIDATE
  __m256i has_error = _mm256_setzero_si256();
  struct avx_processed_utf_bytes previous = {
      .rawbytes = _mm256_setzero_si256(),
@ -78,7 +81,7 @@ WARN_UNUSED
 #endif
    m256 input_lo = _mm256_loadu_si256((const m256 *)(buf + idx + 0));
    m256 input_hi = _mm256_loadu_si256((const m256 *)(buf + idx + 32));
-#ifdef UTF8VALIDATE
+#ifdef SIMDJSON_UTF8VALIDATE
    m256 highbit = _mm256_set1_epi8(0x80);
    if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
        // it is ascii, we just check continuation
@ -261,7 +264,7 @@ WARN_UNUSED
    memcpy(tmpbuf,buf+idx,len - idx);
    m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
    m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
-#ifdef UTF8VALIDATE
+#ifdef SIMDJSON_UTF8VALIDATE
    m256 highbit = _mm256_set1_epi8(0x80);
    if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
        // it is ascii, we just check continuation
@ -402,7 +405,7 @@ WARN_UNUSED
    structurals &= ~(quote_bits & ~quote_mask);
    *(u64 *)(pj.structurals + idx / 8) = structurals;
  }
-#ifdef UTF8VALIDATE
+#ifdef SIMDJSON_UTF8VALIDATE
  return _mm256_testz_si256(has_error, has_error);
 #else
  return true;