More cleaning.

2018-11-30 21:31:05 -05:00 · 2018-11-30 21:31:05 -05:00 · c11eefca32
parent 0e4804137c
commit c11eefca32
12 changed files with 199 additions and 56 deletions
--- a/README.md
+++ b/README.md
@ -12,11 +12,11 @@ Goal: Speed up the parsing of JSON per se.
 /...
 const char * filename = ... //
-simdjsonstring p = get_corpus(filename);
+std::string_view p = get_corpus(filename);
 ParsedJson pj;
 size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024
 pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes
-bool is_ok = json_parse(p.first, p.second, pj); // do the parsing, return false on error
+bool is_ok = json_parse(p, pj); // do the parsing, return false on error
 // parsing is done!
 // js can be reused with other json_parse calls.
 ```
--- a/benchmark/minifiercompetition.cpp
+++ b/benchmark/minifiercompetition.cpp
@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
    exit(1);
  }
  const char * filename = argv[optind];
-  simdjsonstring p;
+  std::string_view p;
  try {
    p = get_corpus(filename);
  } catch (const std::exception& e) { // caught by reference to base
@ -79,20 +79,20 @@ int main(int argc, char *argv[]) {
    std::cout << std::endl;
  }
  char *buffer = allocate_aligned_buffer(p.size() + 1);
-  memcpy(buffer, p.c_str(), p.size());
+  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';
  int repeat = 10;
  int volume = p.size();
-  size_t strlength = rapidstringme((char *)p.c_str()).size();
+  size_t strlength = rapidstringme((char *)p.data()).size();
  if (verbose)
    std::cout << "input length is " << p.size() << " stringified length is "
              << strlength << std::endl;
-  BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.c_str()), , repeat, volume, true);
+  BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.data()), , repeat, volume, true);
  BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
-                    memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+                    memcpy(buffer, p.data(), p.size()), repeat, volume, true);
-  memcpy(buffer, p.c_str(), p.size());
+  memcpy(buffer, p.data(), p.size());
  size_t outlength =
      jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
  uint8_t *cbuffer = (uint8_t *)buffer;
  BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength,
-            memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+            memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  printf("minisize = %zu, original size = %zu  (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
  /***
@ -109,10 +109,10 @@ int main(int argc, char *argv[]) {
   ***/
  rapidjson::Document d;
  BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
-            memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+            memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  char *minibuffer = allocate_aligned_buffer(p.size() + 1);
-  size_t minisize = jsonminify((const uint8_t *)p.c_str(), p.size(), (uint8_t*) minibuffer);
+  size_t minisize = jsonminify((const uint8_t *)p.data(), p.size(), (uint8_t*) minibuffer);
  minibuffer[minisize] = '\0';
  BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
@ -122,14 +122,14 @@ int main(int argc, char *argv[]) {
  size_t astbuffersize = p.size() * 2;
  size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
-  BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
  ParsedJson pj;
  pj.allocateCapacity(p.size(), 1024);
-  BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  ParsedJson pj2;
  pj2.allocateCapacity(p.size(), 1024);
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
    cerr << "warning: ignoring everything after " << argv[optind  + 1] << endl;
  }
  if(verbose) cout << "[verbose] loading " << filename << endl;
-  simdjsonstring p;
+  std::string_view p;
  try {
    p = get_corpus(filename);
  } catch (const std::exception& e) { // caught by reference to base
@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
 #ifndef SQUASH_COUNTERS
    unified.start();
 #endif
-    isok = find_structural_bits(p.c_str(), p.size(), pj);
+    isok = find_structural_bits(p.data(), p.size(), pj);
 #ifndef SQUASH_COUNTERS
    unified.end(results);
    cy1 += results[0];
@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
    unified.start();
 #endif
-    isok = isok && unified_machine(p.c_str(), p.size(), pj);
+    isok = isok && unified_machine(p.data(), p.size(), pj);
 #ifndef SQUASH_COUNTERS
    unified.end(results);
    cy3 += results[0];
--- a/benchmark/parsingcompetition.cpp
+++ b/benchmark/parsingcompetition.cpp
@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
  if(optind + 1 < argc) {
    cerr << "warning: ignoring everything after " << argv[optind  + 1] << endl;
  }
-  simdjsonstring p;
+  std::string_view p;
  try {
    p = get_corpus(filename);
  } catch (const std::exception& e) { // caught by reference to base
@ -93,32 +93,32 @@ int main(int argc, char *argv[]) {
  rapidjson::Document d;
  char *buffer = (char *)malloc(p.size() + 1);
-  memcpy(buffer, p.c_str(), p.size());
+  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';
  BEST_TIME("RapidJSON", 
      d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
-      false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+      false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
-            memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+            memcpy(buffer, p.data(), p.size()), repeat, volume, true);
-  BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  size_t astbuffersize = p.size();
  size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
-  BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  std::string json11err;
-  if(all) BEST_TIME("dropbox (json11)     ",  (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  if(all) BEST_TIME("dropbox (json11)     ",  (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
-  if(all) BEST_TIME("fastjson             ", fastjson_parse(buffer), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  if(all) BEST_TIME("fastjson             ", fastjson_parse(buffer), true, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  JsonValue value;
  JsonAllocator allocator;
  char *endptr;
-  if(all) BEST_TIME("gason             ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  if(all) BEST_TIME("gason             ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
  void *state;
-  if(all) BEST_TIME("ultrajson         ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
+  if(all) BEST_TIME("ultrajson         ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.data(), p.size()), repeat, volume, true);
-  BEST_TIME("memcpy            ", (memcpy(buffer, p.c_str(), p.size()) == buffer), true, , repeat, volume, true);
+  BEST_TIME("memcpy            ", (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, volume, true);
  free(ast_buffer);
  free(buffer);
 }
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@ -14,6 +14,6 @@ static inline size_t jsonminify(const char *buf, size_t len, char *out) {
 }
-static inline size_t jsonminify(const simdjsonstring & p, char *out) {
+static inline size_t jsonminify(const std::string_view & p, char *out) {
-    return jsonminify(p.c_str(), p.size(), out);
+    return jsonminify(p.data(), p.size(), out);
 }
--- a/src/jsonioutil.cpp
+++ b/src/jsonioutil.cpp
@ -10,10 +10,6 @@ char * allocate_aligned_buffer(size_t length) {
    if (posix_memalign((void **)&aligned_buffer, 64, totalpaddedlength)) {
      throw std::runtime_error("Could not allocate sufficient memory");
    };
    aligned_buffer[length] = '\0';
    for(size_t i = length + 1; i < totalpaddedlength; i++) aligned_buffer[i] = 0x20;
    //aligned_buffer[paddedlength] = '\0';
    //memset(aligned_buffer + length, 0x20, paddedlength - length);
    return aligned_buffer;
 }
@ -29,6 +25,7 @@ std::string_view get_corpus(std::string filename) {
    }
    std::rewind(fp);
    std::fread(buf, 1, len, fp);
    buf[len] = '\0';
    std::fclose(fp);
    return std::string_view(buf,len);
  }
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -11,9 +11,13 @@ bool json_parse(const u8 *buf, size_t len, ParsedJson &pj) {
  bool isok = find_structural_bits(buf, len, pj);
  if (isok) {
    isok = flatten_indexes(len, pj);
  } else {
    return false;
  }
  if (isok) {
    isok = unified_machine(buf, len, pj);
  } else {
    return false;
  }
  return isok;
 }
--- a/src/stage1_find_marks.cpp
+++ b/src/stage1_find_marks.cpp
@ -61,8 +61,9 @@ WARN_UNUSED
  // effectively the very first char is considered to follow "whitespace" for the
  // purposes of psuedo-structural character detection
  u64 prev_iter_ends_pseudo_pred = 1ULL;
-
+  size_t lenminus64 = len + 1 < 64 ? 0 : len + 1  - 64; // len + 1 because of the NULL termination
-  for (size_t idx = 0; idx < len; idx += 64) {
+  size_t idx = 0;
  for (; idx < lenminus64; idx += 64) {
    __builtin_prefetch(buf + idx + 128);
 #ifdef DEBUG
    cout << "Idx is " << idx << "\n";
@ -249,21 +250,163 @@ WARN_UNUSED
        "final structurals and pseudo structurals after close quote removal");
    *(u64 *)(pj.structurals + idx / 8) = structurals;
  }
  ////////////////
  /// we use a giant copy-paste which is ugly.
  /// but otherwise the string needs to be properly padded or else we
  /// risk invalidating the UTF-8 checks.
  ////////////
  if (idx < len + 1) { // +1 due to NULL termination
    u8 tmpbuf[64];
    memset(tmpbuf,0x20,64);
    memcpy(tmpbuf,buf+idx,len - idx + 1);// +1 due to NULL termination
    m256 input_lo = _mm256_loadu_si256((const m256 *)(tmpbuf + 0));
    m256 input_hi = _mm256_loadu_si256((const m256 *)(tmpbuf + 32));
 #ifdef UTF8VALIDATE
    m256 highbit = _mm256_set1_epi8(0x80);
    if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
        // it is ascii, we just check continuation
        has_error = _mm256_or_si256(
          _mm256_cmpgt_epi8(previous.carried_continuations,
                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                                           9, 9, 9, 9, 9, 9, 9, 1)),has_error);
    } else {
        // it is not ascii so we have to do heavy work
        previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
        previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
    }
 #endif
    ////////////////////////////////////////////////////////////////////////////////////////////
    //     Step 1: detect odd sequences of backslashes
    ////////////////////////////////////////////////////////////////////////////////////////////
    u64 bs_bits =
        cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
    u64 start_edges = bs_bits & ~(bs_bits << 1);
    // flip lowest if we have an odd-length run at the end of the prior
    // iteration
    u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
    u64 even_starts = start_edges & even_start_mask;
    u64 odd_starts = start_edges & ~even_start_mask;
    u64 even_carries = bs_bits + even_starts;
    u64 odd_carries;
    // must record the carry-out of our odd-carries out of bit 63; this
    // indicates whether the sense of any edge going to the next iteration
    // should be flipped
    bool iter_ends_odd_backslash =
        __builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
    odd_carries |=
        prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                                      // if we had an odd-numbered run at the
                                      // end of the previous iteration
    prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
    u64 even_carry_ends = even_carries & ~bs_bits;
    u64 odd_carry_ends = odd_carries & ~bs_bits;
    u64 even_start_odd_end = even_carry_ends & odd_bits;
    u64 odd_start_even_end = odd_carry_ends & even_bits;
    u64 odd_ends = even_start_odd_end | odd_start_even_end;
    ////////////////////////////////////////////////////////////////////////////////////////////
    //     Step 2: detect insides of quote pairs
    ////////////////////////////////////////////////////////////////////////////////////////////
    u64 quote_bits =
        cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
    quote_bits = quote_bits & ~odd_ends;
    u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
        _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
    quote_mask ^= prev_iter_inside_quote;
    prev_iter_inside_quote = (u64)((s64)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20
    // How do we build up a user traversable data structure
    // first, do a 'shufti' to detect structural JSON characters
    // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
    // these go into the first 3 buckets of the comparison (1/2/4)
    // we are also interested in the four whitespace characters
    // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
    // these go into the next 2 buckets of the comparison (8/16)
    const m256 low_nibble_mask = _mm256_setr_epi8(
        //  0                           9  a   b  c  d
        16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0, 0,
        0, 0, 8, 12, 1, 2, 9, 0, 0);
    const m256 high_nibble_mask = _mm256_setr_epi8(
        //  0     2   3     5     7
        8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
        1, 0, 0, 0, 3, 2, 1, 0, 0);
    m256 structural_shufti_mask = _mm256_set1_epi8(0x7);
    m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
    m256 v_lo = _mm256_and_si256(
        _mm256_shuffle_epi8(low_nibble_mask, input_lo),
        _mm256_shuffle_epi8(high_nibble_mask,
                            _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
                                             _mm256_set1_epi8(0x7f))));
    m256 v_hi = _mm256_and_si256(
        _mm256_shuffle_epi8(low_nibble_mask, input_hi),
        _mm256_shuffle_epi8(high_nibble_mask,
                            _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
                                             _mm256_set1_epi8(0x7f))));
    m256 tmp_lo = _mm256_cmpeq_epi8(
        _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
    m256 tmp_hi = _mm256_cmpeq_epi8(
        _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
    u64 structural_res_0 = (u32)_mm256_movemask_epi8(tmp_lo);
    u64 structural_res_1 = _mm256_movemask_epi8(tmp_hi);
    u64 structurals = ~(structural_res_0 | (structural_res_1 << 32));
    // this additional mask and transfer is non-trivially expensive,
    // unfortunately
    m256 tmp_ws_lo = _mm256_cmpeq_epi8(
        _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
    m256 tmp_ws_hi = _mm256_cmpeq_epi8(
        _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
    u64 ws_res_0 = (u32)_mm256_movemask_epi8(tmp_ws_lo);
    u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
    u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
    // mask off anything inside quotes
    structurals &= ~quote_mask;
    // add the real quote bits back into our bitmask as well, so we can
    // quickly traverse the strings we've spent all this trouble gathering
    structurals |= quote_bits;
    // Now, establish "pseudo-structural characters". These are non-whitespace
    // characters that are (a) outside quotes and (b) have a predecessor that's
    // either whitespace or a structural character. This means that subsequent
    // passes will get a chance to encounter the first character of every string
    // of non-whitespace and, if we're parsing an atom like true/false/null or a
    // number we can stop at the first whitespace or structural character
    // following it.
    // a qualified predecessor is something that can happen 1 position before an
    // psuedo-structural character
    u64 pseudo_pred = structurals | whitespace;
    u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
    prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
    u64 pseudo_structurals =
        shifted_pseudo_pred & (~whitespace) & (~quote_mask);
    structurals |= pseudo_structurals;
    // now, we've used our close quotes all we need to. So let's switch them off
    // they will be off in the quote mask and on in quote bits.
    structurals &= ~(quote_bits & ~quote_mask);
    *(u64 *)(pj.structurals + idx / 8) = structurals;
  }
  if(buf[len] != '\0') {
      std::cerr << "Your string should be NULL terminated." << std::endl;
      return false;
  }
  // we are going to zero out everything after len:
  size_t count_last_64bits = len % 64;
  if(count_last_64bits != 0) { // we have a "final" word where only count_last_64bits matter
      u64 lastword = *(u64 *)(pj.structurals + len / 8);
      printf("last word %zu \n", lastword);
      printf("count_last_64bits%zu \n", count_last_64bits);
      lastword &= ( UINT64_C(1) << count_last_64bits) - 1;
      *(u64 *)(pj.structurals + len / 8) = lastword;
  }
  //pj.structural_indexes[pj.n_structural_indexes++] = len; // the final NULL is used as a pseudo-structural character
 #ifdef UTF8VALIDATE
  return _mm256_testz_si256(has_error, has_error);
 #else
--- a/src/stage2_flatten.cpp
+++ b/src/stage2_flatten.cpp
@ -119,7 +119,7 @@ bool flatten_indexes(size_t len, ParsedJson &pj) {
  }
  pj.n_structural_indexes = base;
  if(len != base_ptr[pj.n_structural_indexes-1]) {
-    printf("last structural should be pointing at the end of the string\n");
+    // can happen with malformed JSON such as unclosed quotes (["this is an unclosed string ])
    return false;
  }
  base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array
--- a/tests/allparserscheckfile.cpp
+++ b/tests/allparserscheckfile.cpp
@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
    exit(1);
  }
  const char * filename = argv[optind];
-  simdjsonstring p;
+  std::string_view p;
  try {
    p = get_corpus(filename);
  } catch (const std::exception& e) { // caught by reference to base
@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
  rapidjson::Document d;
  char *buffer = (char *)malloc(p.size() + 1);
-  memcpy(buffer, p.c_str(), p.size());
+  memcpy(buffer, p.data(), p.size());
  buffer[p.size()] = '\0';
  bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false);
  bool rapid_correct_checkencoding = (d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError() == false);
--- a/tests/jsoncheck.cpp
+++ b/tests/jsoncheck.cpp
@ -49,7 +49,8 @@ bool validate(const char *dirname) {
  for (int i = 0; i < c; i++) {
    const char *name = entry_list[i]->d_name;
    if (hasExtension(name, extension)) {
-      //printf("validating: file %s \n", name);
+      printf("validating: file %s ", name);
      fflush(NULL);
      size_t filelen = strlen(name);
      char *fullpath = (char *)malloc(dirlen + filelen + 1 + 1);
      strcpy(fullpath, dirname);
@ -74,6 +75,7 @@ bool validate(const char *dirname) {
      }
      ++howmany;
      bool isok = json_parse(p, pj);
      printf("%s\n", isok ? "ok" : "invalid");
      if(contains("EXCLUDE",name)) {
        // skipping
        howmany--;
@ -89,9 +91,6 @@ bool validate(const char *dirname) {
          printf("warning: file %s should fail but it passes.\n", name);
          everythingfine = false;
        }
      } else {
        printf("File %s %s.\n", name,
               isok ? " is valid JSON " : " is not valid JSON");
      } 
      free(fullpath);
    }
--- a/tools/minify.cpp
+++ b/tools/minify.cpp
@ -8,7 +8,7 @@ int main(int argc, char *argv[]) {
    std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
    exit(1);
  }
-  simdjsonstring p;
+  std::string_view p;
  std::string filename = argv[argc - 1];
  try{
    p = get_corpus(filename);
@ -16,6 +16,6 @@ int main(int argc, char *argv[]) {
        std::cout << "Could not load the file " << filename << std::endl;
        return EXIT_FAILURE;
  }
-  jsonminify(p,  &p[0]);
+  jsonminify(p, (char *)p.data());
  printf("%s",p.data());
 }