faster minifier (#568)

* Fallback should use our scalar code. * parse should have a nicer error message. * Making it so that "minify" can use different architectures. * Let us change the minifier competition so that it tests all implementations. * Documenting the untaken optimization opportunity. Co-authored-by: John Keiser <john@johnkeiser.com>
2020-03-20 16:14:47 -04:00 · 2020-03-20 16:14:47 -04:00 · 5d1e3efce8
parent 293ec7aec5
commit 5d1e3efce8
34 changed files with 558 additions and 620 deletions
--- a/4
+++ b/4
@ -53,7 +53,7 @@ endif # ifeq ($(SANITIZE),1)
 endif # ifeq ($(MEMSANITIZE),1)

 # Headers and sources
-SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
+SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/json_minifier.h src/generic/buf_block_reader.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
 SRCHEADERS_ARM64=      src/arm64/bitmanipulation.h    src/arm64/bitmask.h    src/arm64/intrinsics.h    src/arm64/numberparsing.h    src/arm64/simd.h    src/arm64/stage1_find_marks.h    src/arm64/stage2_build_tape.h    src/arm64/stringparsing.h
 SRCHEADERS_HASWELL=  src/haswell/bitmanipulation.h  src/haswell/bitmask.h  src/haswell/intrinsics.h  src/haswell/numberparsing.h  src/haswell/simd.h  src/haswell/stage1_find_marks.h  src/haswell/stage2_build_tape.h  src/haswell/stringparsing.h
 SRCHEADERS_FALLBACK=  src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h
@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
 SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
 SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)

-INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
+INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h

 ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
 	HEADERS=singleheader/simdjson.h
--- a/amalgamation.sh
+++ b/amalgamation.sh
@ -143,7 +143,7 @@ int main(int argc, char *argv[]) {
  // parse_many
  const char * filename2 = argv[2];
  for (auto result : parser.load_many(filename2)) {
-    error = result.error;
+    error = result.error();
  }
  if (error) {
    std::cout << "parse_many failed" << std::endl;
--- a/benchmark/minifiercompetition.cpp
+++ b/benchmark/minifiercompetition.cpp
@ -98,16 +98,14 @@ int main(int argc, char *argv[]) {
      "despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
      memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
  memcpy(buffer, p.data(), p.size());
-
-  size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
-                                           (uint8_t *)buffer);
-  if (verbose)
-    std::cout << "json_minify length is " << outlength << std::endl;
-
+  size_t outlength;
  uint8_t *cbuffer = (uint8_t *)buffer;
-  BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
+  for (auto imple : simdjson::available_implementations) {
+    BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength), outlength),
            outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
            !just_data);
+  }
+
  printf("minisize = %zu, original size = %zu  (minified down to %.2f percent "
         "of original) \n",
         outlength, p.size(), outlength * 100.0 / p.size());
@ -121,8 +119,9 @@ int main(int argc, char *argv[]) {
            !just_data);

  char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
-  size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
-                                          (uint8_t *)mini_buffer);
+  size_t minisize;
+  simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(),
+                                          (uint8_t *)mini_buffer, minisize);
  mini_buffer[minisize] = '\0';

  BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
@ -171,6 +170,7 @@ int main(int argc, char *argv[]) {
                                 automated_reallocation),
            simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
            !just_data);
+
  free(buffer);
  free(ast_buffer);
  free(mini_buffer);
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -109,7 +109,12 @@ struct option_struct {
        case 'a': {
          const implementation *impl = simdjson::available_implementations[optarg];
          if (!impl) {
-            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a haswell, westmere or arm64");
+            std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a  with one of ";
+            for (auto imple : simdjson::available_implementations) {
+              exit_message += imple->name();
+              exit_message += " ";
+            }
+            exit_usage(exit_message);
          }
          simdjson::active_implementation = impl;
          break;
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@ -16,7 +16,6 @@ set(SIMDJSON_INCLUDE
    ${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
-    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h
--- a/include/simdjson.h
+++ b/include/simdjson.h
@ -10,7 +10,6 @@
 #include "simdjson/implementation.h"
 #include "simdjson/document.h"
 #include "simdjson/document_stream.h"
-#include "simdjson/jsonminifier.h"

 // Deprecated API
 #include "simdjson/parsedjsoniterator.h"
--- a/include/simdjson/error.h
+++ b/include/simdjson/error.h
@ -77,6 +77,17 @@ private:
 */
 template<typename T>
 struct simdjson_result : public std::pair<T, error_code> {
+  /**
+   * Move the value and the error to the provided variables.
+   */
+  void tie(T& t, error_code & e) {
+    // on the clang compiler that comes with current macOS (Apple clang version 11.0.0),
+    // tie(width, error) = size["w"].as_uint64_t();
+    // fails with "error: no viable overloaded '='""
+    t = std::move(this->first);
+    e = std::move(this->second);
+  }
+
  /**
   * The error.
   */
@ -138,6 +149,7 @@ struct simdjson_move_result : std::pair<T, error_code> {
    t = std::move(this->first);
    e = std::move(this->second);
  }
+
  /**
   * The error.
   */
--- a/include/simdjson/implementation.h
+++ b/include/simdjson/implementation.h
@ -56,6 +56,19 @@ public:
   */
  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept = 0;

+  /**
+   * Run a full document parse (ensure_capacity, stage1 and stage2).
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param dst_len the number of bytes written. Output only.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
+
  /**
   * Stage 1 of the document parser.
   *
@ -182,6 +195,9 @@ public:
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final {
    return set_best()->parse(buf, len, parser);
  }
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
+    return set_best()->minify(buf, len, dst, dst_len);
+  }
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final {
    return set_best()->stage1(buf, len, parser, streaming);
  }
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@ -1,32 +0,0 @@
-#ifndef SIMDJSON_JSONMINIFIER_H
-#define SIMDJSON_JSONMINIFIER_H
-
-#include "simdjson/padded_string.h"
-#include <cstddef>
-#include <cstdint>
-#include <string_view>
-
-namespace simdjson {
-
-// Take input from buf and remove useless whitespace, write it to out; buf and
-// out can be the same pointer. Result is null terminated,
-// return the string length (minus the null termination).
-// The accelerated version of this function only runs on AVX2 hardware.
-size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out);
-
-static inline size_t json_minify(const char *buf, size_t len, char *out) {
-  return json_minify(reinterpret_cast<const uint8_t *>(buf), len,
-                     reinterpret_cast<uint8_t *>(out));
-}
-
-static inline size_t json_minify(const std::string_view &p, char *out) {
-  return json_minify(p.data(), p.size(), out);
-}
-
-static inline size_t json_minify(const padded_string &p, char *out) {
-  return json_minify(p.data(), p.size(), out);
-}
-
-} // namespace simdjson
-
-#endif // SIMDJSON_JSONMINIFIER_H
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -29,7 +29,6 @@ set(SIMDJSON_SRC
 set(SIMDJSON_SRC_HEADERS
  implementation.cpp
  isadetection.h
-  jsonminifier.cpp
  simdprune_tables.h
  stage1_find_marks.cpp
  stage2_build_tape.cpp
--- a/src/arm64/bitmanipulation.h
+++ b/src/arm64/bitmanipulation.h
@ -48,7 +48,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 }

 /* result might be undefined when input_num is zero */
-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
   return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
 }

--- a/src/arm64/implementation.h
+++ b/src/arm64/implementation.h
@ -10,6 +10,7 @@ class implementation final : public simdjson::implementation {
 public:
  really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/arm64/simd.h
+++ b/src/arm64/simd.h
@ -2,6 +2,8 @@
 #define SIMDJSON_ARM64_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "arm64/bitmanipulation.h"
 #include "arm64/intrinsics.h"

 namespace simdjson::arm64::simd {
@ -142,6 +144,43 @@ namespace simdjson::arm64::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return lookup_table.apply_lookup_16_to(*this);
    }
+
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint16_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint16_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
+      uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
+      // we increment by 0x08 the second half of the mask
+      uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+      shufmask = vaddq_u8(shufmask, inc);
+      // this is the version "nearly pruned"
+      uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8));
+      uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+      vst1q_u8((uint8_t*) output, answer);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -267,6 +306,13 @@ namespace simdjson::arm64::simd {
      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(mask, output);
+      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
+      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
+      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+    }
+
    template <typename F>
    static really_inline void each_index(F const& each) {
      each(0);
@ -339,7 +385,6 @@ namespace simdjson::arm64::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::arm64::simd
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -31,6 +31,23 @@ really_inline json_character_block json_character_block::classify(const simd::si
    return shuf_lo & shuf_hi;
  });

+
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace). *However* if we only need spaces,
+  // it is likely that we will still compute 'v' above with two lookup_16: one
+  // could do it a bit cheaper. This is in contrast with the x64 implementations
+  // where we can, efficiently, do the white space and structural matching
+  // separately. One reason for this difference is that on ARM NEON, the table
+  // lookups either zero or leave unchanged the characters exceeding 0xF whereas
+  // on x64, the equivalent instruction (pshufb) automatically applies a mask,
+  // ignoring the 4 most significant bits. Thus the x64 implementation is
+  // optimized differently. This being said, if you use this code strictly
+  // just for minification (or just to identify the structural characters),
+  // there is a small untaken optimization opportunity here. We deliberately
+  // do not pick it up.
+
  uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
  uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
  return { whitespace, op };
@ -53,11 +70,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }

-#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/buf_block_reader.h"
 #include "generic/json_string_scanner.h"
 #include "generic/json_scanner.h"
-#include "generic/json_structural_indexer.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
  return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }
--- a/src/fallback/implementation.h
+++ b/src/fallback/implementation.h
@ -14,6 +14,7 @@ public:
      0
  ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/fallback/stage1_find_marks.h
+++ b/src/fallback/stage1_find_marks.h
@ -151,6 +151,62 @@ WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, do
  return scanner.scan();
 }

+// big table for the minifier
+static uint8_t jump_table[256 * 3] = {
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
+    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+};
+
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  size_t i = 0, pos = 0;
+  uint8_t quote = 0;
+  uint8_t nonescape = 1;
+
+  while (i < len) {
+    unsigned char c = buf[i];
+    uint8_t *meta = jump_table + 3 * c;
+
+    quote = quote ^ (meta[0] & nonescape);
+    dst[pos] = c;
+    pos += meta[2] | quote;
+
+    i += 1;
+    nonescape = (~nonescape) | (meta[1]);
+  }
+  dst_len = pos; // we intentionally do not work with a reference
+  // for fear of aliasing
+  return SUCCESS;
+}
+
 } // namespace simdjson::fallback

 #endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
--- a/src/generic/buf_block_reader.h
+++ b/src/generic/buf_block_reader.h
@ -0,0 +1,48 @@
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+  really_inline size_t block_index() { return idx; }
+  really_inline bool has_full_block() const {
+    return idx < lenminusstep;
+  }
+  really_inline const uint8_t *full_block() const {
+    return &buf[idx];
+  }
+  really_inline bool has_remainder() const {
+    return idx < len;
+  }
+  really_inline void get_remainder(uint8_t *tmp_buf) const {
+    memset(tmp_buf, 0x20, STEP_SIZE);
+    memcpy(tmp_buf, buf + idx, len - idx);
+  }
+  really_inline void advance() {
+    idx += STEP_SIZE;
+  }
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
+  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
+  in.store((uint8_t*)buf);
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+UNUSED static char * format_mask(uint64_t mask) {
+  static char *buf = (char*)malloc(64 + 1);
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
--- a/src/generic/json_minifier.h
+++ b/src/generic/json_minifier.h
@ -0,0 +1,73 @@
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
+  template<size_t STEP_SIZE>
+  really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
+  really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner;
+  uint8_t *dst;
+};
+
+really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
+  uint64_t mask = block.whitespace();
+  in.compress(mask, dst);
+  dst += 64 - count_ones(mask);
+}
+
+really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  *dst = '\0';
+  error_code error = scanner.finish(false);
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    minifier.step<STEP_SIZE>(block, reader);
+  }
+
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
--- a/src/generic/json_scanner.h
+++ b/src/generic/json_scanner.h
@ -5,23 +5,33 @@ namespace stage1 {
 */
 struct json_block {
 public:
-  // the start of structurals that are not inside strings
+  /** The start of structurals */
  really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }

-  // operators plus scalar starts like 123, true and "abc"
-  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
-  // the start of non-operator runs, like 123, true and "abc"
-  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
-  // whether the given character is immediately after a non-operator like 123, true or "
-  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
-  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
  really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+
  // string and escape characters
  json_string_block _string;
  // whitespace, operators, scalars
  json_character_block _characters;
  // whether the previous character was a scalar
  uint64_t _follows_potential_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /** operators plus scalar starts like 123, true and "abc" */
+  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
+  /** the start of non-operator runs, like 123, true and "abc" */
+  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
+  /** whether the given character is immediately after a non-operator like 123, true or " */
+  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
 };

 /**
--- a/src/generic/json_string_scanner.h
+++ b/src/generic/json_string_scanner.h
@ -14,7 +14,9 @@ struct json_string_block {
  // Only characters inside the string (not including the quotes)
  really_inline uint64_t string_content() const { return _in_string & ~_quote; }
  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
-  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return _in_string & mask; }
+  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
  // Tail of string (everything except the start quote)
  really_inline uint64_t string_tail() const { return _in_string ^ _quote; }

--- a/src/generic/json_structural_indexer.h
+++ b/src/generic/json_structural_indexer.h
@ -22,7 +22,7 @@ public:
    // it helps tremendously.
    if (bits == 0)
        return;
-    uint32_t cnt = hamming(bits);
+    uint32_t cnt = count_ones(bits);

    // Do the first 8 all together
    for (int i=0; i<8; i++) {
@ -55,55 +55,6 @@ public:
  }
 };

-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  in.store((uint8_t*)buf);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
-
-UNUSED static char * format_mask(uint64_t mask) {
-  static char *buf = (char*)malloc(64 + 1);
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
-}
-
-// Walks through a buffer in block-sized increments, loading the last part with spaces
-template<size_t STEP_SIZE>
-struct buf_block_reader {
-public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
-
 class json_structural_indexer {
 public:
  template<size_t STEP_SIZE>
@ -112,7 +63,7 @@ public:
 private:
  really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
  template<size_t STEP_SIZE>
-  really_inline void index_step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
  really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
  really_inline error_code finish(document::parser &parser, size_t idx, size_t len, bool streaming);

@ -162,7 +113,7 @@ really_inline error_code json_structural_indexer::finish(document::parser &parse
 }

 template<>
-really_inline void json_structural_indexer::index_step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
  simd::simd8x64<uint8_t> in_1(block);
  simd::simd8x64<uint8_t> in_2(block+64);
  json_block block_1 = scanner.next(in_1);
@ -173,7 +124,7 @@ really_inline void json_structural_indexer::index_step<128>(const uint8_t *block
 }

 template<>
-really_inline void json_structural_indexer::index_step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
  simd::simd8x64<uint8_t> in_1(block);
  json_block block_1 = scanner.next(in_1);
  this->next(in_1, block_1, reader.block_index());
@ -209,13 +160,13 @@ error_code json_structural_indexer::index(const uint8_t *buf, size_t len, docume
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());
  while (reader.has_full_block()) {
-    indexer.index_step<STEP_SIZE>(reader.full_block(), reader);
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
  }

  if (likely(reader.has_remainder())) {
    uint8_t block[STEP_SIZE];
    reader.get_remainder(block);
-    indexer.index_step<STEP_SIZE>(block, reader);
+    indexer.step<STEP_SIZE>(block, reader);
  }

  return indexer.finish(parser, reader.block_index(), len, streaming);
--- a/src/haswell/bitmanipulation.h
+++ b/src/haswell/bitmanipulation.h
@ -37,7 +37,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
  return static_cast<int>(_lzcnt_u64(input_num));
 }

-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
 #ifdef _MSC_VER
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num);// Visual Studio wants two underscores
--- a/src/haswell/implementation.h
+++ b/src/haswell/implementation.h
@ -14,6 +14,7 @@ public:
      instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
  ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/haswell/simd.h
+++ b/src/haswell/simd.h
@ -2,6 +2,8 @@
 #define SIMDJSON_HASWELL_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "haswell/bitmanipulation.h"
 #include "haswell/intrinsics.h"

 TARGET_HASWELL
@ -109,6 +111,57 @@ namespace simdjson::haswell::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return _mm256_shuffle_epi8(lookup_table, *this);
    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint32_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in four steps, first 8 bytes and then second 8 bytes...
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // second least significant 8 bits
+      uint8_t mask3 = static_cast<uint8_t>(mask >> 16); // ...
+      uint8_t mask4 = static_cast<uint8_t>(mask >> 24); // ...
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m256i shufmask =  _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3], 
+        thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask and so forth
+      shufmask =
+      _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818, 
+         0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
+      // we still need to put the  pieces back together.
+      // we compute the popcount of the first words:
+      int pop1 = BitsSetTable256mul2[mask1];
+      int pop3 = BitsSetTable256mul2[mask3];
+
+      // then load the corresponding mask
+      // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
+      __m256i v256 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)));
+      __m256i compactmask = _mm256_insertf128_si256(v256,
+         _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1);
+      __m256i almostthere =  _mm256_shuffle_epi8(pruned, compactmask);
+      // We just need to write out the result.
+      // This is the tricky bit that is hard to do
+      // if we want to return a SIMD register, since there
+      // is no single-instruction approach to recombine
+      // the two 128-bit lanes with an offset.
+      __m128i v128;
+      v128 = _mm256_castsi256_si128(almostthere);
+      _mm_storeu_si128( (__m128i *)output, v128);
+      v128 = _mm256_extractf128_si256(almostthere, 1);
+      _mm_storeu_si128( (__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -249,6 +302,13 @@ namespace simdjson::haswell::simd {
      each(1);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      uint32_t mask1 = static_cast<uint32_t>(mask);
+      uint32_t mask2 = static_cast<uint32_t>(mask >> 32);
+      this->chunks[0].compress(mask1, output);
+      this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
+    }
+
    really_inline void store(T ptr[64]) const {
      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
@ -269,6 +329,8 @@ namespace simdjson::haswell::simd {
      );
    }

+    
+
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
      return simd8x64<R>(
@ -302,7 +364,6 @@ namespace simdjson::haswell::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::haswell::simd
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -30,6 +30,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');

+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
    return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
  }).to_bitmask();
@ -54,11 +59,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

-#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/buf_block_reader.h"
 #include "generic/json_string_scanner.h"
 #include "generic/json_scanner.h"
-#include "generic/json_structural_indexer.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
  return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
 }
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@ -1,5 +1,7 @@
 #include "simdjson.h"
 #include "isadetection.h"
+#include "simdprune_tables.h"
+
 #include <initializer_list>

 // Static array of known implementations. We're hoping these get baked into the executable
@ -48,6 +50,9 @@ public:
  WARN_UNUSED error_code parse(const uint8_t *, size_t, document::parser &) const noexcept final {
    return UNSUPPORTED_ARCHITECTURE;
  }
+  WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
  WARN_UNUSED error_code stage1(const uint8_t *, size_t, document::parser &, bool) const noexcept final {
    return UNSUPPORTED_ARCHITECTURE;
  }
--- a/src/jsonminifier.cpp
+++ b/src/jsonminifier.cpp
@ -1,478 +0,0 @@
-#include "simdjson.h"
-#include <cstdint>
-
-#ifndef SIMDJSON_ISSUE384RESOLVED // to avoid tripping users
-
-namespace simdjson {
-static uint8_t jump_table[256 * 3] = {
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
-    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-};
-
-size_t json_minify(const unsigned char *bytes, size_t how_many,
-                   unsigned char *out) {
-  size_t i = 0, pos = 0;
-  uint8_t quote = 0;
-  uint8_t nonescape = 1;
-
-  while (i < how_many) {
-    unsigned char c = bytes[i];
-    uint8_t *meta = jump_table + 3 * c;
-
-    quote = quote ^ (meta[0] & nonescape);
-    out[pos] = c;
-    pos += meta[2] | quote;
-
-    i += 1;
-    nonescape = (~nonescape) | (meta[1]);
-  }
-  return pos;
-}
-} // namespace simdjson
-#else
-
-//
-// This fast code is disabled.
-// See issue https://github.com/lemire/simdjson/issues/384
-//
-#include "simdprune_tables.h"
-#include <cstring>
-#include <x86intrin.h> // currently, there is no runtime dispatch for the minifier
-
-namespace simdjson {
-
-// a straightforward comparison of a mask against input.
-static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
-                                            __m256i mask) {
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
-// Write up to 16 bytes, only the bytes corresponding to a 1-bit are written
-// out. credit: Anime Tosho
-static __m128i skinnycleanm128(__m128i x, int mask) {
-  int mask1 = mask & 0xFF;
-  int mask2 = (mask >> 8) & 0xFF;
-  __m128i shufmask = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64(
-                       (const __m128i *)(thintable_epi8 + mask1))),
-                   (const __m64 *)(thintable_epi8 + mask2)));
-  shufmask =
-      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
-  __m128i pruned = _mm_shuffle_epi8(x, shufmask);
-  intptr_t popx2 = BitsSetTable256mul2[mask1];
-  __m128i compactmask =
-      _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8));
-  return _mm_shuffle_epi8(pruned, compactmask);
-}
-
-// take input from buf and remove useless whitespace, input and output can be
-// the same, result is null terminated, return the string length (minus the null
-// termination)
-size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
-  // Useful constant masks
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint8_t *initout(out);
-  uint64_t prev_iter_ends_odd_backslash =
-      0ULL;                               // either 0 or 1, but a 64-bit value
-  uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
-  size_t idx = 0;
-  if (len >= 64) {
-    size_t avx_len = len - 63;
-
-    for (; idx < avx_len; idx += 64) {
-      __m256i input_lo =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
-      __m256i input_hi =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
-      uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                     _mm256_set1_epi8('\\'));
-      uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-      uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-      uint64_t even_starts = start_edges & even_start_mask;
-      uint64_t odd_starts = start_edges & ~even_start_mask;
-      uint64_t even_carries = bs_bits + even_starts;
-      uint64_t odd_carries;
-      bool iter_ends_odd_backslash =
-          add_overflow(bs_bits, odd_starts, &odd_carries);
-      odd_carries |= prev_iter_ends_odd_backslash;
-      prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-      uint64_t even_carry_ends = even_carries & ~bs_bits;
-      uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-      uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-      uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-      uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-      uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                        _mm256_set1_epi8('"'));
-      quote_bits = quote_bits & ~odd_ends;
-      uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-          _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-      quote_mask ^= prev_iter_inside_quote;
-      prev_iter_inside_quote = static_cast<uint64_t>(
-          static_cast<int64_t>(quote_mask) >>
-          63); // might be undefined behavior, should be fully defined in C++20,
-               // ok according to John Regher from Utah University
-      const __m256i low_nibble_mask = _mm256_setr_epi8(
-          //  0                           9  a   b  c  d
-          16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
-          0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-      const __m256i high_nibble_mask = _mm256_setr_epi8(
-          //  0     2   3     5     7
-          8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
-          1, 0, 0, 0, 3, 2, 1, 0, 0);
-      __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-      __m256i v_lo = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_lo),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
-                                               _mm256_set1_epi8(0x7f))));
-
-      __m256i v_hi = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_hi),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
-                                               _mm256_set1_epi8(0x7f))));
-      __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-      __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-      uint64_t ws_res_0 =
-          static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-      uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-      uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
-      whitespace &= ~quote_mask;
-
-      uint64_t non_whitespace = ~whitespace;
-
-      __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-      __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-      __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-      __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-
-      int mask1 = non_whitespace & 0xFFFF;
-      int mask2 = (non_whitespace >> 16) & 0xFFFF;
-      int mask3 = (non_whitespace >> 32) & 0xFFFF;
-      int mask4 = (non_whitespace >> 48) & 0xFFFF;
-
-      x1 = skinnycleanm128(x1, mask1);
-      x2 = skinnycleanm128(x2, mask2);
-      x3 = skinnycleanm128(x3, mask3);
-      x4 = skinnycleanm128(x4, mask4);
-      int pop1 = hamming(non_whitespace & 0xFFFF);
-      int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
-      int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
-      int pop4 = hamming(non_whitespace);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-      out += pop4;
-    }
-  }
-  // we finish off the job... copying and pasting the code is not ideal here,
-  // but it gets the job done.
-  if (idx < len) {
-    uint8_t buffer[64];
-    memset(buffer, 0, 64);
-    memcpy(buffer, buf + idx, len - idx);
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
-    uint64_t bs_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
-    uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-    uint64_t even_starts = start_edges & even_start_mask;
-    uint64_t odd_starts = start_edges & ~even_start_mask;
-    uint64_t even_carries = bs_bits + even_starts;
-    uint64_t odd_carries;
-    // bool iter_ends_odd_backslash =
-    add_overflow(bs_bits, odd_starts, &odd_carries);
-    odd_carries |= prev_iter_ends_odd_backslash;
-    // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-    // // we never use it
-    uint64_t even_carry_ends = even_carries & ~bs_bits;
-    uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-    uint64_t quote_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
-    quote_bits = quote_bits & ~odd_ends;
-    uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-        _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-    quote_mask ^= prev_iter_inside_quote;
-    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
-    // don't need this anymore
-
-    __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
-    __m256i mask_70 =
-        _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
-    // but moves any value >= 16 above 128
-
-    __m256i lut_cntrl = _mm256_setr_epi8(
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
-        0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
-
-    __m256i tmp_ws_lo = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_lo),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
-    __m256i tmp_ws_hi = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_hi),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
-    uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-    uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
-    whitespace &= ~quote_mask;
-
-    if (len - idx < 64) {
-      whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
-    }
-    uint64_t non_whitespace = ~whitespace;
-
-    int mask1 = non_whitespace & 0xFFFF;
-    int mask2 = (non_whitespace >> 16) & 0xFFFF;
-    int mask3 = (non_whitespace >> 32) & 0xFFFF;
-    int mask4 = (non_whitespace >> 48) & 0xFFFF;
-
-    x1 = skinnycleanm128(x1, mask1);
-    x2 = skinnycleanm128(x2, mask2);
-    x3 = skinnycleanm128(x3, mask3);
-    x4 = skinnycleanm128(x4, mask4);
-    int pop1 = hamming(non_whitespace & 0xFFFF);
-    int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
-    int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
-    int pop4 = hamming(non_whitespace);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-    out += pop4;
-  }
-  *out = '\0'; // NULL termination
-  return out - initout;
-}
-
-size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
-  // Useful constant masks
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint8_t *initout(out);
-  uint64_t prev_iter_ends_odd_backslash =
-      0ULL;                               // either 0 or 1, but a 64-bit value
-  uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
-  size_t idx = 0;
-  if (len >= 64) {
-    size_t avx_len = len - 63;
-
-    for (; idx < avx_len; idx += 64) {
-      __m256i input_lo =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
-      __m256i input_hi =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
-      uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                     _mm256_set1_epi8('\\'));
-      uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-      uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-      uint64_t even_starts = start_edges & even_start_mask;
-      uint64_t odd_starts = start_edges & ~even_start_mask;
-      uint64_t even_carries = bs_bits + even_starts;
-      uint64_t odd_carries;
-      bool iter_ends_odd_backslash =
-          add_overflow(bs_bits, odd_starts, &odd_carries);
-      odd_carries |= prev_iter_ends_odd_backslash;
-      prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-      uint64_t even_carry_ends = even_carries & ~bs_bits;
-      uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-      uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-      uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-      uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-      uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                        _mm256_set1_epi8('"'));
-      quote_bits = quote_bits & ~odd_ends;
-      uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-          _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-      quote_mask ^= prev_iter_inside_quote;
-      prev_iter_inside_quote = static_cast<uint64_t>(
-          static_cast<int64_t>(quote_mask) >>
-          63); // might be undefined behavior, should be fully defined in C++20,
-               // ok according to John Regher from Utah University
-      const __m256i low_nibble_mask = _mm256_setr_epi8(
-          //  0                           9  a   b  c  d
-          16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
-          0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-      const __m256i high_nibble_mask = _mm256_setr_epi8(
-          //  0     2   3     5     7
-          8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
-          1, 0, 0, 0, 3, 2, 1, 0, 0);
-      __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-      __m256i v_lo = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_lo),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
-                                               _mm256_set1_epi8(0x7f))));
-
-      __m256i v_hi = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_hi),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
-                                               _mm256_set1_epi8(0x7f))));
-      __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-      __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-      uint64_t ws_res_0 =
-          static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-      uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-      uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
-      whitespace &= ~quote_mask;
-      int mask1 = whitespace & 0xFFFF;
-      int mask2 = (whitespace >> 16) & 0xFFFF;
-      int mask3 = (whitespace >> 32) & 0xFFFF;
-      int mask4 = (whitespace >> 48) & 0xFFFF;
-      int pop1 = hamming((~whitespace) & 0xFFFF);
-      int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
-      int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
-      int pop4 = hamming((~whitespace));
-      __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-      __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-      __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-      __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-      x1 = skinnycleanm128(x1, mask1);
-      x2 = skinnycleanm128(x2, mask2);
-      x3 = skinnycleanm128(x3, mask3);
-      x4 = skinnycleanm128(x4, mask4);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-      out += pop4;
-    }
-  }
-  // we finish off the job... copying and pasting the code is not ideal here,
-  // but it gets the job done.
-  if (idx < len) {
-    uint8_t buffer[64];
-    memset(buffer, 0, 64);
-    memcpy(buffer, buf + idx, len - idx);
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
-    uint64_t bs_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
-    uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-    uint64_t even_starts = start_edges & even_start_mask;
-    uint64_t odd_starts = start_edges & ~even_start_mask;
-    uint64_t even_carries = bs_bits + even_starts;
-    uint64_t odd_carries;
-    // bool iter_ends_odd_backslash =
-    add_overflow(bs_bits, odd_starts, &odd_carries);
-    odd_carries |= prev_iter_ends_odd_backslash;
-    // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-    // // we never use it
-    uint64_t even_carry_ends = even_carries & ~bs_bits;
-    uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-    uint64_t quote_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
-    quote_bits = quote_bits & ~odd_ends;
-    uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-        _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-    quote_mask ^= prev_iter_inside_quote;
-    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
-    // don't need this anymore
-
-    __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
-    __m256i mask_70 =
-        _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
-    // but moves any value >= 16 above 128
-
-    __m256i lut_cntrl = _mm256_setr_epi8(
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
-        0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
-
-    __m256i tmp_ws_lo = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_lo),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
-    __m256i tmp_ws_hi = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_hi),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
-    uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-    uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
-    whitespace &= ~quote_mask;
-
-    if (len - idx < 64) {
-      whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
-    }
-    int mask1 = whitespace & 0xFFFF;
-    int mask2 = (whitespace >> 16) & 0xFFFF;
-    int mask3 = (whitespace >> 32) & 0xFFFF;
-    int mask4 = (whitespace >> 48) & 0xFFFF;
-    int pop1 = hamming((~whitespace) & 0xFFFF);
-    int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
-    int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
-    int pop4 = hamming((~whitespace));
-    __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-    __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-    __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-    __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-    x1 = skinnycleanm128(x1, mask1);
-    x2 = skinnycleanm128(x2, mask2);
-    x3 = skinnycleanm128(x3, mask3);
-    x4 = skinnycleanm128(x4, mask4);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4);
-    memcpy(out, buffer, pop4);
-    out += pop4;
-  }
-  *out = '\0'; // NULL termination
-  return out - initout;
-}
-
-} // namespace simdjson
-#endif
--- a/src/simdjson.cpp
+++ b/src/simdjson.cpp
@ -1,5 +1,4 @@
 #include "simdjson.h"
 #include "implementation.cpp"
-#include "jsonminifier.cpp"
 #include "stage1_find_marks.cpp"
 #include "stage2_build_tape.cpp"
--- a/src/westmere/bitmanipulation.h
+++ b/src/westmere/bitmanipulation.h
@ -46,7 +46,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 #endif// _MSC_VER
 }

-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
 #ifdef _MSC_VER
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num);// Visual Studio wants two underscores
--- a/src/westmere/implementation.h
+++ b/src/westmere/implementation.h
@ -11,6 +11,7 @@ class implementation final : public simdjson::implementation {
 public:
  really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/westmere/simd.h
+++ b/src/westmere/simd.h
@ -2,8 +2,12 @@
 #define SIMDJSON_WESTMERE_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "westmere/bitmanipulation.h"
 #include "westmere/intrinsics.h"

+
+
 TARGET_WESTMERE
 namespace simdjson::westmere::simd {

@ -106,6 +110,42 @@ namespace simdjson::westmere::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return _mm_shuffle_epi8(lookup_table, *this);
    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint16_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m128i shufmask =  _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask
+      shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      __m128i compactmask =
+      _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8));
+      __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+      _mm_storeu_si128(( __m128i *)(output), answer);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -235,6 +275,13 @@ namespace simdjson::westmere::simd {
      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(mask, output);
+      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
+      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
+      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+    }
+
    template <typename F>
    static really_inline void each_index(F const& each) {
      each(0);
@ -302,7 +349,6 @@ namespace simdjson::westmere::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::westmere::simd
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -29,6 +29,11 @@ really_inline json_character_block json_character_block::classify(const simd::si
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');

+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
    return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
  }).to_bitmask();
@ -53,11 +58,17 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

-#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/buf_block_reader.h"
 #include "generic/json_string_scanner.h"
 #include "generic/json_scanner.h"
-#include "generic/json_structural_indexer.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
  return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }
--- a/tests/basictests.cpp
+++ b/tests/basictests.cpp
@ -892,7 +892,8 @@ namespace dom_api {
    if (doc["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"a\"]) to be 1, was " << doc["a"].first << endl; return false; }

    UNUSED document::element val;
-    tie(val, error) = doc["d"];
+    // tie(val, error) = doc["d"]; fails with "no viable overloaded '='" on Apple clang version 11.0.0
+    doc["d"].tie(val, error);
    if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(doc[\"d\"]), got " << error << endl; return false; }
    return true;
  }
@ -906,11 +907,11 @@ namespace dom_api {
    if (doc["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; }

    document::object obj;
-    tie(obj, error) = doc.as_object();
+    doc.as_object().tie(obj, error); //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
    if (error) { cerr << "Error: " << error << endl; return false; }
    if (obj["obj"]["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(doc[\"obj\"][\"a\"]) to be 1, was " << doc["obj"]["a"].first << endl; return false; }

-    tie(obj, error) = obj["obj"].as_object();
+    obj["obj"].as_object().tie(obj, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
    if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; }
    if (obj["b"].as_uint64_t().first != 2) { cerr << "Expected uint64_t(obj[\"b\"]) to be 2, was " << obj["b"].first << endl; return false; }
    if (obj["c"].as_uint64_t().first != 3) { cerr << "Expected uint64_t(obj[\"c\"]) to be 3, was " << obj["c"].first << endl; return false; }
@ -920,7 +921,7 @@ namespace dom_api {
    if (obj["a"].as_uint64_t().first != 1) { cerr << "Expected uint64_t(obj[\"a\"]) to be 1, was " << obj["a"].first << endl; return false; }

    UNUSED document::element val;
-    tie(val, error) = doc["d"];
+    doc["d"].tie(val, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
    if (error != simdjson::NO_SUCH_FIELD) { cerr << "Expected NO_SUCH_FIELD error for uint64_t(obj[\"d\"]), got " << error << endl; return false; }
    return true;
  }
@ -944,14 +945,14 @@ namespace dom_api {
    if (error) { cerr << "Error: " << error << endl; return false; }
    for (auto tweet : tweets) {
      document::object user;
-      tie(user, error) = tweet["user"].as_object();
+      tweet["user"].as_object().tie(user, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
      if (error) { cerr << "Error: " << error << endl; return false; }
      bool default_profile;
-      tie(default_profile, error) = user["default_profile"].as_bool();
+      user["default_profile"].as_bool().tie(default_profile, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
      if (error) { cerr << "Error: " << error << endl; return false; }
      if (default_profile) {
        std::string_view screen_name;
-        tie(screen_name, error) = user["screen_name"].as_string();
+        user["screen_name"].as_string().tie(screen_name, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
        if (error) { cerr << "Error: " << error << endl; return false; }
        default_users.insert(screen_name);
      }
@ -972,13 +973,13 @@ namespace dom_api {
      if (!not_found) {
        for (auto image : media) {
          document::object sizes;
-          tie(sizes, error) = image["sizes"].as_object();
+          image["sizes"].as_object().tie(sizes, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
          if (error) { cerr << "Error: " << error << endl; return false; }
          for (auto [key, size] : sizes) {
            uint64_t width, height;
-            tie(width, error) = size["w"].as_uint64_t();
+            size["w"].as_uint64_t().tie(width, error);  //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
            if (error) { cerr << "Error: " << error << endl; return false; }
-            tie(height, error) = size["h"].as_uint64_t();
+            size["h"].as_uint64_t().tie(height, error); //  tie(...) = fails with "no viable overloaded '='" on Apple clang version 11.0.0
            if (error) { cerr << "Error: " << error << endl; return false; }
            image_sizes.insert(make_pair(width, height));
          }
--- a/tools/minify.cpp
+++ b/tools/minify.cpp
@ -1,18 +1,90 @@
 #include <iostream>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif

 #include "simdjson.h"

-int main(int argc, char *argv[]) {
-  if (argc != 2) {
-    std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    exit(1);
+// Stash the exe_name in main() for functions to use
+char* exe_name;
+
+void print_usage(std::ostream& out) {
+  out << "Usage: " << exe_name << "  [-a ARCH] <jsonfile>" << std::endl;
+  out << std::endl;
+  out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << std::endl;
+  out << std::endl;
+  out << "Options:" << std::endl;
+  out << std::endl;
+  out << "-a IMPL      - Use the given parser implementation. By default, detects the most advanced" << std::endl;
+  out << "               implementation supported on the host machine." << std::endl;
+  for (auto impl : simdjson::available_implementations) {
+    out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << std::endl;
  }
-  std::string filename = argv[argc - 1];
+}
+
+void exit_usage(std::string message) {
+  std::cerr << message << std::endl;
+  std::cerr << std::endl;
+  print_usage(std::cerr);
+  exit(EXIT_FAILURE);
+}
+
+
+struct option_struct {
+  char* filename;
+ 
+  option_struct(int argc, char **argv) {
+    #ifndef _MSC_VER
+      int c;
+
+      while ((c = getopt(argc, argv, "a:")) != -1) {
+        switch (c) {
+        case 'a': {
+          const simdjson::implementation *impl = simdjson::available_implementations[optarg];
+          if (!impl) {
+            std::string exit_message = std::string("Unsupported option value -a ") + optarg + ": expected -a  with one of ";
+            for (auto imple : simdjson::available_implementations) {
+              exit_message += imple->name();
+              exit_message += " ";
+            }
+            exit_usage(exit_message);
+          }
+          simdjson::active_implementation = impl;
+          break;
+        }
+        default:
+          // reaching here means an argument was given to getopt() which did not have a case label
+          exit_usage("Unexpected argument - missing case for option "+
+                     std::string(1,static_cast<char>(c))+
+                     " (programming error)");
+        }
+      }
+    #else
+      int optind = 1;
+    #endif
+
+    // All remaining arguments are considered to be files
+    if(optind + 1 == argc) {
+      filename = argv[optind];
+    } else {
+      exit_usage("Please specify exactly one input file.");
+    }
+  }
+};
+
+int main(int argc, char *argv[]) {
+  exe_name = argv[0];
+  option_struct options(argc, argv);
+  std::string filename = options.filename;
  auto [p, error] = simdjson::padded_string::load(filename);
  if (error) {
    std::cerr << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
  }
-  simdjson::json_minify(p, p.data());
-  printf("%s", p.data());
+  simdjson::padded_string copy(p.length());
+  size_t copy_len;
+  error = simdjson::active_implementation->minify((const uint8_t*)p.data(), p.length(), (uint8_t*)copy.data(), copy_len);
+  if (error) { std::cerr << error << std::endl; return 1; }
+  printf("%s", copy.data());
 }