Merge branch 'master' of github.com:lemire/simdjson

Conflicts: include/simdjson/jsonstream.h
2020-03-22 12:40:34 -04:00 · 2020-03-22 12:40:34 -04:00 · 3e39a998ce
parent 2867dc50fa c3c43769ae
commit 3e39a998ce
54 changed files with 9598 additions and 5729 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -1,4 +1,17 @@
 kind: pipeline
+name: x64-quicktests-libc
+
+platform:
+  os: linux
+  arch: amd64
+
+steps:
+- name: quicktests
+  image: conanio/clang8
+  user: root
+  commands: [ EXTRAFLAGS=-stdlib=libc++ make quicktests ]
+---
+kind: pipeline
 name: x64-quicktests

 platform:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -6,7 +6,7 @@ In particular, the following contributions are invited:

 - The library is focused on performance. Well-documented performance optimization are invited.
 - Fixes to known or newly discovered bugs are always welcome. Typically, a bug fix should come with a test demonstrating that the bug has been fixed.
- The simdjson library is advanced software and maintanability and flexibility are always a concern. Specific contributions to improve maintanability and flexibility are invited.
+- The simdjson library is advanced software and maintainability and flexibility are always a concern. Specific contributions to improve maintainability and flexibility are invited.



@ -28,5 +28,5 @@ Contributors are encouraged to



-Though we do not have a formal code of conduct, we will not tolerate bullying, bigotery or intimidation. Everyone is welcome to contribute.
+Though we do not have a formal code of conduct, we will not tolerate bullying, bigotry or intimidation. Everyone is welcome to contribute.

--- a/4
+++ b/4
@ -53,7 +53,7 @@ endif # ifeq ($(SANITIZE),1)
 endif # ifeq ($(MEMSANITIZE),1)

 # Headers and sources
-SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
+SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/json_minifier.h src/generic/buf_block_reader.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
 SRCHEADERS_ARM64=      src/arm64/bitmanipulation.h    src/arm64/bitmask.h    src/arm64/intrinsics.h    src/arm64/numberparsing.h    src/arm64/simd.h    src/arm64/stage1_find_marks.h    src/arm64/stage2_build_tape.h    src/arm64/stringparsing.h
 SRCHEADERS_HASWELL=  src/haswell/bitmanipulation.h  src/haswell/bitmask.h  src/haswell/intrinsics.h  src/haswell/numberparsing.h  src/haswell/simd.h  src/haswell/stage1_find_marks.h  src/haswell/stage2_build_tape.h  src/haswell/stringparsing.h
 SRCHEADERS_FALLBACK=  src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h
@ -61,7 +61,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
 SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
 SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)

-INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
+INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h

 ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
 	HEADERS=singleheader/simdjson.h
--- a/README.md
+++ b/README.md
@ -1,16 +1,44 @@
-#  simdjson : Parsing gigabytes of JSON per second
-[![Build Status](https://cloud.drone.io/api/badges/lemire/simdjson/status.svg)](https://cloud.drone.io/lemire/simdjson/)
-[![CircleCI](https://circleci.com/gh/lemire/simdjson.svg?style=svg)](https://circleci.com/gh/lemire/simdjson)
-[![Build Status](https://img.shields.io/appveyor/ci/lemire/simdjson/master.svg)](https://ci.appveyor.com/project/lemire/simdjson)
+# simdjson : Parsing gigabytes of JSON per second
+
+<img src="images/logo.png" width="10%" style="float: right">
+JSON is everywhere on the Internet. Servers spend a *lot* of time parsing it. We need a fresh approach. simdjson uses commonly available SIMD instructions and microparallel algorithms to parse JSON 2.5x faster than anything else out there.
+
+* **Ludicrous Speed:** Over 2.5x faster than other production-grade JSON parsers.
+* **Delightfully Easy:** First-class, easy to use API.
+* **Complete Validation:** Full JSON and UTF-8 validation, with no compromises.
+* **Rock-Solid Reliability:** From memory allocation to error handling, simdjson's design avoids surprises.
+
+This library is part of the [Awesome Modern C++](https://awesomecpp.com) list.
+
+[![Build Status](https://cloud.drone.io/api/badges/simdjson/simdjson/status.svg)](https://cloud.drone.io/simdjson/simdjson)
+[![CircleCI](https://circleci.com/gh/simdjson/simdjson.svg?style=svg)](https://circleci.com/gh/simdjson/simdjson)
+[![Build status](https://ci.appveyor.com/api/projects/status/ae77wp5v3lebmu6n/branch/master?svg=true)](https://ci.appveyor.com/project/lemire/simdjson-jmmti/branch/master)
 [![][license img]][license]
-[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/simdjson.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:simdjson)

+## Quick Start

-## A C++ library to see how fast we can parse JSON with complete validation.
+simdjson is easily consumable with a single .h and .cpp file.

-JSON documents are everywhere on the Internet. Servers spend a lot of time parsing these documents. We want to accelerate the parsing of JSON per se using commonly available SIMD instructions as much as possible while doing full validation (including character encoding). This library is part of the [Awesome Modern C++](https://awesomecpp.com) list.
+0. Prerequisites: `g++` or `clang++`.
+1. Pull [simdjson.h](singleheader/simdjson.h) and [simdjson.cpp](singleheader/simdjson.cpp) into a directory, along with the sample file [twitter.json](jsonexamples/twitter.json).
+   ```
+   wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.h https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.cpp https://raw.githubusercontent.com/simdjson/simdjson/master/jsonexamples/twitter.json
+   ```
+2. Create `parser.cpp`:

-<img src="images/logo.png" width="10%">
+   ```c++
+   #include "simdjson.h"
+   int main(void) {
+     simdjson::document::parser parser;
+     simdjson::document& tweets = parser.load("twitter.json");
+     std::cout << tweets["search_metadata"]["count"] << " results." << std::endl;
+   }
+   ```
+3. `g++ -o parser parser.cpp` (or clang++)
+4. `./parser`
+   ```
+   100 results.
+   ```

 ## Real-world usage

@ -110,7 +138,7 @@ be concerned with computed gotos.

 ## Thread safety

-The simdjson library is mostly single-threaded. Thread safety is the responsability of the caller: it is unsafe to reuse a document::parser object between different threads.
+The simdjson library is mostly single-threaded. Thread safety is the responsibility of the caller: it is unsafe to reuse a document::parser object between different threads.

 If you are on an x64 processor, the runtime dispatching assigns the right code path the first time that parsing is attempted. The runtime dispatching is thread-safe.

@ -136,23 +164,23 @@ All examples below use use `#include "simdjson.h"`, `#include "simdjson.cpp"` an
 The simplest API to get started is `document::parse()`, which allocates a new parser, parses a string, and returns the DOM. This is less efficient if you're going to read multiple documents, but as long as you're only parsing a single document, this will do just fine.

 ```c++
-auto [doc, error] = document::parse(string("[ 1, 2, 3 ]"));
-if (error) { cerr << "Error: " << error_message(error) << endl; exit(1); }
+auto [doc, error] = document::parse("[ 1, 2, 3 ]"_padded);
+if (error) { cerr << "Error: " << error << endl; exit(1); }
 cout << doc;
 ```

 If you're using exceptions, it gets even simpler (simdjson won't use exceptions internally, so you'll only pay the performance cost of exceptions in your own calling code):

 ```c++
-document doc = document::parse(string("[ 1, 2, 3 ]"));
-cout << doc;
+cout << document::parse("[ 1, 2, 3 ]"_padded);
 ```

-The simdjson library requires SIMDJSON_PADDING extra bytes at the end of a string (it doesn't matter if the bytes are initialized). The `padded_string` class is an easy way to ensure this is accomplished up front and prevent the extra allocation:
+If you're wondering why the examples above use `_padded`, it's because the simdjson library requires SIMDJSON_PADDING extra bytes at the end of a string (it doesn't matter if the bytes are initialized). `_padded`
+is a way of creating a `padded_string` class, which assures us we have enough allocation.

 ```c++
-document doc = document::parse(padded_string(string("[ 1, 2, 3 ]")));
-cout << doc;
+padded_string json = "[ 1, 2, 3 ]"_padded;
+cout << document::parse(json);
 ```

 You can also load from a file with `parser.load()`:
@ -463,7 +491,7 @@ You then have access to the following methods on the resulting `simdjson::docume
 * `bool move_to_key(const char *key, uint32_t length)`: as above except that the target can contain NULL characters
 * `void move_to_value()`: when at a key location within an object, this moves to the accompanying, value (located next to it).  This is equivalent but much faster than calling `next()`.
 * `bool move_to_index(uint32_t index)`: when at `[`, go one level deep, and advance to the given index, if successful, we are left pointing at the value,i f not, we are still pointing at the array
-* `bool move_to(const char *pointer, uint32_t length)`: Moves the iterator to the value correspoding to the json pointer. Always search from the root of the document. If successful, we are left pointing at the value, if not, we are still pointing the same value we were pointing before the call. The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
+* `bool move_to(const char *pointer, uint32_t length)`: Moves the iterator to the value corresponding to the json pointer. Always search from the root of the document. If successful, we are left pointing at the value, if not, we are still pointing the same value we were pointing before the call. The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
 * `bool move_to(const std::string &pointer) `: same as above but with a std::string parameter
 * `bool next()`:   Within a given scope (series of nodes at the same depth within either an array or an object), we move forward. Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [. At the object ({) or at the array ([), you can issue a "down" to visit their content. valid if we're not at the end of a scope (returns true).
 * `bool prev()`:  Within a given scope (series of nodes at the same depth within either an
@ -567,7 +595,7 @@ make allparsingcompetition
 ```

 Both the `parsingcompetition` and `allparsingcompetition` tools take a `-t` flag which produces
-a table-oriented output that can be conventiently parsed by other tools.
+a table-oriented output that can be conveniently parsed by other tools.


 ## Docker
@ -575,7 +603,7 @@ a table-oriented output that can be conventiently parsed by other tools.
 One can run tests and benchmarks using docker. It especially makes sense under Linux. A privileged access may be needed to get performance counters.

 ```
-git clone https://github.com/lemire/simdjson.git
+git clone https://github.com/simdjson/simdjson.git
 cd simdjson
 docker build -t simdjson .
 docker run --privileged -t simdjson
--- a/amalgamation.sh
+++ b/amalgamation.sh
@ -143,7 +143,7 @@ int main(int argc, char *argv[]) {
  // parse_many
  const char * filename2 = argv[2];
  for (auto result : parser.load_many(filename2)) {
-    error = result.error;
+    error = result.error();
  }
  if (error) {
    std::cout << "parse_many failed" << std::endl;
--- a/benchmark/benchmarker.h
+++ b/benchmark/benchmarker.h
@ -263,7 +263,7 @@ struct benchmarker {
    : filename(_filename), collector(_collector), stats(NULL) {
    verbose() << "[verbose] loading " << filename << endl;
    simdjson::error_code error;
-    std::tie(this->json, error) = padded_string::load(filename);
+    padded_string::load(filename).tie(this->json, error);
    if (error) {
      exit_error(string("Could not load the file ") + filename);
    }
--- a/benchmark/minifiercompetition.cpp
+++ b/benchmark/minifiercompetition.cpp
@ -98,16 +98,14 @@ int main(int argc, char *argv[]) {
      "despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
      memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
  memcpy(buffer, p.data(), p.size());
-
-  size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
-                                           (uint8_t *)buffer);
-  if (verbose)
-    std::cout << "json_minify length is " << outlength << std::endl;
-
+  size_t outlength;
  uint8_t *cbuffer = (uint8_t *)buffer;
-  BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
+  for (auto imple : simdjson::available_implementations) {
+    BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength), outlength),
            outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
            !just_data);
+  }
+
  printf("minisize = %zu, original size = %zu  (minified down to %.2f percent "
         "of original) \n",
         outlength, p.size(), outlength * 100.0 / p.size());
@ -121,8 +119,9 @@ int main(int argc, char *argv[]) {
            !just_data);

  char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
-  size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
-                                          (uint8_t *)mini_buffer);
+  size_t minisize;
+  simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(),
+                                          (uint8_t *)mini_buffer, minisize);
  mini_buffer[minisize] = '\0';

  BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
@ -171,6 +170,7 @@ int main(int argc, char *argv[]) {
                                 automated_reallocation),
            simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
            !just_data);
+
  free(buffer);
  free(ast_buffer);
  free(mini_buffer);
--- a/benchmark/parse.cpp
+++ b/benchmark/parse.cpp
@ -109,7 +109,12 @@ struct option_struct {
        case 'a': {
          const implementation *impl = simdjson::available_implementations[optarg];
          if (!impl) {
-            exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a haswell, westmere or arm64");
+            std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a  with one of ";
+            for (auto imple : simdjson::available_implementations) {
+              exit_message += imple->name();
+              exit_message += " ";
+            }
+            exit_usage(exit_message);
          }
          simdjson::active_implementation = impl;
          break;
--- a/doc/tape.md
+++ b/doc/tape.md
@ -84,12 +84,12 @@ Simple JSON nodes are represented with one tape element:
 ## Integer and Double values

 Integer values are represented as two 64-bit tape elements:
- The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
- The 64-bit value `('u' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be unsigned 64-bit values.
+- The 64-bit value `('l' << 56)` followed by the 64-bit integer value literally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
+- The 64-bit value `('u' << 56)` followed by the 64-bit integer value literally. Integer values are assumed to be unsigned 64-bit values.


 Float values are represented as two 64-bit tape elements:
- The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.
+- The 64-bit value `('d' << 56)` followed by the 64-bit double value literally in standard IEEE 754 notation.

 Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance. 

--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@ -16,7 +16,6 @@ set(SIMDJSON_INCLUDE
    ${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
-    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
    ${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h
--- a/include/simdjson.h
+++ b/include/simdjson.h
@ -10,7 +10,6 @@
 #include "simdjson/implementation.h"
 #include "simdjson/document.h"
 #include "simdjson/document_stream.h"
-#include "simdjson/jsonminifier.h"

 // Deprecated API
 #include "simdjson/parsedjsoniterator.h"
--- a/include/simdjson/document.h
+++ b/include/simdjson/document.h
@ -1522,6 +1522,14 @@ private:
  //
  size_t _max_depth;

+  //
+  // The loaded buffer (reused each time load() is called)
+  //
+  std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
+
+  // Capacity of loaded_bytes buffer.
+  size_t _loaded_bytes_capacity{0};
+
  // all nodes are stored on the doc.tape using a 64-bit word.
  //
  // strings, double and ints are stored as
@ -1543,6 +1551,11 @@ private:
  // and auto-allocate if not.
  inline error_code ensure_capacity(size_t desired_capacity) noexcept;

+  //
+  // Read the file into loaded_bytes
+  //
+  inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
+
 #if SIMDJSON_EXCEPTIONS
  // Used internally to get the document
  inline const document &get_document() const noexcept(false);
@ -1555,7 +1568,7 @@ private:
 /**
 * Minifies a JSON element or document, printing the smallest possible valid JSON.
 *
- *   document doc = document::parse("   [ 1 , 2 , 3 ] "_pad);
+ *   document doc = document::parse("   [ 1 , 2 , 3 ] "_padded);
 *   cout << minify(doc) << endl; // prints [1,2,3]
 *
 */
--- a/include/simdjson/document_iterator.h
+++ b/include/simdjson/document_iterator.h
@ -24,10 +24,10 @@ public:

  inline bool is_ok() const;

-  // useful for debuging purposes
+  // useful for debugging purposes
  inline size_t get_tape_location() const;

-  // useful for debuging purposes
+  // useful for debugging purposes
  inline size_t get_tape_length() const;

  // returns the current depth (start at 1 with 0 reserved for the fictitious
@ -165,7 +165,7 @@ public:
  // if not, we are still pointing at the array ([)
  inline bool move_to_index(uint32_t index);

-  // Moves the iterator to the value correspoding to the json pointer.
+  // Moves the iterator to the value corresponding to the json pointer.
  // Always search from the root of the document.
  // if successful, we are left pointing at the value,
  // if not, we are still pointing the same value we were pointing before the
@ -177,7 +177,7 @@ public:
  // jsonpointer string ('pointer').
  bool move_to(const char *pointer, uint32_t length);

-  // Moves the iterator to the value correspoding to the json pointer.
+  // Moves the iterator to the value corresponding to the json pointer.
  // Always search from the root of the document.
  // if successful, we are left pointing at the value,
  // if not, we are still pointing the same value we were pointing before the
@ -191,7 +191,7 @@ public:
  }

  private:
-  // Almost the same as move_to(), except it searchs from the current
+  // Almost the same as move_to(), except it searches from the current
  // position. The pointer's syntax is identical, though that case is not
  // handled by the rfc6901 standard. The '/' is still required at the
  // beginning. However, contrary to move_to(), the URI Fragment Identifier
--- a/include/simdjson/document_parser.h
+++ b/include/simdjson/document_parser.h
@ -1,522 +0,0 @@
-#ifndef SIMDJSON_DOCUMENT_PARSER_H
-#define SIMDJSON_DOCUMENT_PARSER_H
-
-#include "simdjson/document.h"
-#include "simdjson/common_defs.h"
-#include "simdjson/error.h"
-#include "simdjson/padded_string.h"
-#include <string>
-
-namespace simdjson {
-
-/**
-  * A persistent document parser.
-  *
-  * Use this if you intend to parse more than one document. It holds the internal memory necessary
-  * to do parsing, as well as memory for a single document that is overwritten on each parse.
-  *
-  * This class cannot be copied, only moved, to avoid unintended allocations.
-  *
-  * @note This is not thread safe: one parser cannot produce two documents at the same time!
-  */
-class document::parser {
-public:
-  /**
-  * Create a JSON parser with zero capacity. Call allocate_capacity() to initialize it.
-  */
-  parser()=default;
-  ~parser()=default;
-
-  /**
-   * Take another parser's buffers and state.
-   *
-   * @param other The parser to take. Its capacity is zeroed.
-   */
-  parser(document::parser &&other) = default;
-  parser(const document::parser &) = delete; // Disallow copying
-  /**
-   * Take another parser's buffers and state.
-   *
-   * @param other The parser to take. Its capacity is zeroed.
-   */
-  parser &operator=(document::parser &&other) = default;
-  parser &operator=(const document::parser &) = delete; // Disallow copying
-
-  /**
-   * Parse a JSON document and return a reference to it.
-   *
-   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
-   * documents because it reuses the same buffers, but you *must* use the document before you
-   * destroy the parser or call parse() again.
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated. If realloc_if_needed is true,
-   * it is assumed that the buffer does *not* have enough padding, and it is reallocated, enlarged
-   * and copied before parsing.
-   *
-   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
-   *            realloc_if_needed is true.
-   * @param len The length of the JSON.
-   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
-   * @return the document, or an error if the JSON is invalid.
-   */
-  inline doc_ref_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) noexcept;
-
-  /**
-   * Parse a JSON document and return a reference to it.
-   *
-   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
-   * documents because it reuses the same buffers, but you *must* use the document before you
-   * destroy the parser or call parse() again.
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated. If realloc_if_needed is true,
-   * it is assumed that the buffer does *not* have enough padding, and it is reallocated, enlarged
-   * and copied before parsing.
-   *
-   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
-   *            realloc_if_needed is true.
-   * @param len The length of the JSON.
-   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
-   * @return the document, or an error if the JSON is invalid.
-   */
-  really_inline doc_ref_result parse(const char *buf, size_t len, bool realloc_if_needed = true) noexcept;
-
-  /**
-   * Parse a JSON document and return a reference to it.
-   *
-   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
-   * documents because it reuses the same buffers, but you *must* use the document before you
-   * destroy the parser or call parse() again.
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated. If `str.capacity() - str.size()
-   * < SIMDJSON_PADDING`, the string will be copied to a string with larger capacity before parsing.
-   *
-   * @param s The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, or
-   *          a new string will be created with the extra padding.
-   * @return the document, or an error if the JSON is invalid.
-   */
-  really_inline doc_ref_result parse(const std::string &s) noexcept;
-
-  /**
-   * Parse a JSON document and return a reference to it.
-   *
-   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
-   * documents because it reuses the same buffers, but you *must* use the document before you
-   * destroy the parser or call parse() again.
-   *
-   * @param s The JSON to parse.
-   * @return the document, or an error if the JSON is invalid.
-   */
-  really_inline doc_ref_result parse(const padded_string &s) noexcept;
-
-  // We do not want to allow implicit conversion from C string to std::string.
-  really_inline doc_ref_result parse(const char *buf) noexcept = delete;
-
-  /**
-   * Parse a buffer containing many JSON documents.
-   *
-   *   document::parser parser;
-   *   for (const document &doc : parser.parse_many(buf, len)) {
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### Format
-   *
-   * The buffer must contain a series of one or more JSON documents, concatenated into a single
-   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
-   * then starts parsing the next document at that point. (It does this with more parallelism and
-   * lookahead than you might think, though.)
-   *
-   * documents that consist of an object or array may omit the whitespace between them, concatenating
-   * with no separator. documents that consist of a single primitive (i.e. documents that are not
-   * arrays or objects) MUST be separated with whitespace.
-   *
-   * ### Error Handling
-   *
-   * All errors are returned during iteration: if there is a global error such as memory allocation,
-   * it will be yielded as the first result. Iteration always stops after the first error.
-   *
-   * As with all other simdjson methods, non-exception error handling is readily available through
-   * the same interface, requiring you to check the error before using the document:
-   *
-   *   document::parser parser;
-   *   for (auto [doc, error] : parser.parse_many(buf, len)) {
-   *     if (error) { cerr << error << endl; exit(1); }
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### REQUIRED: Buffer Padding
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated.
-   *
-   * ### Threads
-   *
-   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
-   * hood to do some lookahead.
-   *
-   * ### Parser Capacity
-   *
-   * If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
-   * allocated, it must have a capacity at least as large as batch_size.
-   *
-   * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
-   * @param len The length of the concatenated JSON.
-   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
-   *                   spot is cache-related: small enough to fit in cache, yet big enough to
-   *                   parse as many documents as possible in one tight loop.
-   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
-   * @return The stream. If there is an error, it will be returned during iteration. An empty input
-   *         will yield 0 documents rather than an EMPTY error. Errors:
-   *         - MEMALLOC if the parser is unallocated and memory allocation fails
-   *         - CAPACITY if the parser already has a capacity, and it is less than batch_size
-   *         - other json errors if parsing fails.
-   */
-  inline stream parse_many(const uint8_t *buf, size_t len, size_t batch_size = 1000000) noexcept;
-
-  /**
-   * Parse a buffer containing many JSON documents.
-   *
-   *   document::parser parser;
-   *   for (const document &doc : parser.parse_many(buf, len)) {
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### Format
-   *
-   * The buffer must contain a series of one or more JSON documents, concatenated into a single
-   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
-   * then starts parsing the next document at that point. (It does this with more parallelism and
-   * lookahead than you might think, though.)
-   *
-   * documents that consist of an object or array may omit the whitespace between them, concatenating
-   * with no separator. documents that consist of a single primitive (i.e. documents that are not
-   * arrays or objects) MUST be separated with whitespace.
-   *
-   * ### Error Handling
-   *
-   * All errors are returned during iteration: if there is a global error such as memory allocation,
-   * it will be yielded as the first result. Iteration always stops after the first error.
-   *
-   * As with all other simdjson methods, non-exception error handling is readily available through
-   * the same interface, requiring you to check the error before using the document:
-   *
-   *   document::parser parser;
-   *   for (auto [doc, error] : parser.parse_many(buf, len)) {
-   *     if (error) { cerr << error << endl; exit(1); }
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### REQUIRED: Buffer Padding
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated.
-   *
-   * ### Threads
-   *
-   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
-   * hood to do some lookahead.
-   *
-   * ### Parser Capacity
-   *
-   * If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
-   * allocated, it must have a capacity at least as large as batch_size.
-   *
-   * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
-   * @param len The length of the concatenated JSON.
-   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
-   *                   spot is cache-related: small enough to fit in cache, yet big enough to
-   *                   parse as many documents as possible in one tight loop.
-   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
-   * @return The stream. If there is an error, it will be returned during iteration. An empty input
-   *         will yield 0 documents rather than an EMPTY error. Errors:
-   *         - MEMALLOC if the parser is unallocated and memory allocation fails
-   *         - CAPACITY if the parser already has a capacity, and it is less than batch_size
-   *         - other json errors if parsing fails
-   */
-  inline stream parse_many(const char *buf, size_t len, size_t batch_size = 1000000) noexcept;
-
-  /**
-   * Parse a buffer containing many JSON documents.
-   *
-   *   document::parser parser;
-   *   for (const document &doc : parser.parse_many(buf, len)) {
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### Format
-   *
-   * The buffer must contain a series of one or more JSON documents, concatenated into a single
-   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
-   * then starts parsing the next document at that point. (It does this with more parallelism and
-   * lookahead than you might think, though.)
-   *
-   * documents that consist of an object or array may omit the whitespace between them, concatenating
-   * with no separator. documents that consist of a single primitive (i.e. documents that are not
-   * arrays or objects) MUST be separated with whitespace.
-   *
-   * ### Error Handling
-   *
-   * All errors are returned during iteration: if there is a global error such as memory allocation,
-   * it will be yielded as the first result. Iteration always stops after the first error.
-   *
-   * As with all other simdjson methods, non-exception error handling is readily available through
-   * the same interface, requiring you to check the error before using the document:
-   *
-   *   document::parser parser;
-   *   for (auto [doc, error] : parser.parse_many(buf, len)) {
-   *     if (error) { cerr << error << endl; exit(1); }
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### REQUIRED: Buffer Padding
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated.
-   *
-   * ### Threads
-   *
-   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
-   * hood to do some lookahead.
-   *
-   * ### Parser Capacity
-   *
-   * If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
-   * allocated, it must have a capacity at least as large as batch_size.
-   *
-   * @param s The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
-   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
-   *                   spot is cache-related: small enough to fit in cache, yet big enough to
-   *                   parse as many documents as possible in one tight loop.
-   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
-   * @return he stream. If there is an error, it will be returned during iteration. An empty input
-   *         will yield 0 documents rather than an EMPTY error. Errors:
-   *         - MEMALLOC if the parser is unallocated and memory allocation fails
-   *         - CAPACITY if the parser already has a capacity, and it is less than batch_size
-   *         - other json errors if parsing fails
-   */
-  inline stream parse_many(const std::string &s, size_t batch_size = 1000000) noexcept;
-
-  /**
-   * Parse a buffer containing many JSON documents.
-   *
-   *   document::parser parser;
-   *   for (const document &doc : parser.parse_many(buf, len)) {
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### Format
-   *
-   * The buffer must contain a series of one or more JSON documents, concatenated into a single
-   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
-   * then starts parsing the next document at that point. (It does this with more parallelism and
-   * lookahead than you might think, though.)
-   *
-   * documents that consist of an object or array may omit the whitespace between them, concatenating
-   * with no separator. documents that consist of a single primitive (i.e. documents that are not
-   * arrays or objects) MUST be separated with whitespace.
-   *
-   * ### Error Handling
-   *
-   * All errors are returned during iteration: if there is a global error such as memory allocation,
-   * it will be yielded as the first result. Iteration always stops after the first error.
-   *
-   * As with all other simdjson methods, non-exception error handling is readily available through
-   * the same interface, requiring you to check the error before using the document:
-   *
-   *   document::parser parser;
-   *   for (auto [doc, error] : parser.parse_many(buf, len)) {
-   *     if (error) { cerr << error << endl; exit(1); }
-   *     cout << std::string(doc["title"]) << endl;
-   *   }
-   *
-   * ### REQUIRED: Buffer Padding
-   *
-   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
-   * those bytes are initialized to, as long as they are allocated.
-   *
-   * ### Threads
-   *
-   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
-   * hood to do some lookahead.
-   *
-   * ### Parser Capacity
-   *
-   * If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
-   * allocated, it must have a capacity at least as large as batch_size.
-   *
-   * @param s The concatenated JSON to parse.
-   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
-   *                   spot is cache-related: small enough to fit in cache, yet big enough to
-   *                   parse as many documents as possible in one tight loop.
-   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
-   * @return he stream. If there is an error, it will be returned during iteration. An empty input
-   *         will yield 0 documents rather than an EMPTY error. Errors:
-   *         - MEMALLOC if the parser is unallocated and memory allocation fails
-   *         - CAPACITY if the parser already has a capacity, and it is less than batch_size
-   *         - other json errors if parsing fails
-   */
-  inline stream parse_many(const padded_string &s, size_t batch_size = 1000000) noexcept;
-
-  // We do not want to allow implicit conversion from C string to std::string.
-  really_inline doc_ref_result parse_many(const char *buf, size_t batch_size = 1000000) noexcept = delete;
-
-  /**
-   * Current capacity: the largest document this parser can support without reallocating.
-   */
-  really_inline size_t capacity() const noexcept;
-
-  /**
-   * The maximum level of nested object and arrays supported by this parser.
-   */
-  really_inline size_t max_depth() const noexcept;
-
-  /**
-   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
-   * and `max_depth` depth.
-   */
-  WARN_UNUSED inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH);
-
-  // type aliases for backcompat
-  using Iterator = document::iterator;
-  using InvalidJSON = simdjson_error;
-
-  // Next location to write to in the tape
-  uint32_t current_loc{0};
-
-  // structural indices passed from stage 1 to stage 2
-  uint32_t n_structural_indexes{0};
-  std::unique_ptr<uint32_t[]> structural_indexes;
-
-  // location and return address of each open { or [
-  std::unique_ptr<uint32_t[]> containing_scope_offset;
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-  std::unique_ptr<void*[]> ret_address;
-#else
-  std::unique_ptr<char[]> ret_address;
-#endif
-
-  // Next place to write a string
-  uint8_t *current_string_buf_loc;
-
-  bool valid{false};
-  error_code error{UNINITIALIZED};
-
-  // Document we're writing to
-  document doc;
-
-  //
-  // TODO these are deprecated; use the results of parse instead.
-  //
-
-  // returns true if the document parsed was valid
-  inline bool is_valid() const noexcept;
-
-  // return an error code corresponding to the last parsing attempt, see
-  // simdjson.h will return UNITIALIZED if no parsing was attempted
-  inline int get_error_code() const noexcept;
-
-  // return the string equivalent of "get_error_code"
-  inline std::string get_error_message() const noexcept;
-
-  // print the json to std::ostream (should be valid)
-  // return false if the tape is likely wrong (e.g., you did not parse a valid
-  // JSON).
-  /** @deprecated Use cout << parser.parse() */
-  inline bool print_json(std::ostream &os) const noexcept;
-  inline bool dump_raw_tape(std::ostream &os) const noexcept;
-
-  //
-  // Parser callbacks: these are internal!
-  //
-  // TODO find a way to do this without exposing the interface or crippling performance
-  //
-
-  // this should be called when parsing (right before writing the tapes)
-  inline void init_stage2() noexcept;
-  really_inline error_code on_error(error_code new_error_code) noexcept;
-  really_inline error_code on_success(error_code success_code) noexcept;
-  really_inline bool on_start_document(uint32_t depth) noexcept;
-  really_inline bool on_start_object(uint32_t depth) noexcept;
-  really_inline bool on_start_array(uint32_t depth) noexcept;
-  // TODO we're not checking this bool
-  really_inline bool on_end_document(uint32_t depth) noexcept;
-  really_inline bool on_end_object(uint32_t depth) noexcept;
-  really_inline bool on_end_array(uint32_t depth) noexcept;
-  really_inline bool on_true_atom() noexcept;
-  really_inline bool on_false_atom() noexcept;
-  really_inline bool on_null_atom() noexcept;
-  really_inline uint8_t *on_start_string() noexcept;
-  really_inline bool on_end_string(uint8_t *dst) noexcept;
-  really_inline bool on_number_s64(int64_t value) noexcept;
-  really_inline bool on_number_u64(uint64_t value) noexcept;
-  really_inline bool on_number_double(double value) noexcept;
-  //
-  // Called before a parse is initiated.
-  //
-  // - Returns CAPACITY if the document is too large
-  // - Returns MEMALLOC if we needed to allocate memory and could not
-  //
-  WARN_UNUSED inline error_code init_parse(size_t len) noexcept;
-
-private:
-  //
-  // The maximum document length this parser supports.
-  //
-  // Buffers are large enough to handle any document up to this length.
-  //
-  size_t _capacity{0};
-
-  //
-  // The maximum depth (number of nested objects and arrays) supported by this parser.
-  //
-  // Defaults to DEFAULT_MAX_DEPTH.
-  //
-  size_t _max_depth{0};
-
-  // all nodes are stored on the doc.tape using a 64-bit word.
-  //
-  // strings, double and ints are stored as
-  //  a 64-bit word with a pointer to the actual value
-  //
-  //
-  //
-  // for objects or arrays, store [ or {  at the beginning and } and ] at the
-  // end. For the openings ([ or {), we annotate them with a reference to the
-  // location on the doc.tape of the end, and for then closings (} and ]), we
-  // annotate them with a reference to the location of the opening
-  //
-  //
-
-  inline void write_tape(uint64_t val, tape_type t) noexcept;
-  inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) noexcept;
-
-  //
-  // Set the current capacity: the largest document this parser can support without reallocating.
-  //
-  // This will allocate *or deallocate* as necessary.
-  //
-  // Returns false if allocation fails.
-  //
-  inline WARN_UNUSED bool set_capacity(size_t capacity);
-
-  //
-  // Set the maximum level of nested object and arrays supported by this parser.
-  //
-  // This will allocate *or deallocate* as necessary.
-  //
-  // Returns false if allocation fails.
-  //
-  inline WARN_UNUSED bool set_max_depth(size_t max_depth);
-
-  // Used internally to get the document
-  inline const document &get_document() const noexcept(false);
-
-  template<size_t max_depth> friend class document_iterator;
-}; // class parser
-
-} // namespace simdjson
-
-#endif // SIMDJSON_DOCUMENT_PARSER_H
--- a/include/simdjson/error.h
+++ b/include/simdjson/error.h
@ -77,6 +77,17 @@ private:
 */
 template<typename T>
 struct simdjson_result : public std::pair<T, error_code> {
+  /**
+   * Move the value and the error to the provided variables.
+   */
+  void tie(T& t, error_code & e) {
+    // on the clang compiler that comes with current macOS (Apple clang version 11.0.0),
+    // tie(width, error) = size["w"].as_uint64_t();
+    // fails with "error: no viable overloaded '='""
+    t = std::move(this->first);
+    e = std::move(this->second);
+  }
+
  /**
   * The error.
   */
@ -128,6 +139,17 @@ struct simdjson_result : public std::pair<T, error_code> {
 */
 template<typename T>
 struct simdjson_move_result : std::pair<T, error_code> {
+  /**
+   * Move the value and the error to the provided variables.
+   */
+  void tie(T& t, error_code & e) {
+    // on the clang compiler that comes with current macOS (Apple clang version 11.0.0),
+    // std::tie(this->json, error) = padded_string::load(filename);
+    // fails with "benchmark/benchmarker.h:266:33: error: no viable overloaded '='""
+    t = std::move(this->first);
+    e = std::move(this->second);
+  }
+
  /**
   * The error.
   */
--- a/include/simdjson/implementation.h
+++ b/include/simdjson/implementation.h
@ -56,6 +56,19 @@ public:
   */
  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept = 0;

+  /**
+   * Run a full document parse (ensure_capacity, stage1 and stage2).
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param dst_len the number of bytes written. Output only.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
+
  /**
   * Stage 1 of the document parser.
   *
@ -182,6 +195,9 @@ public:
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final {
    return set_best()->parse(buf, len, parser);
  }
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
+    return set_best()->minify(buf, len, dst, dst_len);
+  }
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final {
    return set_best()->stage1(buf, len, parser, streaming);
  }
--- a/include/simdjson/inline/document.h
+++ b/include/simdjson/inline/document.h
@ -9,6 +9,7 @@
 #include "simdjson/padded_string.h"
 #include "simdjson/internal/jsonformatutils.h"
 #include <iostream>
+#include <climits>

 namespace simdjson {

@ -361,7 +362,7 @@ inline document::element_result document::doc_move_result::operator[](const char
 // document::parser inline implementation
 //
 really_inline document::parser::parser(size_t max_capacity, size_t max_depth) noexcept
-  : _max_capacity{max_capacity}, _max_depth{max_depth} {
+  : _max_capacity{max_capacity}, _max_depth{max_depth}, loaded_bytes(nullptr, &aligned_free_char) {

 }
 inline bool document::parser::is_valid() const noexcept { return valid; }
@ -387,15 +388,54 @@ inline const document &document::parser::get_document() const noexcept(false) {

 #endif // SIMDJSON_EXCEPTIONS

+inline simdjson_result<size_t> document::parser::read_file(const std::string &path) noexcept {
+  // Open the file
+  std::FILE *fp = std::fopen(path.c_str(), "rb");
+  if (fp == nullptr) {
+    return IO_ERROR;
+  }
+
+  // Get the file size
+  if(std::fseek(fp, 0, SEEK_END) < 0) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+  long len = std::ftell(fp);
+  if((len < 0) || (len == LONG_MAX)) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+
+  // Make sure we have enough capacity to load the file
+  if (_loaded_bytes_capacity < size_t(len)) {
+    loaded_bytes.reset( internal::allocate_padded_buffer(len) );
+    if (!loaded_bytes) {
+      std::fclose(fp);
+      return MEMALLOC;
+    }
+    _loaded_bytes_capacity = len;
+  }
+
+  // Read the string
+  std::rewind(fp);
+  size_t bytes_read = std::fread(loaded_bytes.get(), 1, len, fp);
+  if (std::fclose(fp) != 0 || bytes_read != size_t(len)) {
+    return IO_ERROR;
+  }
+
+  return bytes_read;
+}
+
 inline document::doc_result document::parser::load(const std::string &path) noexcept {
-  auto [json, _error] = padded_string::load(path);
-  if (_error) { return doc_result(doc, _error); }
-  return parse(json);
+  auto [len, code] = read_file(path);
+  if (code) { return doc_result(doc, code); }
+
+  return parse(loaded_bytes.get(), len, false);
 }

 inline document::stream document::parser::load_many(const std::string &path, size_t batch_size) noexcept {
-  auto [json, _error] = padded_string::load(path);
-  return stream(*this, reinterpret_cast<const uint8_t*>(json.data()), json.length(), batch_size, _error);
+  auto [len, code] = read_file(path);
+  return stream(*this, (const uint8_t*)loaded_bytes.get(), len, batch_size, code);
 }

 inline document::doc_result document::parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) noexcept {
@ -480,7 +520,7 @@ inline error_code document::parser::set_capacity(size_t capacity) noexcept {
  // Initialize stage 1 output
  //
  uint32_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures]); // TODO realloc
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc
  if (!structural_indexes) {
    return MEMALLOC;
  }
--- a/include/simdjson/inline/document_iterator.h
+++ b/include/simdjson/inline/document_iterator.h
@ -12,13 +12,13 @@ WARN_UNUSED bool document_iterator<max_depth>::is_ok() const {
  return location < tape_length;
 }

-// useful for debuging purposes
+// useful for debugging purposes
 template <size_t max_depth>
 size_t document_iterator<max_depth>::get_tape_location() const {
  return location;
 }

-// useful for debuging purposes
+// useful for debugging purposes
 template <size_t max_depth>
 size_t document_iterator<max_depth>::get_tape_length() const {
  return tape_length;
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@ -1,32 +0,0 @@
-#ifndef SIMDJSON_JSONMINIFIER_H
-#define SIMDJSON_JSONMINIFIER_H
-
-#include "simdjson/padded_string.h"
-#include <cstddef>
-#include <cstdint>
-#include <string_view>
-
-namespace simdjson {
-
-// Take input from buf and remove useless whitespace, write it to out; buf and
-// out can be the same pointer. Result is null terminated,
-// return the string length (minus the null termination).
-// The accelerated version of this function only runs on AVX2 hardware.
-size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out);
-
-static inline size_t json_minify(const char *buf, size_t len, char *out) {
-  return json_minify(reinterpret_cast<const uint8_t *>(buf), len,
-                     reinterpret_cast<uint8_t *>(out));
-}
-
-static inline size_t json_minify(const std::string_view &p, char *out) {
-  return json_minify(p.data(), p.size(), out);
-}
-
-static inline size_t json_minify(const padded_string &p, char *out) {
-  return json_minify(p.data(), p.size(), out);
-}
-
-} // namespace simdjson
-
-#endif // SIMDJSON_JSONMINIFIER_H
--- a/include/simdjson/jsonstream.h
+++ b/include/simdjson/jsonstream.h
@ -35,7 +35,7 @@ namespace simdjson {
 * to a char* and to the number of bytes respectively.
 * The simdjson parser may read up to SIMDJSON_PADDING bytes beyond the end
 * of the string, so if you do not use a padded_string container,
- * you have the responsability to overallocate. If you fail to
+ * you have the responsibility to overallocate. If you fail to
 * do so, your software may crash if you cross a page boundary,
 * and you should expect memory checkers to object.
 * Most users should use a simdjson::padded_string.
--- a/include/simdjson/padded_string.h
+++ b/include/simdjson/padded_string.h
@ -105,6 +105,10 @@ private:

 }; // padded_string

+inline padded_string operator "" _padded(const char *str, size_t len) {
+  return padded_string(str, len);
+}
+
 } // namespace simdjson

 namespace simdjson::internal {
--- a/singleheader/amalgamation_demo.cpp
+++ b/singleheader/amalgamation_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Thu Mar  5 10:30:07 PST 2020. Do not edit! */
+/* auto-generated on Fri Mar 20 11:47:31 PDT 2020. Do not edit! */

 #include <iostream>
 #include "simdjson.h"
@ -8,37 +8,31 @@ int main(int argc, char *argv[]) {
    std::cerr << "Please specify at least one file name. " << std::endl;
  }
  const char * filename = argv[1];
-  simdjson::padded_string p = simdjson::get_corpus(filename);
-  auto [doc, error] = simdjson::document::parse(p); // do the parsing
+  simdjson::document::parser parser;
+  auto [doc, error] = parser.load(filename); // do the parsing
  if (error) {
-    std::cout << "document::parse failed" << std::endl;
+    std::cout << "parse failed" << std::endl;
    std::cout << "error code: " << error << std::endl;
    std::cout << error << std::endl;
  } else {
-    std::cout << "document::parse valid" << std::endl;
+    std::cout << "parse valid" << std::endl;
  }
  if(argc == 2) {
    return EXIT_SUCCESS;
  }

-  //JsonStream
+  // parse_many
  const char * filename2 = argv[2];
-  simdjson::padded_string p2 = simdjson::get_corpus(filename2);
-  simdjson::document::parser parser;
-  simdjson::JsonStream js{p2};
-  int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
-
-  while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
-            parse_res = js.json_parse(parser);
+  for (auto result : parser.load_many(filename2)) {
+    error = result.error();
  }
-
-  if( ! parser.is_valid()) {
-    std::cout << "JsonStream not valid" << std::endl;
+  if (error) {
+    std::cout << "parse_many failed" << std::endl;
+    std::cout << "error code: " << error << std::endl;
+    std::cout << error << std::endl;
  } else {
-    std::cout << "JsonStream valid" << std::endl;
+    std::cout << "parse_many valid" << std::endl;
  }
-
-
  return EXIT_SUCCESS;
 }

--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -29,7 +29,6 @@ set(SIMDJSON_SRC
 set(SIMDJSON_SRC_HEADERS
  implementation.cpp
  isadetection.h
-  jsonminifier.cpp
  simdprune_tables.h
  stage1_find_marks.cpp
  stage2_build_tape.cpp
@ -46,8 +45,10 @@ set(SIMDJSON_SRC_HEADERS
  fallback/stage1_find_marks.h
  fallback/stage2_build_tape.h
  generic/atomparsing.h
+  generic/json_scanner.h
+  generic/json_string_scanner.h
+  generic/json_structural_indexer.h
  generic/numberparsing.h
-  generic/stage1_find_marks.h
  generic/stage2_build_tape.h
  generic/stage2_streaming_build_tape.h
  generic/stringparsing.h
--- a/src/arm64/bitmanipulation.h
+++ b/src/arm64/bitmanipulation.h
@ -48,7 +48,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 }

 /* result might be undefined when input_num is zero */
-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
   return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
 }

--- a/src/arm64/implementation.h
+++ b/src/arm64/implementation.h
@ -10,6 +10,7 @@ class implementation final : public simdjson::implementation {
 public:
  really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/arm64/simd.h
+++ b/src/arm64/simd.h
@ -2,6 +2,8 @@
 #define SIMDJSON_ARM64_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "arm64/bitmanipulation.h"
 #include "arm64/intrinsics.h"

 namespace simdjson::arm64::simd {
@ -142,6 +144,43 @@ namespace simdjson::arm64::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return lookup_table.apply_lookup_16_to(*this);
    }
+
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint16_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint16_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
+      uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
+      // we increment by 0x08 the second half of the mask
+      uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+      shufmask = vaddq_u8(shufmask, inc);
+      // this is the version "nearly pruned"
+      uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      uint8x16_t compactmask = vld1q_u8((const uint8_t *)(pshufb_combine_table + pop1 * 8));
+      uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+      vst1q_u8((uint8_t*) output, answer);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -267,6 +306,13 @@ namespace simdjson::arm64::simd {
      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(mask, output);
+      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
+      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
+      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+    }
+
    template <typename F>
    static really_inline void each_index(F const& each) {
      each(0);
@ -339,7 +385,6 @@ namespace simdjson::arm64::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::arm64::simd
--- a/src/arm64/stage1_find_marks.h
+++ b/src/arm64/stage1_find_marks.h
@ -11,10 +11,18 @@ namespace simdjson::arm64 {

 using namespace simd;

-really_inline void find_whitespace_and_operators(
-  const simd::simd8x64<uint8_t> in,
-  uint64_t &whitespace, uint64_t &op) {
+struct json_character_block {
+  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);

+  really_inline uint64_t whitespace() const { return _whitespace; }
+  really_inline uint64_t op() const { return _op; }
+  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
  auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
    auto nib_lo = chunk & 0xf;
    auto nib_hi = chunk.shr<4>();
@ -23,8 +31,26 @@ really_inline void find_whitespace_and_operators(
    return shuf_lo & shuf_hi;
  });

-  op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
-  whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
+
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace). *However* if we only need spaces,
+  // it is likely that we will still compute 'v' above with two lookup_16: one
+  // could do it a bit cheaper. This is in contrast with the x64 implementations
+  // where we can, efficiently, do the white space and structural matching
+  // separately. One reason for this difference is that on ARM NEON, the table
+  // lookups either zero or leave unchanged the characters exceeding 0xF whereas
+  // on x64, the equivalent instruction (pshufb) automatically applies a mask,
+  // ignoring the 4 most significant bits. Thus the x64 implementation is
+  // optimized differently. This being said, if you use this code strictly
+  // just for minification (or just to identify the structural characters),
+  // there is a small untaken optimization opportunity here. We deliberately
+  // do not pick it up.
+
+  uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
+  uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
+  return { whitespace, op };
 }

 really_inline bool is_ascii(simd8x64<uint8_t> input) {
@ -44,11 +70,19 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }

-#include "generic/utf8_lookup2_algorithm.h"
-#include "generic/stage1_find_marks.h"
+#include "generic/buf_block_reader.h"
+#include "generic/json_string_scanner.h"
+#include "generic/json_scanner.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
-  return arm64::stage1::find_structural_bits<64>(buf, len, parser, streaming);
+  return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }

 } // namespace simdjson::arm64
--- a/src/fallback/implementation.h
+++ b/src/fallback/implementation.h
@ -14,6 +14,7 @@ public:
      0
  ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/fallback/stage1_find_marks.h
+++ b/src/fallback/stage1_find_marks.h
@ -151,6 +151,62 @@ WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, do
  return scanner.scan();
 }

+// big table for the minifier
+static uint8_t jump_table[256 * 3] = {
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
+    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+};
+
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  size_t i = 0, pos = 0;
+  uint8_t quote = 0;
+  uint8_t nonescape = 1;
+
+  while (i < len) {
+    unsigned char c = buf[i];
+    uint8_t *meta = jump_table + 3 * c;
+
+    quote = quote ^ (meta[0] & nonescape);
+    dst[pos] = c;
+    pos += meta[2] | quote;
+
+    i += 1;
+    nonescape = (~nonescape) | (meta[1]);
+  }
+  dst_len = pos; // we intentionally do not work with a reference
+  // for fear of aliasing
+  return SUCCESS;
+}
+
 } // namespace simdjson::fallback

 #endif // SIMDJSON_FALLBACK_STAGE1_FIND_MARKS_H
--- a/src/generic/buf_block_reader.h
+++ b/src/generic/buf_block_reader.h
@ -0,0 +1,48 @@
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+  really_inline size_t block_index() { return idx; }
+  really_inline bool has_full_block() const {
+    return idx < lenminusstep;
+  }
+  really_inline const uint8_t *full_block() const {
+    return &buf[idx];
+  }
+  really_inline bool has_remainder() const {
+    return idx < len;
+  }
+  really_inline void get_remainder(uint8_t *tmp_buf) const {
+    memset(tmp_buf, 0x20, STEP_SIZE);
+    memcpy(tmp_buf, buf + idx, len - idx);
+  }
+  really_inline void advance() {
+    idx += STEP_SIZE;
+  }
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
+  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
+  in.store((uint8_t*)buf);
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+UNUSED static char * format_mask(uint64_t mask) {
+  static char *buf = (char*)malloc(64 + 1);
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
--- a/src/generic/json_minifier.h
+++ b/src/generic/json_minifier.h
@ -0,0 +1,73 @@
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  really_inline json_minifier(uint8_t *_dst) : dst{_dst} {}
+  template<size_t STEP_SIZE>
+  really_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  really_inline void next(simd::simd8x64<uint8_t> in, json_block block);
+  really_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner;
+  uint8_t *dst;
+};
+
+really_inline void json_minifier::next(simd::simd8x64<uint8_t> in, json_block block) {
+  uint64_t mask = block.whitespace();
+  in.compress(mask, dst);
+  dst += 64 - count_ones(mask);
+}
+
+really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  *dst = '\0';
+  error_code error = scanner.finish(false);
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+really_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+really_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    minifier.step<STEP_SIZE>(block, reader);
+  }
+
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
--- a/src/generic/json_scanner.h
+++ b/src/generic/json_scanner.h
@ -0,0 +1,103 @@
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ */
+struct json_block {
+public:
+  /** The start of structurals */
+  really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  really_inline uint64_t whitespace() { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  really_inline uint64_t non_quote_outside_string(uint64_t mask) { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, operators, scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /** operators plus scalar starts like 123, true and "abc" */
+  really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
+  /** the start of non-operator runs, like 123, true and "abc" */
+  really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
+  /** whether the given character is immediately after a non-operator like 123, true or " */
+  really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
+};
+
+/**
+ * Scans JSON for important bits: operators, strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  really_inline json_block next(const simd::simd8x64<uint8_t> in);
+  really_inline error_code finish(bool streaming);
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or an operator).
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner;
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+//
+// Check if the current character follows a matching character, with possible "filler" between.
+// For example, this checks for empty curly braces, e.g. 
+//
+//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
+//
+really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
+  uint64_t follows_match = follows(match, overflow);
+  uint64_t result;
+  overflow |= uint64_t(add_overflow(follows_match, filler, &result));
+  return result;
+}
+
+really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
+  json_string_block strings = string_scanner.next(in);
+  json_character_block characters = json_character_block::classify(in);
+  uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
+  return {
+    strings,
+    characters,
+    follows_scalar
+  };
+}
+
+really_inline error_code json_scanner::finish(bool streaming) {
+  return string_scanner.finish(streaming);
+}
+
+} // namespace stage1
--- a/src/generic/json_string_scanner.h
+++ b/src/generic/json_string_scanner.h
@ -0,0 +1,127 @@
+namespace stage1 {
+
+struct json_string_block {
+  // Escaped characters (characters following an escape() character)
+  really_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  really_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  really_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  really_inline uint64_t string_end() const { return _quote & _in_string; }
+  // End quotes of strings
+  really_inline uint64_t string_start() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  really_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
+  really_inline error_code finish(bool streaming);
+
+private:
+  really_inline uint64_t find_escaped(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
+  // Use ^ to turn the beginning quote off, and the end quote on.
+  return {
+    backslash,
+    escaped,
+    quote,
+    in_string
+  };
+}
+
+really_inline error_code json_string_scanner::finish(bool streaming) {
+  if (prev_in_string and (not streaming)) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
--- a/src/generic/json_structural_indexer.h
+++ b/src/generic/json_structural_indexer.h
@ -0,0 +1,175 @@
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
+
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  really_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  really_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+    uint32_t cnt = count_ones(bits);
+
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (unlikely(cnt > 16)) {
+        uint32_t i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+  }
+};
+
+class json_structural_indexer {
+public:
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) noexcept;
+
+private:
+  really_inline json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+  template<size_t STEP_SIZE>
+  really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
+  really_inline error_code finish(document::parser &parser, size_t idx, size_t len, bool streaming);
+
+  json_scanner scanner;
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(idx-64, prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+really_inline error_code json_structural_indexer::finish(document::parser &parser, size_t idx, size_t len, bool streaming) {
+  // Write out the final iteration's structurals
+  indexer.write(idx-64, prev_structurals);
+
+  error_code error = scanner.finish(streaming);
+  if (unlikely(error != SUCCESS)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+
+  parser.n_structural_indexes = indexer.tail - parser.structural_indexes.get();
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending character. */
+    parser.structural_indexes[parser.n_structural_indexes++] = len;
+  }
+  /* make it safe to dereference one beyond this array */
+  parser.structural_indexes[parser.n_structural_indexes] = 0;
+  return checker.errors();
+}
+
+template<>
+really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+//
+// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+// 
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
+// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
+// you may want to call on a function like trimmed_length_safe_utf8.
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) noexcept {
+  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    indexer.step<STEP_SIZE>(block, reader);
+  }
+
+  return indexer.finish(parser, reader.block_index(), len, streaming);
+}
+
+} // namespace stage1
--- a/src/generic/stage1_find_marks.h
+++ b/src/generic/stage1_find_marks.h
@ -1,425 +0,0 @@
-// This file contains the common code every implementation uses in stage1
-// It is intended to be included multiple times and compiled multiple times
-// We assume the file in which it is included already includes
-// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
-
-namespace stage1 {
-
-class bit_indexer {
-public:
-  uint32_t *tail;
-
-  bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
-
-  // flatten out values in 'bits' assuming that they are are to have values of idx
-  // plus their position in the bitvector, and store these indexes at
-  // base_ptr[base] incrementing base as we go
-  // will potentially store extra values beyond end of valid bits, so base_ptr
-  // needs to be large enough to handle this
-  really_inline void write_indexes(uint32_t idx, uint64_t bits) {
-    // In some instances, the next branch is expensive because it is mispredicted.
-    // Unfortunately, in other cases,
-    // it helps tremendously.
-    if (bits == 0)
-        return;
-    uint32_t cnt = hamming(bits);
-
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (unlikely(cnt > 16)) {
-        uint32_t i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
-
-    this->tail += cnt;
-  }
-};
-
-class json_structural_scanner {
-public:
-  // Whether the first character of the next iteration is escaped.
-  uint64_t prev_escaped = 0ULL;
-  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
-  uint64_t prev_in_string = 0ULL;
-  // Whether the last character of the previous iteration is a primitive value character
-  // (anything except whitespace, braces, comma or colon).
-  uint64_t prev_primitive = 0ULL;
-  // Mask of structural characters from the last iteration.
-  // Kept around for performance reasons, so we can call flatten_bits to soak up some unused
-  // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
-  uint64_t prev_structurals = 0;
-  // Errors with unescaped characters in strings (ASCII codepoints < 0x20)
-  uint64_t unescaped_chars_error = 0;
-  bit_indexer structural_indexes;
-
-  json_structural_scanner(uint32_t *_structural_indexes) : structural_indexes{_structural_indexes} {}
-
-  //
-  // Finish the scan and return any errors.
-  //
-  // This may detect errors as well, such as unclosed string and certain UTF-8 errors.
-  // if streaming is set to true, an unclosed string is allowed.
-  //
-  really_inline error_code detect_errors_on_eof(bool streaming = false);
-
-  //
-  // Return a mask of all string characters plus end quotes.
-  //
-  // prev_escaped is overflow saying whether the next character is escaped.
-  // prev_in_string is overflow saying whether we're still in a string.
-  //
-  // Backslash sequences outside of quotes will be detected in stage 2.
-  //
-  really_inline uint64_t find_strings(const simd::simd8x64<uint8_t> in);
-
-  //
-  // Determine which characters are *structural*:
-  // - braces: [] and {}
-  // - the start of primitives (123, true, false, null)
-  // - the start of invalid non-whitespace (+, &, ture, UTF-8)
-  //
-  // Also detects value sequence errors:
-  // - two values with no separator between ("hello" "world")
-  // - separators with no values ([1,] [1,,]and [,2])
-  //
-  // This method will find all of the above whether it is in a string or not.
-  //
-  // To reduce dependency on the expensive "what is in a string" computation, this method treats the
-  // contents of a string the same as content outside. Errors and structurals inside the string or on
-  // the trailing quote will need to be removed later when the correct string information is known.
-  //
-  really_inline uint64_t find_potential_structurals(const simd::simd8x64<uint8_t> in);
-
-  //
-  // Find the important bits of JSON in a STEP_SIZE-byte chunk, and add them to structural_indexes.
-  //
-  template<size_t STEP_SIZE>
-  really_inline void scan_step(const uint8_t *buf, const size_t idx, utf8_checker &utf8_checker);
-
-  //
-  // Parse the entire input in STEP_SIZE-byte chunks.
-  //
-  template<size_t STEP_SIZE>
-  really_inline void scan(const uint8_t *buf, const size_t len, utf8_checker &utf8_checker);
-};
-
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  in.store((uint8_t*)buf);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
-
-UNUSED static char * format_mask(uint64_t mask) {
-  static char *buf = (char*)malloc(64 + 1);
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
-}
-
-//
-// Finds escaped characters (characters following \).
-//
-// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
-//
-// Does this by:
-// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
-// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
-// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
-//
-// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
-// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
-// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
-// the start bit causes a carry), and leaves even-bit sequences alone.
-//
-// Example:
-//
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
-// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
-// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
-// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
-// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
-// escaped        |   x  | x x  x x  x x  x  x  |
-// desired        |   x  | x x  x x  x x  x  x  |
-// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
-//
-really_inline uint64_t find_escaped(uint64_t escape, uint64_t &escaped_overflow) {
-  // If there was overflow, pretend the first character isn't a backslash
-  escape &= ~escaped_overflow;
-  uint64_t follows_escape = escape << 1 | escaped_overflow;
-
-  // Get sequences starting on even bits by clearing out the odd series using +
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  uint64_t odd_sequence_starts = escape & ~even_bits & ~follows_escape;
-  uint64_t sequences_starting_on_even_bits;
-  escaped_overflow = add_overflow(odd_sequence_starts, escape, &sequences_starting_on_even_bits);
-  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
-
-  // Mask every other backslashed character as an escaped character
-  // Flip the mask for sequences that start on even bits, to correct them
-  return (even_bits ^ invert_mask) & follows_escape;
-}
-
-//
-// Check if the current character immediately follows a matching character.
-//
-// For example, this checks for quotes with backslashes in front of them:
-//
-//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
-//
-really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
-  const uint64_t result = match << 1 | overflow;
-  overflow = match >> 63;
-  return result;
-}
-
-//
-// Check if the current character follows a matching character, with possible "filler" between.
-// For example, this checks for empty curly braces, e.g. 
-//
-//     in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
-//
-really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
-  uint64_t follows_match = follows(match, overflow);
-  uint64_t result;
-  overflow |= uint64_t(add_overflow(follows_match, filler, &result));
-  return result;
-}
-
-really_inline error_code json_structural_scanner::detect_errors_on_eof(bool streaming) {
-  if ((prev_in_string) and (not streaming)) {
-    return UNCLOSED_STRING;
-  }
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
-  return SUCCESS;
-}
-
-//
-// Return a mask of all string characters plus end quotes.
-//
-// prev_escaped is overflow saying whether the next character is escaped.
-// prev_in_string is overflow saying whether we're still in a string.
-//
-// Backslash sequences outside of quotes will be detected in stage 2.
-//
-really_inline uint64_t json_structural_scanner::find_strings(const simd::simd8x64<uint8_t> in) {
-  const uint64_t backslash = in.eq('\\');
-  const uint64_t escaped = find_escaped(backslash, prev_escaped);
-  const uint64_t quote = in.eq('"') & ~escaped;
-  // prefix_xor flips on bits inside the string (and flips off the end quote).
-  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
-  /* right shift of a signed value expected to be well-defined and standard
-  * compliant as of C++20,
-  * John Regher from Utah U. says this is fine code */
-  prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
-  // Use ^ to turn the beginning quote off, and the end quote on.
-  return in_string ^ quote;
-}
-
-//
-// Determine which characters are *structural*:
-// - braces: [] and {}
-// - the start of primitives (123, true, false, null)
-// - the start of invalid non-whitespace (+, &, ture, UTF-8)
-//
-// Also detects value sequence errors:
-// - two values with no separator between ("hello" "world")
-// - separators with no values ([1,] [1,,]and [,2])
-//
-// This method will find all of the above whether it is in a string or not.
-//
-// To reduce dependency on the expensive "what is in a string" computation, this method treats the
-// contents of a string the same as content outside. Errors and structurals inside the string or on
-// the trailing quote will need to be removed later when the correct string information is known.
-//
-really_inline uint64_t json_structural_scanner::find_potential_structurals(const simd::simd8x64<uint8_t> in) {
-  // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
-  uint64_t whitespace, op;
-  find_whitespace_and_operators(in, whitespace, op);
-
-  // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
-  // Everything except whitespace, braces, colon and comma.
-  const uint64_t primitive = ~(op | whitespace);
-  const uint64_t follows_primitive = follows(primitive, prev_primitive);
-  const uint64_t start_primitive = primitive & ~follows_primitive;
-
-  // Return final structurals
-  return op | start_primitive;
-}
-
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-// 
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-template<>
-really_inline void json_structural_scanner::scan_step<128>(const uint8_t *buf, const size_t idx, utf8_checker &utf8_checker) {
-  //
-  // Load up all 128 bytes into SIMD registers
-  //
-  simd::simd8x64<uint8_t> in_1(buf);
-  simd::simd8x64<uint8_t> in_2(buf+64);
-
-  //
-  // Find the strings and potential structurals (operators / primitives).
-  //
-  // This will include false structurals that are *inside* strings--we'll filter strings out
-  // before we return.
-  //
-  uint64_t string_1 = this->find_strings(in_1);
-  uint64_t structurals_1 = this->find_potential_structurals(in_1);
-  uint64_t string_2 = this->find_strings(in_2);
-  uint64_t structurals_2 = this->find_potential_structurals(in_2);
-
-  //
-  // Do miscellaneous work while the processor is busy calculating strings and structurals.
-  //
-  // After that, weed out structurals that are inside strings and find invalid string characters.
-  //
-  uint64_t unescaped_1 = in_1.lteq(0x1F);
-  utf8_checker.check_next_input(in_1);
-  this->structural_indexes.write_indexes(idx-64, this->prev_structurals); // Output *last* iteration's structurals to the parser
-  this->prev_structurals = structurals_1 & ~string_1;
-  this->unescaped_chars_error |= unescaped_1 & string_1;
-
-  uint64_t unescaped_2 = in_2.lteq(0x1F);
-  utf8_checker.check_next_input(in_2);
-  this->structural_indexes.write_indexes(idx, this->prev_structurals); // Output *last* iteration's structurals to the parser
-  this->prev_structurals = structurals_2 & ~string_2;
-  this->unescaped_chars_error |= unescaped_2 & string_2;
-}
-
-//
-// Find the important bits of JSON in a 64-byte chunk, and add them to structural_indexes.
-//
-template<>
-really_inline void json_structural_scanner::scan_step<64>(const uint8_t *buf, const size_t idx, utf8_checker &utf8_checker) {
-  //
-  // Load up bytes into SIMD registers
-  //
-  simd::simd8x64<uint8_t> in_1(buf);
-
-  //
-  // Find the strings and potential structurals (operators / primitives).
-  //
-  // This will include false structurals that are *inside* strings--we'll filter strings out
-  // before we return.
-  //
-  uint64_t string_1 = this->find_strings(in_1);
-  uint64_t structurals_1 = this->find_potential_structurals(in_1);
-
-  //
-  // Do miscellaneous work while the processor is busy calculating strings and structurals.
-  //
-  // After that, weed out structurals that are inside strings and find invalid string characters.
-  //
-  uint64_t unescaped_1 = in_1.lteq(0x1F);
-  utf8_checker.check_next_input(in_1);
-  this->structural_indexes.write_indexes(idx-64, this->prev_structurals); // Output *last* iteration's structurals to ParsedJson
-  this->prev_structurals = structurals_1 & ~string_1;
-  this->unescaped_chars_error |= unescaped_1 & string_1;
-}
-
-template<size_t STEP_SIZE>
-really_inline void json_structural_scanner::scan(const uint8_t *buf, const size_t len, utf8_checker &utf8_checker) {
-  size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
-  size_t idx = 0;
-
-  for (; idx < lenminusstep; idx += STEP_SIZE) {
-    this->scan_step<STEP_SIZE>(&buf[idx], idx, utf8_checker);
-  }
-
-  /* If we have a final chunk of less than STEP_SIZE bytes, pad it to STEP_SIZE with
-  * spaces  before processing it (otherwise, we risk invalidating the UTF-8
-  * checks). */
-  if (likely(idx < len)) {
-    uint8_t tmp_buf[STEP_SIZE];
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-    this->scan_step<STEP_SIZE>(&tmp_buf[0], idx, utf8_checker);
-    idx += STEP_SIZE;
-  }
-
-  /* finally, flatten out the remaining structurals from the last iteration */
-  this->structural_indexes.write_indexes(idx-64, this->prev_structurals);
-}
-
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code find_structural_bits(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) {
-  if (unlikely(len > parser.capacity())) {
-    return CAPACITY;
-  }
-  utf8_checker utf8_checker{};
-  json_structural_scanner scanner{parser.structural_indexes.get()};
-  scanner.scan<STEP_SIZE>(buf, len, utf8_checker);
-  // we might tolerate an unclosed string if streaming is true
-  error_code error = scanner.detect_errors_on_eof(streaming);
-  if (unlikely(error != SUCCESS)) {
-    return error;
-  }
-  parser.n_structural_indexes = scanner.structural_indexes.tail - parser.structural_indexes.get();
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = len;
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return utf8_checker.errors();
-}
-
-} // namespace stage1
--- a/src/generic/stage2_build_tape.h
+++ b/src/generic/stage2_build_tape.h
@ -101,7 +101,6 @@ public:
    return next_structural;
  }

-private:
  const uint8_t* const buf;
  const size_t len;
  const uint32_t* const structural_indexes;
--- a/src/haswell/bitmanipulation.h
+++ b/src/haswell/bitmanipulation.h
@ -37,7 +37,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
  return static_cast<int>(_lzcnt_u64(input_num));
 }

-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
 #ifdef _MSC_VER
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num);// Visual Studio wants two underscores
--- a/src/haswell/implementation.h
+++ b/src/haswell/implementation.h
@ -14,6 +14,7 @@ public:
      instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
  ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/haswell/simd.h
+++ b/src/haswell/simd.h
@ -2,6 +2,8 @@
 #define SIMDJSON_HASWELL_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "haswell/bitmanipulation.h"
 #include "haswell/intrinsics.h"

 TARGET_HASWELL
@ -109,6 +111,57 @@ namespace simdjson::haswell::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return _mm256_shuffle_epi8(lookup_table, *this);
    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint32_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in four steps, first 8 bytes and then second 8 bytes...
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // second least significant 8 bits
+      uint8_t mask3 = static_cast<uint8_t>(mask >> 16); // ...
+      uint8_t mask4 = static_cast<uint8_t>(mask >> 24); // ...
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m256i shufmask =  _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3], 
+        thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask and so forth
+      shufmask =
+      _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818, 
+         0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
+      // we still need to put the  pieces back together.
+      // we compute the popcount of the first words:
+      int pop1 = BitsSetTable256mul2[mask1];
+      int pop3 = BitsSetTable256mul2[mask3];
+
+      // then load the corresponding mask
+      // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
+      __m256i v256 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8)));
+      __m256i compactmask = _mm256_insertf128_si256(v256,
+         _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop3 * 8)), 1);
+      __m256i almostthere =  _mm256_shuffle_epi8(pruned, compactmask);
+      // We just need to write out the result.
+      // This is the tricky bit that is hard to do
+      // if we want to return a SIMD register, since there
+      // is no single-instruction approach to recombine
+      // the two 128-bit lanes with an offset.
+      __m128i v128;
+      v128 = _mm256_castsi256_si128(almostthere);
+      _mm_storeu_si128( (__m128i *)output, v128);
+      v128 = _mm256_extractf128_si256(almostthere, 1);
+      _mm_storeu_si128( (__m128i *)(output + 16 - count_ones(mask & 0xFFFF)), v128);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -249,6 +302,13 @@ namespace simdjson::haswell::simd {
      each(1);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      uint32_t mask1 = static_cast<uint32_t>(mask);
+      uint32_t mask2 = static_cast<uint32_t>(mask >> 32);
+      this->chunks[0].compress(mask1, output);
+      this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
+    }
+
    really_inline void store(T ptr[64]) const {
      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
@ -269,6 +329,8 @@ namespace simdjson::haswell::simd {
      );
    }

+    
+
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
      return simd8x64<R>(
@ -302,7 +364,6 @@ namespace simdjson::haswell::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::haswell::simd
--- a/src/haswell/stage1_find_marks.h
+++ b/src/haswell/stage1_find_marks.h
@ -13,21 +13,37 @@ namespace simdjson::haswell {

 using namespace simd;

-really_inline void find_whitespace_and_operators(simd8x64<uint8_t> in, uint64_t &whitespace, uint64_t &op) {
+struct json_character_block {
+  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);

+  really_inline uint64_t whitespace() const { return _whitespace; }
+  really_inline uint64_t op() const { return _op; }
+  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
  // we can't use the generic lookup_16.
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');

-  whitespace = in.map([&](simd8<uint8_t> _in) {
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
+  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
    return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
  }).to_bitmask();

-  op = in.map([&](simd8<uint8_t> _in) {
+  uint64_t op = in.map([&](simd8<uint8_t> _in) {
    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
    return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-','));
  }).to_bitmask();
+  return { whitespace, op };
 }

 really_inline bool is_ascii(simd8x64<uint8_t> input) {
@ -43,11 +59,19 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

-#include "generic/utf8_lookup2_algorithm.h"
-#include "generic/stage1_find_marks.h"
+#include "generic/buf_block_reader.h"
+#include "generic/json_string_scanner.h"
+#include "generic/json_scanner.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
-  return haswell::stage1::find_structural_bits<128>(buf, len, parser, streaming);
+  return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
 }

 } // namespace simdjson::haswell
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@ -1,5 +1,7 @@
 #include "simdjson.h"
 #include "isadetection.h"
+#include "simdprune_tables.h"
+
 #include <initializer_list>

 // Static array of known implementations. We're hoping these get baked into the executable
@ -48,6 +50,9 @@ public:
  WARN_UNUSED error_code parse(const uint8_t *, size_t, document::parser &) const noexcept final {
    return UNSUPPORTED_ARCHITECTURE;
  }
+  WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
  WARN_UNUSED error_code stage1(const uint8_t *, size_t, document::parser &, bool) const noexcept final {
    return UNSUPPORTED_ARCHITECTURE;
  }
--- a/src/jsonminifier.cpp
+++ b/src/jsonminifier.cpp
@ -1,478 +0,0 @@
-#include "simdjson.h"
-#include <cstdint>
-
-#ifndef SIMDJSON_ISSUE384RESOLVED // to avoid tripping users
-
-namespace simdjson {
-static uint8_t jump_table[256 * 3] = {
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
-    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
-    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
-    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
-};
-
-size_t json_minify(const unsigned char *bytes, size_t how_many,
-                   unsigned char *out) {
-  size_t i = 0, pos = 0;
-  uint8_t quote = 0;
-  uint8_t nonescape = 1;
-
-  while (i < how_many) {
-    unsigned char c = bytes[i];
-    uint8_t *meta = jump_table + 3 * c;
-
-    quote = quote ^ (meta[0] & nonescape);
-    out[pos] = c;
-    pos += meta[2] | quote;
-
-    i += 1;
-    nonescape = (~nonescape) | (meta[1]);
-  }
-  return pos;
-}
-} // namespace simdjson
-#else
-
-//
-// This fast code is disabled.
-// See issue https://github.com/lemire/simdjson/issues/384
-//
-#include "simdprune_tables.h"
-#include <cstring>
-#include <x86intrin.h> // currently, there is no runtime dispatch for the minifier
-
-namespace simdjson {
-
-// a straightforward comparison of a mask against input.
-static uint64_t cmp_mask_against_input_mini(__m256i input_lo, __m256i input_hi,
-                                            __m256i mask) {
-  __m256i cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
-  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
-  __m256i cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
-  uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
-  return res_0 | (res_1 << 32);
-}
-
-// Write up to 16 bytes, only the bytes corresponding to a 1-bit are written
-// out. credit: Anime Tosho
-static __m128i skinnycleanm128(__m128i x, int mask) {
-  int mask1 = mask & 0xFF;
-  int mask2 = (mask >> 8) & 0xFF;
-  __m128i shufmask = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(_mm_loadl_epi64(
-                       (const __m128i *)(thintable_epi8 + mask1))),
-                   (const __m64 *)(thintable_epi8 + mask2)));
-  shufmask =
-      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
-  __m128i pruned = _mm_shuffle_epi8(x, shufmask);
-  intptr_t popx2 = BitsSetTable256mul2[mask1];
-  __m128i compactmask =
-      _mm_loadu_si128((const __m128i *)(pshufb_combine_table + popx2 * 8));
-  return _mm_shuffle_epi8(pruned, compactmask);
-}
-
-// take input from buf and remove useless whitespace, input and output can be
-// the same, result is null terminated, return the string length (minus the null
-// termination)
-size_t json_minify(const uint8_t *buf, size_t len, uint8_t *out) {
-  // Useful constant masks
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint8_t *initout(out);
-  uint64_t prev_iter_ends_odd_backslash =
-      0ULL;                               // either 0 or 1, but a 64-bit value
-  uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
-  size_t idx = 0;
-  if (len >= 64) {
-    size_t avx_len = len - 63;
-
-    for (; idx < avx_len; idx += 64) {
-      __m256i input_lo =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
-      __m256i input_hi =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
-      uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                     _mm256_set1_epi8('\\'));
-      uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-      uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-      uint64_t even_starts = start_edges & even_start_mask;
-      uint64_t odd_starts = start_edges & ~even_start_mask;
-      uint64_t even_carries = bs_bits + even_starts;
-      uint64_t odd_carries;
-      bool iter_ends_odd_backslash =
-          add_overflow(bs_bits, odd_starts, &odd_carries);
-      odd_carries |= prev_iter_ends_odd_backslash;
-      prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-      uint64_t even_carry_ends = even_carries & ~bs_bits;
-      uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-      uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-      uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-      uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-      uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                        _mm256_set1_epi8('"'));
-      quote_bits = quote_bits & ~odd_ends;
-      uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-          _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-      quote_mask ^= prev_iter_inside_quote;
-      prev_iter_inside_quote = static_cast<uint64_t>(
-          static_cast<int64_t>(quote_mask) >>
-          63); // might be undefined behavior, should be fully defined in C++20,
-               // ok according to John Regher from Utah University
-      const __m256i low_nibble_mask = _mm256_setr_epi8(
-          //  0                           9  a   b  c  d
-          16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
-          0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-      const __m256i high_nibble_mask = _mm256_setr_epi8(
-          //  0     2   3     5     7
-          8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
-          1, 0, 0, 0, 3, 2, 1, 0, 0);
-      __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-      __m256i v_lo = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_lo),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
-                                               _mm256_set1_epi8(0x7f))));
-
-      __m256i v_hi = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_hi),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
-                                               _mm256_set1_epi8(0x7f))));
-      __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-      __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-      uint64_t ws_res_0 =
-          static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-      uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-      uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
-      whitespace &= ~quote_mask;
-
-      uint64_t non_whitespace = ~whitespace;
-
-      __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-      __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-      __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-      __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-
-      int mask1 = non_whitespace & 0xFFFF;
-      int mask2 = (non_whitespace >> 16) & 0xFFFF;
-      int mask3 = (non_whitespace >> 32) & 0xFFFF;
-      int mask4 = (non_whitespace >> 48) & 0xFFFF;
-
-      x1 = skinnycleanm128(x1, mask1);
-      x2 = skinnycleanm128(x2, mask2);
-      x3 = skinnycleanm128(x3, mask3);
-      x4 = skinnycleanm128(x4, mask4);
-      int pop1 = hamming(non_whitespace & 0xFFFF);
-      int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
-      int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
-      int pop4 = hamming(non_whitespace);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-      out += pop4;
-    }
-  }
-  // we finish off the job... copying and pasting the code is not ideal here,
-  // but it gets the job done.
-  if (idx < len) {
-    uint8_t buffer[64];
-    memset(buffer, 0, 64);
-    memcpy(buffer, buf + idx, len - idx);
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
-    uint64_t bs_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
-    uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-    uint64_t even_starts = start_edges & even_start_mask;
-    uint64_t odd_starts = start_edges & ~even_start_mask;
-    uint64_t even_carries = bs_bits + even_starts;
-    uint64_t odd_carries;
-    // bool iter_ends_odd_backslash =
-    add_overflow(bs_bits, odd_starts, &odd_carries);
-    odd_carries |= prev_iter_ends_odd_backslash;
-    // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-    // // we never use it
-    uint64_t even_carry_ends = even_carries & ~bs_bits;
-    uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-    uint64_t quote_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
-    quote_bits = quote_bits & ~odd_ends;
-    uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-        _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-    quote_mask ^= prev_iter_inside_quote;
-    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
-    // don't need this anymore
-
-    __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
-    __m256i mask_70 =
-        _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
-    // but moves any value >= 16 above 128
-
-    __m256i lut_cntrl = _mm256_setr_epi8(
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
-        0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
-
-    __m256i tmp_ws_lo = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_lo),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
-    __m256i tmp_ws_hi = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_hi),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
-    uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-    uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
-    whitespace &= ~quote_mask;
-
-    if (len - idx < 64) {
-      whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
-    }
-    uint64_t non_whitespace = ~whitespace;
-
-    int mask1 = non_whitespace & 0xFFFF;
-    int mask2 = (non_whitespace >> 16) & 0xFFFF;
-    int mask3 = (non_whitespace >> 32) & 0xFFFF;
-    int mask4 = (non_whitespace >> 48) & 0xFFFF;
-
-    x1 = skinnycleanm128(x1, mask1);
-    x2 = skinnycleanm128(x2, mask2);
-    x3 = skinnycleanm128(x3, mask3);
-    x4 = skinnycleanm128(x4, mask4);
-    int pop1 = hamming(non_whitespace & 0xFFFF);
-    int pop2 = hamming(non_whitespace & UINT64_C(0xFFFFFFFF));
-    int pop3 = hamming(non_whitespace & UINT64_C(0xFFFFFFFFFFFF));
-    int pop4 = hamming(non_whitespace);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-    out += pop4;
-  }
-  *out = '\0'; // NULL termination
-  return out - initout;
-}
-
-size_t oldjson_minify(const uint8_t *buf, size_t len, uint8_t *out) {
-  // Useful constant masks
-  const uint64_t even_bits = 0x5555555555555555ULL;
-  const uint64_t odd_bits = ~even_bits;
-  uint8_t *initout(out);
-  uint64_t prev_iter_ends_odd_backslash =
-      0ULL;                               // either 0 or 1, but a 64-bit value
-  uint64_t prev_iter_inside_quote = 0ULL; // either all zeros or all ones
-  size_t idx = 0;
-  if (len >= 64) {
-    size_t avx_len = len - 63;
-
-    for (; idx < avx_len; idx += 64) {
-      __m256i input_lo =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 0));
-      __m256i input_hi =
-          _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf + idx + 32));
-      uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                     _mm256_set1_epi8('\\'));
-      uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-      uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-      uint64_t even_starts = start_edges & even_start_mask;
-      uint64_t odd_starts = start_edges & ~even_start_mask;
-      uint64_t even_carries = bs_bits + even_starts;
-      uint64_t odd_carries;
-      bool iter_ends_odd_backslash =
-          add_overflow(bs_bits, odd_starts, &odd_carries);
-      odd_carries |= prev_iter_ends_odd_backslash;
-      prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-      uint64_t even_carry_ends = even_carries & ~bs_bits;
-      uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-      uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-      uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-      uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-      uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi,
-                                                        _mm256_set1_epi8('"'));
-      quote_bits = quote_bits & ~odd_ends;
-      uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-          _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-      quote_mask ^= prev_iter_inside_quote;
-      prev_iter_inside_quote = static_cast<uint64_t>(
-          static_cast<int64_t>(quote_mask) >>
-          63); // might be undefined behavior, should be fully defined in C++20,
-               // ok according to John Regher from Utah University
-      const __m256i low_nibble_mask = _mm256_setr_epi8(
-          //  0                           9  a   b  c  d
-          16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 16, 0, 0, 0, 0, 0,
-          0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-      const __m256i high_nibble_mask = _mm256_setr_epi8(
-          //  0     2   3     5     7
-          8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 8, 0, 18, 4, 0, 1, 0,
-          1, 0, 0, 0, 3, 2, 1, 0, 0);
-      __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-      __m256i v_lo = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_lo),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_lo, 4),
-                                               _mm256_set1_epi8(0x7f))));
-
-      __m256i v_hi = _mm256_and_si256(
-          _mm256_shuffle_epi8(low_nibble_mask, input_hi),
-          _mm256_shuffle_epi8(high_nibble_mask,
-                              _mm256_and_si256(_mm256_srli_epi32(input_hi, 4),
-                                               _mm256_set1_epi8(0x7f))));
-      __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-      __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-          _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-      uint64_t ws_res_0 =
-          static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-      uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-      uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));
-      whitespace &= ~quote_mask;
-      int mask1 = whitespace & 0xFFFF;
-      int mask2 = (whitespace >> 16) & 0xFFFF;
-      int mask3 = (whitespace >> 32) & 0xFFFF;
-      int mask4 = (whitespace >> 48) & 0xFFFF;
-      int pop1 = hamming((~whitespace) & 0xFFFF);
-      int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
-      int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
-      int pop4 = hamming((~whitespace));
-      __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-      __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-      __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-      __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-      x1 = skinnycleanm128(x1, mask1);
-      x2 = skinnycleanm128(x2, mask2);
-      x3 = skinnycleanm128(x3, mask3);
-      x4 = skinnycleanm128(x4, mask4);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), x1);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop1), x2);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop2), x3);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + pop3), x4);
-      out += pop4;
-    }
-  }
-  // we finish off the job... copying and pasting the code is not ideal here,
-  // but it gets the job done.
-  if (idx < len) {
-    uint8_t buffer[64];
-    memset(buffer, 0, 64);
-    memcpy(buffer, buf + idx, len - idx);
-    __m256i input_lo =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer));
-    __m256i input_hi =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buffer + 32));
-    uint64_t bs_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('\\'));
-    uint64_t start_edges = bs_bits & ~(bs_bits << 1);
-    uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
-    uint64_t even_starts = start_edges & even_start_mask;
-    uint64_t odd_starts = start_edges & ~even_start_mask;
-    uint64_t even_carries = bs_bits + even_starts;
-    uint64_t odd_carries;
-    // bool iter_ends_odd_backslash =
-    add_overflow(bs_bits, odd_starts, &odd_carries);
-    odd_carries |= prev_iter_ends_odd_backslash;
-    // prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
-    // // we never use it
-    uint64_t even_carry_ends = even_carries & ~bs_bits;
-    uint64_t odd_carry_ends = odd_carries & ~bs_bits;
-    uint64_t even_start_odd_end = even_carry_ends & odd_bits;
-    uint64_t odd_start_even_end = odd_carry_ends & even_bits;
-    uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
-    uint64_t quote_bits =
-        cmp_mask_against_input_mini(input_lo, input_hi, _mm256_set1_epi8('"'));
-    quote_bits = quote_bits & ~odd_ends;
-    uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-        _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-    quote_mask ^= prev_iter_inside_quote;
-    // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we
-    // don't need this anymore
-
-    __m256i mask_20 = _mm256_set1_epi8(0x20); // c==32
-    __m256i mask_70 =
-        _mm256_set1_epi8(0x70); // adding 0x70 does not check low 4-bits
-    // but moves any value >= 16 above 128
-
-    __m256i lut_cntrl = _mm256_setr_epi8(
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00,
-        0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00);
-
-    __m256i tmp_ws_lo = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_lo),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_lo)));
-    __m256i tmp_ws_hi = _mm256_or_si256(
-        _mm256_cmpeq_epi8(mask_20, input_hi),
-        _mm256_shuffle_epi8(lut_cntrl, _mm256_adds_epu8(mask_70, input_hi)));
-    uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-    uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-    uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32));
-    whitespace &= ~quote_mask;
-
-    if (len - idx < 64) {
-      whitespace |= UINT64_C(0xFFFFFFFFFFFFFFFF) << (len - idx);
-    }
-    int mask1 = whitespace & 0xFFFF;
-    int mask2 = (whitespace >> 16) & 0xFFFF;
-    int mask3 = (whitespace >> 32) & 0xFFFF;
-    int mask4 = (whitespace >> 48) & 0xFFFF;
-    int pop1 = hamming((~whitespace) & 0xFFFF);
-    int pop2 = hamming((~whitespace) & UINT64_C(0xFFFFFFFF));
-    int pop3 = hamming((~whitespace) & UINT64_C(0xFFFFFFFFFFFF));
-    int pop4 = hamming((~whitespace));
-    __m128i x1 = _mm256_extracti128_si256(input_lo, 0);
-    __m128i x2 = _mm256_extracti128_si256(input_lo, 1);
-    __m128i x3 = _mm256_extracti128_si256(input_hi, 0);
-    __m128i x4 = _mm256_extracti128_si256(input_hi, 1);
-    x1 = skinnycleanm128(x1, mask1);
-    x2 = skinnycleanm128(x2, mask2);
-    x3 = skinnycleanm128(x3, mask3);
-    x4 = skinnycleanm128(x4, mask4);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), x1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop1), x2);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop2), x3);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + pop3), x4);
-    memcpy(out, buffer, pop4);
-    out += pop4;
-  }
-  *out = '\0'; // NULL termination
-  return out - initout;
-}
-
-} // namespace simdjson
-#endif
--- a/src/simdjson.cpp
+++ b/src/simdjson.cpp
@ -1,5 +1,4 @@
 #include "simdjson.h"
 #include "implementation.cpp"
-#include "jsonminifier.cpp"
 #include "stage1_find_marks.cpp"
 #include "stage2_build_tape.cpp"
--- a/src/westmere/bitmanipulation.h
+++ b/src/westmere/bitmanipulation.h
@ -46,7 +46,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 #endif// _MSC_VER
 }

-really_inline int hamming(uint64_t input_num) {
+really_inline int count_ones(uint64_t input_num) {
 #ifdef _MSC_VER
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num);// Visual Studio wants two underscores
--- a/src/westmere/implementation.h
+++ b/src/westmere/implementation.h
@ -11,6 +11,7 @@ class implementation final : public simdjson::implementation {
 public:
  really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
+  WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser) const noexcept final;
  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, document::parser &parser, size_t &next_json) const noexcept final;
--- a/src/westmere/simd.h
+++ b/src/westmere/simd.h
@ -2,8 +2,12 @@
 #define SIMDJSON_WESTMERE_SIMD_H

 #include "simdjson.h"
+#include "simdprune_tables.h"
+#include "westmere/bitmanipulation.h"
 #include "westmere/intrinsics.h"

+
+
 TARGET_WESTMERE
 namespace simdjson::westmere::simd {

@ -106,6 +110,42 @@ namespace simdjson::westmere::simd {
    really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
      return _mm_shuffle_epi8(lookup_table, *this);
    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    really_inline void compress(uint16_t mask, L * output) const {
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = static_cast<uint8_t>(mask); // least significant 8 bits
+      uint8_t mask2 = static_cast<uint8_t>(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m128i shufmask =  _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask
+      shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      __m128i compactmask =
+      _mm_loadu_si128((const __m128i *)(pshufb_combine_table + pop1 * 8));
+      __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+      _mm_storeu_si128(( __m128i *)(output), answer);
+    }
+
    template<typename L>
    really_inline simd8<L> lookup_16(
        L replace0,  L replace1,  L replace2,  L replace3,
@ -235,6 +275,13 @@ namespace simdjson::westmere::simd {
      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
    }

+    really_inline void compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(mask, output);
+      this->chunks[1].compress(mask >> 16, output + 16 - count_ones(mask & 0xFFFF));
+      this->chunks[2].compress(mask >> 32, output + 32 - count_ones(mask & 0xFFFFFFFF));
+      this->chunks[3].compress(mask >> 48, output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+    }
+
    template <typename F>
    static really_inline void each_index(F const& each) {
      each(0);
@ -302,7 +349,6 @@ namespace simdjson::westmere::simd {
      const simd8<T> mask = simd8<T>::splat(m);
      return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
    }
-
  }; // struct simd8x64<T>

 } // namespace simdjson::westmere::simd
--- a/src/westmere/stage1_find_marks.h
+++ b/src/westmere/stage1_find_marks.h
@ -12,23 +12,37 @@ namespace simdjson::westmere {

 using namespace simd;

-really_inline void find_whitespace_and_operators(
-  const simd8x64<uint8_t> in,
-  uint64_t &whitespace, uint64_t &op) {
+struct json_character_block {
+  static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);

+  really_inline uint64_t whitespace() const { return _whitespace; }
+  really_inline uint64_t op() const { return _op; }
+  really_inline uint64_t scalar() { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
  // we can't use the generic lookup_16.
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
  auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');

-  whitespace = in.map([&](simd8<uint8_t> _in) {
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
+  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
    return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
  }).to_bitmask();

-  op = in.map([&](simd8<uint8_t> _in) {
+  uint64_t op = in.map([&](simd8<uint8_t> _in) {
    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
    return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-','));
  }).to_bitmask();
+  return { whitespace, op };
 }

 really_inline bool is_ascii(simd8x64<uint8_t> input) {
@ -44,11 +58,19 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }

-#include "generic/utf8_lookup2_algorithm.h"
-#include "generic/stage1_find_marks.h"
+#include "generic/buf_block_reader.h"
+#include "generic/json_string_scanner.h"
+#include "generic/json_scanner.h"

+#include "generic/json_minifier.h"
+WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+#include "generic/utf8_lookup2_algorithm.h"
+#include "generic/json_structural_indexer.h"
 WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
-  return westmere::stage1::find_structural_bits<64>(buf, len, parser, streaming);
+  return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }

 } // namespace simdjson::westmere
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -19,7 +19,7 @@ add_cpp_test(parse_many_test)
 add_cpp_test(pointercheck)
 add_cpp_test(integer_tests)

-target_compile_definitions(basictests PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
+target_compile_definitions(basictests PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json" NDJSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/amazon_cellphones.ndjson")
 target_compile_definitions(errortests PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")

 ## This causes problems
--- a/tests/basictests.cpp
+++ b/tests/basictests.cpp
--- a/tests/parse_many_test.cpp
+++ b/tests/parse_many_test.cpp
@ -101,6 +101,14 @@ bool validate(const char *dirname) {
                printf("size of file in bytes: %zu \n", json.size());
                everything_fine = false;
            }
+            if(!error) {
+              // issue 570, we just want to check for segfault
+              simdjson::document::parser parser;
+              for (const simdjson::document &doc : parser.load_many(fullpath)) {
+                auto iter = simdjson::document::iterator(doc);
+                //do something
+              }
+            }
            free(fullpath);
        }
    }
--- a/tests/readme_examples.cpp
+++ b/tests/readme_examples.cpp
@ -6,8 +6,7 @@ using namespace simdjson;
 void document_parse_error_code() {
  cout << __func__ << endl;

-  string json("[ 1, 2, 3 ]");
-  auto [doc, error] = document::parse(json);
+  auto [doc, error] = document::parse("[ 1, 2, 3 ]"_padded);
  if (error) { cerr << "Error: " << error << endl; exit(1); }
  cout << doc << endl;
 }
@ -31,7 +30,7 @@ void parser_parse_many_error_code() {
  cout << __func__ << endl;

  // Read files with the parser
-  padded_string json = string("[1, 2, 3] true [ true, false ]");
+  auto json = "[1, 2, 3] true [ true, false ]"_padded;
  cout << "Parsing " << json.data() << " ..." << endl;
  document::parser parser;
  for (auto [doc, error] : parser.parse_many(json)) {
@ -41,6 +40,8 @@ void parser_parse_many_error_code() {
 }

 void parser_parse_max_capacity() {
+  cout << __func__ << endl;
+
  int argc = 2;
  padded_string argv[] { string("[1,2,3]"), string("true") };
  document::parser parser(1024*1024); // Set max capacity to 1MB
@ -53,6 +54,8 @@ void parser_parse_max_capacity() {
 }

 void parser_parse_fixed_capacity() {
+  cout << __func__ << endl;
+
  int argc = 2;
  padded_string argv[] { string("[1,2,3]"), string("true") };
  document::parser parser(0); // This parser is not allowed to auto-allocate
@ -71,14 +74,13 @@ void parser_parse_fixed_capacity() {
 void document_parse_exception() {
  cout << __func__ << endl;

-  string json("[ 1, 2, 3 ]");
-  cout << document::parse(json) << endl;
+  cout << document::parse("[ 1, 2, 3 ]"_padded) << endl;
 }

 void document_parse_padded_string() {
  cout << __func__ << endl;

-  padded_string json(string("[ 1, 2, 3 ]"));
+  auto json = "[ 1, 2, 3 ]"_padded;
  cout << document::parse(json) << endl;
 }

@ -112,7 +114,7 @@ void parser_parse_many_exception() {
  cout << __func__ << endl;

  // Read files with the parser
-  padded_string json = string("[1, 2, 3] true [ true, false ]");
+  auto json = "[1, 2, 3] true [ true, false ]"_padded;
  cout << "Parsing " << json.data() << " ..." << endl;
  document::parser parser;
  for (const document &doc : parser.parse_many(json)) {
--- a/tools/minify.cpp
+++ b/tools/minify.cpp
@ -1,18 +1,90 @@
 #include <iostream>
+#ifndef _MSC_VER
+#include <dirent.h>
+#include <unistd.h>
+#endif

 #include "simdjson.h"

-int main(int argc, char *argv[]) {
-  if (argc != 2) {
-    std::cerr << "Usage: " << argv[0] << " <jsonfile>\n";
-    exit(1);
+// Stash the exe_name in main() for functions to use
+char* exe_name;
+
+void print_usage(std::ostream& out) {
+  out << "Usage: " << exe_name << "  [-a ARCH] <jsonfile>" << std::endl;
+  out << std::endl;
+  out << "Runs the parser against the given json files in a loop, measuring speed and other statistics." << std::endl;
+  out << std::endl;
+  out << "Options:" << std::endl;
+  out << std::endl;
+  out << "-a IMPL      - Use the given parser implementation. By default, detects the most advanced" << std::endl;
+  out << "               implementation supported on the host machine." << std::endl;
+  for (auto impl : simdjson::available_implementations) {
+    out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << std::endl;
  }
-  std::string filename = argv[argc - 1];
+}
+
+void exit_usage(std::string message) {
+  std::cerr << message << std::endl;
+  std::cerr << std::endl;
+  print_usage(std::cerr);
+  exit(EXIT_FAILURE);
+}
+
+
+struct option_struct {
+  char* filename;
+ 
+  option_struct(int argc, char **argv) {
+    #ifndef _MSC_VER
+      int c;
+
+      while ((c = getopt(argc, argv, "a:")) != -1) {
+        switch (c) {
+        case 'a': {
+          const simdjson::implementation *impl = simdjson::available_implementations[optarg];
+          if (!impl) {
+            std::string exit_message = std::string("Unsupported option value -a ") + optarg + ": expected -a  with one of ";
+            for (auto imple : simdjson::available_implementations) {
+              exit_message += imple->name();
+              exit_message += " ";
+            }
+            exit_usage(exit_message);
+          }
+          simdjson::active_implementation = impl;
+          break;
+        }
+        default:
+          // reaching here means an argument was given to getopt() which did not have a case label
+          exit_usage("Unexpected argument - missing case for option "+
+                     std::string(1,static_cast<char>(c))+
+                     " (programming error)");
+        }
+      }
+    #else
+      int optind = 1;
+    #endif
+
+    // All remaining arguments are considered to be files
+    if(optind + 1 == argc) {
+      filename = argv[optind];
+    } else {
+      exit_usage("Please specify exactly one input file.");
+    }
+  }
+};
+
+int main(int argc, char *argv[]) {
+  exe_name = argv[0];
+  option_struct options(argc, argv);
+  std::string filename = options.filename;
  auto [p, error] = simdjson::padded_string::load(filename);
  if (error) {
    std::cerr << "Could not load the file " << filename << std::endl;
    return EXIT_FAILURE;
  }
-  simdjson::json_minify(p, p.data());
-  printf("%s", p.data());
+  simdjson::padded_string copy(p.length());
+  size_t copy_len;
+  error = simdjson::active_implementation->minify((const uint8_t*)p.data(), p.length(), (uint8_t*)copy.data(), copy_len);
+  if (error) { std::cerr << error << std::endl; return 1; }
+  printf("%s", copy.data());
 }