From 861a6a17e4ab470148c30de5c9c032dc5832e0e2 Mon Sep 17 00:00:00 2001
From: ioioioio <iodadi@gmail.com>
Date: Wed, 3 Jul 2019 17:15:21 -0400
Subject: [PATCH 1/9] SSE implementation integrated

---
 include/simdjson/numberparsing.h     |   2 +-
 include/simdjson/portability.h       |   2 +-
 include/simdjson/simdutf8check.h     | 315 ++++++++++++++++++++++++---
 include/simdjson/stage1_find_marks.h | 270 ++++++++++++++++++++---
 include/simdjson/stringparsing.h     |  26 ++-
 src/jsonparser.cpp                   |  12 +-
 6 files changed, 565 insertions(+), 62 deletions(-)
diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h
index aba966e0..82787ea4 100644
--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@@ -114,7 +114,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
   return structural_or_whitespace_or_exponent_or_decimal_negated[c];
 }
 
-#ifdef __AVX2__
+#if defined (__AVX2__) || defined (__SSE4_2__)
 #define SWAR_NUMBER_PARSING
 #endif
 
diff --git a/include/simdjson/portability.h b/include/simdjson/portability.h
index 2069cf72..ec5a409e 100644
--- a/include/simdjson/portability.h
+++ b/include/simdjson/portability.h
@@ -40,7 +40,7 @@ static inline int hamming(uint64_t input_num) {
 #include <cstdint>
 #include <cstdlib>
 
-#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__)
+#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__) || defined(__SSE4_2__)
 #include <x86intrin.h>
 #endif
 namespace simdjson {
diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h
index fe198991..ecbc235b 100644
--- a/include/simdjson/simdutf8check.h
+++ b/include/simdjson/simdutf8check.h
@@ -1,12 +1,12 @@
-
-#ifndef SIMDJSON_SIMDUTF8CHECK_H
-#define SIMDJSON_SIMDUTF8CHECK_H
-
-
+#ifndef SIMDUTF8CHECK_H
+#define SIMDUTF8CHECK_H
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
-#include "simdjson/portability.h"
+#include <x86intrin.h>
+
+namespace simdjson {
 /*
  * legal utf-8 byte sequence
  * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
@@ -25,9 +25,171 @@
  */
 
 // all byte values must be no larger than 0xF4
+static inline void checkSmallerThan0xF4(__m128i current_bytes,
+                                        __m128i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm_or_si128(*has_error,
+                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
+}
+
+static inline __m128i continuationLengths(__m128i high_nibbles) {
+  return _mm_shuffle_epi8(
+      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                    0, 0, 0, 0,             // 10xx (continuation)
+                    2, 2,                   // 110x
+                    3,                      // 1110
+                    4), // 1111, next should be 0 (not checked here)
+      high_nibbles);
+}
+
+static inline __m128i carryContinuations(__m128i initial_lengths,
+                                         __m128i previous_carries) {
+
+  __m128i right1 =
+      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
+                    _mm_set1_epi8(1));
+  __m128i sum = _mm_add_epi8(initial_lengths, right1);
+
+  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
+                                 _mm_set1_epi8(2));
+  return _mm_add_epi8(sum, right2);
+}
+
+static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
+                                      __m128i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m128i overunder =
+      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
+                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
+
+  *has_error = _mm_or_si128(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void checkFirstContinuationMax(__m128i current_bytes,
+                                             __m128i off1_current_bytes,
+                                             __m128i *has_error) {
+  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
+  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
+
+  __m128i badfollowED =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
+  __m128i badfollowF4 =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
+
+  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+static inline void checkOverlong(__m128i current_bytes,
+                                 __m128i off1_current_bytes, __m128i hibits,
+                                 __m128i previous_hibits, __m128i *has_error) {
+  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
+  __m128i initial_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    0xC2, -128, // 110x
+                    0xE1,       // 1110
+                    0xF1),
+      off1_hibits);
+
+  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m128i second_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    127, 127,   // 110x => true
+                    0xA0,       // 1110
+                    0x90),
+      off1_hibits);
+  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
+  *has_error =
+      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+}
+
+struct processed_utf_bytes {
+  __m128i rawbytes;
+  __m128i high_nibbles;
+  __m128i carried_continuations;
+};
+
+static inline void count_nibbles(__m128i bytes,
+                                 struct processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct processed_utf_bytes
+checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
+               __m128i *has_error) {
+  struct processed_utf_bytes pb;
+  count_nibbles(current_bytes, &pb);
+
+  checkSmallerThan0xF4(current_bytes, has_error);
+
+  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      carryContinuations(initial_lengths, previous->carried_continuations);
+
+  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m128i off1_current_bytes =
+      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
+  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast(const char *src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
+                                         .high_nibbles = _mm_setzero_si128(),
+                                         .carried_continuations =
+                                             _mm_setzero_si128()};
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[16];
+    memset(buffer, 0, 16);
+    memcpy(buffer, src + i, len - i);
+    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
+    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     has_error);
+  }
+
+  return _mm_testz_si128(has_error, has_error);
+}
 
-namespace simdjson {
 #ifdef __AVX2__
+
 /*****************************/
 static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
   return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15);
@@ -119,31 +281,29 @@ static inline void avxcheckOverlong(__m256i current_bytes,
                                     __m256i *has_error) {
   __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
   __m256i initial_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, // 10xx => false
+                       0xC2, -128,       // 110x
+                       0xE1,             // 1110
+                       0xF1, -128, -128, -128, -128, -128, -128, -128, -128,
                        -128, -128, -128, -128, // 10xx => false
                        0xC2, -128,             // 110x
                        0xE1,                   // 1110
-                       0xF1,                   // 1111
-                       -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, -128, // 10xx => false
-                       0xC2, -128,             // 110x
-                       0xE1,                   // 1110
-                       0xF1),                  // 1111
+                       0xF1),
       off1_hibits);
 
   __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
 
   __m256i second_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, // 10xx => false
+                       127, 127,         // 110x => true
+                       0xA0,             // 1110
+                       0x90, -128, -128, -128, -128, -128, -128, -128, -128,
                        -128, -128, -128, -128, // 10xx => false
                        127, 127,               // 110x => true
                        0xA0,                   // 1110
-                       0x90,                   // 1111
-                       -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, -128, // 10xx => false
-                       127, 127,               // 110x => true
-                       0xA0,                   // 1110
-                       0x90),                  // 1111
+                       0x90),
       off1_hibits);
   __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
   *has_error = _mm256_or_si256(*has_error,
@@ -165,11 +325,11 @@ static inline void avx_count_nibbles(__m256i bytes,
 
 // check whether the current bytes are valid UTF-8
 // at the end of the function, previous gets updated
-static inline struct avx_processed_utf_bytes
+static struct avx_processed_utf_bytes
 avxcheckUTF8Bytes(__m256i current_bytes,
                   struct avx_processed_utf_bytes *previous,
                   __m256i *has_error) {
-  struct avx_processed_utf_bytes pb{};
+  struct avx_processed_utf_bytes pb;
   avx_count_nibbles(current_bytes, &pb);
 
   avxcheckSmallerThan0xF4(current_bytes, has_error);
@@ -190,8 +350,111 @@ avxcheckUTF8Bytes(__m256i current_bytes,
   return pb;
 }
 
-#else // __AVX2__
-#warning "We require AVX2 support!"
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct avx_processed_utf_bytes
+avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
+                            struct avx_processed_utf_bytes *previous,
+                            __m256i *has_error) {
+  if (_mm256_testz_si256(current_bytes,
+                         _mm256_set1_epi8(0x80))) { // fast ascii path
+    *has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous->carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        *has_error);
+    return *previous;
+  }
+
+  struct avx_processed_utf_bytes pb;
+  avx_count_nibbles(current_bytes, &pb);
+
+  avxcheckSmallerThan0xF4(current_bytes, has_error);
+
+  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      avxcarryContinuations(initial_lengths, previous->carried_continuations);
+
+  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m256i off1_current_bytes =
+      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
+  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                   previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
+  size_t i = 0;
+  __m256i has_error = _mm256_setzero_si256();
+  struct avx_processed_utf_bytes previous = {
+      .rawbytes = _mm256_setzero_si256(),
+      .high_nibbles = _mm256_setzero_si256(),
+      .carried_continuations = _mm256_setzero_si256()};
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      previous =
+          avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[32];
+    memset(buffer, 0, 32);
+    memcpy(buffer, src + i, len - i);
+    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
+    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        has_error);
+  }
+
+  return _mm256_testz_si256(has_error, has_error);
+}
+
+static bool validate_utf8_fast_avx(const char *src, size_t len) {
+  size_t i = 0;
+  __m256i has_error = _mm256_setzero_si256();
+  struct avx_processed_utf_bytes previous = {
+      .rawbytes = _mm256_setzero_si256(),
+      .high_nibbles = _mm256_setzero_si256(),
+      .carried_continuations = _mm256_setzero_si256()};
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[32];
+    memset(buffer, 0, 32);
+    memcpy(buffer, src + i, len - i);
+    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
+    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        has_error);
+  }
+
+  return _mm256_testz_si256(has_error, has_error);
+}
+
 #endif // __AVX2__
 }
-#endif
+#endif
\ No newline at end of file
diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h
index 6c025080..121c9454 100644
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@@ -6,7 +6,7 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"
 
-#ifdef __AVX2__
+#if defined (__AVX2__) || (__SSE4_2__)
 
 #ifndef SIMDJSON_SKIPUTF8VALIDATION
 #define SIMDJSON_UTF8VALIDATE
@@ -21,7 +21,7 @@
 #else
 #warning It appears that neither ARM NEON nor AVX2 are detected.
 #endif // __ARM_NEON
-#endif // __AVX2__
+#endif // (__AVX2__) || (__SSE4_2__)
 
 // It seems that many parsers do UTF-8 validation.
 // RapidJSON does not do it by default, but a flag
@@ -35,6 +35,7 @@
 namespace simdjson {
 template<instruction_set>
 struct simd_input;
+
 #ifdef __AVX2__
 template<>
 struct simd_input<instruction_set::avx2>
@@ -44,6 +45,17 @@ struct simd_input<instruction_set::avx2>
 };
 #endif
 
+#ifdef __SSE4_2__
+template<>
+struct simd_input<instruction_set::sse4_2>
+{
+  __m128i v0;
+  __m128i v1;
+  __m128i v2;
+  __m128i v3;
+};
+#endif
+
 #ifdef __ARM_NEON
 template<> struct simd_input<instruction_set::neon>
 {
@@ -122,8 +134,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 // where clmul is not supported, so check for both, to be sure.
 #ifdef SIMDJSON_AVOID_CLMUL
 template<instruction_set T> really_inline
-uint64_t compute_quote_mask(uint64_t quote_bits)
-{
+uint64_t compute_quote_mask(uint64_t quote_bits) {
   uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
   quote_mask = quote_mask ^ (quote_mask << 2);
   quote_mask = quote_mask ^ (quote_mask << 4);
@@ -145,6 +156,15 @@ uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
 }
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
+  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
+      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
+  return quote_mask;
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
@@ -158,30 +178,115 @@ uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
   return quote_mask;
 }
 #endif
-#endif
+#endif // SIMDJSON_AVOID_CLMUL
 
 #ifdef SIMDJSON_UTF8VALIDATE
-template<instruction_set T>really_inline
-void check_utf8(simd_input<T> in,
-                __m256i &has_error,
-                struct avx_processed_utf_bytes &previous) {
+// some hack to bypass the impossibily to overload the check_utf8() specialized template
+template<instruction_set>
+struct check_utf8_helper;
+
+#ifdef __AVX2__
+template<>
+struct check_utf8_helper<instruction_set::avx2>
+{
+  __m256i has_error = _mm256_setzero_si256();
+  avx_processed_utf_bytes previous {
+    _mm256_setzero_si256(),
+    _mm256_setzero_si256(),
+    _mm256_setzero_si256()
+  };
+};
+#endif
+
+#ifdef __SSE4_2__
+template<>
+struct check_utf8_helper<instruction_set::sse4_2>
+{
+  __m128i has_error = _mm_setzero_si128();
+  processed_utf_bytes previous {
+    _mm_setzero_si128(),
+    _mm_setzero_si128(),
+    _mm_setzero_si128()
+  };
+};
+#endif
+
+template<instruction_set T>
+void check_utf8(simd_input<T> in, check_utf8_helper<T>& helper);
+
+#ifdef __AVX2__
+template<> really_inline
+void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
+                check_utf8_helper<instruction_set::avx2>& helper) {
   __m256i highbit = _mm256_set1_epi8(0x80);
   if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
     // it is ascii, we just check continuation
-    has_error = _mm256_or_si256(
+    helper.has_error = _mm256_or_si256(
         _mm256_cmpgt_epi8(
-            previous.carried_continuations,
+            helper.previous.carried_continuations,
             _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                              9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
+        helper.has_error);
   } else {
     // it is not ascii so we have to do heavy work
-    previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
-    previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
+    helper.previous = avxcheckUTF8Bytes(in.lo, &(helper.previous), &(helper.has_error));
+    helper.previous = avxcheckUTF8Bytes(in.hi, &(helper.previous), &(helper.has_error));
   }
 }
+#endif //__AVX2__
+
+#ifdef __SSE4_2__
+template<> really_inline
+void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
+                check_utf8_helper<instruction_set::sse4_2>& helper) {
+  __m128i highbit = _mm_set1_epi8(0x80);
+  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
+    // it is ascii, we just check continuation
+    helper.has_error = _mm_or_si128(
+        _mm_cmpgt_epi8(
+            helper.previous.carried_continuations,
+            _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
+        helper.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    helper.previous = checkUTF8Bytes(in.v0, &(helper.previous), &(helper.has_error));
+    helper.previous = checkUTF8Bytes(in.v1, &(helper.previous), &(helper.has_error));
+  }
+
+  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
+    // it is ascii, we just check continuation
+    helper.has_error = _mm_or_si128(
+            _mm_cmpgt_epi8(
+                    helper.previous.carried_continuations,
+                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
+            helper.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    helper.previous = checkUTF8Bytes(in.v2, &(helper.previous), &(helper.has_error));
+    helper.previous = checkUTF8Bytes(in.v3, &(helper.previous), &(helper.has_error));
+  }
+}
+#endif // __SSE4_2
+
+// Checks if the utf8 validation has found any error.
+template<instruction_set T>
+errorValues check_utf8_errors(check_utf8_helper<T>& helper);
+
+#ifdef __AVX2__
+template<> really_inline
+errorValues check_utf8_errors<instruction_set::avx2>(check_utf8_helper<instruction_set::avx2>& helper) {
+  return _mm256_testz_si256(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+}
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+errorValues check_utf8_errors<instruction_set::sse4_2>(check_utf8_helper<instruction_set::sse4_2>& helper) {
+  return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+}
+#endif
+#endif // SIMDJSON_UTF8VALIDATE
+
 template<instruction_set T>
 simd_input<T> fill_input(const uint8_t * ptr);
 
@@ -195,6 +300,18 @@ simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_
 }
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+simd_input<instruction_set::sse4_2> fill_input<instruction_set::sse4_2>(const uint8_t * ptr) {
+  struct simd_input<instruction_set::sse4_2> in;
+  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
+  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
+  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
+  return in;
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
@@ -219,7 +336,6 @@ uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
 #ifdef __AVX2__
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
-
   const __m256i mask = _mm256_set1_epi8(m);
   __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
   uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@@ -229,6 +345,23 @@ uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_se
 }
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
+  const __m128i mask = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+  return res_0 | (res_1 << 32);
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
@@ -257,6 +390,22 @@ uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instructi
 }
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
+  const __m128i maxval = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
@@ -447,7 +596,78 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
   whitespace = ~(ws_res_0 | (ws_res_1 << 32));
 #endif // SIMDJSON_NAIVE_STRUCTURAL
 }
-#endif
+#endif // __AVX2__
+
+#ifdef __SSE4_2__
+template<> really_inline
+void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
+                                                     uint64_t &whitespace,
+                                                     uint64_t &structurals) {
+  const __m128i low_nibble_mask = _mm_setr_epi8(
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  const __m128i high_nibble_mask = _mm_setr_epi8(
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+
+  __m128i structural_shufti_mask = _mm_set1_epi8(0x7);
+  __m128i whitespace_shufti_mask = _mm_set1_epi8(0x18);
+
+  __m128i v_0 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v0),
+      _mm_shuffle_epi8(high_nibble_mask,
+                          _mm_and_si128(_mm_srli_epi32(in.v0, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_1 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v1),
+      _mm_shuffle_epi8(high_nibble_mask,
+                          _mm_and_si128(_mm_srli_epi32(in.v1, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_2 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v2),
+      _mm_shuffle_epi8(high_nibble_mask,
+                         _mm_and_si128(_mm_srli_epi32(in.v2, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_3 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v3),
+      _mm_shuffle_epi8(high_nibble_mask,
+                         _mm_and_si128(_mm_srli_epi32(in.v3, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i tmp_v0 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_0, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v1 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_1, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v2 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_2, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v3 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_3, structural_shufti_mask), _mm_set1_epi8(0));
+
+  uint64_t structural_res_0 = _mm_movemask_epi8(tmp_v0);
+  uint64_t structural_res_1 = _mm_movemask_epi8(tmp_v1);
+  uint64_t structural_res_2 = _mm_movemask_epi8(tmp_v2);
+  uint64_t structural_res_3 = _mm_movemask_epi8(tmp_v3);
+
+  structurals = ~(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
+
+  __m128i tmp_ws_v0 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_0, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v1 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_1, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v2 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_2, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v3 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_3, whitespace_shufti_mask), _mm_set1_epi8(0));
+
+  uint64_t ws_res_0 = _mm_movemask_epi8(tmp_ws_v0);
+  uint64_t ws_res_1 = _mm_movemask_epi8(tmp_ws_v1);
+  uint64_t ws_res_2 = _mm_movemask_epi8(tmp_ws_v2);
+  uint64_t ws_res_3 = _mm_movemask_epi8(tmp_ws_v3);
+
+  whitespace = ~(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
+}
+#endif // __SSE4_2__
 
 #ifdef __ARM_NEON
 template<> really_inline
@@ -569,9 +789,9 @@ void find_whitespace_and_structurals<instruction_set::neon>(
 
   structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
   whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
-#endif
+#endif // FUNKY_BAD_TABLE
 }
-#endif
+#endif // __ARM_NEON
 
 
 #ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
@@ -657,7 +877,7 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
   }
   base = next_base;
 }
-#endif
+#endif // SIMDJSON_NAIVE_FLATTEN
 
 // return a updated structural bit vector with quoted contents cleared out and
 // pseudo-structural characters added to the mask
@@ -711,11 +931,7 @@ WARN_UNUSED
   uint32_t *base_ptr = pj.structural_indexes;
   uint32_t base = 0;
 #ifdef SIMDJSON_UTF8VALIDATE
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous {};
-  previous.rawbytes = _mm256_setzero_si256();
-  previous.high_nibbles = _mm256_setzero_si256();
-  previous.carried_continuations = _mm256_setzero_si256();
+  check_utf8_helper<T> helper;
 #endif
 
   // we have padded the input out to 64 byte multiple with the remainder being
@@ -751,7 +967,7 @@ WARN_UNUSED
 #endif
     simd_input<T> in = fill_input<T>(buf+idx);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
+    check_utf8<T>(in, helper);
 #endif
     // detect odd sequences of backslashes
     uint64_t odd_ends = find_odd_backslash_sequences<T>(
@@ -786,7 +1002,7 @@ WARN_UNUSED
     memcpy(tmpbuf, buf + idx, len - idx);
     simd_input<T> in = fill_input<T>(tmpbuf);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
+    check_utf8<T>(in, helper);
 #endif
 
     // detect odd sequences of backslashes
@@ -843,7 +1059,7 @@ WARN_UNUSED
     return simdjson::UNESCAPED_CHARS;
   }
 #ifdef SIMDJSON_UTF8VALIDATE
-    return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+  return check_utf8_errors<T>(helper);
 #else
   return simdjson::SUCCESS;
 #endif
diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h
index 148678f8..87687846 100644
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@@ -109,6 +109,23 @@ parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const ui
 }
 #endif
 
+#ifdef __SSE4_2__
+template<> really_inline
+parse_string_helper find_bs_bits_and_quote_bits<instruction_set::sse4_2> (const uint8_t *src, uint8_t *dst) {
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+    // store to dest unconditionally - we can overwrite the bits we don't like
+    // later
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
+    auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
+    return {
+      static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
+      static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
+    };
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
@@ -221,8 +238,13 @@ bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
     } else {
       // they are the same. Since they can't co-occur, it means we encountered
       // neither.
-      src += 32;
-      dst += 32;
+      if constexpr(T == instruction_set::sse4_2) {
+        src += 16;
+        dst += 16;
+      } else {
+        src += 32;
+        dst += 32;
+      }
     }
   }
   // can't be reached
diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp
index 538ca813..ed4e3912 100644
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@@ -15,7 +15,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
   json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>;
 #endif
 #ifdef __SSE4_2__
-  // json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>; // not implemented yet
+  json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>;
 #endif
 #ifdef __ARM_NEON
   json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>;
@@ -39,11 +39,13 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
   case instruction_set::avx2 :
     json_parse_ptr = avx_implementation;
     break;
-#elif defined (__SSE4_2__)
-  /*case instruction_set::sse4_2 :
+#endif
+#ifdef __SSE4_2__
+  case instruction_set::sse4_2 :
     json_parse_ptr = sse4_2_implementation;
-    break;*/
-#elif defined (__ARM_NEON)
+    break;
+#endif
+#ifdef __ARM_NEON
   case instruction_set::neon :
     json_parse_ptr = neon_implementation;
     break;

From f7ea2629e4599569e688aa4dd300445fe7ffb92b Mon Sep 17 00:00:00 2001
From: ioioioio <iodadi@gmail.com>
Date: Thu, 4 Jul 2019 10:13:40 -0400
Subject: [PATCH 2/9] Fixing warnings and Microsoft intinsics.

---
 include/simdjson/simdutf8check.h | 187 +++++--------------------------
 1 file changed, 28 insertions(+), 159 deletions(-)

diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h
index ecbc235b..40fc921a 100644
--- a/include/simdjson/simdutf8check.h
+++ b/include/simdjson/simdutf8check.h
@@ -1,12 +1,12 @@
-#ifndef SIMDUTF8CHECK_H
-#define SIMDUTF8CHECK_H
-#include <stdbool.h>
+
+#ifndef SIMDJSON_SIMDUTF8CHECK_H
+#define SIMDJSON_SIMDUTF8CHECK_H
+
+
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
-#include <x86intrin.h>
-
-namespace simdjson {
+#include "simdjson/portability.h"
 /*
  * legal utf-8 byte sequence
  * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
@@ -25,6 +25,9 @@ namespace simdjson {
  */
 
 // all byte values must be no larger than 0xF4
+
+namespace simdjson {
+// all byte values must be no larger than 0xF4
 static inline void checkSmallerThan0xF4(__m128i current_bytes,
                                         __m128i *has_error) {
   // unsigned, saturates to 0 below max
@@ -156,40 +159,7 @@ checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
   return pb;
 }
 
-static bool validate_utf8_fast(const char *src, size_t len) {
-  size_t i = 0;
-  __m128i has_error = _mm_setzero_si128();
-  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
-                                         .high_nibbles = _mm_setzero_si128(),
-                                         .carried_continuations =
-                                             _mm_setzero_si128()};
-  if (len >= 16) {
-    for (; i <= len - 16; i += 16) {
-      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
-      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[16];
-    memset(buffer, 0, 16);
-    memcpy(buffer, src + i, len - i);
-    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
-    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     has_error);
-  }
-
-  return _mm_testz_si128(has_error, has_error);
-}
-
 #ifdef __AVX2__
-
 /*****************************/
 static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
   return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15);
@@ -281,29 +251,31 @@ static inline void avxcheckOverlong(__m256i current_bytes,
                                     __m256i *has_error) {
   __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
   __m256i initial_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, // 10xx => false
-                       0xC2, -128,       // 110x
-                       0xE1,             // 1110
-                       0xF1, -128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
                        -128, -128, -128, -128, // 10xx => false
                        0xC2, -128,             // 110x
                        0xE1,                   // 1110
-                       0xF1),
+                       0xF1,                   // 1111
+                       -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       0xC2, -128,             // 110x
+                       0xE1,                   // 1110
+                       0xF1),                  // 1111
       off1_hibits);
 
   __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
 
   __m256i second_mins = _mm256_shuffle_epi8(
-      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
-                       -128, -128, -128, // 10xx => false
-                       127, 127,         // 110x => true
-                       0xA0,             // 1110
-                       0x90, -128, -128, -128, -128, -128, -128, -128, -128,
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128,
                        -128, -128, -128, -128, // 10xx => false
                        127, 127,               // 110x => true
                        0xA0,                   // 1110
-                       0x90),
+                       0x90,                   // 1111
+                       -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       127, 127,               // 110x => true
+                       0xA0,                   // 1110
+                       0x90),                  // 1111
       off1_hibits);
   __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
   *has_error = _mm256_or_si256(*has_error,
@@ -325,11 +297,11 @@ static inline void avx_count_nibbles(__m256i bytes,
 
 // check whether the current bytes are valid UTF-8
 // at the end of the function, previous gets updated
-static struct avx_processed_utf_bytes
+static inline struct avx_processed_utf_bytes
 avxcheckUTF8Bytes(__m256i current_bytes,
                   struct avx_processed_utf_bytes *previous,
                   __m256i *has_error) {
-  struct avx_processed_utf_bytes pb;
+  struct avx_processed_utf_bytes pb{};
   avx_count_nibbles(current_bytes, &pb);
 
   avxcheckSmallerThan0xF4(current_bytes, has_error);
@@ -350,111 +322,8 @@ avxcheckUTF8Bytes(__m256i current_bytes,
   return pb;
 }
 
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
-static struct avx_processed_utf_bytes
-avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
-                            struct avx_processed_utf_bytes *previous,
-                            __m256i *has_error) {
-  if (_mm256_testz_si256(current_bytes,
-                         _mm256_set1_epi8(0x80))) { // fast ascii path
-    *has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous->carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        *has_error);
-    return *previous;
-  }
-
-  struct avx_processed_utf_bytes pb;
-  avx_count_nibbles(current_bytes, &pb);
-
-  avxcheckSmallerThan0xF4(current_bytes, has_error);
-
-  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
-
-  pb.carried_continuations =
-      avxcarryContinuations(initial_lengths, previous->carried_continuations);
-
-  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
-
-  __m256i off1_current_bytes =
-      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
-  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
-
-  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                   previous->high_nibbles, has_error);
-  return pb;
-}
-
-static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
-  size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous = {
-      .rawbytes = _mm256_setzero_si256(),
-      .high_nibbles = _mm256_setzero_si256(),
-      .carried_continuations = _mm256_setzero_si256()};
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous =
-          avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[32];
-    memset(buffer, 0, 32);
-    memcpy(buffer, src + i, len - i);
-    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
-    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
-  }
-
-  return _mm256_testz_si256(has_error, has_error);
-}
-
-static bool validate_utf8_fast_avx(const char *src, size_t len) {
-  size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous = {
-      .rawbytes = _mm256_setzero_si256(),
-      .high_nibbles = _mm256_setzero_si256(),
-      .carried_continuations = _mm256_setzero_si256()};
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-    }
-  }
-
-  // last part
-  if (i < len) {
-    char buffer[32];
-    memset(buffer, 0, 32);
-    memcpy(buffer, src + i, len - i);
-    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
-    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
-  }
-
-  return _mm256_testz_si256(has_error, has_error);
-}
-
+#else // __AVX2__
+#warning "We require AVX2 support!"
 #endif // __AVX2__
 }
-#endif
\ No newline at end of file
+#endif

From 2b2d93b05f1838addca9a925007d9ab1e64adb42 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Thu, 4 Jul 2019 17:19:05 -0400
Subject: [PATCH 3/9] Various minor tweaks.

---
 .appveyor.yml                        |  7 ++++-
 .travis.yml                          |  3 ++
 CMakeLists.txt                       |  2 ++
 Makefile                             | 17 +++++++----
 README.md                            | 11 +++----
 include/simdjson/jsonminifier.h      |  2 ++
 include/simdjson/simdutf8check.h     |  2 --
 include/simdjson/stage1_find_marks.h | 43 +++++++++++++++++-----------
 tests/CMakeLists.txt                 |  9 +++---
 tools/cmake/FindOptions.cmake        | 18 ++++++++----
 10 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index 5372da62..30549a97 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -9,9 +9,14 @@ clone_folder: c:\projects\simdjson
 platform:
 - x64
 
+environment:
+  matrix:
+    - AVXFLAG: "OFF"
+    - AVXFLAG: "ON"
+
 build_script:
   - mkdir build
   - cd build
-  - ps: cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
+  - ps: cmake -DSIMDJSON_DISABLE_AVX="$env:AVXFLAG"  -DCMAKE_GENERATOR_PLATFORM=x64 ..
   - cmake --build .
   - ctest --verbose
diff --git a/.travis.yml b/.travis.yml
index b625f367..bafe5d5c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,3 +19,6 @@ script:
    - make test 
    - make clean
    - make SANITIZEGOLD=1 test
+   - make clean
+   - ARCHFLAGS="-march=nehalem" make 
+   - ARCHFLAGS="-march=nehalem" make test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eba12585..88e9d1d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,8 @@ if(ltoresult)
 set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
+option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF)
+	
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_MACOSX_RPATH OFF)
diff --git a/Makefile b/Makefile
index 603ac88d..a99cf7b0 100644
--- a/Makefile
+++ b/Makefile
@@ -9,16 +9,21 @@ COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include
 EXTRADEPSINCLUDE =  -Idependencies/jsoncppdist -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
 # users can provide their own additional flags with make EXTRAFLAGS=something
 architecture:=$(shell arch)
-CXXFLAGS =  -std=c++17   -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux $(EXTRAFLAGS)
-CFLAGS =   -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
+
+####
+# If you want to specify your own target architecture,
+# then define ARCHFLAGS. Otherwise, we set good default.
+###
 ifeq ($(architecture),aarch64)
-CXXFLAGS += -march=armv8-a+crc+crypto
-CFLAGS += -march=armv8-a+crc+crypto
+ARCHFLAGS ?= -march=armv8-a+crc+crypto
 else
-CXXFLAGS += -march=native
-CFLAGS += -march=native
+ARCHFLAGS ?= -march=native
 endif
 
+CXXFLAGS = $(ARCHFLAGS) -std=c++17   -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux $(EXTRAFLAGS)
+CFLAGS =  $(ARCHFLAGS)  -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
+
+
 # This is a convenience flag
 ifdef SANITIZEGOLD
     SANITIZE = 1
diff --git a/README.md b/README.md
index 3efe5f0e..f49b9d9e 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ On a Skylake processor, the parsing speeds (in GB/s) of various processors on th
 ## Requirements
 
 - We support platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later.
-- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017).
+- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017) or at least SSE 4.2 (i.e., Intel processors going back to Nehalem released in 2008 or AMD processors starting with the Jaguar used in the PS4 and XBox One).
 - A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU GCC 7 or better or LLVM's clang 6 or better.
 - Some benchmark scripts assume bash and other common utilities, but they are optional.
 
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-We require hardware support for AVX2 instructions. You have to make sure that you instruct your 
+On Intel and AMD processors, we get best performance by using the hardware support for AVX2 instructions. You have to make sure that you instruct your 
 compiler to use these instructions as needed. Under compilers such as GNU GCC or LLVM clang, the
 flag `-march=native` used on a recent Intel processor (Haswell or better) is sufficient. For portability
 of the binary files you can also specify directly the Haswell processor (`-march=haswell`). You may 
@@ -260,14 +260,15 @@ make test
 
 ## Usage (CMake on Windows using Visual Studio)
 
-We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later).
+We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 (2008 Nehalem or later). 
 
 - Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/).
 - Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake.
 - Create a subdirectory within simdjson, such as `VisualStudio`.
 - Using a shell, go to this newly created directory.
-- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.)
-- This last command created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
+- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) This will build the code with AVX2 instructions. If your target processor does not support AVX2, you need to replace `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` by `cmake  -DSIMDJSON_DISABLE_AVX=on -DCMAKE_GENERATOR_PLATFORM=x64 ..` . That is, you need to set the flag to forcefully disable AVX support since we compile with AVX2 instructions *by default*.
+- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
+
 
 
 ## Usage (Using `vcpkg` on Windows, Linux and MacOS)
diff --git a/include/simdjson/jsonminifier.h b/include/simdjson/jsonminifier.h
index a588338c..c5cf0bb4 100644
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@@ -5,9 +5,11 @@
 #include <cstdint>
 
 namespace simdjson {
+
 // Take input from buf and remove useless whitespace, write it to out; buf and
 // out can be the same pointer. Result is null terminated,
 // return the string length (minus the null termination).
+// The accelerated version of this function only runs on AVX2 hardware.
 size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);
 
 
diff --git a/include/simdjson/simdutf8check.h b/include/simdjson/simdutf8check.h
index 40fc921a..79e67567 100644
--- a/include/simdjson/simdutf8check.h
+++ b/include/simdjson/simdutf8check.h
@@ -322,8 +322,6 @@ avxcheckUTF8Bytes(__m256i current_bytes,
   return pb;
 }
 
-#else // __AVX2__
-#warning "We require AVX2 support!"
 #endif // __AVX2__
 }
 #endif
diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h
index 121c9454..4b0df107 100644
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@@ -126,6 +126,19 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
 template<instruction_set T>
 uint64_t compute_quote_mask(uint64_t quote_bits);
 
+namespace {
+  // for when clmul is unavailable
+  [[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
+    uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
+    quote_mask = quote_mask ^ (quote_mask << 2);
+    quote_mask = quote_mask ^ (quote_mask << 4);
+    quote_mask = quote_mask ^ (quote_mask << 8);
+    quote_mask = quote_mask ^ (quote_mask << 16);
+    quote_mask = quote_mask ^ (quote_mask << 32);
+    return quote_mask;
+  }
+}
+
 // In practice, if you have NEON or __PCLMUL__, you would
 // always want to use them, but it might be useful, for research
 // purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
@@ -135,13 +148,7 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 #ifdef SIMDJSON_AVOID_CLMUL
 template<instruction_set T> really_inline
 uint64_t compute_quote_mask(uint64_t quote_bits) {
-  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
-  quote_mask = quote_mask ^ (quote_mask << 2);
-  quote_mask = quote_mask ^ (quote_mask << 4);
-  quote_mask = quote_mask ^ (quote_mask << 8);
-  quote_mask = quote_mask ^ (quote_mask << 16);
-  quote_mask = quote_mask ^ (quote_mask << 32);
-  return quote_mask;
+  return portable_compute_quote_mask(quote_bits);
 }
 #else
 template<instruction_set>
@@ -150,6 +157,8 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 #ifdef __AVX2__ 
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
   uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
       _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
   return quote_mask;
@@ -159,23 +168,25 @@ uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
 #ifdef __SSE4_2__
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
-  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
+  // CLMUL is supported on some SSE42 hardware such as Sandy Bridge,
+  // but not on others.
+#ifdef __PCLMUL__
+  return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
       _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
-  return quote_mask;
+#else
+  return portable_compute_quote_mask(quote_bits);
+#endif
 }
 #endif
 
 #ifdef __ARM_NEON
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
-#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
-  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-                                          _mm_set_epi64x(0ULL, quote_bits),
-                                          _mm_set1_epi8(0xFF), 0));
+#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
+  return vmull_p64( -1ULL, quote_bits);
 #else
-  uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
-#endif
-  return quote_mask;
+  return portable_compute_quote_mask(quote_bits);
+#endif 
 }
 #endif
 #endif // SIMDJSON_AVOID_CLMUL
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index be75702b..229df38b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,7 +7,8 @@ endif()
 add_cpp_test(basictests)
 add_cpp_test(jsoncheck)
 
-add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
-target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
-target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
-add_test(singleheader singleheader)
\ No newline at end of file
+## This causes problems
+# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
+# target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
+# target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
+# add_test(singleheader singleheader)
\ No newline at end of file
diff --git a/tools/cmake/FindOptions.cmake b/tools/cmake/FindOptions.cmake
index 3c4596cb..89f2c611 100644
--- a/tools/cmake/FindOptions.cmake
+++ b/tools/cmake/FindOptions.cmake
@@ -13,15 +13,21 @@ if(SIMDJSON_SANITIZE)
 endif()
 
 
-
-# some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
-if(NOT MSVC)
+if(SIMDJSON_DISABLE_AVX) 
+  if(NOT MSVC)
+   set (OPT_FLAGS "${OPT_FLAGS} -mno-avx -mno-bmi -mno-pclmul -msse4.2")
+  else()
+   set (OPT_FLAGS "${OPT_FLAGS}")
+  endif()
+else() 
+  # some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
+  if(NOT MSVC)
    set (OPT_FLAGS "${OPT_FLAGS} -mavx2 -mbmi -mbmi2 -mpclmul")
-else()
-   set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2 /std:c++latest")
+  else()
+   set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2")
+  endif()
 endif()
 
-
 if(NOT MSVC)
 set(CXXSTD_FLAGS "-std=c++17 -fPIC")
 endif()

From 19cdc09928fa321c3d6430890d1b5269832fa8f1 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Thu, 4 Jul 2019 17:36:26 -0400
Subject: [PATCH 4/9] Improving support for VS

---
 include/simdjson/stage1_find_marks.h | 37 ++++++++++++++--------------
 include/simdjson/stringparsing.h     |  2 +-
 src/jsonparser.cpp                   |  2 +-
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h
index 4b0df107..65525557 100644
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@@ -6,17 +6,16 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"
 
-#if defined (__AVX2__) || (__SSE4_2__)
+#if defined (__AVX2__) || defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 
 #ifndef SIMDJSON_SKIPUTF8VALIDATION
 #define SIMDJSON_UTF8VALIDATE
-
 #endif
 #else
 // currently we don't UTF8 validate for ARM
 // also we assume that if you're not __AVX2__ 
 // you're ARM, which is a bit dumb. TODO: Fix...
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)   || (defined(_MSC_VER) && defined(_M_ARM64))
 #include <arm_neon.h>
 #else
 #warning It appears that neither ARM NEON nor AVX2 are detected.
@@ -45,7 +44,7 @@ struct simd_input<instruction_set::avx2>
 };
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<>
 struct simd_input<instruction_set::sse4_2>
 {
@@ -56,7 +55,7 @@ struct simd_input<instruction_set::sse4_2>
 };
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> struct simd_input<instruction_set::neon>
 {
 #ifndef TRANSPOSE
@@ -70,7 +69,7 @@ template<> struct simd_input<instruction_set::neon>
 };
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 really_inline
 uint16_t neonmovemask(uint8x16_t input) {
   const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
@@ -165,7 +164,7 @@ uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
   // CLMUL is supported on some SSE42 hardware such as Sandy Bridge,
@@ -179,7 +178,7 @@ uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
 }
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
 #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
@@ -209,7 +208,7 @@ struct check_utf8_helper<instruction_set::avx2>
 };
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<>
 struct check_utf8_helper<instruction_set::sse4_2>
 {
@@ -246,7 +245,7 @@ void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
 }
 #endif //__AVX2__
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
                 check_utf8_helper<instruction_set::sse4_2>& helper) {
@@ -290,7 +289,7 @@ errorValues check_utf8_errors<instruction_set::avx2>(check_utf8_helper<instructi
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 errorValues check_utf8_errors<instruction_set::sse4_2>(check_utf8_helper<instruction_set::sse4_2>& helper) {
   return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
@@ -311,7 +310,7 @@ simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 simd_input<instruction_set::sse4_2> fill_input<instruction_set::sse4_2>(const uint8_t * ptr) {
   struct simd_input<instruction_set::sse4_2> in;
@@ -323,7 +322,7 @@ simd_input<instruction_set::sse4_2> fill_input<instruction_set::sse4_2>(const ui
 }
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
   struct simd_input<instruction_set::neon> in;
@@ -356,7 +355,7 @@ uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_se
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
   const __m128i mask = _mm_set1_epi8(m);
@@ -373,7 +372,7 @@ uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_
 }
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
   const uint8x16_t mask = vmovq_n_u8(m); 
@@ -401,7 +400,7 @@ uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instructi
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
   const __m128i maxval = _mm_set1_epi8(m);
@@ -417,7 +416,7 @@ uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruc
 }
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
   const uint8x16_t mask = vmovq_n_u8(m); 
@@ -609,7 +608,7 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
 }
 #endif // __AVX2__
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
                                                      uint64_t &whitespace,
@@ -680,7 +679,7 @@ void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruc
 }
 #endif // __SSE4_2__
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 void find_whitespace_and_structurals<instruction_set::neon>(
                                                   simd_input<instruction_set::neon> in,
diff --git a/include/simdjson/stringparsing.h b/include/simdjson/stringparsing.h
index 87687846..c9be1788 100644
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@@ -109,7 +109,7 @@ parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const ui
 }
 #endif
 
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 parse_string_helper find_bs_bits_and_quote_bits<instruction_set::sse4_2> (const uint8_t *src, uint8_t *dst) {
     // this can read up to 31 bytes beyond the buffer size, but we require 
diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp
index ed4e3912..7f82471a 100644
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@@ -40,7 +40,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
     json_parse_ptr = avx_implementation;
     break;
 #endif
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
   case instruction_set::sse4_2 :
     json_parse_ptr = sse4_2_implementation;
     break;

From fba27ef4b99ee3e93a1e8ade22f94601db4f2a92 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Thu, 4 Jul 2019 17:45:45 -0400
Subject: [PATCH 5/9] I missed a few. Building up VS support.

---
 include/simdjson/simdjson.h |  2 +-
 src/jsonparser.cpp          | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/simdjson/simdjson.h b/include/simdjson/simdjson.h
index 9a16692d..4190aeaf 100644
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@@ -12,7 +12,7 @@ enum class instruction_set {
 // the 'native' enum class value should point at a good default on the current machine
 #ifdef __AVX2__
   native = avx2
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
   native = neon
 #else
   // Let us assume that we have an old x64 processor, but one that has SSE (i.e., something
diff --git a/src/jsonparser.cpp b/src/jsonparser.cpp
index 7f82471a..be17b069 100644
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@@ -14,10 +14,10 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
 #ifdef __AVX2__
   json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>;
 #endif
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
   json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>;
 #endif
-#ifdef __ARM_NEON
+#if  defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
   json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>;
 #endif
 
@@ -25,9 +25,9 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
   // Should be done at runtime. Does not make any sense on preprocessor.
 #ifdef __AVX2__
   instruction_set best_implementation = instruction_set::avx2;
-#elif defined (__SSE4_2__)
+#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
   instruction_set best_implementation = instruction_set::sse4_2;
-#elif defined (__ARM_NEON)
+#elif defined (__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
   instruction_set best_implementation = instruction_set::neon;
 #else
   instruction_set best_implementation = instruction_set::none;
@@ -45,7 +45,7 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
     json_parse_ptr = sse4_2_implementation;
     break;
 #endif
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
   case instruction_set::neon :
     json_parse_ptr = neon_implementation;
     break;

From 0c2f58e40c25c9cd9c5632994183efbdb22ee88c Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Thu, 4 Jul 2019 17:58:45 -0400
Subject: [PATCH 6/9] Extending the no-avx tests on circleci.

---
 .circleci/config.yml | 78 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e73a9654..86889c0b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,6 +38,44 @@ jobs:
             cd build
             make test
 
+  "gccnoavx":
+    docker:
+      - image: ubuntu:18.04
+        environment:
+          CXX: g++-7
+    steps:
+      - checkout
+
+      - run: apt-get update -qq
+      - run: >
+          apt-get install -y
+          build-essential
+          cmake
+          g++-7
+          git
+
+      - run:
+          name: Building (gcc)
+          command: ARCHFLAGS="-march=nehalem" make
+
+      - run:
+          name: Running tests (gcc)
+          command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
+
+      - run:
+          name: Building (gcc, cmake)
+          command: |
+            mkdir build
+            cd build
+            cmake -DSIMDJSON_DISABLE_AVX=on ..
+            make
+
+      - run:
+          name: Running tests (gcc, cmake)
+          command: |
+            cd build
+            make test
+
   "clang":
     docker:
       - image: ubuntu:18.04
@@ -76,9 +114,49 @@ jobs:
             cd build
             make test
 
+  "clangnoavx":
+    docker:
+      - image: ubuntu:18.04
+        environment:
+          CXX: clang++-6.0
+    steps:
+      - checkout
+
+      - run: apt-get update -qq
+      - run: >
+          apt-get install -y
+          build-essential
+          cmake
+          clang-6.0
+          git
+
+      - run:
+          name: Building (clang)
+          command: ARCHFLAGS="-march=nehalem"  make
+
+      - run:
+          name: Running tests (clang)
+          command: ARCHFLAGS="-march=nehalem"  make quiettest amalgamate
+
+      - run:
+          name: Building (clang, cmake)
+          command: |
+            mkdir build
+            cd build
+            cmake -DSIMDJSON_DISABLE_AVX=on ..
+            make
+
+      - run:
+          name: Running tests (clang, cmake)
+          command: |
+            cd build
+            make test
+
 workflows:
   version: 2
   build_and_test:
     jobs:
       - "clang"
       - "gcc"
+      - "clangnoavx"
+      - "gccnoavx"
\ No newline at end of file

From d7b9a29dc6f0ff73c345f19aa71245debd2d2749 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Thu, 4 Jul 2019 19:10:05 -0400
Subject: [PATCH 7/9] Adding comments.

---
 CMakeLists.txt | 1 +
 Makefile       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88e9d1d1..d2da0f6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ if(ltoresult)
 set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
+# usage: cmake -DSIMDJSON_DISABLE_AVX=on ..
 option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF)
 	
 set(CMAKE_CXX_STANDARD 17)
diff --git a/Makefile b/Makefile
index a99cf7b0..e9edf55a 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ architecture:=$(shell arch)
 ####
 # If you want to specify your own target architecture,
 # then define ARCHFLAGS. Otherwise, we set good default.
+# E.g., type ' ARCHFLAGS="-march=nehalem" make parse '
 ###
 ifeq ($(architecture),aarch64)
 ARCHFLAGS ?= -march=armv8-a+crc+crypto

From b0d9c074e1179d6ff6ef915c8f02259d20fe6cef Mon Sep 17 00:00:00 2001
From: ioioioio <iodadi@gmail.com>
Date: Fri, 5 Jul 2019 11:09:28 -0400
Subject: [PATCH 8/9] check_utf8_helper has a more meaningful name

---
 include/simdjson/stage1_find_marks.h | 75 ++++++++++++++--------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/include/simdjson/stage1_find_marks.h b/include/simdjson/stage1_find_marks.h
index 65525557..e707d624 100644
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@@ -191,56 +191,56 @@ uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
 #endif // SIMDJSON_AVOID_CLMUL
 
 #ifdef SIMDJSON_UTF8VALIDATE
-// some hack to bypass the impossibily to overload the check_utf8() specialized template
+// Holds the state required to perform check_utf8().
 template<instruction_set>
-struct check_utf8_helper;
+struct utf8_checking_state;
 
 #ifdef __AVX2__
 template<>
-struct check_utf8_helper<instruction_set::avx2>
+struct utf8_checking_state<instruction_set::avx2>
 {
   __m256i has_error = _mm256_setzero_si256();
   avx_processed_utf_bytes previous {
-    _mm256_setzero_si256(),
-    _mm256_setzero_si256(),
-    _mm256_setzero_si256()
+    _mm256_setzero_si256(), // rawbytes
+    _mm256_setzero_si256(), // high_nibbles
+    _mm256_setzero_si256()  // carried_continuations
   };
 };
 #endif
 
 #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<>
-struct check_utf8_helper<instruction_set::sse4_2>
+struct utf8_checking_state<instruction_set::sse4_2>
 {
   __m128i has_error = _mm_setzero_si128();
   processed_utf_bytes previous {
-    _mm_setzero_si128(),
-    _mm_setzero_si128(),
-    _mm_setzero_si128()
+    _mm_setzero_si128(), // rawbytes
+    _mm_setzero_si128(), // high_nibbles
+    _mm_setzero_si128()  // carried_continuations
   };
 };
 #endif
 
 template<instruction_set T>
-void check_utf8(simd_input<T> in, check_utf8_helper<T>& helper);
+void check_utf8(simd_input<T> in, utf8_checking_state<T>& state);
 
 #ifdef __AVX2__
 template<> really_inline
 void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
-                check_utf8_helper<instruction_set::avx2>& helper) {
+                utf8_checking_state<instruction_set::avx2>& state) {
   __m256i highbit = _mm256_set1_epi8(0x80);
   if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
     // it is ascii, we just check continuation
-    helper.has_error = _mm256_or_si256(
+    state.has_error = _mm256_or_si256(
         _mm256_cmpgt_epi8(
-            helper.previous.carried_continuations,
+            state.previous.carried_continuations,
             _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                              9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-        helper.has_error);
+        state.has_error);
   } else {
     // it is not ascii so we have to do heavy work
-    helper.previous = avxcheckUTF8Bytes(in.lo, &(helper.previous), &(helper.has_error));
-    helper.previous = avxcheckUTF8Bytes(in.hi, &(helper.previous), &(helper.has_error));
+    state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error));
+    state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error));
   }
 }
 #endif //__AVX2__
@@ -248,51 +248,51 @@ void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
 #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
-                check_utf8_helper<instruction_set::sse4_2>& helper) {
+                utf8_checking_state<instruction_set::sse4_2>& state) {
   __m128i highbit = _mm_set1_epi8(0x80);
   if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
     // it is ascii, we just check continuation
-    helper.has_error = _mm_or_si128(
+    state.has_error = _mm_or_si128(
         _mm_cmpgt_epi8(
-            helper.previous.carried_continuations,
+            state.previous.carried_continuations,
             _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-        helper.has_error);
+        state.has_error);
   } else {
     // it is not ascii so we have to do heavy work
-    helper.previous = checkUTF8Bytes(in.v0, &(helper.previous), &(helper.has_error));
-    helper.previous = checkUTF8Bytes(in.v1, &(helper.previous), &(helper.has_error));
+    state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error));
+    state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error));
   }
 
   if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
     // it is ascii, we just check continuation
-    helper.has_error = _mm_or_si128(
+    state.has_error = _mm_or_si128(
             _mm_cmpgt_epi8(
-                    helper.previous.carried_continuations,
+                    state.previous.carried_continuations,
                     _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-            helper.has_error);
+            state.has_error);
   } else {
     // it is not ascii so we have to do heavy work
-    helper.previous = checkUTF8Bytes(in.v2, &(helper.previous), &(helper.has_error));
-    helper.previous = checkUTF8Bytes(in.v3, &(helper.previous), &(helper.has_error));
+    state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error));
+    state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error));
   }
 }
 #endif // __SSE4_2
 
 // Checks if the utf8 validation has found any error.
 template<instruction_set T>
-errorValues check_utf8_errors(check_utf8_helper<T>& helper);
+errorValues check_utf8_errors(utf8_checking_state<T>& state);
 
 #ifdef __AVX2__
 template<> really_inline
-errorValues check_utf8_errors<instruction_set::avx2>(check_utf8_helper<instruction_set::avx2>& helper) {
-  return _mm256_testz_si256(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+errorValues check_utf8_errors<instruction_set::avx2>(utf8_checking_state<instruction_set::avx2>& state) {
+  return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
 }
 #endif
 
 #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
-errorValues check_utf8_errors<instruction_set::sse4_2>(check_utf8_helper<instruction_set::sse4_2>& helper) {
-  return _mm_testz_si128(helper.has_error, helper.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+errorValues check_utf8_errors<instruction_set::sse4_2>(utf8_checking_state<instruction_set::sse4_2>& state) {
+  return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
 }
 #endif
 #endif // SIMDJSON_UTF8VALIDATE
@@ -368,7 +368,6 @@ uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_
   __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
   uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
   return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
-  return res_0 | (res_1 << 32);
 }
 #endif
 
@@ -941,7 +940,7 @@ WARN_UNUSED
   uint32_t *base_ptr = pj.structural_indexes;
   uint32_t base = 0;
 #ifdef SIMDJSON_UTF8VALIDATE
-  check_utf8_helper<T> helper;
+  utf8_checking_state<T> state;
 #endif
 
   // we have padded the input out to 64 byte multiple with the remainder being
@@ -977,7 +976,7 @@ WARN_UNUSED
 #endif
     simd_input<T> in = fill_input<T>(buf+idx);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8<T>(in, helper);
+    check_utf8<T>(in, state);
 #endif
     // detect odd sequences of backslashes
     uint64_t odd_ends = find_odd_backslash_sequences<T>(
@@ -1012,7 +1011,7 @@ WARN_UNUSED
     memcpy(tmpbuf, buf + idx, len - idx);
     simd_input<T> in = fill_input<T>(tmpbuf);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8<T>(in, helper);
+    check_utf8<T>(in, state);
 #endif
 
     // detect odd sequences of backslashes
@@ -1069,7 +1068,7 @@ WARN_UNUSED
     return simdjson::UNESCAPED_CHARS;
   }
 #ifdef SIMDJSON_UTF8VALIDATE
-  return check_utf8_errors<T>(helper);
+  return check_utf8_errors<T>(state);
 #else
   return simdjson::SUCCESS;
 #endif

From a1f692408d9a3846e513ee63d57e6c18533c092a Mon Sep 17 00:00:00 2001
From: ioioioio <iodadi@gmail.com>
Date: Fri, 5 Jul 2019 11:38:32 -0400
Subject: [PATCH 9/9] Adding Sunny Gleason to the contributors list

---
 CONTRIBUTORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 35fb3b30..9b11cbec 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -20,5 +20,6 @@ Tom Dyson
 Ihor Dotsenko
 Alexey Milovidov
 Chang Liu
+Sunny Gleason
 # if you have contributed to the project and your name does not 
 # appear in this list, please let us know!