Merge pull request #202 from lemire/sse_integration

SSE integration (PR#139)
2019-07-05 12:25:06 -04:00 · 2019-07-05 12:25:06 -04:00 · 3bd3116cf8
parent 9238e15bb1 a1f692408d
commit 3bd3116cf8
17 changed files with 572 additions and 87 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -9,9 +9,14 @@ clone_folder: c:\projects\simdjson
 platform:
 - x64

+environment:
+  matrix:
+    - AVXFLAG: "OFF"
+    - AVXFLAG: "ON"
+
 build_script:
  - mkdir build
  - cd build
-  - ps: cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
+  - ps: cmake -DSIMDJSON_DISABLE_AVX="$env:AVXFLAG"  -DCMAKE_GENERATOR_PLATFORM=x64 ..
  - cmake --build .
  - ctest --verbose
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -38,6 +38,44 @@ jobs:
            cd build
            make test

+  "gccnoavx":
+    docker:
+      - image: ubuntu:18.04
+        environment:
+          CXX: g++-7
+    steps:
+      - checkout
+
+      - run: apt-get update -qq
+      - run: >
+          apt-get install -y
+          build-essential
+          cmake
+          g++-7
+          git
+
+      - run:
+          name: Building (gcc)
+          command: ARCHFLAGS="-march=nehalem" make
+
+      - run:
+          name: Running tests (gcc)
+          command: ARCHFLAGS="-march=nehalem" make quiettest amalgamate
+
+      - run:
+          name: Building (gcc, cmake)
+          command: |
+            mkdir build
+            cd build
+            cmake -DSIMDJSON_DISABLE_AVX=on ..
+            make
+
+      - run:
+          name: Running tests (gcc, cmake)
+          command: |
+            cd build
+            make test
+
  "clang":
    docker:
      - image: ubuntu:18.04
@ -76,9 +114,49 @@ jobs:
            cd build
            make test

+  "clangnoavx":
+    docker:
+      - image: ubuntu:18.04
+        environment:
+          CXX: clang++-6.0
+    steps:
+      - checkout
+
+      - run: apt-get update -qq
+      - run: >
+          apt-get install -y
+          build-essential
+          cmake
+          clang-6.0
+          git
+
+      - run:
+          name: Building (clang)
+          command: ARCHFLAGS="-march=nehalem"  make
+
+      - run:
+          name: Running tests (clang)
+          command: ARCHFLAGS="-march=nehalem"  make quiettest amalgamate
+
+      - run:
+          name: Building (clang, cmake)
+          command: |
+            mkdir build
+            cd build
+            cmake -DSIMDJSON_DISABLE_AVX=on ..
+            make
+
+      - run:
+          name: Running tests (clang, cmake)
+          command: |
+            cd build
+            make test
+
 workflows:
  version: 2
  build_and_test:
    jobs:
      - "clang"
      - "gcc"
+      - "clangnoavx"
+      - "gccnoavx"
--- a/.travis.yml
+++ b/.travis.yml
@ -19,3 +19,6 @@ script:
   - make test 
   - make clean
   - make SANITIZEGOLD=1 test
+   - make clean
+   - ARCHFLAGS="-march=nehalem" make 
+   - ARCHFLAGS="-march=nehalem" make test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,6 +5,9 @@ if(ltoresult)
 set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()

+# usage: cmake -DSIMDJSON_DISABLE_AVX=on ..
+option(SIMDJSON_DISABLE_AVX "Forcefully disable AVX even if hardware supports it" OFF)
+	
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_MACOSX_RPATH OFF)
--- a/1
+++ b/1
@ -20,5 +20,6 @@ Tom Dyson
 Ihor Dotsenko
 Alexey Milovidov
 Chang Liu
+Sunny Gleason
 # if you have contributed to the project and your name does not 
 # appear in this list, please let us know!
--- a/18
+++ b/18
@ -9,16 +9,22 @@ COREDEPSINCLUDE = -Idependencies/rapidjson/include -Idependencies/sajson/include
 EXTRADEPSINCLUDE =  -Idependencies/jsoncppdist -Idependencies/json11 -Idependencies/fastjson/src -Idependencies/fastjson/include -Idependencies/gason/src -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src
 # users can provide their own additional flags with make EXTRAFLAGS=something
 architecture:=$(shell arch)
-CXXFLAGS =  -std=c++17   -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux $(EXTRAFLAGS)
-CFLAGS =   -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
+
+####
+# If you want to specify your own target architecture,
+# then define ARCHFLAGS. Otherwise, we set good default.
+# E.g., type ' ARCHFLAGS="-march=nehalem" make parse '
+###
 ifeq ($(architecture),aarch64)
-CXXFLAGS += -march=armv8-a+crc+crypto
-CFLAGS += -march=armv8-a+crc+crypto
+ARCHFLAGS ?= -march=armv8-a+crc+crypto
 else
-CXXFLAGS += -march=native
-CFLAGS += -march=native
+ARCHFLAGS ?= -march=native
 endif

+CXXFLAGS = $(ARCHFLAGS) -std=c++17   -Wall -Wextra -Wshadow -Iinclude  -Ibenchmark/linux $(EXTRAFLAGS)
+CFLAGS =  $(ARCHFLAGS)  -Idependencies/ujson4c/3rdparty -Idependencies/ujson4c/src $(EXTRAFLAGS)
+
+
 # This is a convenience flag
 ifdef SANITIZEGOLD
    SANITIZE = 1
--- a/README.md
+++ b/README.md
@ -52,7 +52,7 @@ On a Skylake processor, the parsing speeds (in GB/s) of various processors on th
 ## Requirements

 - We support platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later.
- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017).
+- A processor with AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD processors starting with the Zen microarchitecture released 2017) or at least SSE 4.2 (i.e., Intel processors going back to Nehalem released in 2008 or AMD processors starting with the Jaguar used in the PS4 and XBox One).
 - A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU GCC 7 or better or LLVM's clang 6 or better.
 - Some benchmark scripts assume bash and other common utilities, but they are optional.

@ -169,7 +169,7 @@ int main(int argc, char *argv[]) {
 }
 ```

-We require hardware support for AVX2 instructions. You have to make sure that you instruct your 
+On Intel and AMD processors, we get best performance by using the hardware support for AVX2 instructions. You have to make sure that you instruct your 
 compiler to use these instructions as needed. Under compilers such as GNU GCC or LLVM clang, the
 flag `-march=native` used on a recent Intel processor (Haswell or better) is sufficient. For portability
 of the binary files you can also specify directly the Haswell processor (`-march=haswell`). You may 
@ -261,14 +261,15 @@ make test

 ## Usage (CMake on Windows using Visual Studio)

-We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later).
+We assume you have a common Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 (2008 Nehalem or later). 

 - Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/).
 - Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake.
 - Create a subdirectory within simdjson, such as `VisualStudio`.
 - Using a shell, go to this newly created directory.
- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.)
- This last command created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
+- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `VisualStudio` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) This will build the code with AVX2 instructions. If your target processor does not support AVX2, you need to replace `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` by `cmake  -DSIMDJSON_DISABLE_AVX=on -DCMAKE_GENERATOR_PLATFORM=x64 ..` . That is, you need to set the flag to forcefully disable AVX support since we compile with AVX2 instructions *by default*.
+- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`.
+


 ## Usage (Using `vcpkg` on Windows, Linux and MacOS)
--- a/include/simdjson/jsonminifier.h
+++ b/include/simdjson/jsonminifier.h
@ -5,9 +5,11 @@
 #include <cstdint>

 namespace simdjson {
+
 // Take input from buf and remove useless whitespace, write it to out; buf and
 // out can be the same pointer. Result is null terminated,
 // return the string length (minus the null termination).
+// The accelerated version of this function only runs on AVX2 hardware.
 size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);


--- a/include/simdjson/numberparsing.h
+++ b/include/simdjson/numberparsing.h
@ -114,7 +114,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
 }

-#ifdef __AVX2__
+#if defined (__AVX2__) || defined (__SSE4_2__)
 #define SWAR_NUMBER_PARSING
 #endif

--- a/include/simdjson/portability.h
+++ b/include/simdjson/portability.h
@ -40,7 +40,7 @@ static inline int hamming(uint64_t input_num) {
 #include <cstdint>
 #include <cstdlib>

-#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__)
+#if defined(__BMI2__) || defined(__POPCOUNT__) || defined(__AVX2__) || defined(__SSE4_2__)
 #include <x86intrin.h>
 #endif
 namespace simdjson {
--- a/include/simdjson/simdjson.h
+++ b/include/simdjson/simdjson.h
@ -12,7 +12,7 @@ enum class instruction_set {
 // the 'native' enum class value should point at a good default on the current machine
 #ifdef __AVX2__
  native = avx2
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
  native = neon
 #else
  // Let us assume that we have an old x64 processor, but one that has SSE (i.e., something
--- a/include/simdjson/simdutf8check.h
+++ b/include/simdjson/simdutf8check.h
@ -27,6 +27,138 @@
 // all byte values must be no larger than 0xF4

 namespace simdjson {
+// all byte values must be no larger than 0xF4
+static inline void checkSmallerThan0xF4(__m128i current_bytes,
+                                        __m128i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm_or_si128(*has_error,
+                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
+}
+
+static inline __m128i continuationLengths(__m128i high_nibbles) {
+  return _mm_shuffle_epi8(
+      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                    0, 0, 0, 0,             // 10xx (continuation)
+                    2, 2,                   // 110x
+                    3,                      // 1110
+                    4), // 1111, next should be 0 (not checked here)
+      high_nibbles);
+}
+
+static inline __m128i carryContinuations(__m128i initial_lengths,
+                                         __m128i previous_carries) {
+
+  __m128i right1 =
+      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
+                    _mm_set1_epi8(1));
+  __m128i sum = _mm_add_epi8(initial_lengths, right1);
+
+  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
+                                 _mm_set1_epi8(2));
+  return _mm_add_epi8(sum, right2);
+}
+
+static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
+                                      __m128i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m128i overunder =
+      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
+                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
+
+  *has_error = _mm_or_si128(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void checkFirstContinuationMax(__m128i current_bytes,
+                                             __m128i off1_current_bytes,
+                                             __m128i *has_error) {
+  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
+  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
+
+  __m128i badfollowED =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
+  __m128i badfollowF4 =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
+
+  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+static inline void checkOverlong(__m128i current_bytes,
+                                 __m128i off1_current_bytes, __m128i hibits,
+                                 __m128i previous_hibits, __m128i *has_error) {
+  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
+  __m128i initial_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    0xC2, -128, // 110x
+                    0xE1,       // 1110
+                    0xF1),
+      off1_hibits);
+
+  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m128i second_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    127, 127,   // 110x => true
+                    0xA0,       // 1110
+                    0x90),
+      off1_hibits);
+  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
+  *has_error =
+      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+}
+
+struct processed_utf_bytes {
+  __m128i rawbytes;
+  __m128i high_nibbles;
+  __m128i carried_continuations;
+};
+
+static inline void count_nibbles(__m128i bytes,
+                                 struct processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct processed_utf_bytes
+checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
+               __m128i *has_error) {
+  struct processed_utf_bytes pb;
+  count_nibbles(current_bytes, &pb);
+
+  checkSmallerThan0xF4(current_bytes, has_error);
+
+  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      carryContinuations(initial_lengths, previous->carried_continuations);
+
+  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m128i off1_current_bytes =
+      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
+  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                previous->high_nibbles, has_error);
+  return pb;
+}
+
 #ifdef __AVX2__
 /*****************************/
 static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
@ -190,8 +322,6 @@ avxcheckUTF8Bytes(__m256i current_bytes,
  return pb;
 }

-#else // __AVX2__
-#warning "We require AVX2 support!"
 #endif // __AVX2__
 }
 #endif
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -6,22 +6,21 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/portability.h"

-#ifdef __AVX2__
+#if defined (__AVX2__) || defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))

 #ifndef SIMDJSON_SKIPUTF8VALIDATION
 #define SIMDJSON_UTF8VALIDATE
-
 #endif
 #else
 // currently we don't UTF8 validate for ARM
 // also we assume that if you're not __AVX2__ 
 // you're ARM, which is a bit dumb. TODO: Fix...
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)   || (defined(_MSC_VER) && defined(_M_ARM64))
 #include <arm_neon.h>
 #else
 #warning It appears that neither ARM NEON nor AVX2 are detected.
 #endif // __ARM_NEON
-#endif // __AVX2__
+#endif // (__AVX2__) || (__SSE4_2__)

 // It seems that many parsers do UTF-8 validation.
 // RapidJSON does not do it by default, but a flag
@ -35,6 +34,7 @@
 namespace simdjson {
 template<instruction_set>
 struct simd_input;
+
 #ifdef __AVX2__
 template<>
 struct simd_input<instruction_set::avx2>
@ -44,7 +44,18 @@ struct simd_input<instruction_set::avx2>
 };
 #endif

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<>
+struct simd_input<instruction_set::sse4_2>
+{
+  __m128i v0;
+  __m128i v1;
+  __m128i v2;
+  __m128i v3;
+};
+#endif
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> struct simd_input<instruction_set::neon>
 {
 #ifndef TRANSPOSE
@ -58,7 +69,7 @@ template<> struct simd_input<instruction_set::neon>
 };
 #endif

-#ifdef __ARM_NEON
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 really_inline
 uint16_t neonmovemask(uint8x16_t input) {
  const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
@ -114,6 +125,19 @@ uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16
 template<instruction_set T>
 uint64_t compute_quote_mask(uint64_t quote_bits);

+namespace {
+  // for when clmul is unavailable
+  [[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) {
+    uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
+    quote_mask = quote_mask ^ (quote_mask << 2);
+    quote_mask = quote_mask ^ (quote_mask << 4);
+    quote_mask = quote_mask ^ (quote_mask << 8);
+    quote_mask = quote_mask ^ (quote_mask << 16);
+    quote_mask = quote_mask ^ (quote_mask << 32);
+    return quote_mask;
+  }
+}
+
 // In practice, if you have NEON or __PCLMUL__, you would
 // always want to use them, but it might be useful, for research
 // purposes, to disable it willingly, that's what SIMDJSON_AVOID_CLMUL
@ -122,15 +146,8 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 // where clmul is not supported, so check for both, to be sure.
 #ifdef SIMDJSON_AVOID_CLMUL
 template<instruction_set T> really_inline
-uint64_t compute_quote_mask(uint64_t quote_bits)
-{
-  uint64_t quote_mask = quote_bits ^ (quote_bits << 1);
-  quote_mask = quote_mask ^ (quote_mask << 2);
-  quote_mask = quote_mask ^ (quote_mask << 4);
-  quote_mask = quote_mask ^ (quote_mask << 8);
-  quote_mask = quote_mask ^ (quote_mask << 16);
-  quote_mask = quote_mask ^ (quote_mask << 32);
-  return quote_mask;
+uint64_t compute_quote_mask(uint64_t quote_bits) {
+  return portable_compute_quote_mask(quote_bits);
 }
 #else
 template<instruction_set>
@ -139,49 +156,147 @@ uint64_t compute_quote_mask(uint64_t quote_bits);
 #ifdef __AVX2__ 
 template<> really_inline
 uint64_t compute_quote_mask<instruction_set::avx2>(uint64_t quote_bits) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
  return quote_mask;
 }
 #endif

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
-uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
-#ifdef __PCLMUL__ // Might cause problems on runtime dispatch
-  uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
-                                          _mm_set_epi64x(0ULL, quote_bits),
-                                          _mm_set1_epi8(0xFF), 0));
+uint64_t compute_quote_mask<instruction_set::sse4_2>(uint64_t quote_bits) {
+  // CLMUL is supported on some SSE42 hardware such as Sandy Bridge,
+  // but not on others.
+#ifdef __PCLMUL__
+  return _mm_cvtsi128_si64(_mm_clmulepi64_si128(
+      _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
 #else
-  uint64_t quote_mask = vmull_p64( -1ULL, quote_bits);
+  return portable_compute_quote_mask(quote_bits);
 #endif
-  return quote_mask;
 }
 #endif
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
+template<> really_inline
+uint64_t compute_quote_mask<instruction_set::neon>(uint64_t quote_bits) {
+#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
+  return vmull_p64( -1ULL, quote_bits);
+#else
+  return portable_compute_quote_mask(quote_bits);
+#endif 
+}
 #endif
+#endif // SIMDJSON_AVOID_CLMUL

 #ifdef SIMDJSON_UTF8VALIDATE
-template<instruction_set T>really_inline
-void check_utf8(simd_input<T> in,
-                __m256i &has_error,
-                struct avx_processed_utf_bytes &previous) {
+// Holds the state required to perform check_utf8().
+template<instruction_set>
+struct utf8_checking_state;
+
+#ifdef __AVX2__
+template<>
+struct utf8_checking_state<instruction_set::avx2>
+{
+  __m256i has_error = _mm256_setzero_si256();
+  avx_processed_utf_bytes previous {
+    _mm256_setzero_si256(), // rawbytes
+    _mm256_setzero_si256(), // high_nibbles
+    _mm256_setzero_si256()  // carried_continuations
+  };
+};
+#endif
+
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<>
+struct utf8_checking_state<instruction_set::sse4_2>
+{
+  __m128i has_error = _mm_setzero_si128();
+  processed_utf_bytes previous {
+    _mm_setzero_si128(), // rawbytes
+    _mm_setzero_si128(), // high_nibbles
+    _mm_setzero_si128()  // carried_continuations
+  };
+};
+#endif
+
+template<instruction_set T>
+void check_utf8(simd_input<T> in, utf8_checking_state<T>& state);
+
+#ifdef __AVX2__
+template<> really_inline
+void check_utf8<instruction_set::avx2>(simd_input<instruction_set::avx2> in,
+                utf8_checking_state<instruction_set::avx2>& state) {
  __m256i highbit = _mm256_set1_epi8(0x80);
  if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), highbit)) == 1) {
    // it is ascii, we just check continuation
-    has_error = _mm256_or_si256(
+    state.has_error = _mm256_or_si256(
        _mm256_cmpgt_epi8(
-            previous.carried_continuations,
+            state.previous.carried_continuations,
            _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                             9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
+        state.has_error);
  } else {
    // it is not ascii so we have to do heavy work
-    previous = avxcheckUTF8Bytes(in.lo, &previous, &has_error);
-    previous = avxcheckUTF8Bytes(in.hi, &previous, &has_error);
+    state.previous = avxcheckUTF8Bytes(in.lo, &(state.previous), &(state.has_error));
+    state.previous = avxcheckUTF8Bytes(in.hi, &(state.previous), &(state.has_error));
  }
 }
+#endif //__AVX2__
+
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+void check_utf8<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
+                utf8_checking_state<instruction_set::sse4_2>& state) {
+  __m128i highbit = _mm_set1_epi8(0x80);
+  if ((_mm_testz_si128(_mm_or_si128(in.v0, in.v1), highbit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error = _mm_or_si128(
+        _mm_cmpgt_epi8(
+            state.previous.carried_continuations,
+            _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
+        state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous = checkUTF8Bytes(in.v0, &(state.previous), &(state.has_error));
+    state.previous = checkUTF8Bytes(in.v1, &(state.previous), &(state.has_error));
+  }
+
+  if ((_mm_testz_si128(_mm_or_si128(in.v2, in.v3), highbit)) == 1) {
+    // it is ascii, we just check continuation
+    state.has_error = _mm_or_si128(
+            _mm_cmpgt_epi8(
+                    state.previous.carried_continuations,
+                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)),
+            state.has_error);
+  } else {
+    // it is not ascii so we have to do heavy work
+    state.previous = checkUTF8Bytes(in.v2, &(state.previous), &(state.has_error));
+    state.previous = checkUTF8Bytes(in.v3, &(state.previous), &(state.has_error));
+  }
+}
+#endif // __SSE4_2
+
+// Checks if the utf8 validation has found any error.
+template<instruction_set T>
+errorValues check_utf8_errors(utf8_checking_state<T>& state);
+
+#ifdef __AVX2__
+template<> really_inline
+errorValues check_utf8_errors<instruction_set::avx2>(utf8_checking_state<instruction_set::avx2>& state) {
+  return _mm256_testz_si256(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+}
 #endif

+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+errorValues check_utf8_errors<instruction_set::sse4_2>(utf8_checking_state<instruction_set::sse4_2>& state) {
+  return _mm_testz_si128(state.has_error, state.has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+}
+#endif
+#endif // SIMDJSON_UTF8VALIDATE
+
 template<instruction_set T>
 simd_input<T> fill_input(const uint8_t * ptr);

@ -195,7 +310,19 @@ simd_input<instruction_set::avx2> fill_input<instruction_set::avx2>(const uint8_
 }
 #endif

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+simd_input<instruction_set::sse4_2> fill_input<instruction_set::sse4_2>(const uint8_t * ptr) {
+  struct simd_input<instruction_set::sse4_2> in;
+  in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
+  in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+  in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
+  in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
+  return in;
+}
+#endif
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 simd_input<instruction_set::neon> fill_input<instruction_set::neon>(const uint8_t * ptr) {
  struct simd_input<instruction_set::neon> in;
@ -219,7 +346,6 @@ uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
 #ifdef __AVX2__
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_set::avx2> in, uint8_t m) {
-
  const __m256i mask = _mm256_set1_epi8(m);
  __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
  uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
@ -229,7 +355,23 @@ uint64_t cmp_mask_against_input<instruction_set::avx2>(simd_input<instruction_se
 }
 #endif

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+uint64_t cmp_mask_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
+  const __m128i mask = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+#endif
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 uint64_t cmp_mask_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
@ -257,7 +399,23 @@ uint64_t unsigned_lteq_against_input<instruction_set::avx2>(simd_input<instructi
 }
 #endif

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+uint64_t unsigned_lteq_against_input<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in, uint8_t m) {
+  const __m128i maxval = _mm_set1_epi8(m);
+  __m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v0),maxval);
+  uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
+  __m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v1),maxval);
+  uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
+  __m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v2),maxval);
+  uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
+  __m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval,in.v3),maxval);
+  uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
+  return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
+}
+#endif
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 uint64_t unsigned_lteq_against_input<instruction_set::neon>(simd_input<instruction_set::neon> in, uint8_t m) {
  const uint8x16_t mask = vmovq_n_u8(m); 
@ -447,9 +605,80 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
  whitespace = ~(ws_res_0 | (ws_res_1 << 32));
 #endif // SIMDJSON_NAIVE_STRUCTURAL
 }
-#endif
+#endif // __AVX2__

-#ifdef __ARM_NEON
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
+                                                     uint64_t &whitespace,
+                                                     uint64_t &structurals) {
+  const __m128i low_nibble_mask = _mm_setr_epi8(
+      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  const __m128i high_nibble_mask = _mm_setr_epi8(
+      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+
+  __m128i structural_shufti_mask = _mm_set1_epi8(0x7);
+  __m128i whitespace_shufti_mask = _mm_set1_epi8(0x18);
+
+  __m128i v_0 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v0),
+      _mm_shuffle_epi8(high_nibble_mask,
+                          _mm_and_si128(_mm_srli_epi32(in.v0, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_1 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v1),
+      _mm_shuffle_epi8(high_nibble_mask,
+                          _mm_and_si128(_mm_srli_epi32(in.v1, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_2 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v2),
+      _mm_shuffle_epi8(high_nibble_mask,
+                         _mm_and_si128(_mm_srli_epi32(in.v2, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i v_3 = _mm_and_si128(
+      _mm_shuffle_epi8(low_nibble_mask, in.v3),
+      _mm_shuffle_epi8(high_nibble_mask,
+                         _mm_and_si128(_mm_srli_epi32(in.v3, 4),
+                                           _mm_set1_epi8(0x7f))));
+
+  __m128i tmp_v0 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_0, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v1 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_1, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v2 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_2, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_v3 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_3, structural_shufti_mask), _mm_set1_epi8(0));
+
+  uint64_t structural_res_0 = _mm_movemask_epi8(tmp_v0);
+  uint64_t structural_res_1 = _mm_movemask_epi8(tmp_v1);
+  uint64_t structural_res_2 = _mm_movemask_epi8(tmp_v2);
+  uint64_t structural_res_3 = _mm_movemask_epi8(tmp_v3);
+
+  structurals = ~(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
+
+  __m128i tmp_ws_v0 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_0, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v1 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_1, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v2 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_2, whitespace_shufti_mask), _mm_set1_epi8(0));
+  __m128i tmp_ws_v3 = _mm_cmpeq_epi8(
+      _mm_and_si128(v_3, whitespace_shufti_mask), _mm_set1_epi8(0));
+
+  uint64_t ws_res_0 = _mm_movemask_epi8(tmp_ws_v0);
+  uint64_t ws_res_1 = _mm_movemask_epi8(tmp_ws_v1);
+  uint64_t ws_res_2 = _mm_movemask_epi8(tmp_ws_v2);
+  uint64_t ws_res_3 = _mm_movemask_epi8(tmp_ws_v3);
+
+  whitespace = ~(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
+}
+#endif // __SSE4_2__
+
+#if defined(__ARM_NEON)  || (defined(_MSC_VER) && defined(_M_ARM64))
 template<> really_inline
 void find_whitespace_and_structurals<instruction_set::neon>(
                                                  simd_input<instruction_set::neon> in,
@ -569,9 +798,9 @@ void find_whitespace_and_structurals<instruction_set::neon>(

  structurals = neonmovemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
  whitespace = neonmovemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
-#endif
+#endif // FUNKY_BAD_TABLE
 }
-#endif
+#endif // __ARM_NEON


 #ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
@ -657,7 +886,7 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
  }
  base = next_base;
 }
-#endif
+#endif // SIMDJSON_NAIVE_FLATTEN

 // return a updated structural bit vector with quoted contents cleared out and
 // pseudo-structural characters added to the mask
@ -711,11 +940,7 @@ WARN_UNUSED
  uint32_t *base_ptr = pj.structural_indexes;
  uint32_t base = 0;
 #ifdef SIMDJSON_UTF8VALIDATE
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous {};
-  previous.rawbytes = _mm256_setzero_si256();
-  previous.high_nibbles = _mm256_setzero_si256();
-  previous.carried_continuations = _mm256_setzero_si256();
+  utf8_checking_state<T> state;
 #endif

  // we have padded the input out to 64 byte multiple with the remainder being
@ -751,7 +976,7 @@ WARN_UNUSED
 #endif
    simd_input<T> in = fill_input<T>(buf+idx);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
+    check_utf8<T>(in, state);
 #endif
    // detect odd sequences of backslashes
    uint64_t odd_ends = find_odd_backslash_sequences<T>(
@ -786,7 +1011,7 @@ WARN_UNUSED
    memcpy(tmpbuf, buf + idx, len - idx);
    simd_input<T> in = fill_input<T>(tmpbuf);
 #ifdef SIMDJSON_UTF8VALIDATE
-    check_utf8(in, has_error, previous);
+    check_utf8<T>(in, state);
 #endif

    // detect odd sequences of backslashes
@ -843,7 +1068,7 @@ WARN_UNUSED
    return simdjson::UNESCAPED_CHARS;
  }
 #ifdef SIMDJSON_UTF8VALIDATE
-    return _mm256_testz_si256(has_error, has_error) == 0 ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
+  return check_utf8_errors<T>(state);
 #else
  return simdjson::SUCCESS;
 #endif
--- a/include/simdjson/stringparsing.h
+++ b/include/simdjson/stringparsing.h
@ -109,6 +109,23 @@ parse_string_helper find_bs_bits_and_quote_bits<instruction_set::avx2> (const ui
 }
 #endif

+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+template<> really_inline
+parse_string_helper find_bs_bits_and_quote_bits<instruction_set::sse4_2> (const uint8_t *src, uint8_t *dst) {
+    // this can read up to 31 bytes beyond the buffer size, but we require 
+    // SIMDJSON_PADDING of padding
+    __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+    // store to dest unconditionally - we can overwrite the bits we don't like
+    // later
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), v);
+    auto quote_mask = _mm_cmpeq_epi8(v, _mm_set1_epi8('"'));
+    return {
+      static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8('\\')))), // bs_bits
+      static_cast<uint32_t>(_mm_movemask_epi8(quote_mask)) // quote_bits
+    };
+}
+#endif
+
 #ifdef __ARM_NEON
 template<> really_inline
 parse_string_helper find_bs_bits_and_quote_bits<instruction_set::neon> (const uint8_t *src, uint8_t *dst) {
@ -221,8 +238,13 @@ bool parse_string(UNUSED const uint8_t *buf, UNUSED size_t len,
    } else {
      // they are the same. Since they can't co-occur, it means we encountered
      // neither.
-      src += 32;
-      dst += 32;
+      if constexpr(T == instruction_set::sse4_2) {
+        src += 16;
+        dst += 16;
+      } else {
+        src += 32;
+        dst += 32;
+      }
    }
  }
  // can't be reached
--- a/src/jsonparser.cpp
+++ b/src/jsonparser.cpp
@ -14,10 +14,10 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
 #ifdef __AVX2__
  json_parse_functype* avx_implementation = &json_parse_implementation<instruction_set::avx2>;
 #endif
-#ifdef __SSE4_2__
-  // json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>; // not implemented yet
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+  json_parse_functype* sse4_2_implementation = &json_parse_implementation<instruction_set::sse4_2>;
 #endif
-#ifdef __ARM_NEON
+#if  defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
  json_parse_functype* neon_implementation = &json_parse_implementation<instruction_set::neon>;
 #endif

@ -25,9 +25,9 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
  // Should be done at runtime. Does not make any sense on preprocessor.
 #ifdef __AVX2__
  instruction_set best_implementation = instruction_set::avx2;
-#elif defined (__SSE4_2__)
+#elif defined (__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
  instruction_set best_implementation = instruction_set::sse4_2;
-#elif defined (__ARM_NEON)
+#elif defined (__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
  instruction_set best_implementation = instruction_set::neon;
 #else
  instruction_set best_implementation = instruction_set::none;
@ -39,11 +39,13 @@ int json_parse_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj, bool rea
  case instruction_set::avx2 :
    json_parse_ptr = avx_implementation;
    break;
-#elif defined (__SSE4_2__)
-  /*case instruction_set::sse4_2 :
+#endif
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
+  case instruction_set::sse4_2 :
    json_parse_ptr = sse4_2_implementation;
-    break;*/
-#elif defined (__ARM_NEON)
+    break;
+#endif
+#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
  case instruction_set::neon :
    json_parse_ptr = neon_implementation;
    break;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -7,7 +7,8 @@ endif()
 add_cpp_test(basictests)
 add_cpp_test(jsoncheck)

-add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
-target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
-target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
-add_test(singleheader singleheader)
+## This causes problems
+# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
+# target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
+# target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
+# add_test(singleheader singleheader)
--- a/tools/cmake/FindOptions.cmake
+++ b/tools/cmake/FindOptions.cmake
@ -13,15 +13,21 @@ if(SIMDJSON_SANITIZE)
 endif()


-
-# some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
-if(NOT MSVC)
+if(SIMDJSON_DISABLE_AVX) 
+  if(NOT MSVC)
+   set (OPT_FLAGS "${OPT_FLAGS} -mno-avx -mno-bmi -mno-pclmul -msse4.2")
+  else()
+   set (OPT_FLAGS "${OPT_FLAGS}")
+  endif()
+else() 
+  # some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
+  if(NOT MSVC)
   set (OPT_FLAGS "${OPT_FLAGS} -mavx2 -mbmi -mbmi2 -mpclmul")
-else()
-   set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2 /std:c++latest")
+  else()
+   set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX2")
+  endif()
 endif()

-
 if(NOT MSVC)
 set(CXXSTD_FLAGS "-std=c++17 -fPIC")
 endif()