Fixing clang under visual studio (#1028)

* Lots of fixes * Removing some lambdas * Removing some functional programming. Co-authored-by: Daniel Lemire <lemire@gmai.com>
2020-07-06 18:58:19 -04:00 · 2020-07-06 18:58:19 -04:00 · d0ce2f0b5a
parent a19f635a6a
commit d0ce2f0b5a
13 changed files with 1165 additions and 1500 deletions
--- a/.github/workflows/vs16-clang-ci.yml
+++ b/.github/workflows/vs16-clang-ci.yml
@ -0,0 +1,25 @@
 name: VS16-CLANG-CI
 on: push
 jobs:
  ci:
    name: windows-vs16
    runs-on: windows-latest
    steps:
    - uses: actions/checkout@v2
    - name: 'Run CMake with VS16'
      uses: lukka/run-cmake@v2
      with:
        cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
        cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt'
        buildDirectory: "${{ github.workspace }}/../../_temp/windows"
        cmakeBuildType: Release   
        buildWithCMake: true
        cmakeGenerator: VS16Win64 
        cmakeAppendedArgs:  -T ClangCL -DSIMDJSON_COMPETITION=OFF  -DSIMDJSON_BUILD_STATIC=ON
        buildWithCMakeArgs: --config Release  
    - name: 'Run CTest'
      run: ctest -C Release -E checkperf  --output-on-failure 
      working-directory: "${{ github.workspace }}/../../_temp/windows"
--- a/benchmark/benchmark.h
+++ b/benchmark/benchmark.h
@ -30,7 +30,6 @@
      event_count allocate_count = collector.end();                            \
      aggregate << allocate_count;                                             \
    }                                                                          \
    uint64_t S = size;                                                         \
    if (collector.has_events()) {                                              \
      printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size));    \
      if (verbose) {                                                           \
@ -76,7 +75,6 @@
      event_count allocate_count = collector.end();                            \
      aggregate << allocate_count;                                             \
    }                                                                          \
    uint64_t S = size;                                                         \
    if (collector.has_events()) {                                              \
      printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size));    \
      if (verbose) {                                                           \
--- a/singleheader/amalgamate_demo.cpp
+++ b/singleheader/amalgamate_demo.cpp
@ -1,4 +1,4 @@
-/* auto-generated on Wed Jul  1 14:00:57 EDT 2020. Do not edit! */
+/* auto-generated on Mon Jul  6 18:16:52 EDT 2020. Do not edit! */
 #include <iostream>
 #include "simdjson.h"
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@ -1,4 +1,4 @@
-/* auto-generated on Wed Jul  1 14:00:57 EDT 2020. Do not edit! */
+/* auto-generated on Mon Jul  6 18:16:52 EDT 2020. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
--- a/src/arm64/dom_parser_implementation.cpp
+++ b/src/arm64/dom_parser_implementation.cpp
@ -26,13 +26,24 @@ struct json_character_block {
 };
 really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
-  auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
+  // Functional programming causes trouble with Visual Studio.
-    auto nib_lo = chunk & 0xf;
+  // Keeping this version in comments since it is much nicer:
-    auto nib_hi = chunk.shr<4>();
+  // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
-    auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  //  auto nib_lo = chunk & 0xf;
-    auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+  //  auto nib_hi = chunk.shr<4>();
-    return shuf_lo & shuf_hi;
+  //  auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-  });
+  //  auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
  //  return shuf_lo & shuf_hi;
  // });
  const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
  const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
  auto v = simd8x64<uint8_t>(
     (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
     (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
     (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
     (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
  );
  // We compute whitespace and op separately. If the code later only use one or the
@ -51,13 +62,25 @@ really_inline json_character_block json_character_block::classify(const simd::si
  // there is a small untaken optimization opportunity here. We deliberately
  // do not pick it up.
-  uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
+  uint64_t op = simd8x64<bool>(
-  uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
+        v.chunks[0].any_bits_set(0x7),
        v.chunks[1].any_bits_set(0x7),
        v.chunks[2].any_bits_set(0x7),
        v.chunks[3].any_bits_set(0x7)
  ).to_bitmask();
  uint64_t whitespace = simd8x64<bool>(
        v.chunks[0].any_bits_set(0x18),
        v.chunks[1].any_bits_set(0x18),
        v.chunks[2].any_bits_set(0x18),
        v.chunks[3].any_bits_set(0x18)
  ).to_bitmask();
  return { whitespace, op };
 }
 really_inline bool is_ascii(simd8x64<uint8_t> input) {
-    simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; });
+    simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
    return bits.max() < 0b10000000u;
 }
--- a/src/arm64/simd.h
+++ b/src/arm64/simd.h
@ -442,43 +442,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_
      each(3);
    }
    template <typename F>
    really_inline void each(F const& each_chunk) const
    {
      each_chunk(this->chunks[0]);
      each_chunk(this->chunks[1]);
      each_chunk(this->chunks[2]);
      each_chunk(this->chunks[3]);
    }
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0]),
        map_chunk(this->chunks[1]),
        map_chunk(this->chunks[2]),
        map_chunk(this->chunks[3])
      );
    }
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<T> b, F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0], b.chunks[0]),
        map_chunk(this->chunks[1], b.chunks[1]),
        map_chunk(this->chunks[2], b.chunks[2]),
        map_chunk(this->chunks[3], b.chunks[3])
      );
    }
    template <typename F>
    really_inline simd8<T> reduce(F const& reduce_pair) const {
      return reduce_pair(
        reduce_pair(this->chunks[0], this->chunks[1]),
        reduce_pair(this->chunks[2], this->chunks[3])
      );
    }
    really_inline uint64_t to_bitmask() const {
 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
      const uint8x16_t bit_mask = make_uint8x16_t(
@ -501,17 +464,32 @@ really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_
    really_inline simd8x64<T> bit_or(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a | mask; } );
+      return simd8x64<T>(
        this->chunks[0] | mask,
        this->chunks[1] | mask,
        this->chunks[2] | mask,
        this->chunks[3] | mask
      );
    }
    really_inline uint64_t eq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] == mask,
        this->chunks[1] == mask,
        this->chunks[2] == mask,
        this->chunks[3] == mask
      ).to_bitmask();
    }
    really_inline uint64_t lteq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] <= mask,
        this->chunks[1] <= mask,
        this->chunks[2] <= mask,
        this->chunks[3] <= mask
      ).to_bitmask();
    }
  }; // struct simd8x64<T>
--- a/src/generic/stage2/structural_iterator.h
+++ b/src/generic/stage2/structural_iterator.h
@ -31,31 +31,7 @@ public:
  really_inline size_t remaining_len() {
    return parser.len - *current_structural;
  }
-  template<typename F>
+
  really_inline bool with_space_terminated_copy(const F& f) {
    /**
    * We need to make a copy to make sure that the string is space terminated.
    * This is not about padding the input, which should already padded up
    * to len + SIMDJSON_PADDING. However, we have no control at this stage
    * on how the padding was done. What if the input string was padded with nulls?
    * It is quite common for an input string to have an extra null character (C string).
    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
    * document, but the string "9\0" by itself is fine. So we make a copy and
    * pad the input with spaces when we know that there is just one input element.
    * This copy is relatively expensive, but it will almost never be called in
    * practice unless you are in the strange scenario where you have many JSON
    * documents made of single atoms.
    */
    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
    if (copy == nullptr) {
      return true;
    }
    memcpy(copy, buf, parser.len);
    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
    free(copy);
    return result;
  }
  really_inline bool past_end(uint32_t n_structural_indexes) {
    return current_structural >= &parser.structural_indexes[n_structural_indexes];
  }
--- a/src/generic/stage2/structural_parser.h
+++ b/src/generic/stage2/structural_parser.h
@ -169,6 +169,31 @@ struct structural_parser : structural_iterator {
    return parse_number(current(), found_minus);
  }
  really_inline bool parse_number_with_space_terminated_copy(const bool is_negative) {
    /**
    * We need to make a copy to make sure that the string is space terminated.
    * This is not about padding the input, which should already padded up
    * to len + SIMDJSON_PADDING. However, we have no control at this stage
    * on how the padding was done. What if the input string was padded with nulls?
    * It is quite common for an input string to have an extra null character (C string).
    * We do not want to allow 9\0 (where \0 is the null character) inside a JSON
    * document, but the string "9\0" by itself is fine. So we make a copy and
    * pad the input with spaces when we know that there is just one input element.
    * This copy is relatively expensive, but it will almost never be called in
    * practice unless you are in the strange scenario where you have many JSON
    * documents made of single atoms.
    */
    uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
    if (copy == nullptr) {
      return true;
    }
    memcpy(copy, buf, parser.len);
    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
    size_t idx = *current_structural;
    bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
    free(copy);
    return result;
  }
  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
    switch (advance_char()) {
    case '"':
@ -306,6 +331,7 @@ struct structural_parser : structural_iterator {
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 template<bool STREAMING>
 WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
  dom_parser.doc = &doc;
@ -351,18 +377,16 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
    goto finish;
  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
+    // Next line used to be an interesting functional programming exercise with
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+    // a lambda that gets passed to another function via a closure. This would confuse the
-        return parser.parse_number(&copy[idx], false);
+    // clangcl compiler under Visual Studio 2019 (recent release).
-      })
+    { if(parser.parse_number_with_space_terminated_copy(false)) { goto error; }}
    );
    goto finish;
  case '-':
-    FAIL_IF(
+    // Next line used to be an interesting functional programming exercise with
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+    // a lambda that gets passed to another function via a closure. This would confuse the
-        return parser.parse_number(&copy[idx], true);
+    // clangcl compiler under Visual Studio 2019 (recent release).
-      })
+    { if(parser.parse_number_with_space_terminated_copy(true)) { goto error; }}
    );
    goto finish;
  default:
    parser.log_error("Document starts with a non-value character");
--- a/src/haswell/dom_parser_implementation.cpp
+++ b/src/haswell/dom_parser_implementation.cpp
@ -37,19 +37,20 @@ really_inline json_character_block json_character_block::classify(const simd::si
  // hope that useless computations will be omitted. This is namely case when
  // minifying (we only need whitespace).
-  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
+  uint64_t whitespace = simd8x64<bool>(
-    return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
+        in.chunks[0] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[0])),
-  }).to_bitmask();
+        in.chunks[1] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[1]))
-
+  ).to_bitmask();
-  uint64_t op = in.map([&](simd8<uint8_t> _in) {
+  
-    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
+  uint64_t op = simd8x64<bool>(
-    return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-','));
+        (in.chunks[0] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[0]-',')),
-  }).to_bitmask();
+        (in.chunks[1] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[1]-','))
  ).to_bitmask();
  return { whitespace, op };
 }
 really_inline bool is_ascii(simd8x64<uint8_t> input) {
-  simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; });
+  simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]);
  return !bits.any_bits_set_anywhere(0b10000000u);
 }
--- a/src/haswell/simd.h
+++ b/src/haswell/simd.h
@ -316,36 +316,6 @@ namespace simd {
      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
    }
    template <typename F>
    really_inline void each(F const& each_chunk) const
    {
      each_chunk(this->chunks[0]);
      each_chunk(this->chunks[1]);
    }
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0]),
        map_chunk(this->chunks[1])
      );
    }
    template <typename R=bool, typename F>
    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0], b.chunks[0]),
        map_chunk(this->chunks[1], b.chunks[1])
      );
    }
    template <typename F>
    really_inline simd8<T> reduce(F const& reduce_pair) const {
      return reduce_pair(this->chunks[0], this->chunks[1]);
    }
    really_inline uint64_t to_bitmask() const {
      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
      uint64_t r_hi =                       this->chunks[1].to_bitmask();
@ -354,17 +324,26 @@ namespace simd {
    really_inline simd8x64<T> bit_or(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a | mask; } );
+      return simd8x64<T>(
        this->chunks[0] | mask,
        this->chunks[1] | mask
      );
    }
    really_inline uint64_t eq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] == mask,
        this->chunks[1] == mask
      ).to_bitmask();
    }
    really_inline uint64_t lteq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] <= mask,
        this->chunks[1] <= mask
      ).to_bitmask();
    }
  }; // struct simd8x64<T>
--- a/src/westmere/dom_parser_implementation.cpp
+++ b/src/westmere/dom_parser_implementation.cpp
@ -38,19 +38,25 @@ really_inline json_character_block json_character_block::classify(const simd::si
  // hope that useless computations will be omitted. This is namely case when
  // minifying (we only need whitespace).
-  uint64_t whitespace = in.map([&](simd8<uint8_t> _in) {
+  uint64_t whitespace = simd8x64<bool>(
-    return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
+        in.chunks[0] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[0])),
-  }).to_bitmask();
+        in.chunks[1] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[1])),
        in.chunks[2] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[2])),
        in.chunks[3] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[3]))
  ).to_bitmask();
-  uint64_t op = in.map([&](simd8<uint8_t> _in) {
+  // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
-    // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
+  uint64_t op = simd8x64<bool>(
-    return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-','));
+        (in.chunks[0] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[0]-',')),
-  }).to_bitmask();
+        (in.chunks[1] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[1]-',')),
        (in.chunks[2] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[2]-',')),
        (in.chunks[3] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[3]-','))
  ).to_bitmask();
  return { whitespace, op };
 }
 really_inline bool is_ascii(simd8x64<uint8_t> input) {
-  simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; });
+  simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
  return !bits.any_bits_set_anywhere(0b10000000u);
 }
--- a/src/westmere/simd.h
+++ b/src/westmere/simd.h
@ -292,43 +292,6 @@ namespace simd {
      each(3);
    }
    template <typename F>
    really_inline void each(F const& each_chunk) const
    {
      each_chunk(this->chunks[0]);
      each_chunk(this->chunks[1]);
      each_chunk(this->chunks[2]);
      each_chunk(this->chunks[3]);
    }
    template <typename F, typename R=bool>
    really_inline simd8x64<R> map(F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0]),
        map_chunk(this->chunks[1]),
        map_chunk(this->chunks[2]),
        map_chunk(this->chunks[3])
      );
    }
    template <typename F, typename R=bool>
    really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
      return simd8x64<R>(
        map_chunk(this->chunks[0], b.chunks[0]),
        map_chunk(this->chunks[1], b.chunks[1]),
        map_chunk(this->chunks[2], b.chunks[2]),
        map_chunk(this->chunks[3], b.chunks[3])
      );
    }
    template <typename F>
    really_inline simd8<T> reduce(F const& reduce_pair) const {
      return reduce_pair(
        reduce_pair(this->chunks[0], this->chunks[1]),
        reduce_pair(this->chunks[2], this->chunks[3])
      );
    }
    really_inline uint64_t to_bitmask() const {
      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
      uint64_t r1 =                       this->chunks[1].to_bitmask();
@ -339,17 +302,32 @@ namespace simd {
    really_inline simd8x64<T> bit_or(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a | mask; } );
+      return simd8x64<T>(
        this->chunks[0] | mask,
        this->chunks[1] | mask,
        this->chunks[2] | mask,
        this->chunks[3] | mask
      );
    }
    really_inline uint64_t eq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] == mask,
        this->chunks[1] == mask,
        this->chunks[2] == mask,
        this->chunks[3] == mask
      ).to_bitmask();
    }
    really_inline uint64_t lteq(const T m) const {
      const simd8<T> mask = simd8<T>::splat(m);
-      return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask();
+      return  simd8x64<bool>(
        this->chunks[0] <= mask,
        this->chunks[1] <= mask,
        this->chunks[2] <= mask,
        this->chunks[3] <= mask
      ).to_bitmask();
    }
  }; // struct simd8x64<T>