Fixing clang under visual studio (#1028)

* Lots of fixes

* Removing some lambdas

* Removing some functional programming.

Co-authored-by: Daniel Lemire <lemire@gmai.com>
This commit is contained in:
Daniel Lemire 2020-07-06 18:58:19 -04:00 committed by GitHub
parent a19f635a6a
commit d0ce2f0b5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1165 additions and 1500 deletions

25
.github/workflows/vs16-clang-ci.yml vendored Normal file
View File

@ -0,0 +1,25 @@
name: VS16-CLANG-CI
on: push
jobs:
ci:
name: windows-vs16
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
- name: 'Run CMake with VS16'
uses: lukka/run-cmake@v2
with:
cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt'
buildDirectory: "${{ github.workspace }}/../../_temp/windows"
cmakeBuildType: Release
buildWithCMake: true
cmakeGenerator: VS16Win64
cmakeAppendedArgs: -T ClangCL -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_BUILD_STATIC=ON
buildWithCMakeArgs: --config Release
- name: 'Run CTest'
run: ctest -C Release -E checkperf --output-on-failure
working-directory: "${{ github.workspace }}/../../_temp/windows"

View File

@ -30,7 +30,6 @@
event_count allocate_count = collector.end(); \ event_count allocate_count = collector.end(); \
aggregate << allocate_count; \ aggregate << allocate_count; \
} \ } \
uint64_t S = size; \
if (collector.has_events()) { \ if (collector.has_events()) { \
printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size)); \ printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size)); \
if (verbose) { \ if (verbose) { \
@ -76,7 +75,6 @@
event_count allocate_count = collector.end(); \ event_count allocate_count = collector.end(); \
aggregate << allocate_count; \ aggregate << allocate_count; \
} \ } \
uint64_t S = size; \
if (collector.has_events()) { \ if (collector.has_events()) { \
printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size)); \ printf("%7.3f", aggregate.best.cycles() / static_cast<double>(size)); \
if (verbose) { \ if (verbose) { \

View File

@ -1,4 +1,4 @@
/* auto-generated on Wed Jul 1 14:00:57 EDT 2020. Do not edit! */ /* auto-generated on Mon Jul 6 18:16:52 EDT 2020. Do not edit! */
#include <iostream> #include <iostream>
#include "simdjson.h" #include "simdjson.h"

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/* auto-generated on Wed Jul 1 14:00:57 EDT 2020. Do not edit! */ /* auto-generated on Mon Jul 6 18:16:52 EDT 2020. Do not edit! */
/* begin file include/simdjson.h */ /* begin file include/simdjson.h */
#ifndef SIMDJSON_H #ifndef SIMDJSON_H
#define SIMDJSON_H #define SIMDJSON_H

View File

@ -26,13 +26,24 @@ struct json_character_block {
}; };
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) { really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) { // Functional programming causes trouble with Visual Studio.
auto nib_lo = chunk & 0xf; // Keeping this version in comments since it is much nicer:
auto nib_hi = chunk.shr<4>(); // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); // auto nib_lo = chunk & 0xf;
auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); // auto nib_hi = chunk.shr<4>();
return shuf_lo & shuf_hi; // auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
}); // auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
// return shuf_lo & shuf_hi;
// });
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
auto v = simd8x64<uint8_t>(
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
(in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
);
// We compute whitespace and op separately. If the code later only use one or the // We compute whitespace and op separately. If the code later only use one or the
@ -51,13 +62,25 @@ really_inline json_character_block json_character_block::classify(const simd::si
// there is a small untaken optimization opportunity here. We deliberately // there is a small untaken optimization opportunity here. We deliberately
// do not pick it up. // do not pick it up.
uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask(); uint64_t op = simd8x64<bool>(
uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask(); v.chunks[0].any_bits_set(0x7),
v.chunks[1].any_bits_set(0x7),
v.chunks[2].any_bits_set(0x7),
v.chunks[3].any_bits_set(0x7)
).to_bitmask();
uint64_t whitespace = simd8x64<bool>(
v.chunks[0].any_bits_set(0x18),
v.chunks[1].any_bits_set(0x18),
v.chunks[2].any_bits_set(0x18),
v.chunks[3].any_bits_set(0x18)
).to_bitmask();
return { whitespace, op }; return { whitespace, op };
} }
really_inline bool is_ascii(simd8x64<uint8_t> input) { really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; }); simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
return bits.max() < 0b10000000u; return bits.max() < 0b10000000u;
} }

View File

@ -442,43 +442,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
each(3); each(3);
} }
template <typename F>
really_inline void each(F const& each_chunk) const
{
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
each_chunk(this->chunks[2]);
each_chunk(this->chunks[3]);
}
template <typename R=bool, typename F>
really_inline simd8x64<R> map(F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1]),
map_chunk(this->chunks[2]),
map_chunk(this->chunks[3])
);
}
template <typename R=bool, typename F>
really_inline simd8x64<R> map(const simd8x64<T> b, F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1]),
map_chunk(this->chunks[2], b.chunks[2]),
map_chunk(this->chunks[3], b.chunks[3])
);
}
template <typename F>
really_inline simd8<T> reduce(F const& reduce_pair) const {
return reduce_pair(
reduce_pair(this->chunks[0], this->chunks[1]),
reduce_pair(this->chunks[2], this->chunks[3])
);
}
really_inline uint64_t to_bitmask() const { really_inline uint64_t to_bitmask() const {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
const uint8x16_t bit_mask = make_uint8x16_t( const uint8x16_t bit_mask = make_uint8x16_t(
@ -501,17 +464,32 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_
really_inline simd8x64<T> bit_or(const T m) const { really_inline simd8x64<T> bit_or(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a | mask; } ); return simd8x64<T>(
this->chunks[0] | mask,
this->chunks[1] | mask,
this->chunks[2] | mask,
this->chunks[3] | mask
);
} }
really_inline uint64_t eq(const T m) const { really_inline uint64_t eq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] == mask,
this->chunks[1] == mask,
this->chunks[2] == mask,
this->chunks[3] == mask
).to_bitmask();
} }
really_inline uint64_t lteq(const T m) const { really_inline uint64_t lteq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] <= mask,
this->chunks[1] <= mask,
this->chunks[2] <= mask,
this->chunks[3] <= mask
).to_bitmask();
} }
}; // struct simd8x64<T> }; // struct simd8x64<T>

View File

@ -31,31 +31,7 @@ public:
really_inline size_t remaining_len() { really_inline size_t remaining_len() {
return parser.len - *current_structural; return parser.len - *current_structural;
} }
template<typename F>
really_inline bool with_space_terminated_copy(const F& f) {
/**
* We need to make a copy to make sure that the string is space terminated.
* This is not about padding the input, which should already padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* We do not want to allow 9\0 (where \0 is the null character) inside a JSON
* document, but the string "9\0" by itself is fine. So we make a copy and
* pad the input with spaces when we know that there is just one input element.
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
if (copy == nullptr) {
return true;
}
memcpy(copy, buf, parser.len);
memset(copy + parser.len, ' ', SIMDJSON_PADDING);
bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
free(copy);
return result;
}
really_inline bool past_end(uint32_t n_structural_indexes) { really_inline bool past_end(uint32_t n_structural_indexes) {
return current_structural >= &parser.structural_indexes[n_structural_indexes]; return current_structural >= &parser.structural_indexes[n_structural_indexes];
} }

View File

@ -169,6 +169,31 @@ struct structural_parser : structural_iterator {
return parse_number(current(), found_minus); return parse_number(current(), found_minus);
} }
really_inline bool parse_number_with_space_terminated_copy(const bool is_negative) {
/**
* We need to make a copy to make sure that the string is space terminated.
* This is not about padding the input, which should already padded up
* to len + SIMDJSON_PADDING. However, we have no control at this stage
* on how the padding was done. What if the input string was padded with nulls?
* It is quite common for an input string to have an extra null character (C string).
* We do not want to allow 9\0 (where \0 is the null character) inside a JSON
* document, but the string "9\0" by itself is fine. So we make a copy and
* pad the input with spaces when we know that there is just one input element.
* This copy is relatively expensive, but it will almost never be called in
* practice unless you are in the strange scenario where you have many JSON
* documents made of single atoms.
*/
uint8_t *copy = static_cast<uint8_t *>(malloc(parser.len + SIMDJSON_PADDING));
if (copy == nullptr) {
return true;
}
memcpy(copy, buf, parser.len);
memset(copy + parser.len, ' ', SIMDJSON_PADDING);
size_t idx = *current_structural;
bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
free(copy);
return result;
}
WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
switch (advance_char()) { switch (advance_char()) {
case '"': case '"':
@ -306,6 +331,7 @@ struct structural_parser : structural_iterator {
#undef FAIL_IF #undef FAIL_IF
#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
template<bool STREAMING> template<bool STREAMING>
WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
dom_parser.doc = &doc; dom_parser.doc = &doc;
@ -351,18 +377,16 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
goto finish; goto finish;
case '0': case '1': case '2': case '3': case '4': case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': case '5': case '6': case '7': case '8': case '9':
FAIL_IF( // Next line used to be an interesting functional programming exercise with
parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { // a lambda that gets passed to another function via a closure. This would confuse the
return parser.parse_number(&copy[idx], false); // clangcl compiler under Visual Studio 2019 (recent release).
}) { if(parser.parse_number_with_space_terminated_copy(false)) { goto error; }}
);
goto finish; goto finish;
case '-': case '-':
FAIL_IF( // Next line used to be an interesting functional programming exercise with
parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { // a lambda that gets passed to another function via a closure. This would confuse the
return parser.parse_number(&copy[idx], true); // clangcl compiler under Visual Studio 2019 (recent release).
}) { if(parser.parse_number_with_space_terminated_copy(true)) { goto error; }}
);
goto finish; goto finish;
default: default:
parser.log_error("Document starts with a non-value character"); parser.log_error("Document starts with a non-value character");

View File

@ -37,19 +37,20 @@ really_inline json_character_block json_character_block::classify(const simd::si
// hope that useless computations will be omitted. This is namely case when // hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace). // minifying (we only need whitespace).
uint64_t whitespace = in.map([&](simd8<uint8_t> _in) { uint64_t whitespace = simd8x64<bool>(
return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in)); in.chunks[0] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[0])),
}).to_bitmask(); in.chunks[1] == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, in.chunks[1]))
).to_bitmask();
uint64_t op = in.map([&](simd8<uint8_t> _in) {
// | 32 handles the fact that { } and [ ] are exactly 32 bytes apart uint64_t op = simd8x64<bool>(
return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-',')); (in.chunks[0] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[0]-',')),
}).to_bitmask(); (in.chunks[1] | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, in.chunks[1]-','))
).to_bitmask();
return { whitespace, op }; return { whitespace, op };
} }
really_inline bool is_ascii(simd8x64<uint8_t> input) { really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; }); simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]);
return !bits.any_bits_set_anywhere(0b10000000u); return !bits.any_bits_set_anywhere(0b10000000u);
} }

View File

@ -316,36 +316,6 @@ namespace simd {
this->chunks[1].store(ptr+sizeof(simd8<T>)*1); this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
} }
template <typename F>
really_inline void each(F const& each_chunk) const
{
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
}
template <typename R=bool, typename F>
really_inline simd8x64<R> map(F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1])
);
}
template <typename R=bool, typename F>
really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1])
);
}
template <typename F>
really_inline simd8<T> reduce(F const& reduce_pair) const {
return reduce_pair(this->chunks[0], this->chunks[1]);
}
really_inline uint64_t to_bitmask() const { really_inline uint64_t to_bitmask() const {
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
uint64_t r_hi = this->chunks[1].to_bitmask(); uint64_t r_hi = this->chunks[1].to_bitmask();
@ -354,17 +324,26 @@ namespace simd {
really_inline simd8x64<T> bit_or(const T m) const { really_inline simd8x64<T> bit_or(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a | mask; } ); return simd8x64<T>(
this->chunks[0] | mask,
this->chunks[1] | mask
);
} }
really_inline uint64_t eq(const T m) const { really_inline uint64_t eq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] == mask,
this->chunks[1] == mask
).to_bitmask();
} }
really_inline uint64_t lteq(const T m) const { really_inline uint64_t lteq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] <= mask,
this->chunks[1] <= mask
).to_bitmask();
} }
}; // struct simd8x64<T> }; // struct simd8x64<T>

View File

@ -38,19 +38,25 @@ really_inline json_character_block json_character_block::classify(const simd::si
// hope that useless computations will be omitted. This is namely case when // hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace). // minifying (we only need whitespace).
uint64_t whitespace = in.map([&](simd8<uint8_t> _in) { uint64_t whitespace = simd8x64<bool>(
return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in)); in.chunks[0] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[0])),
}).to_bitmask(); in.chunks[1] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[1])),
in.chunks[2] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[2])),
in.chunks[3] == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, in.chunks[3]))
).to_bitmask();
uint64_t op = in.map([&](simd8<uint8_t> _in) { // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
// | 32 handles the fact that { } and [ ] are exactly 32 bytes apart uint64_t op = simd8x64<bool>(
return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-',')); (in.chunks[0] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[0]-',')),
}).to_bitmask(); (in.chunks[1] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[1]-',')),
(in.chunks[2] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[2]-',')),
(in.chunks[3] | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, in.chunks[3]-','))
).to_bitmask();
return { whitespace, op }; return { whitespace, op };
} }
really_inline bool is_ascii(simd8x64<uint8_t> input) { really_inline bool is_ascii(simd8x64<uint8_t> input) {
simd8<uint8_t> bits = input.reduce([&](simd8<uint8_t> a,simd8<uint8_t> b) { return a|b; }); simd8<uint8_t> bits = (input.chunks[0] | input.chunks[1]) | (input.chunks[2] | input.chunks[3]);
return !bits.any_bits_set_anywhere(0b10000000u); return !bits.any_bits_set_anywhere(0b10000000u);
} }

View File

@ -292,43 +292,6 @@ namespace simd {
each(3); each(3);
} }
template <typename F>
really_inline void each(F const& each_chunk) const
{
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
each_chunk(this->chunks[2]);
each_chunk(this->chunks[3]);
}
template <typename F, typename R=bool>
really_inline simd8x64<R> map(F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1]),
map_chunk(this->chunks[2]),
map_chunk(this->chunks[3])
);
}
template <typename F, typename R=bool>
really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
return simd8x64<R>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1]),
map_chunk(this->chunks[2], b.chunks[2]),
map_chunk(this->chunks[3], b.chunks[3])
);
}
template <typename F>
really_inline simd8<T> reduce(F const& reduce_pair) const {
return reduce_pair(
reduce_pair(this->chunks[0], this->chunks[1]),
reduce_pair(this->chunks[2], this->chunks[3])
);
}
really_inline uint64_t to_bitmask() const { really_inline uint64_t to_bitmask() const {
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
uint64_t r1 = this->chunks[1].to_bitmask(); uint64_t r1 = this->chunks[1].to_bitmask();
@ -339,17 +302,32 @@ namespace simd {
really_inline simd8x64<T> bit_or(const T m) const { really_inline simd8x64<T> bit_or(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a | mask; } ); return simd8x64<T>(
this->chunks[0] | mask,
this->chunks[1] | mask,
this->chunks[2] | mask,
this->chunks[3] | mask
);
} }
really_inline uint64_t eq(const T m) const { really_inline uint64_t eq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a == mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] == mask,
this->chunks[1] == mask,
this->chunks[2] == mask,
this->chunks[3] == mask
).to_bitmask();
} }
really_inline uint64_t lteq(const T m) const { really_inline uint64_t lteq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m); const simd8<T> mask = simd8<T>::splat(m);
return this->map( [&](simd8<T> a) { return a <= mask; } ).to_bitmask(); return simd8x64<bool>(
this->chunks[0] <= mask,
this->chunks[1] <= mask,
this->chunks[2] <= mask,
this->chunks[3] <= mask
).to_bitmask();
} }
}; // struct simd8x64<T> }; // struct simd8x64<T>