Remove lookup_lower_4_bits

It's only a coincidence that it works in current uses: it doesn't do
what the name says. Particularly, if the high bit is 1 it will yield
0 even if the lower 4 bits would yield something else.
This commit is contained in:
John Keiser 2019-11-19 10:29:25 -08:00
parent c5504ef50b
commit 7d7bec856d
5 changed files with 94 additions and 60 deletions

View File

@ -91,6 +91,16 @@ namespace simdjson::arm64::simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<uint8_t> repeat_16(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
) {
return simd8<uint8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Store to array // Store to array
really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); } really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); }
@ -119,7 +129,7 @@ namespace simdjson::arm64::simd {
template<int N> template<int N>
really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); } really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
// Perform a lookup assuming no value is larger than 16 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
template<typename L> template<typename L>
really_inline simd8<L> lookup_16( really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3, L replace0, L replace1, L replace2, L replace3,
@ -135,23 +145,9 @@ namespace simdjson::arm64::simd {
return lookup_table.apply_lookup_16_to(*this); return lookup_table.apply_lookup_16_to(*this);
} }
// Perform a lookup of the lower 4 bits template<typename T>
template<typename L> really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
really_inline simd8<L> lookup_lower_4_bits( return vqtbl1q_u8(*this, simd8<uint8_t>(original));
L replace0, L replace1, L replace2, L replace3,
L replace4, L replace5, L replace6, L replace7,
L replace8, L replace9, L replace10, L replace11,
L replace12, L replace13, L replace14, L replace15) const {
return (*this & 0xF).lookup_16(
replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15
);
}
really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
return vqtbl1q_u8(*this, original);
} }
}; };
@ -183,6 +179,16 @@ namespace simdjson::arm64::simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
}) {} }) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<int8_t> repeat_16(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
) {
return simd8<int8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Store to array // Store to array
really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); } really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); }
@ -223,7 +229,8 @@ namespace simdjson::arm64::simd {
); );
} }
really_inline simd8<int8_t> apply_lookup_16_to(const simd8<uint8_t> original) { template<typename T>
really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
return vqtbl1q_s8(*this, original); return vqtbl1q_s8(*this, original);
} }
}; };

View File

@ -90,39 +90,23 @@ namespace simdjson::haswell::simd {
really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; } really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; } really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
// Perform a lookup of the lower 4 bits // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
template<typename L> template<typename L>
really_inline simd8<L> lookup_lower_4_bits( really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
L replace0, L replace1, L replace2, L replace3,
L replace4, L replace5, L replace6, L replace7,
L replace8, L replace9, L replace10, L replace11,
L replace12, L replace13, L replace14, L replace15) const {
simd8<L> lookup_table(
replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15,
replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15
);
return _mm256_shuffle_epi8(lookup_table, *this); return _mm256_shuffle_epi8(lookup_table, *this);
} }
// Perform a lookup assuming the value is between 0 and 16
template<typename L> template<typename L>
really_inline simd8<L> lookup_16( really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3, L replace0, L replace1, L replace2, L replace3,
L replace4, L replace5, L replace6, L replace7, L replace4, L replace5, L replace6, L replace7,
L replace8, L replace9, L replace10, L replace11, L replace8, L replace9, L replace10, L replace11,
L replace12, L replace13, L replace14, L replace15) const { L replace12, L replace13, L replace14, L replace15) const {
return lookup_lower_4_bits( return lookup_16(simd8<L>::repeat_16(
replace0, replace1, replace2, replace3, replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7, replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11, replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15 replace12, replace13, replace14, replace15
); ));
} }
}; };
@ -147,6 +131,18 @@ namespace simdjson::haswell::simd {
v16,v17,v18,v19,v20,v21,v22,v23, v16,v17,v18,v19,v20,v21,v22,v23,
v24,v25,v26,v27,v28,v29,v30,v31 v24,v25,v26,v27,v28,v29,v30,v31
)) {} )) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<int8_t> repeat_16(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
) {
return simd8<int8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15,
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Order-sensitive comparisons // Order-sensitive comparisons
really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); } really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
@ -175,6 +171,18 @@ namespace simdjson::haswell::simd {
v16,v17,v18,v19,v20,v21,v22,v23, v16,v17,v18,v19,v20,v21,v22,v23,
v24,v25,v26,v27,v28,v29,v30,v31 v24,v25,v26,v27,v28,v29,v30,v31
)) {} )) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<uint8_t> repeat_16(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
) {
return simd8<uint8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15,
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Saturated math // Saturated math
really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); } really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }

View File

@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators(
const simd::simd8x64<uint8_t> in, const simd::simd8x64<uint8_t> in,
uint64_t &whitespace, uint64_t &op) { uint64_t &whitespace, uint64_t &op) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
whitespace = in.map([&](simd8<uint8_t> _in) { whitespace = in.map([&](simd8<uint8_t> _in) {
return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
}).to_bitmask(); }).to_bitmask();
op = in.map([&](simd8<uint8_t> _in) { op = in.map([&](simd8<uint8_t> _in) {
return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-','));
}).to_bitmask(); }).to_bitmask();
} }

View File

@ -91,36 +91,23 @@ namespace simdjson::westmere::simd {
really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; } really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; } really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
// Perform a lookup of the lower 4 bits // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
template<typename L> template<typename L>
really_inline simd8<L> lookup_lower_4_bits( really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
L replace0, L replace1, L replace2, L replace3,
L replace4, L replace5, L replace6, L replace7,
L replace8, L replace9, L replace10, L replace11,
L replace12, L replace13, L replace14, L replace15) const {
simd8<L> lookup_table(
replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15
);
return _mm_shuffle_epi8(lookup_table, *this); return _mm_shuffle_epi8(lookup_table, *this);
} }
// Perform a lookup assuming the value is between 0 and 16
template<typename L> template<typename L>
really_inline simd8<L> lookup_16( really_inline simd8<L> lookup_16(
L replace0, L replace1, L replace2, L replace3, L replace0, L replace1, L replace2, L replace3,
L replace4, L replace5, L replace6, L replace7, L replace4, L replace5, L replace6, L replace7,
L replace8, L replace9, L replace10, L replace11, L replace8, L replace9, L replace10, L replace11,
L replace12, L replace13, L replace14, L replace15) const { L replace12, L replace13, L replace14, L replace15) const {
return lookup_lower_4_bits( return lookup_16(simd8<L>::repeat_16(
replace0, replace1, replace2, replace3, replace0, replace1, replace2, replace3,
replace4, replace5, replace6, replace7, replace4, replace5, replace6, replace7,
replace8, replace9, replace10, replace11, replace8, replace9, replace10, replace11,
replace12, replace13, replace14, replace15 replace12, replace13, replace14, replace15
); ));
} }
}; };
@ -141,6 +128,16 @@ namespace simdjson::westmere::simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
)) {} )) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<int8_t> repeat_16(
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
) {
return simd8<int8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Order-sensitive comparisons // Order-sensitive comparisons
really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); } really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
@ -165,6 +162,16 @@ namespace simdjson::westmere::simd {
v0, v1, v2, v3, v4, v5, v6, v7, v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15 v8, v9, v10,v11,v12,v13,v14,v15
)) {} )) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
really_inline static simd8<uint8_t> repeat_16(
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
) {
return simd8<uint8_t>(
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10,v11,v12,v13,v14,v15
);
}
// Saturated math // Saturated math
really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); } really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }

View File

@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators(
const simd8x64<uint8_t> in, const simd8x64<uint8_t> in,
uint64_t &whitespace, uint64_t &op) { uint64_t &whitespace, uint64_t &op) {
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
// we can't use the generic lookup_16.
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
whitespace = in.map([&](simd8<uint8_t> _in) { whitespace = in.map([&](simd8<uint8_t> _in) {
return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
}).to_bitmask(); }).to_bitmask();
op = in.map([&](simd8<uint8_t> _in) { op = in.map([&](simd8<uint8_t> _in) {
return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{'); // | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-','));
}).to_bitmask(); }).to_bitmask();
} }