Add AMD64 build_bitmask
This commit is contained in:
parent
cf4ae61ac6
commit
441963c84c
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
|
||||
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
|
||||
|
||||
#include <iostream>
|
||||
#include "simdjson.h"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
|
||||
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
|
||||
#include "simdjson.h"
|
||||
|
||||
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
|
||||
|
@ -574,22 +574,38 @@ struct simd_input<Architecture::ARM64> {
|
|||
this->i3 = vld1q_u8(ptr + 48);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
|
||||
uint8x16_t r0 = chunk_to_mask(this->i0);
|
||||
uint8x16_t r1 = chunk_to_mask(this->i1);
|
||||
uint8x16_t r2 = chunk_to_mask(this->i2);
|
||||
uint8x16_t r3 = chunk_to_mask(this->i3);
|
||||
return neon_movemask_bulk(r0, r1, r2, r3);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
|
||||
simd_input<Architecture::ARM64> result = {
|
||||
map_chunk(this->i0),
|
||||
map_chunk(this->i1),
|
||||
map_chunk(this->i2),
|
||||
map_chunk(this->i3)
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
return this->build_bitmask([&](uint8x16_t chunk) {
|
||||
return vceqq_u8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
return this->build_bitmask([&](uint8x16_t chunk) {
|
||||
return vcleq_u8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
@ -1467,45 +1483,25 @@ really_inline void find_whitespace_and_structurals(
|
|||
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
|
||||
const uint8x16_t high_nibble_mask =
|
||||
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
|
||||
|
||||
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
|
||||
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
|
||||
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
|
||||
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
|
||||
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
|
||||
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
|
||||
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
|
||||
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
|
||||
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
|
||||
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
|
||||
return vandq_u8(shuf_lo, shuf_hi);
|
||||
});
|
||||
|
||||
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
|
||||
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
|
||||
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
|
||||
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
|
||||
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
structurals = v.build_bitmask([&](auto chunk) {
|
||||
return vtstq_u8(chunk, structural_shufti_mask);
|
||||
});
|
||||
|
||||
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
|
||||
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
|
||||
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
|
||||
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
|
||||
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
|
||||
|
||||
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
|
||||
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
|
||||
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
|
||||
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
|
||||
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
|
||||
|
||||
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
|
||||
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
|
||||
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
|
||||
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
|
||||
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
|
||||
|
||||
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
|
||||
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
whitespace = v.build_bitmask([&](auto chunk) {
|
||||
return vtstq_u8(chunk, whitespace_shufti_mask);
|
||||
});
|
||||
}
|
||||
|
||||
// This file contains a non-architecture-specific version of "flatten" used in stage1.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
|
||||
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
|
||||
/* begin file include/simdjson/simdjson_version.h */
|
||||
// /include/simdjson/simdjson_version.h automatically generated by release.py,
|
||||
// do not change by hand
|
||||
|
|
|
@ -46,22 +46,45 @@ struct simd_input<Architecture::ARM64> {
|
|||
this->i3 = vld1q_u8(ptr + 48);
|
||||
}
|
||||
|
||||
really_inline simd_input(uint8x16_t i0, uint8x16_t i1, uint8x16_t i2, uint8x16_t i3) {
|
||||
this->i0 = i0;
|
||||
this->i1 = i1;
|
||||
this->i2 = i2;
|
||||
this->i3 = i3;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
|
||||
uint8x16_t r0 = chunk_to_mask(this->i0);
|
||||
uint8x16_t r1 = chunk_to_mask(this->i1);
|
||||
uint8x16_t r2 = chunk_to_mask(this->i2);
|
||||
uint8x16_t r3 = chunk_to_mask(this->i3);
|
||||
return neon_movemask_bulk(r0, r1, r2, r3);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
|
||||
simd_input<Architecture::ARM64> result = {
|
||||
map_chunk(this->i0),
|
||||
map_chunk(this->i1),
|
||||
map_chunk(this->i2),
|
||||
map_chunk(this->i3)
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
really_inline uint64_t eq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
|
||||
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
|
||||
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
|
||||
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
return this->build_bitmask([&](uint8x16_t chunk) {
|
||||
return vceqq_u8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
really_inline uint64_t lteq(uint8_t m) {
|
||||
const uint8x16_t mask = vmovq_n_u8(m);
|
||||
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
|
||||
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
|
||||
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
|
||||
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
|
||||
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
|
||||
return this->build_bitmask([&](uint8x16_t chunk) {
|
||||
return vcleq_u8(chunk, mask);
|
||||
});
|
||||
}
|
||||
|
||||
}; // struct simd_input
|
||||
|
|
|
@ -28,45 +28,25 @@ really_inline void find_whitespace_and_structurals(
|
|||
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
|
||||
const uint8x16_t high_nibble_mask =
|
||||
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
|
||||
|
||||
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
|
||||
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
|
||||
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
|
||||
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
|
||||
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
|
||||
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
|
||||
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
|
||||
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
|
||||
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
|
||||
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
|
||||
return vandq_u8(shuf_lo, shuf_hi);
|
||||
});
|
||||
|
||||
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
|
||||
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
|
||||
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
|
||||
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
|
||||
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
|
||||
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
|
||||
structurals = v.build_bitmask([&](auto chunk) {
|
||||
return vtstq_u8(chunk, structural_shufti_mask);
|
||||
});
|
||||
|
||||
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
|
||||
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
|
||||
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
|
||||
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
|
||||
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
|
||||
|
||||
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
|
||||
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
|
||||
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
|
||||
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
|
||||
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
|
||||
|
||||
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
|
||||
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
|
||||
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
|
||||
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
|
||||
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
|
||||
|
||||
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
|
||||
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
|
||||
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
|
||||
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
|
||||
whitespace = v.build_bitmask([&](auto chunk) {
|
||||
return vtstq_u8(chunk, whitespace_shufti_mask);
|
||||
});
|
||||
}
|
||||
|
||||
#include "generic/stage1_find_marks_flatten.h"
|
||||
|
|
Loading…
Reference in New Issue