Add AMD64 build_bitmask

This commit is contained in:
John Keiser 2019-08-23 10:49:26 -07:00
parent cf4ae61ac6
commit 441963c84c
5 changed files with 92 additions and 93 deletions

View File

@ -1,4 +1,4 @@
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
#include <iostream>
#include "simdjson.h"

View File

@ -1,4 +1,4 @@
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
#include "simdjson.h"
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@ -574,22 +574,38 @@ struct simd_input<Architecture::ARM64> {
this->i3 = vld1q_u8(ptr + 48);
}
template <typename F>
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
uint8x16_t r0 = chunk_to_mask(this->i0);
uint8x16_t r1 = chunk_to_mask(this->i1);
uint8x16_t r2 = chunk_to_mask(this->i2);
uint8x16_t r3 = chunk_to_mask(this->i3);
return neon_movemask_bulk(r0, r1, r2, r3);
}
template <typename F>
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
simd_input<Architecture::ARM64> result = {
map_chunk(this->i0),
map_chunk(this->i1),
map_chunk(this->i2),
map_chunk(this->i3)
};
return result;
}
really_inline uint64_t eq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
return this->build_bitmask([&](uint8x16_t chunk) {
return vceqq_u8(chunk, mask);
});
}
really_inline uint64_t lteq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
return this->build_bitmask([&](uint8x16_t chunk) {
return vcleq_u8(chunk, mask);
});
}
}; // struct simd_input
@ -1467,45 +1483,25 @@ really_inline void find_whitespace_and_structurals(
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask =
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
return vandq_u8(shuf_lo, shuf_hi);
});
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
structurals = v.build_bitmask([&](auto chunk) {
return vtstq_u8(chunk, structural_shufti_mask);
});
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
whitespace = v.build_bitmask([&](auto chunk) {
return vtstq_u8(chunk, whitespace_shufti_mask);
});
}
// This file contains a non-architecture-specific version of "flatten" used in stage1.

View File

@ -1,4 +1,4 @@
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
/* begin file include/simdjson/simdjson_version.h */
// /include/simdjson/simdjson_version.h automatically generated by release.py,
// do not change by hand

View File

@ -46,22 +46,45 @@ struct simd_input<Architecture::ARM64> {
this->i3 = vld1q_u8(ptr + 48);
}
really_inline simd_input(uint8x16_t i0, uint8x16_t i1, uint8x16_t i2, uint8x16_t i3) {
this->i0 = i0;
this->i1 = i1;
this->i2 = i2;
this->i3 = i3;
}
template <typename F>
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
uint8x16_t r0 = chunk_to_mask(this->i0);
uint8x16_t r1 = chunk_to_mask(this->i1);
uint8x16_t r2 = chunk_to_mask(this->i2);
uint8x16_t r3 = chunk_to_mask(this->i3);
return neon_movemask_bulk(r0, r1, r2, r3);
}
template <typename F>
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
simd_input<Architecture::ARM64> result = {
map_chunk(this->i0),
map_chunk(this->i1),
map_chunk(this->i2),
map_chunk(this->i3)
};
return result;
}
really_inline uint64_t eq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
return this->build_bitmask([&](uint8x16_t chunk) {
return vceqq_u8(chunk, mask);
});
}
really_inline uint64_t lteq(uint8_t m) {
const uint8x16_t mask = vmovq_n_u8(m);
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
return this->build_bitmask([&](uint8x16_t chunk) {
return vcleq_u8(chunk, mask);
});
}
}; // struct simd_input

View File

@ -28,45 +28,25 @@ really_inline void find_whitespace_and_structurals(
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask =
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
return vandq_u8(shuf_lo, shuf_hi);
});
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
structurals = v.build_bitmask([&](auto chunk) {
return vtstq_u8(chunk, structural_shufti_mask);
});
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
whitespace = v.build_bitmask([&](auto chunk) {
return vtstq_u8(chunk, whitespace_shufti_mask);
});
}
#include "generic/stage1_find_marks_flatten.h"