diff --git a/singleheader/amalgamation_demo.cpp b/singleheader/amalgamation_demo.cpp index 4b132c70..04913179 100644 --- a/singleheader/amalgamation_demo.cpp +++ b/singleheader/amalgamation_demo.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */ +/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */ #include #include "simdjson.h" diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp index 43099bd8..0fb694fd 100644 --- a/singleheader/simdjson.cpp +++ b/singleheader/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */ +/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */ #include "simdjson.h" /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */ @@ -574,22 +574,38 @@ struct simd_input { this->i3 = vld1q_u8(ptr + 48); } + template + really_inline uint64_t build_bitmask(F const& chunk_to_mask) { + uint8x16_t r0 = chunk_to_mask(this->i0); + uint8x16_t r1 = chunk_to_mask(this->i1); + uint8x16_t r2 = chunk_to_mask(this->i2); + uint8x16_t r3 = chunk_to_mask(this->i3); + return neon_movemask_bulk(r0, r1, r2, r3); + } + + template + really_inline simd_input map(F const& map_chunk) { + simd_input result = { + map_chunk(this->i0), + map_chunk(this->i1), + map_chunk(this->i2), + map_chunk(this->i3) + }; + return result; + } + really_inline uint64_t eq(uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask); - uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask); - uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask); - uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask); - return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); + return this->build_bitmask([&](uint8x16_t chunk) { + return vceqq_u8(chunk, mask); + }); } really_inline uint64_t lteq(uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask); - uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask); - uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask); - uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask); - return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); + return this->build_bitmask([&](uint8x16_t chunk) { + return vcleq_u8(chunk, mask); + }); } }; // struct simd_input @@ -1467,45 +1483,25 @@ really_inline void find_whitespace_and_structurals( (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; const uint8x16_t high_nibble_mask = (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; - const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); - uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask); - uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4); - uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo); - uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi); - uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi); + simd_input v = in.map([&](auto chunk) { + uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask); + uint8x16_t nib_hi = vshrq_n_u8(chunk, 4); + uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo); + uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi); + return vandq_u8(shuf_lo, shuf_hi); + }); - uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask); - uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4); - uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo); - uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi); - uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi); + const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); + structurals = v.build_bitmask([&](auto chunk) { + return vtstq_u8(chunk, structural_shufti_mask); + }); - uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask); - uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4); - uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo); - uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi); - uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi); - - uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask); - uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4); - uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo); - uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi); - uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi); - - uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask); - uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask); - uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask); - uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask); - structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); - - uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask); - uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask); - uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask); - uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask); - whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); + const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); + whitespace = v.build_bitmask([&](auto chunk) { + return vtstq_u8(chunk, whitespace_shufti_mask); + }); } // This file contains a non-architecture-specific version of "flatten" used in stage1. diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h index 95ee157d..2340da6c 100644 --- a/singleheader/simdjson.h +++ b/singleheader/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */ +/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */ /* begin file include/simdjson/simdjson_version.h */ // /include/simdjson/simdjson_version.h automatically generated by release.py, // do not change by hand diff --git a/src/arm64/simd_input.h b/src/arm64/simd_input.h index de973ad7..59e6d593 100644 --- a/src/arm64/simd_input.h +++ b/src/arm64/simd_input.h @@ -46,22 +46,45 @@ struct simd_input { this->i3 = vld1q_u8(ptr + 48); } + really_inline simd_input(uint8x16_t i0, uint8x16_t i1, uint8x16_t i2, uint8x16_t i3) { + this->i0 = i0; + this->i1 = i1; + this->i2 = i2; + this->i3 = i3; + } + + template + really_inline uint64_t build_bitmask(F const& chunk_to_mask) { + uint8x16_t r0 = chunk_to_mask(this->i0); + uint8x16_t r1 = chunk_to_mask(this->i1); + uint8x16_t r2 = chunk_to_mask(this->i2); + uint8x16_t r3 = chunk_to_mask(this->i3); + return neon_movemask_bulk(r0, r1, r2, r3); + } + + template + really_inline simd_input map(F const& map_chunk) { + simd_input result = { + map_chunk(this->i0), + map_chunk(this->i1), + map_chunk(this->i2), + map_chunk(this->i3) + }; + return result; + } + really_inline uint64_t eq(uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask); - uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask); - uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask); - uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask); - return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); + return this->build_bitmask([&](uint8x16_t chunk) { + return vceqq_u8(chunk, mask); + }); } really_inline uint64_t lteq(uint8_t m) { const uint8x16_t mask = vmovq_n_u8(m); - uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask); - uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask); - uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask); - uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask); - return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3); + return this->build_bitmask([&](uint8x16_t chunk) { + return vcleq_u8(chunk, mask); + }); } }; // struct simd_input diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h index 3fd0d429..4f93e2d8 100644 --- a/src/arm64/stage1_find_marks.h +++ b/src/arm64/stage1_find_marks.h @@ -28,45 +28,25 @@ really_inline void find_whitespace_and_structurals( (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0}; const uint8x16_t high_nibble_mask = (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0}; - const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); - const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf); - uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask); - uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4); - uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo); - uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi); - uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi); + simd_input v = in.map([&](auto chunk) { + uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask); + uint8x16_t nib_hi = vshrq_n_u8(chunk, 4); + uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo); + uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi); + return vandq_u8(shuf_lo, shuf_hi); + }); - uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask); - uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4); - uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo); - uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi); - uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi); + const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7); + structurals = v.build_bitmask([&](auto chunk) { + return vtstq_u8(chunk, structural_shufti_mask); + }); - uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask); - uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4); - uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo); - uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi); - uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi); - - uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask); - uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4); - uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo); - uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi); - uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi); - - uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask); - uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask); - uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask); - uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask); - structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); - - uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask); - uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask); - uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask); - uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask); - whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3); + const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18); + whitespace = v.build_bitmask([&](auto chunk) { + return vtstq_u8(chunk, whitespace_shufti_mask); + }); } #include "generic/stage1_find_marks_flatten.h"