Optimizing stage1 (#216)

* Optimizing stage 1-- avx edition * Optimizing sse. * Saving 0.5% in instruction count (NEON).
2019-07-11 20:59:21 -04:00 · 2019-07-11 20:59:21 -04:00 · 6c168f046d
parent 37fa6affc8
commit 6c168f046d
1 changed files with 69 additions and 96 deletions
--- a/include/simdjson/stage1_find_marks.h
+++ b/include/simdjson/stage1_find_marks.h
@ -305,10 +305,10 @@ void check_utf8<instruction_set::neon>(simd_input<instruction_set::neon> in,
  if (check_ascii_neon(in)) {
    // All bytes are ascii. Therefore the byte that was just before must be ascii too.
    // We only check the byte that was just before simd_input. Nines are arbitrary values.
-    int8_t _verror[] = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
+    const int8x16_t verror = (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
    state.has_error =
        vorrq_s8(vreinterpretq_s8_u8(vcgtq_s8(state.previous.carried_continuations,
-                                    vld1q_s8(_verror))),
+                                    verror)),
                     state.has_error);
  } else {
    // it is not ascii so we have to do heavy work
@ -616,45 +616,35 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
  // end of naive approach

 #else // SIMDJSON_NAIVE_STRUCTURAL
-  const __m256i low_nibble_mask = _mm256_setr_epi8(
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, 
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-  const __m256i high_nibble_mask = _mm256_setr_epi8(
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, 
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
-
-  __m256i structural_shufti_mask = _mm256_set1_epi8(0x7);
-  __m256i whitespace_shufti_mask = _mm256_set1_epi8(0x18);
-
-  __m256i v_lo = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, in.lo),
-      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(in.lo, 4),
-                                           _mm256_set1_epi8(0x7f))));
-
-  __m256i v_hi = _mm256_and_si256(
-      _mm256_shuffle_epi8(low_nibble_mask, in.hi),
-      _mm256_shuffle_epi8(high_nibble_mask,
-                          _mm256_and_si256(_mm256_srli_epi32(in.hi, 4),
-                                           _mm256_set1_epi8(0x7f))));
-  __m256i tmp_lo = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_lo, structural_shufti_mask), _mm256_set1_epi8(0));
-  __m256i tmp_hi = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_hi, structural_shufti_mask), _mm256_set1_epi8(0));
+  const __m256i structural_table = _mm256_setr_epi8(
+      44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123, 
+      44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
+  const __m256i white_table = _mm256_setr_epi8(
+      32,  100,  100,  100,  17,  100,  113,  2,  100,  9,  10,  112,  100,  13,  100,  100, 
+      32,  100,  100,  100,  17,  100,  113,  2,  100,  9,  10,  112,  100,  13,  100,  100);
+  const __m256i struct_offset = _mm256_set1_epi8(0xd4);
+  const __m256i struct_mask = _mm256_set1_epi8(32);

+  __m256i lo_white = _mm256_cmpeq_epi8(in.lo, 
+           _mm256_shuffle_epi8(white_table, in.lo));
+  __m256i hi_white = _mm256_cmpeq_epi8(in.hi, 
+           _mm256_shuffle_epi8(white_table, in.hi));
+  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
+  uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
+  whitespace = (ws_res_0 | (ws_res_1 << 32));
+  __m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
+  __m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
+  __m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
+  __m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
+  __m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
+  __m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
+  __m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
+  __m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
+  
  uint64_t structural_res_0 =
-      static_cast<uint32_t>(_mm256_movemask_epi8(tmp_lo));
-  uint64_t structural_res_1 = _mm256_movemask_epi8(tmp_hi);
-  structurals = ~(structural_res_0 | (structural_res_1 << 32));
-
-  __m256i tmp_ws_lo = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_lo, whitespace_shufti_mask), _mm256_set1_epi8(0));
-  __m256i tmp_ws_hi = _mm256_cmpeq_epi8(
-      _mm256_and_si256(v_hi, whitespace_shufti_mask), _mm256_set1_epi8(0));
-
-  uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(tmp_ws_lo));
-  uint64_t ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
-  whitespace = ~(ws_res_0 | (ws_res_1 << 32));
+      static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
+  uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
+  structurals = (structural_res_0 | (structural_res_1 << 32));
 #endif // SIMDJSON_NAIVE_STRUCTURAL
 }
 #endif // __AVX2__
@ -662,71 +652,54 @@ void find_whitespace_and_structurals<instruction_set::avx2>(simd_input<instructi
 #if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(_M_AMD64))
 template<> really_inline
 void find_whitespace_and_structurals<instruction_set::sse4_2>(simd_input<instruction_set::sse4_2> in,
-                                                     uint64_t &whitespace,
-                                                     uint64_t &structurals) {
-  const __m128i low_nibble_mask = _mm_setr_epi8(
-      16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-  const __m128i high_nibble_mask = _mm_setr_epi8(
-      8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+                                                     uint64_t &whitespace, uint64_t &structurals) {
+  const __m128i structural_table = _mm_setr_epi8(44, 125, 0, 0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
+  const __m128i white_table = _mm_setr_epi8(
+      32,  100,  100,  100,  17,  100,  113,  2,  100,  9,  10,  112,  100,  13,  100,  100);
+  const __m128i struct_offset = _mm_set1_epi8(0xd4);
+  const __m128i struct_mask = _mm_set1_epi8(32);

-  __m128i structural_shufti_mask = _mm_set1_epi8(0x7);
-  __m128i whitespace_shufti_mask = _mm_set1_epi8(0x18);
+  __m128i white0 = _mm_cmpeq_epi8(in.v0,
+           _mm_shuffle_epi8(white_table, in.v0));
+  __m128i white1 = _mm_cmpeq_epi8(in.v1,
+           _mm_shuffle_epi8(white_table, in.v1));
+  __m128i white2 = _mm_cmpeq_epi8(in.v2,
+           _mm_shuffle_epi8(white_table, in.v2));
+  __m128i white3 = _mm_cmpeq_epi8(in.v3,
+           _mm_shuffle_epi8(white_table, in.v3));
+  uint64_t ws_res_0 = _mm_movemask_epi8(white0);
+  uint64_t ws_res_1 = _mm_movemask_epi8(white1);
+  uint64_t ws_res_2 = _mm_movemask_epi8(white2);
+  uint64_t ws_res_3 = _mm_movemask_epi8(white3);

-  __m128i v_0 = _mm_and_si128(
-      _mm_shuffle_epi8(low_nibble_mask, in.v0),
-      _mm_shuffle_epi8(high_nibble_mask,
-                          _mm_and_si128(_mm_srli_epi32(in.v0, 4),
-                                           _mm_set1_epi8(0x7f))));
+  whitespace = (ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));

-  __m128i v_1 = _mm_and_si128(
-      _mm_shuffle_epi8(low_nibble_mask, in.v1),
-      _mm_shuffle_epi8(high_nibble_mask,
-                          _mm_and_si128(_mm_srli_epi32(in.v1, 4),
-                                           _mm_set1_epi8(0x7f))));
+  __m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
+  __m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
+  __m128i struct3_r1 = _mm_add_epi8(struct_offset, in.v2);
+  __m128i struct4_r1 = _mm_add_epi8(struct_offset, in.v3);

-  __m128i v_2 = _mm_and_si128(
-      _mm_shuffle_epi8(low_nibble_mask, in.v2),
-      _mm_shuffle_epi8(high_nibble_mask,
-                         _mm_and_si128(_mm_srli_epi32(in.v2, 4),
-                                           _mm_set1_epi8(0x7f))));
+  __m128i struct1_r2 = _mm_or_si128(in.v0, struct_mask);
+  __m128i struct2_r2 = _mm_or_si128(in.v1, struct_mask);
+  __m128i struct3_r2 = _mm_or_si128(in.v2, struct_mask);
+  __m128i struct4_r2 = _mm_or_si128(in.v3, struct_mask);

-  __m128i v_3 = _mm_and_si128(
-      _mm_shuffle_epi8(low_nibble_mask, in.v3),
-      _mm_shuffle_epi8(high_nibble_mask,
-                         _mm_and_si128(_mm_srli_epi32(in.v3, 4),
-                                           _mm_set1_epi8(0x7f))));
+  __m128i struct1_r3 = _mm_shuffle_epi8(structural_table, struct1_r1);
+  __m128i struct2_r3 = _mm_shuffle_epi8(structural_table, struct2_r1);
+  __m128i struct3_r3 = _mm_shuffle_epi8(structural_table, struct3_r1);
+  __m128i struct4_r3 = _mm_shuffle_epi8(structural_table, struct4_r1);

-  __m128i tmp_v0 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_0, structural_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_v1 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_1, structural_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_v2 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_2, structural_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_v3 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_3, structural_shufti_mask), _mm_set1_epi8(0));
+  __m128i struct1 = _mm_cmpeq_epi8(struct1_r2, struct1_r3);
+  __m128i struct2 = _mm_cmpeq_epi8(struct2_r2, struct2_r3);
+  __m128i struct3 = _mm_cmpeq_epi8(struct3_r2, struct3_r3);
+  __m128i struct4 = _mm_cmpeq_epi8(struct4_r2, struct4_r3);

-  uint64_t structural_res_0 = _mm_movemask_epi8(tmp_v0);
-  uint64_t structural_res_1 = _mm_movemask_epi8(tmp_v1);
-  uint64_t structural_res_2 = _mm_movemask_epi8(tmp_v2);
-  uint64_t structural_res_3 = _mm_movemask_epi8(tmp_v3);
+  uint64_t structural_res_0 = _mm_movemask_epi8(struct1);
+  uint64_t structural_res_1 = _mm_movemask_epi8(struct2);
+  uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
+  uint64_t structural_res_3 = _mm_movemask_epi8(struct4);

-  structurals = ~(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
-
-  __m128i tmp_ws_v0 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_0, whitespace_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_ws_v1 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_1, whitespace_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_ws_v2 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_2, whitespace_shufti_mask), _mm_set1_epi8(0));
-  __m128i tmp_ws_v3 = _mm_cmpeq_epi8(
-      _mm_and_si128(v_3, whitespace_shufti_mask), _mm_set1_epi8(0));
-
-  uint64_t ws_res_0 = _mm_movemask_epi8(tmp_ws_v0);
-  uint64_t ws_res_1 = _mm_movemask_epi8(tmp_ws_v1);
-  uint64_t ws_res_2 = _mm_movemask_epi8(tmp_ws_v2);
-  uint64_t ws_res_3 = _mm_movemask_epi8(tmp_ws_v3);
-
-  whitespace = ~(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
+  structurals = (structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
 }
 #endif // __SSE4_2__