Cleaning up the code.

This commit is contained in:
Daniel Lemire 2018-10-17 21:31:22 -04:00
parent 35381279c3
commit 8315f4c888
2 changed files with 13 additions and 25 deletions

View File

@ -5,6 +5,7 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <string.h>
/* /*
* legal utf-8 byte sequence * legal utf-8 byte sequence
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
@ -396,7 +397,6 @@ avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
9, 9, 9, 9, 9, 9, 9, 1)),*has_error); 9, 9, 9, 9, 9, 9, 9, 1)),*has_error);
return *previous; return *previous;
} }
struct avx_processed_utf_bytes pb; struct avx_processed_utf_bytes pb;
avx_count_nibbles(current_bytes, &pb); avx_count_nibbles(current_bytes, &pb);

View File

@ -40,10 +40,6 @@ WARN_UNUSED
return false; return false;
} }
#ifdef UTF8VALIDATE #ifdef UTF8VALIDATE
#define TRYASCIIFIRST // we try parsing it as ASCII and fall over to UTF-8 only as needed.
#ifdef TRYASCIIFIRST
bool isasciisofar = true;
#endif
__m256i has_error = _mm256_setzero_si256(); __m256i has_error = _mm256_setzero_si256();
struct avx_processed_utf_bytes previous = { struct avx_processed_utf_bytes previous = {
.rawbytes = _mm256_setzero_si256(), .rawbytes = _mm256_setzero_si256(),
@ -84,24 +80,20 @@ WARN_UNUSED
m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0)); m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0));
m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32)); m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32));
#ifdef UTF8VALIDATE #ifdef UTF8VALIDATE
#ifdef TRYASCIIFIRST m256 highbit = _mm256_set1_epi8(0x80);
if(isasciisofar) { if((_mm256_testz_si256(_mm256_or_si256(input_lo, input_hi),highbit)) == 1) {
m256 highbit = _mm256_set1_epi8(0x80); // it is ascii, we just check continuation
if((_mm256_testz_si256(input_lo,highbit) & _mm256_testz_si256(input_hi,highbit)) != 1) { has_error = _mm256_or_si256(
isasciisofar = false; _mm256_cmpgt_epi8(previous.carried_continuations,
_mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 1)),has_error);
} else {
// it is not ascii so we have to do heavy work
previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error); previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error); previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
} }
} else {
#endif // TRYASCIIFIRST
previous = avxcheckUTF8Bytes_asciipath(input_lo, &previous, &has_error);
previous = avxcheckUTF8Bytes_asciipath(input_hi, &previous, &has_error);
//previous = avxcheckUTF8Bytes(input_lo, &previous, &has_error);
//previous = avxcheckUTF8Bytes(input_hi, &previous, &has_error);
#ifdef TRYASCIIFIRST
}
#endif // TRYASCIIFIRST
#endif #endif
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Step 1: detect odd sequences of backslashes // Step 1: detect odd sequences of backslashes
@ -259,11 +251,7 @@ WARN_UNUSED
*(u64 *)(pj.structurals + idx / 8) = structurals; *(u64 *)(pj.structurals + idx / 8) = structurals;
} }
#ifdef UTF8VALIDATE #ifdef UTF8VALIDATE
#ifdef TRYASCIIFIRST
return isasciisofar || ((!isasciisofar) && (_mm256_testz_si256(has_error, has_error)));
#else
return _mm256_testz_si256(has_error, has_error); return _mm256_testz_si256(has_error, has_error);
#endif
#else #else
return true; return true;
#endif #endif