Code reformatting.

This commit is contained in:
Daniel Lemire 2020-06-22 16:50:57 -04:00
parent 26baf70912
commit 2bb101bd19
2 changed files with 24 additions and 64 deletions

View File

@ -243,17 +243,17 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
// for fear of aliasing
return SUCCESS;
}
// credit: based on code from Google Fuchsia (Apache Licensed)
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
uint64_t next_pos = 0;
uint32_t code_point = 0;
while (pos < len) {
// check of the next 8 bytes are ascii.
next_pos = pos + 16;
if (next_pos <=
len) { // if it is safe to read 8 more bytes, check that they are ascii
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
uint64_t v1;
memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
@ -265,34 +265,21 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons
}
}
unsigned char byte = data[pos];
if (byte < 0b10000000) {
pos++;
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) {
return false;
}
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
@ -303,25 +290,15 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point < 0xffff || 0x10ffff < code_point) {
return false;
}
if (code_point < 0xffff || 0x10ffff < code_point) { return false; }
} else {
// we may have a continuation
return false;

View File

@ -98,6 +98,7 @@ std::vector<uint8_t> RandomUTF8::generate(size_t output_bytes, long seed) {
return generate(output_bytes);
}
// credit: based on code from Google Fuchsia (Apache Licensed)
WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
@ -110,28 +111,18 @@ WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) {
return false;
}
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
@ -142,18 +133,10 @@ WARN_UNUSED bool basic_validate_utf8(const char *buf, size_t len) noexcept {
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
if (next_pos > len) { return false; }
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |