Fixing issue 906 (#912)

* Fixing issue 906

* Safe patching.

* Now with explanations.

* Bumping up memory allocation.

* Putting the patch back.

* fallback fixes.

Co-authored-by: Daniel Lemire <lemire@gmai.com>
This commit is contained in:
Daniel Lemire 2020-06-05 15:37:09 -04:00 committed by GitHub
parent 351717414d
commit 7a69da16e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 66 additions and 5 deletions

View File

@ -33,7 +33,7 @@ inline error_code document::allocate(size_t capacity) noexcept {
// worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
//where len + 1 tape elements are
// generated, see issue https://github.com/lemire/simdjson/issues/345
size_t tape_capacity = ROUNDUP_N(capacity + 2, 64);
size_t tape_capacity = ROUNDUP_N(capacity + 3, 64);
// a document with only zero-length strings... could have len/3 string
// and we would need len/3 * 5 bytes on the string buffer
size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64);

View File

@ -112,6 +112,8 @@ namespace arm64 {
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);
if (err) { return err; }
err = check_for_unclosed_array();
if (err) { return err; }
return stage2(_doc);
}

View File

@ -133,6 +133,10 @@ really_inline error_code scan() {
}
*next_structural_index = len;
next_structural_index++;
// We pad beyond.
// https://github.com/simdjson/simdjson/issues/906
next_structural_index[0] = len;
next_structural_index[1] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
return error;
}
@ -234,6 +238,8 @@ namespace fallback {
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);
if (err) { return err; }
err = check_for_unclosed_array();
if (err) { return err; }
return stage2(_doc);
}

View File

@ -31,6 +31,7 @@ public:
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept final;
WARN_UNUSED error_code check_for_unclosed_array() noexcept;
WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::document &doc, size_t &next_json) noexcept final;
WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
@ -56,3 +57,22 @@ WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth
_max_depth = max_depth;
return SUCCESS;
}
WARN_UNUSED error_code dom_parser_implementation::check_for_unclosed_array() noexcept {
// Before we engage stage 2, we want to make sure there is no risk that we could end with [ and
// loop back at the start with [. That is, we want to make sure that if the first character is [, then
// the last one is ].
// See https://github.com/simdjson/simdjson/issues/906 for details.
if(n_structural_indexes < 2) {
return UNEXPECTED_ERROR;
}
const size_t first_index = structural_indexes[0];
const size_t last_index = structural_indexes[n_structural_indexes - 2];
const char first_character = char(buf[first_index]);
const char last_character = char(buf[last_index]);
if((first_character == '[') and (last_character != ']')) {
return TAPE_ERROR;
}
return SUCCESS;
}

View File

@ -109,7 +109,22 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
}
/* make it safe to dereference one beyond this array */
parser.structural_indexes[parser.n_structural_indexes] = 0;
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 1] = 0;
/***
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
**/
return checker.errors();
}

View File

@ -7,6 +7,7 @@ namespace logger {
static constexpr const int LOG_EVENT_LEN = 30;
static constexpr const int LOG_BUFFER_LEN = 20;
static constexpr const int LOG_DETAIL_LEN = 50;
static constexpr const int LOG_INDEX_LEN = 10;
static int log_depth; // Not threadsafe. Log only.
@ -24,8 +25,8 @@ namespace logger {
if (LOG_ENABLED) {
log_depth = 0;
printf("\n");
printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", LOG_DETAIL_LEN, "Detail");
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES);
printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
}
}
@ -57,6 +58,7 @@ namespace logger {
printf("| %c ", printable_char(structurals.peek_char()));
printf("| %5zd ", structurals.next_structural);
printf("| %-*s ", LOG_DETAIL_LEN, detail);
printf("| %*zu ", LOG_INDEX_LEN, structurals.idx);
printf("|\n");
}
}

View File

@ -101,6 +101,8 @@ namespace haswell {
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);
if (err) { return err; }
err = check_for_unclosed_array();
if (err) { return err; }
return stage2(_doc);
}

View File

@ -102,6 +102,8 @@ namespace westmere {
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
error_code err = stage1(_buf, _len, false);
if (err) { return err; }
err = check_for_unclosed_array();
if (err) { return err; }
return stage2(_doc);
}

View File

@ -233,6 +233,17 @@ namespace document_tests {
}
return true;
}
bool padded_with_open_bracket() {
std::cout << __func__ << std::endl;
simdjson::dom::parser parser;
// This is an invalid document padded with open braces.
auto error1 = parser.parse("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", 2, false).error();
if (!error1) { std::cerr << "We expected an error but got: " << error1 << std::endl; return false; }
// This is a valid document padded with open braces.
auto error2 = parser.parse("[][[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", 2, false).error();
if (error2) { std::cerr << "Error: " << error2 << std::endl; return false; }
return true;
}
// returns true if successful
bool stable_test() {
std::cout << __func__ << std::endl;
@ -340,7 +351,8 @@ namespace document_tests {
return true;
}
bool run() {
return bad_example() &&
return padded_with_open_bracket() &&
bad_example() &&
count_array_example() &&
count_object_example() &&
stable_test() &&