Adding 'count_elements' method. (#1577)

* Adding 'count_elements' method.

* Actually reporting errors.

* removing white space.

* Removing white space again.

* Adding an extra example.

* Prettier.

* Making the functionality more error-proof.

* Avoiding exceptions.

* Various fixes including extending count_elements to value types.

* Various fixes.

* Minor fixes.

* Correcting comment.

* Trimming white spaces.
This commit is contained in:
Daniel Lemire 2021-06-06 17:56:00 -04:00 committed by GitHub
parent eb0ae041e3
commit 16e8db1f17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 343 additions and 18 deletions

View File

@ -311,7 +311,33 @@ cout << doc["str"]["123"]["abc"].get_double() << endl; // Prints 3.14
cout << value << endl; // Prints 3.14
```
Sometimes it is useful to scan an array to determine its length prior to parsing it.
For this purpose, `array` instances have a `count_elements` method. Users should be
aware that the `count_elements` method can be costly since it requires scanning the
whole array. You may use it as follows if your document is itself an array:
```C++
auto cars_json = R"( [ 40.1, 39.9, 37.7, 40.4 ] )"_padded;
auto doc = parser.iterate(cars_json);
size_t count = doc.count_elements();
std::vector<double> values(count);
size_t index = 0;
for(double x : doc) { values[index++] = x; }
```
If you access an array inside a document, you can use the `count_elements` method as follow.
You should not let the array instance go out of scope before consuming it after calling the `count_elements` method:
``` C++
ondemand::parser parser;
auto cars_json = R"( { "test":[ { "val1":1, "val2":2 }, { "val1":1, "val2":2 } ] } )"_padded;
auto doc = parser.iterate(cars_json);
auto test_array = doc.find_field("test").get_array();
size_t count = test_array.count_elements();
std::cout << "Number of elements: " << count << std::endl;
for(ondemand::object elem: test_array) {
std::cout << simdjson::to_string(elem);
}
```
Tree Walking and JSON Element Types: Sometimes you don't necessarily have a document with a known type, and are trying to generically inspect or walk over JSON elements. To do that, you can use iterators and the type() method. For example, here's a quick and dirty recursive function that verbosely prints the JSON document as JSON:

View File

@ -287,7 +287,6 @@ struct simdjson_result : public internal::simdjson_result_base<T> {
template<typename T>
inline std::ostream& operator<<(std::ostream& out, simdjson_result<T> value) noexcept { return out << value.value(); }
#endif // SIMDJSON_EXCEPTIONS
#ifndef SIMDJSON_DISABLE_DEPRECATED_API

View File

@ -72,6 +72,27 @@ simdjson_really_inline simdjson_result<array_iterator> array::end() noexcept {
return array_iterator(iter);
}
simdjson_really_inline simdjson_result<size_t> array::count_elements() & noexcept {
// The count_elements method should always be called before you have begun
// iterating through the array.
// To open a new array you need to be at a `[`.
#ifdef SIMDJSON_DEVELOPMENT_CHECKS
// array::begin() makes the same check.
if(!iter.is_at_container_start()) { return OUT_OF_ORDER_ITERATION; }
#endif
// Otherwise, we need to iterate through the array.
iter.enter_at_container_start(); // sets the depth to indicate that we are inside the container and accesses the first element
size_t count{0};
// Important: we do not consume any of the values.
for(simdjson_unused auto v : *this) { count++; }
// The above loop will always succeed, but we want to report errors.
if(iter.error()) { return iter.error(); }
// We need to move back at the start because we expect users to iterator through
// the array after counting the number of elements.
iter.enter_at_container_start(); // sets the depth to indicate that we are inside the container and accesses the first element
return count;
}
} // namespace ondemand
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
@ -101,5 +122,8 @@ simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_
if (error()) { return error(); }
return first.end();
}
simdjson_really_inline simdjson_result<size_t> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array>::count_elements() & noexcept {
if (error()) { return error(); }
return first.count_elements();
}
} // namespace simdjson

View File

@ -31,7 +31,18 @@ public:
* Part of the std::iterable interface.
*/
simdjson_really_inline simdjson_result<array_iterator> end() noexcept;
/**
* This method scans the array and counts the number of elements.
* The count_elements method should always be called before you have begun
* iterating through the array: it is expected that you are pointing at
* the beginning of the array.
* The runtime complexity is linear in the size of the array. After
* calling this function, if successful, the array is 'rewinded' at its
* beginning as if it had never been accessed. If the JSON is malformed (e.g.,
* there is a missing comma), then an error is returned and it is no longer
* safe to continue.
*/
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
protected:
/**
* Begin array iteration.
@ -98,6 +109,7 @@ public:
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> begin() noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> end() noexcept;
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
};
} // namespace simdjson

View File

@ -92,7 +92,13 @@ simdjson_really_inline document::operator std::string_view() noexcept(false) { r
simdjson_really_inline document::operator raw_json_string() noexcept(false) { return get_raw_json_string(); }
simdjson_really_inline document::operator bool() noexcept(false) { return get_bool(); }
#endif
simdjson_really_inline simdjson_result<size_t> document::count_elements() & noexcept {
auto a = get_array();
simdjson_result<size_t> answer = a.count_elements();
/* If there was an array, we are now left pointing at its first element. */
if(answer.error() == SUCCESS) { iter._depth -= 1 ; /* undoing the increment so we go back at the doc depth.*/ }
return answer;
}
simdjson_really_inline simdjson_result<array_iterator> document::begin() & noexcept {
return get_array().begin();
}
@ -150,6 +156,10 @@ simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::docume
)
{
}
simdjson_really_inline simdjson_result<size_t> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::document>::count_elements() & noexcept {
if (error()) { return error(); }
return first.count_elements();
}
simdjson_really_inline error_code simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::document>::rewind() noexcept {
if (error()) { return error(); }
first.rewind();

View File

@ -198,7 +198,18 @@ public:
*/
simdjson_really_inline operator bool() noexcept(false);
#endif
/**
* This method scans the array and counts the number of elements.
* The count_elements method should always be called before you have begun
* iterating through the array: it is expected that you are pointing at
* the beginning of the array.
* The runtime complexity is linear in the size of the array. After
* calling this function, if successful, the array is 'rewinded' at its
* beginning as if it had never been accessed. If the JSON is malformed (e.g.,
* there is a missing comma), then an error is returned and it is no longer
* safe to continue.
*/
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
/**
* Begin array iteration.
*
@ -371,7 +382,7 @@ public:
simdjson_really_inline operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false);
simdjson_really_inline operator bool() noexcept(false);
#endif
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> begin() & noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> end() & noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) & noexcept;

View File

@ -51,7 +51,6 @@ public:
simdjson_really_inline json_iterator &operator=(json_iterator &&other) noexcept;
simdjson_really_inline explicit json_iterator(const json_iterator &other) noexcept = default;
simdjson_really_inline json_iterator &operator=(const json_iterator &other) noexcept = default;
/**
* Skips a JSON value, whether it is a scalar, array or object.
*/
@ -194,14 +193,13 @@ public:
simdjson_really_inline token_position start_position(depth_t depth) const noexcept;
simdjson_really_inline void set_start_position(depth_t depth, token_position position) noexcept;
#endif
/* Useful for debugging and logging purposes. */
inline std::string to_string() const noexcept;
/**
* Updates this json iterator so that it is back at the beginning of the document,
* as if it had just been created.
*/
inline void rewind() noexcept;
/* Useful for debugging and logging purposes. */
inline std::string to_string() const noexcept;
protected:
simdjson_really_inline json_iterator(const uint8_t *buf, ondemand::parser *parser) noexcept;
simdjson_really_inline token_position last_document_position() const noexcept;

View File

@ -95,6 +95,16 @@ simdjson_really_inline simdjson_result<array_iterator> value::begin() & noexcept
simdjson_really_inline simdjson_result<array_iterator> value::end() & noexcept {
return {};
}
simdjson_really_inline simdjson_result<size_t> value::count_elements() & noexcept {
simdjson_result<size_t> answer;
auto a = get_array();
answer = a.count_elements();
// count_elements leaves you pointing inside the array, at the first element.
// We need to move back so that the user can create a new array (which requires that
// we point at '[').
iter.move_at_start();
return answer;
}
simdjson_really_inline simdjson_result<value> value::find_field(std::string_view key) noexcept {
return start_or_resume_object().find_field(key);
@ -145,7 +155,10 @@ simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value>
implementation_simdjson_result_base<SIMDJSON_IMPLEMENTATION::ondemand::value>(error)
{
}
simdjson_really_inline simdjson_result<size_t> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value>::count_elements() & noexcept {
if (error()) { return error(); }
return first.count_elements();
}
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::value>::begin() & noexcept {
if (error()) { return error(); }
return first.begin();

View File

@ -208,7 +208,18 @@ public:
* Part of the std::iterable interface.
*/
simdjson_really_inline simdjson_result<array_iterator> end() & noexcept;
/**
* This method scans the array and counts the number of elements.
* The count_elements method should always be called before you have begun
* iterating through the array: it is expected that you are pointing at
* the beginning of the array.
* The runtime complexity is linear in the size of the array. After
* calling this function, if successful, the array is 'rewinded' at its
* beginning as if it had never been accessed. If the JSON is malformed (e.g.,
* there is a missing comma), then an error is returned and it is no longer
* safe to continue.
*/
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
/**
* Look up a field by name on an object (order-sensitive).
*
@ -382,7 +393,7 @@ public:
simdjson_really_inline operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false);
simdjson_really_inline operator bool() noexcept(false);
#endif
simdjson_really_inline simdjson_result<size_t> count_elements() & noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> begin() & noexcept;
simdjson_really_inline simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::array_iterator> end() & noexcept;

View File

@ -631,6 +631,17 @@ inline void value_iterator::assert_at_next() const noexcept {
SIMDJSON_ASSUME( _depth > 0 );
}
simdjson_really_inline void value_iterator::enter_at_container_start() noexcept {
_json_iter->_depth = _depth + 1;
_json_iter->token.index = _start_position + 1;
}
simdjson_really_inline void value_iterator::move_at_start() noexcept {
_json_iter->_depth = _depth;
_json_iter->token.index = _start_position;
}
inline void value_iterator::assert_at_child() const noexcept {
SIMDJSON_ASSUME( _json_iter->token.index > _start_position );
SIMDJSON_ASSUME( _json_iter->_depth == _depth + 1 );

View File

@ -280,11 +280,17 @@ public:
simdjson_really_inline bool is_valid() const noexcept;
/** @} */
protected:
/* updates the index so that at_start() is true and syncs the depth. */
simdjson_really_inline void move_at_start() noexcept;
/**
* enter_at_container_start is similar to move_at_start()
* except that it sets the depth to indicate that we are inside the
* container and then it accesses the first element
**/
simdjson_really_inline void enter_at_container_start() noexcept;
/* Useful for debugging and logging purposes. */
inline std::string to_string() const noexcept;
inline std::string to_string() const noexcept;
simdjson_really_inline value_iterator(json_iterator *json_iter, depth_t depth, token_position start_index) noexcept;
simdjson_really_inline bool parse_null(const uint8_t *json) const noexcept;

View File

@ -17,6 +17,7 @@ endfunction(add_dual_compile_test)
add_dual_compile_test(example_compiletest)
# These don't compile with exceptions off
if (SIMDJSON_EXCEPTIONS)
add_dual_compile_test(bad_array_count)
add_dual_compile_test(dangling_parser_load)
add_dual_compile_test(dangling_parser_parse_uint8)
add_dual_compile_test(dangling_parser_parse_uchar)

View File

@ -0,0 +1,21 @@
#include "simdjson.h"
#include <iostream>
using namespace simdjson;
int main(void) {
ondemand::parser parser;
auto cars_json = R"( { "test":[ { "val1":1, "val2":2 }, { "val1":1, "val2":2 } ] } )"_padded;
auto doc = parser.iterate(cars_json);
auto testArray = doc.find_field("test");
#if COMPILATION_TEST_USE_FAILING_CODE
size_t count = testArray.get_array().count_elements();
std::cout << "Number of elements: " << count << std::endl;
for(simdjson_unused ondemand::object elem : testArray) {}
#else
ondemand::array a = testArray.get_array();
size_t count = a.count_elements();
std::cout << "Number of elements: " << count << std::endl;
for(simdjson_unused ondemand::object elem : a) {}
#endif
}

View File

@ -136,6 +136,98 @@ namespace array_tests {
TEST_SUCCEED();
}
bool iterate_complex_array_count() {
TEST_START();
ondemand::parser parser;
auto cars_json = R"( { "test":[ { "val1":1, "val2":2 }, { "val1":1, "val2":2 } ] } )"_padded;
ondemand::document doc;
ASSERT_SUCCESS(parser.iterate(cars_json).get(doc));
ondemand::array myarray;
ASSERT_SUCCESS(doc.find_field("test").get_array().get(myarray));
size_t count;
ASSERT_SUCCESS(myarray.count_elements().get(count));
size_t new_count = 0;
for(simdjson_unused auto elem: myarray) { new_count++; }
ASSERT_EQUAL(count, new_count);
TEST_SUCCEED();
}
bool iterate_sub_array_count() {
TEST_START();
ondemand::parser parser;
auto key_value_json = R"( { "test":[ 1,2,3], "joe": [1,2] } )"_padded;
ondemand::document doc;
ASSERT_SUCCESS(parser.iterate(key_value_json).get(doc));
ondemand::object obj;
ASSERT_SUCCESS(doc.get_object().get(obj));
ondemand::value v;
ASSERT_SUCCESS(doc.find_field("test").get(v));
size_t count;
ASSERT_SUCCESS(v.count_elements().get(count));
ASSERT_EQUAL(count, 3);
ASSERT_SUCCESS(doc.find_field("joe").get(v));
ASSERT_SUCCESS(v.count_elements().get(count));
ASSERT_EQUAL(count, 2);
TEST_SUCCEED();
}
bool iterate_array_count() {
TEST_START();
const auto json = R"([ 1, 10, 100 ])"_padded;
const auto badjson = R"([ 1, 10 100 ])"_padded;
const vector<uint64_t> expected_value = { 1, 10, 100 };
SUBTEST("ondemand::count_elements", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::array array;
ASSERT_RESULT( doc_result.type(), json_type::array );
ASSERT_SUCCESS( doc_result.get(array) );
size_t count;
ASSERT_SUCCESS( array.count_elements().get(count) );
ASSERT_EQUAL(count, expected_value.size());
return true;
}));
SUBTEST("ondemand::count_elements_and_decode", test_ondemand_doc(json, [&](auto doc_result) {
ondemand::array array;
ASSERT_RESULT( doc_result.type(), json_type::array );
ASSERT_SUCCESS( doc_result.get(array) );
size_t count;
ASSERT_SUCCESS( array.count_elements().get(count) );
ASSERT_EQUAL(count, expected_value.size());
size_t i = 0;
std::vector<uint64_t> receiver(count);
for (auto value : array) {
uint64_t actual;
ASSERT_SUCCESS( value.get(actual) );
ASSERT_EQUAL(actual, expected_value[i]);
receiver[i] = actual;
i++;
}
return true;
}));
TEST_SUCCEED();
}
bool iterate_bad_array_count() {
TEST_START();
const auto badjson = R"([ 1, 10 100 ])"_padded;
SUBTEST("ondemand::count_elements", test_ondemand_doc(badjson, [&](auto doc_result) {
ondemand::array array;
ASSERT_RESULT( doc_result.type(), json_type::array );
ASSERT_SUCCESS( doc_result.get(array) );
size_t count;
auto e = array.count_elements().get(count);
if( e != TAPE_ERROR) {
std::cout << e << "\n";
std::cout << "expected: " << TAPE_ERROR << "\n";
std::cout << "count = " << count << "\n";
return false;
}
return true;
}));
TEST_SUCCEED();
}
bool iterate_document_array() {
TEST_START();
const auto json = R"([ 1, 10, 100 ])"_padded;
@ -513,6 +605,10 @@ namespace array_tests {
bool run() {
return
iterate_sub_array_count() &&
iterate_complex_array_count() &&
iterate_bad_array_count() &&
iterate_array_count() &&
issue1588() &&
iterate_array() &&
iterate_document_array() &&

View File

@ -44,6 +44,88 @@ bool basics_3() {
TEST_SUCCEED();
}
bool json_array_with_array_count() {
TEST_START();
ondemand::parser parser;
auto cars_json = R"( [ 40.1, 39.9, 37.7, 40.4 ] )"_padded;
auto doc = parser.iterate(cars_json);
auto arr = doc.get_array();
size_t count = arr.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
// We deliberately do it twice:
count = arr.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
// Next, we check whether we can iterate normally:
std::vector<double> values(count);
size_t index = 0;
for(double x : arr) { values[index++] = x; }
ASSERT_EQUAL(index, count);
TEST_SUCCEED();
}
bool json_value_with_array_count() {
TEST_START();
ondemand::parser parser;
auto cars_json = R"( {"array":[ 40.1, 39.9, 37.7, 40.4 ]} )"_padded;
auto doc = parser.iterate(cars_json);
auto val = doc["array"];
size_t count = val.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
// We deliberately do it twice:
count = val.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
std::vector<double> values(count);
// Next, we check whether we can iterate normally:
size_t index = 0;
for(double x : val) { values[index++] = x; }
ASSERT_EQUAL(index, count);
TEST_SUCCEED();
}
bool json_array_count() {
TEST_START();
ondemand::parser parser;
auto cars_json = R"( [ 40.1, 39.9, 37.7, 40.4 ] )"_padded;
auto doc = parser.iterate(cars_json);
size_t count = doc.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
// We deliberately do it twice:
count = doc.count_elements();
ASSERT_EQUAL(4, count);
std::cout << count << std::endl;
std::vector<double> values(count);
size_t index = 0;
for(double x : doc) { values[index++] = x; }
ASSERT_EQUAL(index, count);
TEST_SUCCEED();
}
bool json_array_count_complex() {
TEST_START();
ondemand::parser parser;
auto cars_json = R"( { "test":[ { "val1":1, "val2":2 }, { "val1":1, "val2":2 }, { "val1":1, "val2":2 } ] } )"_padded;
auto doc = parser.iterate(cars_json);
auto test_array = doc.find_field("test").get_array();
size_t count = test_array.count_elements();
std::cout << "Number of elements: " << count << std::endl;
size_t c = 0;
for(ondemand::object elem : test_array) {
std::cout << simdjson::to_string(elem);
c++;
}
std::cout << std::endl;
ASSERT_EQUAL(c, count);
TEST_SUCCEED();
}
bool using_the_parsed_json_1() {
TEST_START();
@ -215,6 +297,10 @@ int main() {
true
#if SIMDJSON_EXCEPTIONS
// && basics_1() // Fails because twitter.json isn't in current directory. Compile test only.
&& json_value_with_array_count()
&& json_array_with_array_count()
&& json_array_count_complex()
&& json_array_count()
&& using_the_parsed_json_rewind()
&& basics_2()
&& using_the_parsed_json_1()

View File

@ -100,8 +100,8 @@ simdjson_really_inline bool assert_iterate_error(T &arr, simdjson::error_code ex
}
return assert_equal( count, 1, operation );
}
#define TEST_START() do { std::cout << "Running " << __func__ << " ..." << std::endl; } while(0);
#define SUBTEST(NAME, TEST) do { std::cout << "- Subtest " << (NAME) << " ..." << std::endl; if (!(TEST)) { return false; } } while (0);
#define TEST_START() do { std::cout << "> Running " << __func__ << " ..." << std::endl; } while(0);
#define SUBTEST(NAME, TEST) do { std::cout << " - Subtest " << (NAME) << " ..." << std::endl; if (!(TEST)) { return false; } } while (0);
#define ASSERT_EQUAL(ACTUAL, EXPECTED) do { if (!::assert_equal ((ACTUAL), (EXPECTED), #ACTUAL)) { return false; } } while (0);
#define ASSERT_RESULT(ACTUAL, EXPECTED) do { if (!::assert_result ((ACTUAL), (EXPECTED), #ACTUAL)) { return false; } } while (0);
#define ASSERT_SUCCESS(ACTUAL) do { if (!::assert_success((ACTUAL), #ACTUAL)) { return false; } } while (0);