Merge branch 'master' into jkeiser/ondemand-scalar-order

This commit is contained in:
Daniel Lemire 2021-01-13 14:21:16 -05:00
commit 990da22249
22 changed files with 120 additions and 37 deletions

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

View File

@ -11,7 +11,10 @@ on:
- cron: 23 */8 * * *
jobs:
build:
build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-latest
env:
# fuzzers that change behaviour with SIMDJSON_FORCE_IMPLEMENTATION

View File

@ -15,6 +15,9 @@ on:
jobs:
ci:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: windows-gcc
runs-on: windows-2016

View File

@ -15,6 +15,9 @@ on:
jobs:
ci:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: windows-gcc
runs-on: windows-2016

View File

@ -10,6 +10,9 @@ on:
jobs:
windows-mingw:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: ${{ matrix.msystem }}
runs-on: windows-latest
defaults:

View File

@ -8,6 +8,9 @@ on:
jobs:
armv7_job:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
# The host should always be Linux
runs-on: ubuntu-20.04
name: Build on ubuntu-20.04 ppc64le

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ubuntu-build:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2

View File

@ -10,6 +10,9 @@ on:
jobs:
ci:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: windows-vs16
runs-on: windows-latest
steps:

View File

@ -10,6 +10,9 @@ on:
jobs:
ci:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: windows-vs16
runs-on: windows-latest
steps:

View File

@ -10,6 +10,9 @@ on:
jobs:
ci:
if: >-
! contains(toJSON(github.event.commits.*.message), '[skip ci]') &&
! contains(toJSON(github.event.commits.*.message), '[skip github]')
name: windows-vs16
runs-on: windows-latest
steps:

View File

@ -70,6 +70,8 @@ Pull requests are always invited. However, we ask that you follow these guidelin
- Changes should be focused and minimal. You should change as few lines of code as possible. Please do not reformat or touch files needlessly.
- New features must be accompanied of new tests, in general.
- Your code should pass our continuous-integration tests. It is your responsability to ensure that your proposal pass the tests. We do not merge pull requests that would break our build.
- An exception to this would be changes to non-code files, such as documentation and assets, or trivial changes to code, such as comments, where it is encouraged to explicitly ask for skipping a CI run using the `[skip ci]` prefix in your Pull Request title **and** in the first line of the most recent commit in a push. Example for such a commit: `[skip ci] Fixed typo in power_of_ten's docs`
This benefits the project in such a way that the CI pipeline is not burdened by running jobs on changes that don't change any behavior in the code, which reduces wait times for other Pull Requests that do change behavior and require testing.
If the benefits of your proposed code remain unclear, we may choose to discard your code: that is not an insult, we frequently discard our own code. We may also consider various alternatives and choose another path. Again, that is not an insult or a sign that you have wasted your time.

View File

@ -1,18 +1,18 @@
On Demand Basics
================
On Demand is a new, faster simdjson API with all the ease-of-use you're used to. While it provides a
familiar DOM interface, under the hood it is anything but: it is parsing values *as you use them.*
This means you don't waste time parsing JSON you don't use, and you don't pay the cost of generating
On Demand is a new, faster simdjson API with all the ease-of-use you are used to. While it provides a
familiar DOM interface, under the hood it is different: it is parsing values *as you use them.*
With On Demand, you do not waste time parsing JSON you do not use, and you do not pay the cost of generating
an intermediate DOM tree.
An overview of what you need to know to use simdjson, with examples.
We provide an overview of what you need to know to use the simdjson On Demand API, with examples.
* [Including ondemand](#including-ondemand)
* [Including ondemand](#including-on-demand)
* [The Basics: Loading and Parsing JSON Documents](#the-basics-loading-and-parsing-json-documents)
* [Using the Parsed JSON](#using-the-parsed-json)
ondemand supports the same JSON standards and C++ compilers as simdjson's older DOM API. Refer to the DOM docs for more information:
The On Demand API supports the same JSON standards and C++ compilers as simdjson's DOM API. Refer to the DOM docs for more information:
* [Requirements](basics.md##requirements)
* [Using simdjson as a CMake dependency](#using-simdjson-as-a-cmake-dependency)
@ -25,10 +25,10 @@ ondemand supports the same JSON standards and C++ compilers as simdjson's older
* [C++17 Support](basics.md#c17-support)
* [Backwards Compatibility](basics.md#backwards-compatibility)
For deeper information about the design and implementation of simdjson's ondemand API, refer to
For deeper information about the design and implementation of the simdjson On Demand API, refer to
the [design document](ondemand.md).
Including ondemand
Including On Demand
------------------
To include simdjson, copy [simdjson.h](/singleheader/simdjson.h) and [simdjson.cpp](/singleheader/simdjson.cpp)
@ -37,32 +37,20 @@ into your project. Then include it in your project with:
```c++
#include "simdjson.h"
using namespace simdjson; // optional
using namespace simdjson::builtin; // optional, for ondemand
using namespace simdjson::builtin; // optional, for On Demand
```
You can compile with:
You can generally compile with:
```
c++ -march=native myproject.cpp simdjson.cpp
c++ -O3 myproject.cpp simdjson.cpp
```
Note:
- Users on macOS and other platforms where compilers do not provide C++11 compliant by default
should request it with the appropriate flag (e.g., `c++ -march=native -std=c++17 myproject.cpp simdjson.cpp`).
### The native architecture flag
Passing `-march=native` to the compiler makes On Demand much faster by allowing it to use
optimizations specific to your machine. You cannot do this, however, if you are compiling code
that might be run on less advanced machines.
On Demand uses advanced architecture-specific code for many common processors to make JSON
preprocessing and string parsing faster. By default, however, most c++ compilers will compile to the
least common denominator (since the program could theoretically be run anywhere). Since On Demand is
inlined into your own code, it cannot use these advanced versions unless the compiler is told to
target them. -march=native says "target the current computer," which is a reasonable default for
many applications which both compile and run on the same processor.
The Basics: Loading and Parsing JSON Documents
----------------------------------------------
@ -90,8 +78,8 @@ Documents Are Iterators
A `document` is *not* a fully-parsed JSON value; rather, it is an **iterator** over the JSON text.
This means that while you iterate an array, or search for a field in an object, it is actually
walking through the original JSON text, merrily reading commas and colons and brackets to make sure
you get where you're going. This is the key to On Demand's performance: since it's just an iterator,
it lets you parse values as you use them. And particularly, it lets you *skip* values you don't want
you get where you are going. This is the key to On Demand's performance: since it's just an iterator,
it lets you parse values as you use them. And particularly, it lets you *skip* values you do not want
to use.
### Parser, Document and JSON Scope
@ -100,7 +88,7 @@ Because a document is an iterator over the JSON text, both the JSON text and the
remain alive (in scope) while you are using it. Further, a `parser` may have at most
one document open at a time, since it holds allocated memory used for the parsing.
During the `iterate` call, the original JSON text is never modified--only read. After you're done
During the `iterate` call, the original JSON text is never modified--only read. After you are done
with the document, the source (whether file or string) can be safely discarded.
For best performance, a `parser` instance should be reused over several files: otherwise you will
@ -120,7 +108,7 @@ support for users who avoid exceptions. See [the simdjson DOM API's error handli
or the initial `[` or `{` will be verified. An exception is thrown if the cast is not possible.
> IMPORTANT NOTE: values can only be parsed once. Since documents are *iterators*, once you have
> parsed a value (such as by casting to double), you can't get at it again.
> parsed a value (such as by casting to double), you cannot get at it again.
* **Field Access:** To get the value of the "foo" field in an object, use `object["foo"]`. This will
scan through the object looking for the field with the matching string.

View File

@ -721,7 +721,7 @@ The On Demand approach has some limitations:
There are currently additional technical limitations which we expect to resolve in future releases of the simdjson library:
* The simdjson library offers runtime dispatching which allows you to compile one binary and have it run at full speed on different processors, taking advantage of the specific features of the processor. The On Demand API does not have runtime dispatch support at this time. To benefit from the On Demand API, you must compile your code for a specific processor. E.g., if your processor supports AVX2 instructions, you should compile your binary executable with AVX2 instruction support (by using your compiler's commands). If you are sufficiently technically proficient, you can implement runtime dispatching within your application, by compiling your On Demand code for different processors.
* The simdjson library offers runtime dispatching which allows you to compile one binary and have it run at full speed on different processors, taking advantage of the specific features of the processor. The On Demand API has limited runtime dispatch support. Under x64 systems, to fully benefit from the On Demand API, we recommend that you compile your code for a specific processor. E.g., if your processor supports AVX2 instructions, you should compile your binary executable with AVX2 instruction support (by using your compiler's commands). If you are sufficiently technically proficient, you can implement runtime dispatching within your application, by compiling your On Demand code for different processors.
* There is an initial phase which scans the entire document quickly, irrespective of the size of the document. We plan to break this phase into distinct steps for large files in a future release as we have done with other components of our API (e.g., `parse_many`).
* The On Demand API does not support JSON Pointer. This capability is currently limited to our core API.
@ -737,23 +737,42 @@ At this time we recommend the On Demand API in the following cases:
Good applications for the On Demand API might be:
* You are working from pre-existing large JSON files that have been vetted. You expect them to be well formed according to a known JSON dialect and to have a consistent layout. For example, you might be doing biomedical research or machine learning on top of static data dumps in JSON.
* You have a closed system on predetermined hardware. Both the generation and the consumption of JSON data is within your system. Your team controls both the software that produces the JSON and the software the parses it, your team knows and control the hardware. Thus you can fully test your system.
* Both the generation and the consumption of JSON data is within your system. Your team controls both the software that produces the JSON and the software the parses it, your team knows and control the hardware. Thus you can fully test your system.
* You are working with stable JSON APIs which have a consistent layout and JSON dialect.
## Checking Your CPU Selection
## Checking Your CPU Selection (x64 systems)
Given that the On Demand API does not offer runtime dispatching, your code is compiled against a specific CPU target. You should
verify that the code is compiled against the target you expect: `haswell` (AVX2 x64 processors), `westmere` (SSE4 x64 processors), `arm64` (64-bit ARM), `ppc64` (64-bit POWER), `fallback` (others). Under x64 processors, many programmers will want to target `haswell` whereas under ARM,
most programmers will want to target `arm64`. The `fallback` is probably only good for testing purposes, not for deployment.
The On Demand API uses advanced architecture-specific code for many common processors to make JSON preprocessing and string parsing faster. By default, however, most c++ compilers will compile to the least common denominator (since the program could theoretically be run anywhere). Since On Demand is inlined into your own code, it cannot always use these advanced versions unless the compiler is told to target them.
On relevant systems, the On Demand API provides some support for runtime dispatching: that is, it will attempt to detect, at runtime, the instructions that your processor supports and optimize the code accordingly. However, it cannot always make full use of the features of your processor.
Some users wish to run at the best possible speed. Under recent Intel and AMD processors, these users should take additional steps to verify that their code is well optimized.
Given that the On Demand API offer limited runtime dispatching, it matters that your code is compiled against a specific CPU target. You should verify that the code is compiled against the target you expect. Thankfully, the simdjson library will tell you exactly what it detects as an implementation: `haswell` (AVX2 x64 processors), `westmere` (SSE4 x64 processors), `arm64` (64-bit ARM), `ppc64` (64-bit POWER), `fallback` (others). Under x64 processors, many programmers will want to target `haswell` whereas under ARM, most programmers will want to target `arm64` (and it should do so automatically). The `fallback` is probably only good for testing purposes, not for deployment.
```C++
std::cout << simdjson::builtin_implementation()->name() << std::endl;
```
If you are using CMake for your C++ project, then you can pass compilation flags to your compiler by using
the `CMAKE_CXX_FLAGS` variable:
If the `simdjson::builtin_implementation()->name()` call does not return the architecture you wish to target, you may need to pass flags to your compiler.
If you are using CMake for your C++ project, then you can pass compilation flags to your compiler by using the `CMAKE_CXX_FLAGS` variable:
```
cmake -DCMAKE_CXX_FLAGS="-march=haswell" -B build_haswell
cmake --build build_haswell
```
You can also pass the flags directly to your compiler when compiling 'by hand':
````
c++ -march=haswell -O3 myproject.cpp simdjson.cpp
````
In these examples, the `-march=haswell` flags targets a haswell processor and the resulting binary will run on processors that support all features of the haswell processors.
Instead of specifying a specific microarchitecture, you can let your compiler do the work. The `-march=native` flags says "target the current computer," which is a reasonable default for many applications which both compile and run on the same processor.
Passing `-march=native` to the compiler may make On Demand faster by allowing it to use optimizations specific to your machine. You cannot do this, however, if you are compiling code that might be run on less advanced machines. That is, be mindful that when compiling with the `-march=native` flag, the resulting binary will run on the current system but may not run on other systems (e.g., on an old processor).
If you are compiling on an ARM or POWER system, you do not need to be concerned with CPU selection during compilation. The `-march=native` flag useful for best performance on x64 (e.g., Intel) systems but it is generally unsupported on some platforms such as ARM (aarch64) or POWER.

View File

@ -31,10 +31,12 @@ add_cpp_test(minefieldcheck LABELS dom acceptance per_implementation
add_cpp_test(parse_many_test LABELS dom acceptance per_implementation)
add_cpp_test(pointercheck LABELS dom acceptance per_implementation) # https://tools.ietf.org/html/rfc6901
add_cpp_test(unicode_tests LABELS dom acceptance per_implementation)
add_cpp_test(trivially_copyable_test LABELS dom acceptance per_implementation)
add_cpp_test(minify_tests LABELS other acceptance per_implementation)
add_cpp_test(padded_string_tests LABELS other acceptance )
find_program(BASH bash)
# Below we skip anything on Windows, not just visual studio, because running bash under Windows requires you to

View File

@ -65,6 +65,8 @@ bool validate(const char *dirname) {
auto error = simdjson::padded_string::load(fullpath).get(p);
if (error) {
std::cerr << "Could not load the file " << fullpath << std::endl;
free(fullpath);
delete[] is_file_as_expected;
return EXIT_FAILURE;
}
simdjson::dom::parser parser;

View File

@ -63,6 +63,8 @@ bool validate_minefield(const char *dirname) {
auto error = simdjson::padded_string::load(fullpath).get(p);
if (error) {
std::cerr << "Could not load the file " << fullpath << std::endl;
free(fullpath);
delete[] is_file_as_expected;
return EXIT_FAILURE;
}
simdjson::dom::parser parser;

View File

@ -0,0 +1,23 @@
// This file is not part of our main, regular tests.
#include "../singleheader/simdjson.h"
#include <iostream>
#include <type_traits>
int main() {
if (!std::is_trivially_copyable<simdjson::dom::element>::value) {
std::cerr << "simdjson::dom::element must be trivially copyable"
<< std::endl;
return EXIT_FAILURE;
}
if (!std::is_trivially_copyable<simdjson::dom::array>::value) {
std::cerr << "simdjson::dom::array must be trivially copyable" << std::endl;
return EXIT_FAILURE;
}
if (!std::is_trivially_copyable<simdjson::dom::object>::value) {
std::cerr << "simdjson::dom::object must be trivially copyable"
<< std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}