Merge branch 'master' into fix_minor_problems

This commit is contained in:
Kai Wolf 2019-02-25 21:05:29 +01:00 committed by GitHub
commit e7683820d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 1718 additions and 21 deletions

View File

@ -1,9 +1,82 @@
version: 2 version: 2
jobs: jobs:
build: "gcc":
docker: docker:
- image: gcc:8 - image: ubuntu:18.04
environment:
CXX: g++-7
steps: steps:
- checkout - checkout
- run: make
- run: make quiettest - run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
g++-7
- run:
name: Building (gcc)
command: make
- run:
name: Running tests (gcc)
command: make quiettest
- run:
name: Building (gcc, cmake)
command: |
mkdir build
cd build
cmake ..
make
- run:
name: Running tests (gcc, cmake)
command: |
cd build
make test
"clang":
docker:
- image: ubuntu:18.04
environment:
CXX: clang++-6.0
steps:
- checkout
- run: apt-get update -qq
- run: >
apt-get install -y
build-essential
cmake
clang-6.0
- run:
name: Building (clang)
command: make
- run:
name: Running tests (clang)
command: make quiettest
- run:
name: Building (clang, cmake)
command: |
mkdir build
cd build
cmake ..
make
- run:
name: Running tests (clang, cmake)
command: |
cd build
make test
workflows:
version: 2
build_and_test:
jobs:
- "clang"
- "gcc"

View File

@ -5,5 +5,5 @@ steps:
- name: test - name: test
image: gcc:8 image: gcc:8
commands: commands:
- make - make -j2
- make quiettest - make quiettest -j2

View File

@ -1,4 +1,4 @@
language: c++ language: cpp
sudo: false sudo: false
addons: addons:
apt: apt:
@ -15,6 +15,5 @@ branches:
script: script:
- export CXX=g++-7 - export CXX=g++-7
- export CC=gcc-7 - export CC=gcc-7
- make - make
- make test - make test

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8...3.13)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_MACOSX_RPATH OFF) set(CMAKE_MACOSX_RPATH OFF)
@ -34,3 +34,18 @@ add_subdirectory(src)
add_subdirectory(tools) add_subdirectory(tools)
add_subdirectory(tests) add_subdirectory(tests)
add_subdirectory(benchmark) add_subdirectory(benchmark)
set(CPACK_PACKAGE_VENDOR "Daniel Lemire")
set(CPACK_PACKAGE_CONTACT "lemire@gmail.com")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Parsing gigabytes of JSON per second")
set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
set(CPACK_RPM_PACKAGE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
include(CPack)

View File

@ -9,7 +9,9 @@
JSON documents are everywhere on the Internet. Servers spend a lot of time parsing these documents. We want to accelerate the parsing of JSON per se using commonly available SIMD instructions as much as possible while doing full validation (including character encoding). JSON documents are everywhere on the Internet. Servers spend a lot of time parsing these documents. We want to accelerate the parsing of JSON per se using commonly available SIMD instructions as much as possible while doing full validation (including character encoding).
## Paper
A description of the design and implementation of simdjson appears at https://arxiv.org/abs/1902.08318 and an informal blog post providing some background and context is at https://branchfree.org/2019/02/25/paper-parsing-gigabytes-of-json-per-second/.
## Some performance results ## Some performance results
We can use a quarter or fewer instructions than a state-of-the-art parser like RapidJSON, and half as many as sajson. To our knowledge, simdjson is the first fully-validating JSON parser to run at gigabytes per second on commodity processors. We can use a quarter or fewer instructions than a state-of-the-art parser like RapidJSON, and half as many as sajson. To our knowledge, simdjson is the first fully-validating JSON parser to run at gigabytes per second on commodity processors.
@ -221,7 +223,7 @@ To simplify the engineering, we make some assumptions.
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited). - We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited).
- In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.) - In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.)
- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same). - As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same).
- Performance is optimized for JSON documents spanning at least a few kilobytes up to many megabytes: the performance issues with having to parse many tiny JSON documents or one truly enormous JSON document are different. - Performance is optimized for JSON documents spanning at least a tens kilobytes up to many megabytes: the performance issues with having to parse many tiny JSON documents or one truly enormous JSON document are different.
*We do not aim to provide a general-purpose JSON library.* A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document. *We do not aim to provide a general-purpose JSON library.* A library like RapidJSON offers much more than just parsing, it helps you generate JSON and offers various other convenient functions. We merely parse the document.
@ -361,9 +363,10 @@ make allparsingcompetition
./allparsingcompetition myfile.json ./allparsingcompetition myfile.json
``` ```
## Python bindings ## Other programming languages
- [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project. - [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project.
- [SimdJsonSharp](https://github.com/EgorBo/SimdJsonSharp): C# version for .NET Core

View File

@ -27,7 +27,7 @@
#define really_inline inline #define really_inline inline
#define never_inline inline #define never_inline __declspec(noinline)
#define UNUSED #define UNUSED
#define WARN_UNUSED #define WARN_UNUSED

1
jsonchecker/pass15.json Normal file
View File

@ -0,0 +1 @@
[-65.619720000000029]

View File

@ -0,0 +1,4 @@
Files from https://github.com/plokhotnyuk/jsoniter-scala/tree/master/jsoniter-scala-benchmark/src/main/resources/com/github/plokhotnyuk/jsoniter_scala/benchmark
See issue "Lower performance on small files":
https://github.com/lemire/simdjson/issues/70

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,428 @@
[
{
"created_at": "Thu Apr 06 15:28:43 +0000 2017",
"id": 850007368138018817,
"id_str": "850007368138018817",
"text": "RT @TwitterDev: 1/ Today were sharing our vision for the future of the Twitter API platform!\nhttps://t.co/XweGngmxlP",
"truncated": false,
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [
{
"screen_name": "TwitterDev",
"name": "TwitterDev",
"id": 2244994945,
"id_str": "2244994945",
"indices": [
3,
14
]
}
],
"urls": [
{
"url": "https://t.co/XweGngmxlP",
"expanded_url": "https://cards.twitter.com/cards/18ce53wgo4h/3xo1c",
"display_url": "cards.twitter.com/cards/18ce53wg…",
"indices": [
94,
117
]
}
]
},
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 6253282,
"id_str": "6253282",
"name": "Twitter API",
"screen_name": "twitterapi",
"location": "San Francisco, CA",
"description": "The Real Twitter API. I tweet about API changes, service issues and happily answer questions about Twitter and our API. Don't get an answer? It's on my website.",
"url": "http://t.co/78pYTvWfJd",
"entities": {
"url": {
"urls": [
{
"url": "http://t.co/78pYTvWfJd",
"expanded_url": "https://dev.twitter.com",
"display_url": "dev.twitter.com",
"indices": [
0,
22
]
}
]
},
"description": {
"urls": []
}
},
"protected": false,
"followers_count": 6172353,
"friends_count": 46,
"listed_count": 13091,
"created_at": "Wed May 23 06:01:13 +0000 2007",
"favourites_count": 26,
"utc_offset": -25200,
"time_zone": "Pacific Time (US & Canada)",
"geo_enabled": true,
"verified": true,
"statuses_count": 3583,
"lang": "en",
"contributors_enabled": false,
"is_translator": false,
"is_translation_enabled": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png",
"profile_background_tile": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/2284174872/7df3h38zabcvjylnyfe3_normal.png",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/2284174872/7df3h38zabcvjylnyfe3_normal.png",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/6253282/1431474710",
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"has_extended_profile": false,
"default_profile": false,
"default_profile_image": false,
"following": true,
"follow_request_sent": false,
"notifications": false,
"translator_type": "regular"
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"retweeted_status": {
"created_at": "Thu Apr 06 15:24:15 +0000 2017",
"id": 850006245121695744,
"id_str": "850006245121695744",
"text": "1/ Today were sharing our vision for the future of the Twitter API platform!\nhttps://t.co/XweGngmxlP",
"truncated": false,
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://t.co/XweGngmxlP",
"expanded_url": "https://cards.twitter.com/cards/18ce53wgo4h/3xo1c",
"display_url": "cards.twitter.com/cards/18ce53wg…",
"indices": [
78,
101
]
}
]
},
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 2244994945,
"id_str": "2244994945",
"name": "TwitterDev",
"screen_name": "TwitterDev",
"location": "Internet",
"description": "Your official source for Twitter Platform news, updates & events. Need technical help? Visit https://t.co/mGHnxZCxkt ⌨️ #TapIntoTwitter",
"url": "https://t.co/66w26cua1O",
"entities": {
"url": {
"urls": [
{
"url": "https://t.co/66w26cua1O",
"expanded_url": "https://dev.twitter.com/",
"display_url": "dev.twitter.com",
"indices": [
0,
23
]
}
]
},
"description": {
"urls": [
{
"url": "https://t.co/mGHnxZCxkt",
"expanded_url": "https://twittercommunity.com/",
"display_url": "twittercommunity.com",
"indices": [
93,
116
]
}
]
}
},
"protected": false,
"followers_count": 465425,
"friends_count": 1523,
"listed_count": 1168,
"created_at": "Sat Dec 14 04:35:55 +0000 2013",
"favourites_count": 2098,
"utc_offset": -25200,
"time_zone": "Pacific Time (US & Canada)",
"geo_enabled": true,
"verified": true,
"statuses_count": 3031,
"lang": "en",
"contributors_enabled": false,
"is_translator": false,
"is_translation_enabled": false,
"profile_background_color": "FFFFFF",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_image_url": "http://pbs.twimg.com/profile_images/530814764687949824/npQQVkq8_normal.png",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/530814764687949824/npQQVkq8_normal.png",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2244994945/1396995246",
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": false,
"has_extended_profile": false,
"default_profile": false,
"default_profile_image": false,
"following": true,
"follow_request_sent": false,
"notifications": false,
"translator_type": "regular"
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"is_quote_status": false,
"retweet_count": 284,
"favorite_count": 399,
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"lang": "en"
},
"is_quote_status": false,
"retweet_count": 284,
"favorite_count": 0,
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"lang": "en"
},
{
"created_at": "Mon Apr 03 16:09:50 +0000 2017",
"id": 848930551989915648,
"id_str": "848930551989915648",
"text": "RT @TwitterMktg: Starting today, businesses can request and share locations when engaging with people in Direct Messages. https://t.co/rpYn…",
"truncated": false,
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [
{
"screen_name": "TwitterMktg",
"name": "Twitter Marketing",
"id": 357750891,
"id_str": "357750891",
"indices": [
3,
15
]
}
],
"urls": []
},
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 6253282,
"id_str": "6253282",
"name": "Twitter API",
"screen_name": "twitterapi",
"location": "San Francisco, CA",
"description": "The Real Twitter API. I tweet about API changes, service issues and happily answer questions about Twitter and our API. Don't get an answer? It's on my website.",
"url": "http://t.co/78pYTvWfJd",
"entities": {
"url": {
"urls": [
{
"url": "http://t.co/78pYTvWfJd",
"expanded_url": "https://dev.twitter.com",
"display_url": "dev.twitter.com",
"indices": [
0,
22
]
}
]
},
"description": {
"urls": []
}
},
"protected": false,
"followers_count": 6172353,
"friends_count": 46,
"listed_count": 13091,
"created_at": "Wed May 23 06:01:13 +0000 2007",
"favourites_count": 26,
"utc_offset": -25200,
"time_zone": "Pacific Time (US & Canada)",
"geo_enabled": true,
"verified": true,
"statuses_count": 3583,
"lang": "en",
"contributors_enabled": false,
"is_translator": false,
"is_translation_enabled": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png",
"profile_background_tile": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/2284174872/7df3h38zabcvjylnyfe3_normal.png",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/2284174872/7df3h38zabcvjylnyfe3_normal.png",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/6253282/1431474710",
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"has_extended_profile": false,
"default_profile": false,
"default_profile_image": false,
"following": true,
"follow_request_sent": false,
"notifications": false,
"translator_type": "regular"
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"retweeted_status": {
"created_at": "Mon Apr 03 16:05:05 +0000 2017",
"id": 848929357519241216,
"id_str": "848929357519241216",
"text": "Starting today, businesses can request and share locations when engaging with people in Direct Messages. https://t.co/rpYndqWfQw",
"truncated": false,
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://t.co/rpYndqWfQw",
"expanded_url": "https://cards.twitter.com/cards/5wzucr/3x700",
"display_url": "cards.twitter.com/cards/5wzucr/3…",
"indices": [
105,
128
]
}
]
},
"source": "<a href=\"https://ads.twitter.com\" rel=\"nofollow\">Twitter Ads</a>",
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 357750891,
"id_str": "357750891",
"name": "Twitter Marketing",
"screen_name": "TwitterMktg",
"location": "Twitter HQ ",
"description": "Twitters place for marketers, agencies, and creative thinkers ⭐ Bringing you insights, news, updates, and inspiration. Visit @TwitterAdsHelp for Ads support.",
"url": "https://t.co/Tfo4moo92y",
"entities": {
"url": {
"urls": [
{
"url": "https://t.co/Tfo4moo92y",
"expanded_url": "https://marketing.twitter.com",
"display_url": "marketing.twitter.com",
"indices": [
0,
23
]
}
]
},
"description": {
"urls": []
}
},
"protected": false,
"followers_count": 924546,
"friends_count": 661,
"listed_count": 3893,
"created_at": "Thu Aug 18 21:08:15 +0000 2011",
"favourites_count": 1934,
"utc_offset": -25200,
"time_zone": "Pacific Time (US & Canada)",
"geo_enabled": true,
"verified": true,
"statuses_count": 6329,
"lang": "en",
"contributors_enabled": false,
"is_translator": false,
"is_translation_enabled": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/662767273/jvmxdpdrplhxcw8yvkv2.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/662767273/jvmxdpdrplhxcw8yvkv2.png",
"profile_background_tile": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/800953549697888256/UlXXL5h5_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/800953549697888256/UlXXL5h5_normal.jpg",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/357750891/1487188210",
"profile_link_color": "19CF86",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"has_extended_profile": false,
"default_profile": false,
"default_profile_image": false,
"following": false,
"follow_request_sent": false,
"notifications": false,
"translator_type": "none"
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"is_quote_status": false,
"retweet_count": 111,
"favorite_count": 162,
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"lang": "en"
},
"is_quote_status": false,
"retweet_count": 111,
"favorite_count": 0,
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"lang": "en"
}
]

View File

@ -48,16 +48,16 @@ bool ParsedJson::allocateCapacity(size_t len, size_t maxdepth) {
bytecapacity = 0; // will only set it to len after allocations are a success bytecapacity = 0; // will only set it to len after allocations are a success
n_structural_indexes = 0; n_structural_indexes = 0;
uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7;
structural_indexes = new uint32_t[max_structures]; structural_indexes = new (std::nothrow) uint32_t[max_structures];
size_t localtapecapacity = ROUNDUP_N(len, 64); size_t localtapecapacity = ROUNDUP_N(len, 64);
size_t localstringcapacity = ROUNDUP_N(len + 32, 64); size_t localstringcapacity = ROUNDUP_N(len + 32, 64);
string_buf = new uint8_t[localstringcapacity]; string_buf = new (std::nothrow) uint8_t[localstringcapacity];
tape = new uint64_t[localtapecapacity]; tape = new (std::nothrow) uint64_t[localtapecapacity];
containing_scope_offset = new uint32_t[maxdepth]; containing_scope_offset = new (std::nothrow) uint32_t[maxdepth];
#ifdef SIMDJSON_USE_COMPUTED_GOTO #ifdef SIMDJSON_USE_COMPUTED_GOTO
ret_address = new void *[maxdepth]; ret_address = new (std::nothrow) void *[maxdepth];
#else #else
ret_address = new char[maxdepth]; ret_address = new (std::nothrow) char[maxdepth];
#endif #endif
if ((string_buf == nullptr) || (tape == nullptr) || if ((string_buf == nullptr) || (tape == nullptr) ||
(containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) { (containing_scope_offset == nullptr) || (ret_address == nullptr) || (structural_indexes == nullptr)) {