PPC64 support (#1254)

* Initial PPC64 support

* Add travis CI

* Fix outdated cmake version for travis

* Fix indendtation

* Try another workaround for outdated cmake in travis

* Try beta cmake

* Add dash before beta

* Use builtin snaps

* Use cmake as rocksdb

* Test cmake on bionic

* Remove unnecessary things from travis

* Remove unnecessary things from travis

* Another try of compiler install

* Add all major compilers

* Add all major compilers

* Add all major compilers

* Tweak travis a bit

* Typo

* More robust travis

* Typos typos typos

* Add fewer compilers, add non specific build for clang and gcc, should be the final config

* CMAKE_FLAGS is in incorrect place

* Remove default implementation

* Limit build thread number

* Fall back prefix_xor to a usual implementation, no performance boost is noticed

* Test for power9 as it is the main architecture for OpenPOWER right now

* Add to documentation to build with power9 as the implementation is compatible but compiler optimizations is not

* Replace ARM with PPC in the comment
This commit is contained in:
Danila Kutenin 2020-10-28 01:43:39 +03:00 committed by GitHub
parent 1fd0447dbb
commit f46a0f64f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 1246 additions and 128 deletions

View File

@ -1,30 +1,179 @@
language: cpp
sudo: false
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-7
- g++-7
- clang-format
- python
branches:
only:
- master
dist: bionic
arch:
- ppc64le
matrix:
include:
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-8
env:
- COMPILER="CC=gcc-8 && CXX=g++-8"
compiler: gcc-8
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-9
env:
- COMPILER="CC=gcc-9 && CXX=g++-9"
compiler: gcc-9
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-10
env:
- COMPILER="CC=gcc-10 && CXX=g++-10"
compiler: gcc-10
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-10
env:
- COMPILER="CC=gcc-10 && CXX=g++-10"
- SANITIZE="on"
compiler: gcc-10-sanitize
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-10
env:
- COMPILER="CC=gcc-10 && CXX=g++-10"
- STATIC="on"
compiler: gcc-10-static
- os: linux
addons:
apt:
sources:
- llvm-toolchain-bionic-6.0
packages:
- clang-6.0
env:
- COMPILER="CC=clang-6.0 && CXX=clang++-6.0"
compiler: clang-6
- os: linux
addons:
apt:
sources:
- llvm-toolchain-bionic-7
packages:
- clang-7
env:
- COMPILER="CC=clang-7 && CXX=clang++-7"
compiler: clang-7
- os: linux
addons:
apt:
sources:
- llvm-toolchain-bionic-8
packages:
- clang-8
env:
- COMPILER="CC=clang-8 && CXX=clang++-8"
compiler: clang-8
- os: linux
addons:
apt:
sources:
- llvm-toolchain-bionic-9
packages:
- clang-9
env:
- COMPILER="CC=clang-9 && CXX=clang++-9"
compiler: clang-9
- os: linux
addons:
apt:
packages:
- clang-10
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
env:
- COMPILER="CC=clang-10 && CXX=clang++-10"
compiler: clang-10
- os: linux
addons:
apt:
packages:
- clang-10
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
env:
- COMPILER="CC=clang-10 && CXX=clang++-10"
- STATIC="on"
compiler: clang-10-static
- os: linux
addons:
apt:
packages:
- clang-10
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
env:
- COMPILER="CC=clang-10 && CXX=clang++-10"
- SANITIZE="on"
compiler: clang-10-sanitize
before_install:
- eval "${COMPILER}"
install:
- if [[ "${TRAVIS_CPU_ARCH}" == "ppc64le" ]]; then
sudo apt-get install libuv1 rhash libstdc++6;
wget https://anaconda.org/conda-forge/cmake/3.17.0/download/linux-ppc64le/cmake-3.17.0-hfb1cb51_0.tar.bz2;
mkdir $HOME/cmake;
tar -xjf cmake-3.17.0-hfb1cb51_0.tar.bz2 -C $HOME/cmake;
export PATH=$HOME/cmake/bin:$PATH;
fi
- export CMAKE_CXX_FLAGS="-maltivec -mcpu=power9 -mtune=power9"
- export CMAKE_C_FLAGS="${CMAKE_CXX_FLAGS}"
- export CMAKE_FLAGS="-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DSIMDJSON_IMPLEMENTATION=ppc64;fallback";
- if [[ "${SANITIZE}" == "on" ]]; then
export CMAKE_FLAGS="${CMAKE_FLAGS} -DSIMDJSON_SANITIZE=ON";
export ASAN_OPTIONS="detect_leaks=0";
fi
- if [[ "${STATIC}" == "on" ]]; then
export CMAKE_FLAGS="${CMAKE_FLAGS} -DSIMDJSON_BUILD_STATIC=ON";
fi
- export CTEST_FLAGS="-j4 --output-on-failure -E checkperf"
script:
- export CXX=g++-7
- export CC=gcc-7
- make
- make test
- make everything
- make amalgamate
- make clean
- make SANITIZEGOLD=1 test
- make clean
- ARCHFLAGS="-march=nehalem" make
- ARCHFLAGS="-march=nehalem" make test
- ARCHFLAGS="-march=nehalem" make everything
- ./style/run-clang-format.py -r include/ benchmark/ src/ tests/
- mkdir build
- cd build
- cmake $CMAKE_FLAGS ..
- cmake --build . -- -j2
- SIMDJSON_FORCE_IMPLEMENTATION=ppc64 ctest $CTEST_FLAGS -L per_implementation
- SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation
- ctest $CTEST_FLAGS -LE "acceptance|per_implementation"

View File

@ -48,7 +48,7 @@ simdjson's source structure, from the top level, looks like this:
implementations).
* simdjson.cpp: A "main source" that includes all implementation files from src/. This is
equivalent to the distributed simdjson.cpp.
* arm64/|fallback/|haswell/|westmere/: Architecture-specific implementations. All functions are
* arm64/|fallback/|haswell/|ppc64/|westmere/: Architecture-specific implementations. All functions are
Each architecture defines its own namespace, e.g. simdjson::haswell.
* generic/: Generic implementations of the simdjson parser. These files may be included and
compiled multiple times, from whichever architectures use them. They assume they are already

View File

@ -63,8 +63,8 @@ void print_usage(ostream& out) {
out << "-s STAGE - Stop after the given stage." << endl;
out << " -s stage1 - Stop after find_structural_bits." << endl;
out << " -s all - Run all stages." << endl;
out << "-a ARCH - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
out << " or ARM64). By default, detects best supported architecture." << endl;
out << "-a ARCH - Use the parser with the designated architecture (HASWELL, WESTMERE," << endl;
out << " PPC64 or ARM64). By default, detects best supported architecture." << endl;
}
void exit_usage(string message) {

View File

@ -101,7 +101,7 @@ endif()
#
# Implementation selection
#
set(SIMDJSON_ALL_IMPLEMENTATIONS "fallback;westmere;haswell;arm64")
set(SIMDJSON_ALL_IMPLEMENTATIONS "fallback;westmere;haswell;arm64;ppc64")
set(SIMDJSON_IMPLEMENTATION "" CACHE STRING "Semicolon-separated list of implementations to include (${SIMDJSON_ALL_IMPLEMENTATIONS}). If this is not set, any implementations that are supported at compile time and may be selected at runtime will be included.")
foreach(implementation ${SIMDJSON_IMPLEMENTATION})
@ -110,7 +110,7 @@ foreach(implementation ${SIMDJSON_IMPLEMENTATION})
endif()
endforeach(implementation)
set(SIMDJSON_EXCLUDE_IMPLEMENTATION "" CACHE STRING "Semicolon-separated list of implementations to exclude (haswell/westmere/arm64/fallback). By default, excludes any implementations that are unsupported at compile time or cannot be selected at runtime.")
set(SIMDJSON_EXCLUDE_IMPLEMENTATION "" CACHE STRING "Semicolon-separated list of implementations to exclude (haswell/westmere/arm64/ppc64/fallback). By default, excludes any implementations that are unsupported at compile time or cannot be selected at runtime.")
foreach(implementation ${SIMDJSON_EXCLUDE_IMPLEMENTATION})
if(NOT (implementation IN_LIST SIMDJSON_ALL_IMPLEMENTATIONS))
message(ERROR "Implementation ${implementation} not supported by simdjson. Possible implementations: ${SIMDJSON_ALL_IMPLEMENTATIONS}")
@ -161,6 +161,11 @@ if(NOT SIMDJSON_IMPLEMENTATION_ARM64)
message(DEPRECATION "SIMDJSON_IMPLEMENTATION_ARM64 is deprecated. Use SIMDJSON_IMPLEMENTATION=-arm64 instead.")
target_compile_definitions(simdjson-flags INTERFACE SIMDJSON_IMPLEMENTATION_ARM64=0)
endif()
option(SIMDJSON_IMPLEMENTATION_PPC64 "Include the arm64 implementation" ON)
if(NOT SIMDJSON_IMPLEMENTATION_PPC64)
message(DEPRECATION "SIMDJSON_IMPLEMENTATION_PPC64 is deprecated. Use SIMDJSON_IMPLEMENTATION=-ppc64 instead.")
target_compile_definitions(simdjson-flags INTERFACE SIMDJSON_IMPLEMENTATION_PPC64=0)
endif()
option(SIMDJSON_IMPLEMENTATION_FALLBACK "Include the fallback implementation" ON)
if(NOT SIMDJSON_IMPLEMENTATION_FALLBACK)
message(DEPRECATION "SIMDJSON_IMPLEMENTATION_FALLBACK is deprecated. Use SIMDJSON_IMPLEMENTATION=-fallback instead.")

View File

@ -25,7 +25,7 @@ An overview of what you need to know to use simdjson, with examples.
Requirements
------------------
- A recent compiler (LLVM clang6 or better, GNU GCC 7.4 or better) on a 64-bit (ARM or x64 Intel/AMD) POSIX systems such as macOS, freeBSD or Linux. We require that the compiler supports the C++11 standard or better.
- A recent compiler (LLVM clang6 or better, GNU GCC 7.4 or better) on a 64-bit (PPC, ARM or x64 Intel/AMD) POSIX systems such as macOS, freeBSD or Linux. We require that the compiler supports the C++11 standard or better.
- Visual Studio 2017 or better under 64-bit Windows. Users should target a 64-bit build (x64) instead of a 32-bit build (x86). We support the LLVM clang compiler under Visual Studio (clangcl) as well as as the regular Visual Studio compiler. We also support MinGW 64-bit under Windows.
Including simdjson

View File

@ -7,7 +7,7 @@ An overview of what you need to know to use simdjson, with examples.
Requirements
------------------
- A recent compiler (LLVM clang6 or better, GNU GCC 7 or better) on a 64-bit (ARM or x64 Intel/AMD) POSIX systems such as macOS, freeBSD or Linux. We require that the compiler supports the C++11 standard or better.
- A recent compiler (LLVM clang6 or better, GNU GCC 7 or better) on a 64-bit (PPC, ARM or x64 Intel/AMD) POSIX systems such as macOS, freeBSD or Linux. We require that the compiler supports the C++11 standard or better.
- Visual Studio 2017 or better under 64-bit Windows. Users should target a 64-bit build (x64) instead of a 32-bit build (x86). We support the LLVM clang compiler under Visual Studio (clangcl) as well as as the regular Visual Studio compiler.
Including simdjson

View File

@ -20,20 +20,22 @@ The current implementations are:
* haswell: AVX2 (2013 Intel Haswell or later)
* westmere: SSE4.2 (2010 Westmere or later).
* arm64: 64-bit ARMv8-A NEON
* ppc64: 64-bit POWER8 and POWER9 with VSX and ALTIVEC extensions. Both big endian and little endian are implemented, depends on the compiler you are using.
* fallback: A generic implementation that runs on any 64-bit processor.
In many cases, you don't know where your compiled binary is going to run, so simdjson automatically
compiles *all* the implementations into the executable. On Intel, it will include 3 implementations
(haswell, westmere and fallback), and on ARM it will include 2 (arm64 and fallback).
(haswell, westmere and fallback), on ARM it will include 2 (arm64 and fallback), and on PPC it will include 2 (ppc64 and fallback).
If you know more about where you're going to run and want to save the space, you can disable any of
these implementations at compile time with `-DSIMDJSON_IMPLEMENTATION_X=0` (where X is HASWELL,
WESTMERE, ARM64 and FALLBACK).
WESTMERE, ARM64, PPC64 and FALLBACK).
The simdjson library automatically sets header flags for each implementation as it compiles; there
is no need to set architecture-specific flags yourself (e.g., `-mavx2`, `/AVX2` or
`-march=haswell`), and it may even break runtime dispatch and your binaries will fail to run on
older processors.
older processors. _Note:_ for POWER9 processors make sure you compile it with `-mcpu=power9` and `-mtune=power9` to
get maximum performance.
Runtime CPU Detection
---------------------

View File

@ -613,7 +613,7 @@ Good applications for the On Demand API might be:
## Checking Your CPU Selection
Given that the On Demand API does not offer runtime dispatching, your code is compiled against a specific CPU target. You should
verify that the code is compiled against the target you expect: `haswell` (AVX2 x64 processors), `westmere` (SSE4 x64 processors), `arm64` (64-bit ARM), `fallback` (others). Under x64 processors, many programmers will want to target `haswell` whereas under ARM,
verify that the code is compiled against the target you expect: `haswell` (AVX2 x64 processors), `westmere` (SSE4 x64 processors), `arm64` (64-bit ARM), `ppc64` (64-bit POWER), `fallback` (others). Under x64 processors, many programmers will want to target `haswell` whereas under ARM,
most programmers will want to target `arm64`. The `fallback` is probably only good for testing purposes, not for deployment.
```C++

View File

@ -157,7 +157,7 @@ Downclocking
SIMD instructions are the public transportation of computing. Instead of using 4 distinct instructions to add numbers, you can replace them with a single instruction that does the same work. Though the one instruction is slightly more expensive, the energy used per unit of work is much less with SIMD. If you can increase your speed using SIMD instructions (NEON, SSE, AVX), you should expect to reduce your power usage.
The SIMD instructions that simdjson relies upon (SSE and AVX under x64, NEON under ARM) are routinely part of runtime libraries (e.g., [Go](https://golang.org/src/runtime/memmove_amd64.s), [Glibc](https://github.com/ihtsae/glibc/commit/5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97), [LLVM](https://github.com/llvm/llvm-project/blob/96f3ea0d21b48ca088355db10d4d1a2e9bc9f884/lldb/tools/debugserver/source/MacOSX/i386/DNBArchImplI386.cpp), [Rust](https://github.com/rust-lang/rust/commit/070fad1701fb36b112853b0a6a9787a7bb7ff34c), [Java](http://hg.openjdk.java.net/jdk8u/jdk8u/hotspot/file/c1374141598c/src/cpu/x86/vm/stubGenerator_x86_64.cpp#l1297), [PHP](https://github.com/php/php-src/blob/e5cb53ec68603d4dbdd780fd3ecfca943b4fd383/ext/standard/string.c)). What distinguishes the simdjson library is that it is built from the ground up to benefit from these instructions.
The SIMD instructions that simdjson relies upon (SSE and AVX under x64, NEON under ARM, ALTIVEC under PPC) are routinely part of runtime libraries (e.g., [Go](https://golang.org/src/runtime/memmove_amd64.s), [Glibc](https://github.com/ihtsae/glibc/commit/5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97), [LLVM](https://github.com/llvm/llvm-project/blob/96f3ea0d21b48ca088355db10d4d1a2e9bc9f884/lldb/tools/debugserver/source/MacOSX/i386/DNBArchImplI386.cpp), [Rust](https://github.com/rust-lang/rust/commit/070fad1701fb36b112853b0a6a9787a7bb7ff34c), [Java](http://hg.openjdk.java.net/jdk8u/jdk8u/hotspot/file/c1374141598c/src/cpu/x86/vm/stubGenerator_x86_64.cpp#l1297), [PHP](https://github.com/php/php-src/blob/e5cb53ec68603d4dbdd780fd3ecfca943b4fd383/ext/standard/string.c)). What distinguishes the simdjson library is that it is built from the ground up to benefit from these instructions.
You should not expect the simdjson library to cause *downclocking* of your recent Intel CPU cores.

View File

@ -85,6 +85,7 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "simdjson/arm64.h"
#include "simdjson/haswell.h"
#include "simdjson/westmere.h"
#include "simdjson/ppc64.h"
#include "simdjson/fallback.h"
#include "simdjson/builtin.h"

View File

@ -10,6 +10,8 @@
#define SIMDJSON_BUILTIN_IMPLEMENTATION westmere
#elif SIMDJSON_CAN_ALWAYS_RUN_ARM64
#define SIMDJSON_BUILTIN_IMPLEMENTATION arm64
#elif SIMDJSON_CAN_ALWAYS_RUN_PPC64
#define SIMDJSON_BUILTIN_IMPLEMENTATION ppc64
#elif SIMDJSON_CAN_ALWAYS_RUN_FALLBACK
#define SIMDJSON_BUILTIN_IMPLEMENTATION fallback
#else

View File

@ -5,7 +5,7 @@
// Default Fallback to on unless a builtin implementation has already been selected.
#ifndef SIMDJSON_IMPLEMENTATION_FALLBACK
#define SIMDJSON_IMPLEMENTATION_FALLBACK 1 // (!SIMDJSON_CAN_ALWAYS_RUN_ARM64 && !SIMDJSON_CAN_ALWAYS_RUN_HASWELL && !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE)
#define SIMDJSON_IMPLEMENTATION_FALLBACK 1 // (!SIMDJSON_CAN_ALWAYS_RUN_ARM64 && !SIMDJSON_CAN_ALWAYS_RUN_HASWELL && !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE && !SIMDJSON_CAN_ALWAYS_RUN_PPC64)
#endif
#define SIMDJSON_CAN_ALWAYS_RUN_FALLBACK SIMDJSON_IMPLEMENTATION_FALLBACK

View File

@ -65,10 +65,17 @@ enum instruction_set {
SSE42 = 0x8,
PCLMULQDQ = 0x10,
BMI1 = 0x20,
BMI2 = 0x40
BMI2 = 0x40,
ALTIVEC = 0x80
};
#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
#if defined(__PPC64__)
static inline uint32_t detect_supported_architectures() {
return instruction_set::ALTIVEC;
}
#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
#if defined(__ARM_NEON)

View File

@ -43,6 +43,8 @@
#define SIMDJSON_IS_X86_64 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#define SIMDJSON_IS_ARM64 1
#elif defined(__PPC64__) || defined(_M_PPC64)
#define SIMDJSON_IS_PPC64 1
#else
#define SIMDJSON_IS_32BITS 1
@ -52,6 +54,8 @@
#define SIMDJSON_IS_X86_32BITS 1
#elif defined(__arm__) || defined(_M_ARM)
#define SIMDJSON_IS_ARM_32BITS 1
#elif defined(__PPC__) || defined(_M_PPC)
#define SIMDJSON_IS_PPC_32BITS 1
#endif
#endif // defined(__x86_64__) || defined(_M_AMD64)
@ -61,7 +65,7 @@
for 64-bit processors and it seems that you are not \
compiling for a known 64-bit platform. All fast kernels \
will be disabled and performance may be poor. Please \
use a 64-bit target such as x64 or 64-bit ARM.")
use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
#endif // SIMDJSON_IS_32BITS
// this is almost standard?

49
include/simdjson/ppc64.h Normal file
View File

@ -0,0 +1,49 @@
#ifndef SIMDJSON_PPC64_H
#define SIMDJSON_PPC64_H
#ifdef SIMDJSON_FALLBACK_H
#error "ppc64.h must be included before fallback.h"
#endif
#include "simdjson/portability.h"
#include "simdjson/internal/isadetection.h"
#include "simdjson/internal/jsoncharutils_tables.h"
#include "simdjson/internal/numberparsing_tables.h"
#include "simdjson/internal/simdprune_tables.h"
#if SIMDJSON_IMPLEMENTATION_PPC64
namespace simdjson {
/**
* Implementation for ALTIVEC (PPC64).
*/
namespace ppc64 {
} // namespace ppc64
} // namespace simdjson
#include "simdjson/ppc64/implementation.h"
#include "simdjson/ppc64/begin.h"
// Declarations
#include "simdjson/generic/dom_parser_implementation.h"
#include "simdjson/ppc64/intrinsics.h"
#include "simdjson/ppc64/bitmanipulation.h"
#include "simdjson/ppc64/bitmask.h"
#include "simdjson/ppc64/simd.h"
#include "simdjson/generic/jsoncharutils.h"
#include "simdjson/generic/atomparsing.h"
#include "simdjson/ppc64/stringparsing.h"
#include "simdjson/ppc64/numberparsing.h"
#include "simdjson/generic/implementation_simdjson_result_base.h"
#include "simdjson/generic/ondemand.h"
// Inline definitions
#include "simdjson/generic/implementation_simdjson_result_base-inl.h"
#include "simdjson/generic/ondemand-inl.h"
#include "simdjson/ppc64/end.h"
#endif // SIMDJSON_IMPLEMENTATION_PPC64
#endif // SIMDJSON_PPC64_H

View File

@ -0,0 +1 @@
#define SIMDJSON_IMPLEMENTATION ppc64

View File

@ -0,0 +1,70 @@
#ifndef SIMDJSON_PPC64_BITMANIPULATION_H
#define SIMDJSON_PPC64_BITMANIPULATION_H
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
// We sometimes call trailing_zero on inputs that are zero,
// but the algorithms do not end up using the returned value.
// Sadly, sanitizers are not smart enough to figure it out.
NO_SANITIZE_UNDEFINED
simdjson_really_inline int trailing_zeroes(uint64_t input_num) {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
unsigned long ret;
// Search the mask data from least significant bit (LSB)
// to the most significant bit (MSB) for a set bit (1).
_BitScanForward64(&ret, input_num);
return (int)ret;
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
return __builtin_ctzll(input_num);
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
}
/* result might be undefined when input_num is zero */
simdjson_really_inline uint64_t clear_lowest_bit(uint64_t input_num) {
return input_num & (input_num - 1);
}
/* result might be undefined when input_num is zero */
simdjson_really_inline int leading_zeroes(uint64_t input_num) {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
unsigned long leading_zero = 0;
// Search the mask data from most significant bit (MSB)
// to least significant bit (LSB) for a set bit (1).
if (_BitScanReverse64(&leading_zero, input_num))
return (int)(63 - leading_zero);
else
return 64;
#else
return __builtin_clzll(input_num);
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
}
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
simdjson_really_inline int count_ones(uint64_t input_num) {
// note: we do not support legacy 32-bit Windows
return __popcnt64(input_num); // Visual Studio wants two underscores
}
#else
simdjson_really_inline int count_ones(uint64_t input_num) {
return __builtin_popcountll(input_num);
}
#endif
simdjson_really_inline bool add_overflow(uint64_t value1, uint64_t value2,
uint64_t *result) {
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
*result = value1 + value2;
return *result < value1;
#else
return __builtin_uaddll_overflow(value1, value2,
(unsigned long long *)result);
#endif
}
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#endif // SIMDJSON_PPC64_BITMANIPULATION_H

View File

@ -0,0 +1,42 @@
#ifndef SIMDJSON_PPC64_BITMASK_H
#define SIMDJSON_PPC64_BITMASK_H
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
//
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is
// encountered.
//
// For example, prefix_xor(00100100) == 00011100
//
simdjson_really_inline uint64_t prefix_xor(uint64_t bitmask) {
// You can use the version below, however gcc sometimes miscompiles
// vec_pmsum_be, it happens somewhere around between 8 and 9th version.
// The performance boost was not noticeable, falling back to a usual
// implementation.
// __vector unsigned long long all_ones = {~0ull, ~0ull};
// __vector unsigned long long mask = {bitmask, 0};
// // Clang and GCC return different values for pmsum for ull so cast it to one.
// // Generally it is not specified by ALTIVEC ISA what is returned by
// // vec_pmsum_be.
// #if defined(__LITTLE_ENDIAN__)
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]);
// #else
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]);
// #endif
bitmask ^= bitmask << 1;
bitmask ^= bitmask << 2;
bitmask ^= bitmask << 4;
bitmask ^= bitmask << 8;
bitmask ^= bitmask << 16;
bitmask ^= bitmask << 32;
return bitmask;
}
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#endif

View File

@ -0,0 +1 @@
#undef SIMDJSON_IMPLEMENTATION

View File

@ -0,0 +1,34 @@
#ifndef SIMDJSON_PPC64_IMPLEMENTATION_H
#define SIMDJSON_PPC64_IMPLEMENTATION_H
#include "simdjson.h"
#include "simdjson/internal/isadetection.h"
namespace simdjson {
namespace ppc64 {
namespace {
using namespace simdjson;
using namespace simdjson::dom;
} // namespace
class implementation final : public simdjson::implementation {
public:
simdjson_really_inline implementation()
: simdjson::implementation("ppc64", "PPC64 ALTIVEC",
internal::instruction_set::ALTIVEC) {}
simdjson_warn_unused error_code create_dom_parser_implementation(
size_t capacity, size_t max_length,
std::unique_ptr<internal::dom_parser_implementation> &dst)
const noexcept final;
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len,
uint8_t *dst,
size_t &dst_len) const noexcept final;
simdjson_warn_unused bool validate_utf8(const char *buf,
size_t len) const noexcept final;
};
} // namespace ppc64
} // namespace simdjson
#endif // SIMDJSON_PPC64_IMPLEMENTATION_H

View File

@ -0,0 +1,19 @@
#ifndef SIMDJSON_PPC64_INTRINSICS_H
#define SIMDJSON_PPC64_INTRINSICS_H
#include "simdjson.h"
// This should be the correct header whether
// you use visual studio or other compilers.
#include <altivec.h>
// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
#ifdef bool
#undef bool
#endif
#ifdef vector
#undef vector
#endif
#endif // SIMDJSON_PPC64_INTRINSICS_H

View File

@ -0,0 +1,32 @@
#ifndef SIMDJSON_PPC64_NUMBERPARSING_H
#define SIMDJSON_PPC64_NUMBERPARSING_H
#include <byteswap.h>
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
// we don't have appropriate, so let us use a scalar function
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
static simdjson_really_inline uint32_t
parse_eight_digits_unrolled(const uint8_t *chars) {
uint64_t val;
memcpy(&val, chars, sizeof(uint64_t));
#ifdef __BIG_ENDIAN__
val = bswap_64(val);
#endif
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
}
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#define SWAR_NUMBER_PARSING
#include "simdjson/generic/numberparsing.h"
#endif // SIMDJSON_PPC64_NUMBERPARSING_H

View File

@ -0,0 +1,471 @@
#ifndef SIMDJSON_PPC64_SIMD_H
#define SIMDJSON_PPC64_SIMD_H
#include "simdjson.h"
#include "simdjson/internal/simdprune_tables.h"
#include "simdjson/ppc64/bitmanipulation.h"
#include <type_traits>
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
namespace simd {
using __m128i = __vector unsigned char;
template <typename Child> struct base {
__m128i value;
// Zero constructor
simdjson_really_inline base() : value{__m128i()} {}
// Conversion from SIMD register
simdjson_really_inline base(const __m128i _value) : value(_value) {}
// Conversion to SIMD register
simdjson_really_inline operator const __m128i &() const {
return this->value;
}
simdjson_really_inline operator __m128i &() { return this->value; }
// Bit operations
simdjson_really_inline Child operator|(const Child other) const {
return vec_or(this->value, (__m128i)other);
}
simdjson_really_inline Child operator&(const Child other) const {
return vec_and(this->value, (__m128i)other);
}
simdjson_really_inline Child operator^(const Child other) const {
return vec_xor(this->value, (__m128i)other);
}
simdjson_really_inline Child bit_andnot(const Child other) const {
return vec_andc(this->value, (__m128i)other);
}
simdjson_really_inline Child &operator|=(const Child other) {
auto this_cast = (Child *)this;
*this_cast = *this_cast | other;
return *this_cast;
}
simdjson_really_inline Child &operator&=(const Child other) {
auto this_cast = (Child *)this;
*this_cast = *this_cast & other;
return *this_cast;
}
simdjson_really_inline Child &operator^=(const Child other) {
auto this_cast = (Child *)this;
*this_cast = *this_cast ^ other;
return *this_cast;
}
};
// Forward-declared so they can be used by splat and friends.
template <typename T> struct simd8;
template <typename T, typename Mask = simd8<bool>>
struct base8 : base<simd8<T>> {
typedef uint16_t bitmask_t;
typedef uint32_t bitmask2_t;
simdjson_really_inline base8() : base<simd8<T>>() {}
simdjson_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
simdjson_really_inline Mask operator==(const simd8<T> other) const {
return (__m128i)vec_cmpeq(this->value, (__m128i)other);
}
static const int SIZE = sizeof(base<simd8<T>>::value);
template <int N = 1>
simdjson_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
__m128i chunk = this->value;
#ifdef __LITTLE_ENDIAN__
chunk = (__m128i)vec_reve(this->value);
prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
#endif
chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
#ifdef __LITTLE_ENDIAN__
chunk = (__m128i)vec_reve((__m128i)chunk);
#endif
return chunk;
}
};
// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base8<bool> {
static simdjson_really_inline simd8<bool> splat(bool _value) {
return (__m128i)vec_splats((unsigned char)(-(!!_value)));
}
simdjson_really_inline simd8<bool>() : base8() {}
simdjson_really_inline simd8<bool>(const __m128i _value)
: base8<bool>(_value) {}
// Splat constructor
simdjson_really_inline simd8<bool>(bool _value)
: base8<bool>(splat(_value)) {}
simdjson_really_inline int to_bitmask() const {
__vector unsigned long long result;
const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
(__m128i)perm_mask));
#ifdef __LITTLE_ENDIAN__
return static_cast<int>(result[1]);
#else
return static_cast<int>(result[0]);
#endif
}
simdjson_really_inline bool any() const {
return !vec_all_eq(this->value, (__m128i)vec_splats(0));
}
simdjson_really_inline simd8<bool> operator~() const {
return this->value ^ (__m128i)splat(true);
}
};
template <typename T> struct base8_numeric : base8<T> {
static simdjson_really_inline simd8<T> splat(T value) {
(void)value;
return (__m128i)vec_splats(value);
}
static simdjson_really_inline simd8<T> zero() { return splat(0); }
static simdjson_really_inline simd8<T> load(const T values[16]) {
return (__m128i)(vec_vsx_ld(0, (const uint8_t *)values));
}
// Repeat 16 values as many times as necessary (usually for lookup tables)
static simdjson_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
T v5, T v6, T v7, T v8, T v9,
T v10, T v11, T v12, T v13,
T v14, T v15) {
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
v14, v15);
}
simdjson_really_inline base8_numeric() : base8<T>() {}
simdjson_really_inline base8_numeric(const __m128i _value)
: base8<T>(_value) {}
// Store to array
simdjson_really_inline void store(T dst[16]) const {
vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
}
// Override to distinguish from bool version
simdjson_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
// Addition/subtraction are the same for signed and unsigned
simdjson_really_inline simd8<T> operator+(const simd8<T> other) const {
return (__m128i)((__m128i)this->value + (__m128i)other);
}
simdjson_really_inline simd8<T> operator-(const simd8<T> other) const {
return (__m128i)((__m128i)this->value - (__m128i)other);
}
simdjson_really_inline simd8<T> &operator+=(const simd8<T> other) {
*this = *this + other;
return *(simd8<T> *)this;
}
simdjson_really_inline simd8<T> &operator-=(const simd8<T> other) {
*this = *this - other;
return *(simd8<T> *)this;
}
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
// for out of range values)
template <typename L>
simdjson_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
}
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
// as a bitset). Passing a 0 value for mask would be equivalent to writing out
// every byte to output. Only the first 16 - count_ones(mask) bytes of the
// result are significant but 16 bytes get written. Design consideration: it
// seems like a function with the signature simd8<L> compress(uint32_t mask)
// would be sensible, but the AVX ISA makes this kind of approach difficult.
template <typename L>
simdjson_really_inline void compress(uint16_t mask, L *output) const {
using internal::BitsSetTable256mul2;
using internal::pshufb_combine_table;
using internal::thintable_epi8;
// this particular implementation was inspired by work done by @animetosho
// we do it in two steps, first 8 bytes and then second 8 bytes
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
// next line just loads the 64-bit values thintable_epi8[mask1] and
// thintable_epi8[mask2] into a 128-bit register, using only
// two instructions on most compilers.
#ifdef __LITTLE_ENDIAN__
__m128i shufmask = (__m128i)(__vector unsigned long long){
thintable_epi8[mask1], thintable_epi8[mask2]};
#else
__m128i shufmask = (__m128i)(__vector unsigned long long){
thintable_epi8[mask2], thintable_epi8[mask1]};
shufmask = (__m128i)vec_reve((__m128i)shufmask);
#endif
// we increment by 0x08 the second half of the mask
shufmask = ((__m128i)shufmask) +
((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
// this is the version "nearly pruned"
__m128i pruned = vec_perm(this->value, this->value, shufmask);
// we still need to put the two halves together.
// we compute the popcount of the first half:
int pop1 = BitsSetTable256mul2[mask1];
// then load the corresponding mask, what it does is to write
// only the first pop1 bytes from the first 8 bytes, and then
// it fills in with the bytes from the second 8 bytes + some filling
// at the end.
__m128i compactmask =
vec_vsx_ld(0, (const uint8_t *)(pshufb_combine_table + pop1 * 8));
__m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
vec_vsx_st(answer, 0, (__m128i *)(output));
}
template <typename L>
simdjson_really_inline simd8<L>
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
L replace5, L replace6, L replace7, L replace8, L replace9,
L replace10, L replace11, L replace12, L replace13, L replace14,
L replace15) const {
return lookup_16(simd8<L>::repeat_16(
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
replace7, replace8, replace9, replace10, replace11, replace12,
replace13, replace14, replace15));
}
};
// Signed bytes
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
simdjson_really_inline simd8() : base8_numeric<int8_t>() {}
simdjson_really_inline simd8(const __m128i _value)
: base8_numeric<int8_t>(_value) {}
// Splat constructor
simdjson_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
// Array constructor
simdjson_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
// Member-by-member initialization
simdjson_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
: simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14,
v15}) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
simdjson_really_inline static simd8<int8_t>
repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
v13, v14, v15);
}
// Order-sensitive comparisons
simdjson_really_inline simd8<int8_t>
max_val(const simd8<int8_t> other) const {
return (__m128i)vec_max((__vector signed char)this->value,
(__vector signed char)(__m128i)other);
}
simdjson_really_inline simd8<int8_t>
min_val(const simd8<int8_t> other) const {
return (__m128i)vec_min((__vector signed char)this->value,
(__vector signed char)(__m128i)other);
}
simdjson_really_inline simd8<bool>
operator>(const simd8<int8_t> other) const {
return (__m128i)vec_cmpgt((__vector signed char)this->value,
(__vector signed char)(__m128i)other);
}
simdjson_really_inline simd8<bool>
operator<(const simd8<int8_t> other) const {
return (__m128i)vec_cmplt((__vector signed char)this->value,
(__vector signed char)(__m128i)other);
}
};
// Unsigned bytes
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
simdjson_really_inline simd8() : base8_numeric<uint8_t>() {}
simdjson_really_inline simd8(const __m128i _value)
: base8_numeric<uint8_t>(_value) {}
// Splat constructor
simdjson_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
// Array constructor
simdjson_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
// Member-by-member initialization
simdjson_really_inline
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
: simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
v13, v14, v15}) {}
// Repeat 16 values as many times as necessary (usually for lookup tables)
simdjson_really_inline static simd8<uint8_t>
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
uint8_t v15) {
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
v13, v14, v15);
}
// Saturated math
simdjson_really_inline simd8<uint8_t>
saturating_add(const simd8<uint8_t> other) const {
return (__m128i)vec_adds(this->value, (__m128i)other);
}
simdjson_really_inline simd8<uint8_t>
saturating_sub(const simd8<uint8_t> other) const {
return (__m128i)vec_subs(this->value, (__m128i)other);
}
// Order-specific operations
simdjson_really_inline simd8<uint8_t>
max_val(const simd8<uint8_t> other) const {
return (__m128i)vec_max(this->value, (__m128i)other);
}
simdjson_really_inline simd8<uint8_t>
min_val(const simd8<uint8_t> other) const {
return (__m128i)vec_min(this->value, (__m128i)other);
}
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
simdjson_really_inline simd8<uint8_t>
gt_bits(const simd8<uint8_t> other) const {
return this->saturating_sub(other);
}
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
simdjson_really_inline simd8<uint8_t>
lt_bits(const simd8<uint8_t> other) const {
return other.saturating_sub(*this);
}
simdjson_really_inline simd8<bool>
operator<=(const simd8<uint8_t> other) const {
return other.max_val(*this) == other;
}
simdjson_really_inline simd8<bool>
operator>=(const simd8<uint8_t> other) const {
return other.min_val(*this) == other;
}
simdjson_really_inline simd8<bool>
operator>(const simd8<uint8_t> other) const {
return this->gt_bits(other).any_bits_set();
}
simdjson_really_inline simd8<bool>
operator<(const simd8<uint8_t> other) const {
return this->gt_bits(other).any_bits_set();
}
// Bit-specific operations
simdjson_really_inline simd8<bool> bits_not_set() const {
return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
}
simdjson_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
return (*this & bits).bits_not_set();
}
simdjson_really_inline simd8<bool> any_bits_set() const {
return ~this->bits_not_set();
}
simdjson_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
return ~this->bits_not_set(bits);
}
simdjson_really_inline bool bits_not_set_anywhere() const {
return vec_all_eq(this->value, (__m128i)vec_splats(0));
}
simdjson_really_inline bool any_bits_set_anywhere() const {
return !bits_not_set_anywhere();
}
simdjson_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
return vec_all_eq(vec_and(this->value, (__m128i)bits),
(__m128i)vec_splats(0));
}
simdjson_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
return !bits_not_set_anywhere(bits);
}
template <int N> simdjson_really_inline simd8<uint8_t> shr() const {
return simd8<uint8_t>(
(__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
}
template <int N> simdjson_really_inline simd8<uint8_t> shl() const {
return simd8<uint8_t>(
(__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
}
};
template <typename T> struct simd8x64 {
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
static_assert(NUM_CHUNKS == 4,
"Westmere kernel should use four registers per 64-byte block.");
const simd8<T> chunks[NUM_CHUNKS];
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
simd8x64<T> &
operator=(const simd8<T> other) = delete; // no assignment allowed
simd8x64() = delete; // no default constructor allowed
simdjson_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
const simd8<T> chunk2, const simd8<T> chunk3)
: chunks{chunk0, chunk1, chunk2, chunk3} {}
simdjson_really_inline simd8x64(const T ptr[64])
: chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
simdjson_really_inline void store(T ptr[64]) const {
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
}
simdjson_really_inline simd8<T> reduce_or() const {
return (this->chunks[0] | this->chunks[1]) |
(this->chunks[2] | this->chunks[3]);
}
simdjson_really_inline void compress(uint64_t mask, T *output) const {
this->chunks[0].compress(uint16_t(mask), output);
this->chunks[1].compress(uint16_t(mask >> 16),
output + 16 - count_ones(mask & 0xFFFF));
this->chunks[2].compress(uint16_t(mask >> 32),
output + 32 - count_ones(mask & 0xFFFFFFFF));
this->chunks[3].compress(uint16_t(mask >> 48),
output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
}
simdjson_really_inline uint64_t to_bitmask() const {
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
uint64_t r1 = this->chunks[1].to_bitmask();
uint64_t r2 = this->chunks[2].to_bitmask();
uint64_t r3 = this->chunks[3].to_bitmask();
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
}
simdjson_really_inline uint64_t eq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m);
return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
this->chunks[2] == mask, this->chunks[3] == mask)
.to_bitmask();
}
simdjson_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
return simd8x64<bool>(this->chunks[0] == other.chunks[0],
this->chunks[1] == other.chunks[1],
this->chunks[2] == other.chunks[2],
this->chunks[3] == other.chunks[3])
.to_bitmask();
}
simdjson_really_inline uint64_t lteq(const T m) const {
const simd8<T> mask = simd8<T>::splat(m);
return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
this->chunks[2] <= mask, this->chunks[3] <= mask)
.to_bitmask();
}
}; // struct simd8x64<T>
} // namespace simd
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#endif // SIMDJSON_PPC64_SIMD_INPUT_H

View File

@ -0,0 +1,65 @@
#ifndef SIMDJSON_PPC64_STRINGPARSING_H
#define SIMDJSON_PPC64_STRINGPARSING_H
#include "simdjson.h"
#include "simdjson/ppc64/bitmanipulation.h"
#include "simdjson/ppc64/simd.h"
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
using namespace simd;
// Holds backslashes and quotes locations.
struct backslash_and_quote {
public:
static constexpr uint32_t BYTES_PROCESSED = 32;
simdjson_really_inline static backslash_and_quote
copy_and_find(const uint8_t *src, uint8_t *dst);
simdjson_really_inline bool has_quote_first() {
return ((bs_bits - 1) & quote_bits) != 0;
}
simdjson_really_inline bool has_backslash() { return bs_bits != 0; }
simdjson_really_inline int quote_index() {
return trailing_zeroes(quote_bits);
}
simdjson_really_inline int backslash_index() {
return trailing_zeroes(bs_bits);
}
uint32_t bs_bits;
uint32_t quote_bits;
}; // struct backslash_and_quote
simdjson_really_inline backslash_and_quote
backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
// this can read up to 31 bytes beyond the buffer size, but we require
// SIMDJSON_PADDING of padding
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1),
"backslash and quote finder must process fewer than "
"SIMDJSON_PADDING bytes");
simd8<uint8_t> v0(src);
simd8<uint8_t> v1(src + sizeof(v0));
v0.store(dst);
v1.store(dst + sizeof(v0));
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on
// PPC; therefore, we smash them together into a 64-byte mask and get the
// bitmask from there.
uint64_t bs_and_quote =
simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
return {
uint32_t(bs_and_quote), // bs_bits
uint32_t(bs_and_quote >> 32) // quote_bits
};
}
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#include "simdjson/generic/stringparsing.h"
#endif // SIMDJSON_PPC64_STRINGPARSING_H

View File

@ -23,6 +23,9 @@ const westmere::implementation westmere_singleton{};
#if SIMDJSON_IMPLEMENTATION_ARM64
const arm64::implementation arm64_singleton{};
#endif // SIMDJSON_IMPLEMENTATION_ARM64
#if SIMDJSON_IMPLEMENTATION_PPC64
const ppc64::implementation ppc64_singleton{};
#endif // SIMDJSON_IMPLEMENTATION_PPC64
#if SIMDJSON_IMPLEMENTATION_FALLBACK
const fallback::implementation fallback_singleton{};
#endif // SIMDJSON_IMPLEMENTATION_FALLBACK
@ -65,6 +68,9 @@ const std::initializer_list<const implementation *> available_implementation_poi
#if SIMDJSON_IMPLEMENTATION_ARM64
&arm64_singleton,
#endif
#if SIMDJSON_IMPLEMENTATION_PPC64
&ppc64_singleton,
#endif
#if SIMDJSON_IMPLEMENTATION_FALLBACK
&fallback_singleton,
#endif

View File

@ -1,4 +1,4 @@
#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
#include <cstdint>
@ -23,16 +23,16 @@ SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256] = {
SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
0x0f, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80,
0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
// 256 * 8 bytes = 2kB, easily fits in cache.
@ -128,4 +128,4 @@ SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256] = {
} // namespace internal
} // namespace simdjson
#endif // SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
#endif // SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64

View File

@ -0,0 +1,133 @@
#include "simdjson/ppc64/begin.h"
//
// Stage 1
//
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
using namespace simd;
struct json_character_block {
static simdjson_really_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
simdjson_really_inline uint64_t whitespace() const { return _whitespace; }
simdjson_really_inline uint64_t op() const { return _op; }
simdjson_really_inline uint64_t scalar() { return ~(op() | whitespace()); }
uint64_t _whitespace;
uint64_t _op;
};
simdjson_really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
simd8x64<uint8_t> v(
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
(in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
);
uint64_t op = simd8x64<bool>(
v.chunks[0].any_bits_set(0x7),
v.chunks[1].any_bits_set(0x7),
v.chunks[2].any_bits_set(0x7),
v.chunks[3].any_bits_set(0x7)
).to_bitmask();
uint64_t whitespace = simd8x64<bool>(
v.chunks[0].any_bits_set(0x18),
v.chunks[1].any_bits_set(0x18),
v.chunks[2].any_bits_set(0x18),
v.chunks[3].any_bits_set(0x18)
).to_bitmask();
return { whitespace, op };
}
simdjson_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().saturating_sub(0b10000000u).bits_not_set_anywhere();
}
simdjson_unused simdjson_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
simdjson_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
}
} // unnamed namespace
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#include "generic/stage1/utf8_lookup4_algorithm.h"
#include "generic/stage1/json_structural_indexer.h"
#include "generic/stage1/utf8_validator.h"
//
// Stage 2
//
#include "generic/stage2/tape_builder.h"
//
// Implementation-specific overrides
//
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
namespace {
namespace stage1 {
simdjson_really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
// On PPC, we don't short-circuit this if there are no backslashes, because the branch gives us no
// benefit and therefore makes things worse.
// if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
this->buf = _buf;
this->len = _len;
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return ppc64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, false);
if (error) { return error; }
return stage2(_doc);
}
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#include "simdjson/ppc64/end.h"

View File

@ -0,0 +1,21 @@
#include "simdjson/ppc64/begin.h"
namespace simdjson {
namespace SIMDJSON_IMPLEMENTATION {
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
size_t capacity,
size_t max_depth,
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept {
dst.reset( new (std::nothrow) dom_parser_implementation() );
if (!dst) { return MEMALLOC; }
dst->set_capacity(capacity);
dst->set_max_depth(max_depth);
return SUCCESS;
}
} // namespace SIMDJSON_IMPLEMENTATION
} // namespace simdjson
#include "simdjson/ppc64/end.h"

View File

@ -23,6 +23,10 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
#include "haswell/implementation.cpp"
#include "haswell/dom_parser_implementation.cpp"
#endif
#if SIMDJSON_IMPLEMENTATION_PPC64
#include "ppc64/implementation.cpp"
#include "ppc64/dom_parser_implementation.cpp"
#endif
#if SIMDJSON_IMPLEMENTATION_WESTMERE
#include "westmere/implementation.cpp"
#include "westmere/dom_parser_implementation.cpp"