simdjson/fuzz/fuzz_implementations.cpp

/*
 * For fuzzing all of the implementations (haswell/fallback/westmere),
 * finding any difference between the output of each which would
 * indicate inconsistency. Also, it gets the non-default backend
 * some fuzzing love.
 *
 * Copyright Paul Dreik 20200909 for the simdjson project.
 */

#include "simdjson.h"
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <string>
#include <array>
#include "supported_implementations.h"


// store each implementation along with it's intermediate results,
// which would make things easier to debug in case this fuzzer ever
// catches anything
struct Impl {
    explicit Impl(const simdjson::implementation* im=nullptr) : impl(im),parser(),element(),error(),output(){}
    //silence -Weffc++
    Impl(const Impl&)=delete;
    Impl& operator=(const Impl&)=delete;

    const simdjson::implementation* impl;
    simdjson::dom::parser parser;
    simdjson::dom::element element;
    simdjson::error_code error;
    std::string output;
};

template<class Iterator>
void showErrorAndAbort(Iterator first, Iterator last) {
    auto it=first;
    while(it!=last) {
        std::cerr<<"Implementation: "<<it->impl->name()<<"\tError:"<<it->error<<'\n';
        it++;
    }
    std::cerr.flush();
    std::abort();
}

template<class Iterator>
void showOutputAndAbort(Iterator first, Iterator last) {

    for(auto it=first;it!=last;++it) {
        std::cerr<<"Implementation: "<<it->impl->name()<<"\tOutput: "<<it->output<<'\n';
    }

    // show the pairwise results
    for(auto it1=first; it1!=last; ++it1) {
        for(auto it2=it1; it2!=last; ++it2) {
            if(it1!=it2) {
                const bool matches=(it1->output==it2->output);
                std::cerr<<"Implementation "<<it1->impl->name()<<" and "<<it2->impl->name()<<(matches?" match.":" do NOT match.")<<'\n';
            }
        }
    }
    std::cerr.flush();
    std::abort();
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {

  // since this check is expensive, only do it once
  static const auto supported_implementations=get_runtime_supported_implementations();


    // make this dynamic, so it works regardless of how it was compiled
    // or what hardware it runs on
    constexpr std::size_t Nimplementations_max=3;
    const std::size_t Nimplementations = supported_implementations.size();
    
    if(Nimplementations>Nimplementations_max) {
        //there is another backend added, please bump Nimplementations_max!
        std::abort();
    }

    // get pointers to the backend implementation
    std::array<Impl,Nimplementations_max> implementations;
    {
        std::size_t i=0;
        for(auto& e: supported_implementations) {
              implementations[i++].impl=e;
        }
    }

    // let each implementation parse and store the result
    std::size_t nerrors=0;
    for(std::size_t i=0; i<Nimplementations; ++i) {
        auto& e=implementations[i];
        simdjson::active_implementation=e.impl;
        e.error=e.parser.parse(Data,Size).get(e.element);
        if(e.error) {
            ++nerrors;
        } else {
            std::ostringstream oss;
            oss<<e.element;
            e.output=oss.str();
        }
    }

    //we should either have no errors, or all should error
    if(nerrors!=0) {
        if(nerrors!=Nimplementations) {
            showErrorAndAbort(implementations.begin(),
                              implementations.begin()+Nimplementations);
        }
        return 0;
    }

    //parsing went well for all. compare the output against the first.
    const std::string& reference=implementations[0].output;
    for(std::size_t i=1; i<Nimplementations; ++i) {
        if(implementations[i].output!=reference) {
            showOutputAndAbort(implementations.begin(),
                              implementations.begin()+Nimplementations);
        }
    }

    //all is well
    return 0;
}
add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00			`/*`
			`* For fuzzing all of the implementations (haswell/fallback/westmere),`
			`* finding any difference between the output of each which would`
			`* indicate inconsistency. Also, it gets the non-default backend`
			`* some fuzzing love.`
			`*`
			`* Copyright Paul Dreik 20200909 for the simdjson project.`
			`*/`

			`#include "simdjson.h"`
			`#include <cstddef>`
			`#include <cstdint>`
			`#include <cstdlib>`
			`#include <string>`
			`#include <array>`
simplify fuzzing only dynamically supported implementations (#1201) This refactors the dynamic check of which implementations are supported at runtime. It also reduces duplicated effort in the CI fuzzing job, the differential fuzzers don't need to run with different values of SIMDJSON_FORCE_IMPLEMENTATION. There is also a convenience script to run the fuzzers locally, to quickly check that the fuzzers still build, run and no easy to find bugs are there. It should be handy not only when developing the fuzzers, but also when modifying simdjson. 2020-10-09 11:29:54 +08:00			`#include "supported_implementations.h"`
add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00

			`// store each implementation along with it's intermediate results,`
			`// which would make things easier to debug in case this fuzzer ever`
			`// catches anything`
			`struct Impl {`
			`explicit Impl(const simdjson::implementation* im=nullptr) : impl(im),parser(),element(),error(),output(){}`
			`//silence -Weffc++`
			`Impl(const Impl&)=delete;`
			`Impl& operator=(const Impl&)=delete;`

			`const simdjson::implementation* impl;`
			`simdjson::dom::parser parser;`
			`simdjson::dom::element element;`
			`simdjson::error_code error;`
			`std::string output;`
			`};`

			`template<class Iterator>`
			`void showErrorAndAbort(Iterator first, Iterator last) {`
			`auto it=first;`
			`while(it!=last) {`
			`std::cerr<<"Implementation: "<<it->impl->name()<<"\tError:"<<it->error<<'\n';`
			`it++;`
			`}`
			`std::cerr.flush();`
			`std::abort();`
			`}`

			`template<class Iterator>`
			`void showOutputAndAbort(Iterator first, Iterator last) {`

			`for(auto it=first;it!=last;++it) {`
			`std::cerr<<"Implementation: "<<it->impl->name()<<"\tOutput: "<<it->output<<'\n';`
			`}`

			`// show the pairwise results`
			`for(auto it1=first; it1!=last; ++it1) {`
			`for(auto it2=it1; it2!=last; ++it2) {`
			`if(it1!=it2) {`
			`const bool matches=(it1->output==it2->output);`
			`std::cerr<<"Implementation "<<it1->impl->name()<<" and "<<it2->impl->name()<<(matches?" match.":" do NOT match.")<<'\n';`
			`}`
			`}`
			`}`
			`std::cerr.flush();`
			`std::abort();`
			`}`

			`extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {`

simplify fuzzing only dynamically supported implementations (#1201) This refactors the dynamic check of which implementations are supported at runtime. It also reduces duplicated effort in the CI fuzzing job, the differential fuzzers don't need to run with different values of SIMDJSON_FORCE_IMPLEMENTATION. There is also a convenience script to run the fuzzers locally, to quickly check that the fuzzers still build, run and no easy to find bugs are there. It should be handy not only when developing the fuzzers, but also when modifying simdjson. 2020-10-09 11:29:54 +08:00			`// since this check is expensive, only do it once`
			`static const auto supported_implementations=get_runtime_supported_implementations();`


add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00			`// make this dynamic, so it works regardless of how it was compiled`
			`// or what hardware it runs on`
			`constexpr std::size_t Nimplementations_max=3;`
simplify fuzzing only dynamically supported implementations (#1201) This refactors the dynamic check of which implementations are supported at runtime. It also reduces duplicated effort in the CI fuzzing job, the differential fuzzers don't need to run with different values of SIMDJSON_FORCE_IMPLEMENTATION. There is also a convenience script to run the fuzzers locally, to quickly check that the fuzzers still build, run and no easy to find bugs are there. It should be handy not only when developing the fuzzers, but also when modifying simdjson. 2020-10-09 11:29:54 +08:00			`const std::size_t Nimplementations = supported_implementations.size();`
Make it possible to check that an implementation is supported at runtime (#1197) * Make it possible to check that an implementation is supported at runtime. * add CI fuzzing on arm 64 bit This adds fuzzing on drone.io arm64 For some reason, leak detection had to be disabled. If it is enabled, the fuzzer falsely reports a crash at the end of fuzzing. Closes: #1188 * Guarding the implementation accesses. * Better doc. * Updating cxxopts. * Make it possible to check that an implementation is supported at runtime. * Guarding the implementation accesses. * Better doc. * Updating cxxopts. * We need to accomodate cxxopts Co-authored-by: Paul Dreik <github@pauldreik.se> 2020-10-02 23:04:51 +08:00
add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00			`if(Nimplementations>Nimplementations_max) {`
			`//there is another backend added, please bump Nimplementations_max!`
			`std::abort();`
			`}`

			`// get pointers to the backend implementation`
			`std::array<Impl,Nimplementations_max> implementations;`
			`{`
			`std::size_t i=0;`
simplify fuzzing only dynamically supported implementations (#1201) This refactors the dynamic check of which implementations are supported at runtime. It also reduces duplicated effort in the CI fuzzing job, the differential fuzzers don't need to run with different values of SIMDJSON_FORCE_IMPLEMENTATION. There is also a convenience script to run the fuzzers locally, to quickly check that the fuzzers still build, run and no easy to find bugs are there. It should be handy not only when developing the fuzzers, but also when modifying simdjson. 2020-10-09 11:29:54 +08:00			`for(auto& e: supported_implementations) {`
Make it possible to check that an implementation is supported at runtime (#1197) * Make it possible to check that an implementation is supported at runtime. * add CI fuzzing on arm 64 bit This adds fuzzing on drone.io arm64 For some reason, leak detection had to be disabled. If it is enabled, the fuzzer falsely reports a crash at the end of fuzzing. Closes: #1188 * Guarding the implementation accesses. * Better doc. * Updating cxxopts. * Make it possible to check that an implementation is supported at runtime. * Guarding the implementation accesses. * Better doc. * Updating cxxopts. * We need to accomodate cxxopts Co-authored-by: Paul Dreik <github@pauldreik.se> 2020-10-02 23:04:51 +08:00			`implementations[i++].impl=e;`
add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00			`}`
			`}`

			`// let each implementation parse and store the result`
			`std::size_t nerrors=0;`
add CI fuzzing on arm 64 bit This adds fuzzing on drone.io arm64 For some reason, leak detection had to be disabled. If it is enabled, the fuzzer falsely reports a crash at the end of fuzzing. Closes: #1188 2020-10-01 16:12:37 +08:00			`for(std::size_t i=0; i<Nimplementations; ++i) {`
			`auto& e=implementations[i];`
add multi implementation fuzzer (#1162) This adds a fuzzer which parses the same input using all the available implementations (haswell, westmere, fallback on x64). This should get the otherwise uncovered sourcefiles (mostly fallback) to show up in the fuzz coverage. For instance, the fallback directory has only one line covered. As of the 20200909 report, 1866 lines are covered out of 4478. Also, it will detect if the implementations behave differently: by making sure they all succeed, or all error turning the parsed data into text again, should produce equal results While at it, I corrected some minor things: clean up building too many variants, run with forced implementation (closes #815 ) always store crashes as artefacts, good in case the fuzzer finds something return value of the fuzzer function should always be 0 reduce log spam introduce max size for the seed corpus and the CI fuzzer 2020-09-12 05:46:22 +08:00			`simdjson::active_implementation=e.impl;`
			`e.error=e.parser.parse(Data,Size).get(e.element);`
			`if(e.error) {`
			`++nerrors;`
			`} else {`
			`std::ostringstream oss;`
			`oss<<e.element;`
			`e.output=oss.str();`
			`}`
			`}`

			`//we should either have no errors, or all should error`
			`if(nerrors!=0) {`
			`if(nerrors!=Nimplementations) {`
			`showErrorAndAbort(implementations.begin(),`
			`implementations.begin()+Nimplementations);`
			`}`
			`return 0;`
			`}`

			`//parsing went well for all. compare the output against the first.`
			`const std::string& reference=implementations[0].output;`
			`for(std::size_t i=1; i<Nimplementations; ++i) {`
			`if(implementations[i].output!=reference) {`
			`showOutputAndAbort(implementations.begin(),`
			`implementations.begin()+Nimplementations);`
			`}`
			`}`

			`//all is well`
			`return 0;`
			`}`