2016-09-18 20:01:57 +08:00
/*=============================================================================
# Filename: Database.h
2017-01-16 14:12:57 +08:00
# Author: Bookug Lobert
2016-09-18 20:01:57 +08:00
# Mail: 1181955272@qq.com
# Last Modified: 2015-10-23 14:20
# Description: originally written by liyouhuan, modified by zengli and chenjiaqi
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
# ifndef _DATABASE_DATABASE_H
# define _DATABASE_DATABASE_H
# include "../Util/Util.h"
# include "../Util/Triple.h"
# include "Join.h"
# include "../Query/IDList.h"
# include "../Query/ResultSet.h"
# include "../Query/SPARQLquery.h"
# include "../Query/BasicQuery.h"
# include "../Signature/SigEntry.h"
# include "../VSTree/VSTree.h"
2016-09-25 22:14:36 +08:00
# include "../KVstore/KVstore.h"
# include "../StringIndex/StringIndex.h"
2016-09-18 20:01:57 +08:00
# include "../Parser/DBparser.h"
# include "../Parser/RDFParser.h"
# include "../Parser/SparqlParser.h"
# include "../Query/GeneralEvaluation.h"
class Database
{
public :
2017-03-24 20:10:43 +08:00
//static const bool only_sub2idpre2id = true;
//static const int internal = 100 * 1000;
//void test();
//void test_build_sig();
//void test_join();
//void printIDlist(int _i, int* _list, int _len, std::string _log);
//void printPairList(int _i, int* _list, int _len, std::string _log);
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//when encode EntitySig, one way uses STRING-hash, the other one uses ID-hash
//depending on this->encode_mode
2016-09-18 20:01:57 +08:00
static const int STRING_MODE = 1 ;
static const int ID_MODE = 2 ;
Database ( ) ;
Database ( std : : string _name ) ;
void release ( FILE * fp0 ) ;
~ Database ( ) ;
bool load ( ) ;
bool unload ( ) ;
2017-04-11 15:26:35 +08:00
void clear ( ) ;
2017-02-17 17:09:47 +08:00
int query ( const string _query , ResultSet & _result_set , FILE * _fp = stdout ) ;
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//1. if subject of _triple doesn't exist,
//then assign a new subid, and insert a new SigEntry
//2. assign new tuple_id to tuple, if predicate or object doesn't exist before too;
//3. if subject exist, update SigEntry, and update spo, ops... etc. if needed
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
bool build ( const string & _rdf_file ) ;
2016-09-18 20:01:57 +08:00
//interfaces to insert/delete from given rdf file
2017-04-11 15:26:35 +08:00
bool insert ( std : : string _rdf_file , bool _is_restore = false ) ;
bool remove ( std : : string _rdf_file , bool _is_restore = false ) ;
bool backup ( ) ;
bool restore ( ) ;
2016-09-18 20:01:57 +08:00
2017-03-29 13:48:39 +08:00
//name of this DB
2016-09-18 20:01:57 +08:00
string getName ( ) ;
2017-06-15 21:22:03 +08:00
//get infos
TYPE_TRIPLE_NUM getTripleNum ( ) ;
TYPE_ENTITY_LITERAL_ID getEntityNum ( ) ;
TYPE_ENTITY_LITERAL_ID getLiteralNum ( ) ;
TYPE_ENTITY_LITERAL_ID getSubNum ( ) ;
TYPE_PREDICATE_ID getPreNum ( ) ;
2017-03-29 13:48:39 +08:00
//root Path of this DB + sixTuplesFile
2016-09-18 20:01:57 +08:00
string getSixTuplesFile ( ) ;
2017-03-29 13:48:39 +08:00
//root Path of this DB + signatureBFile
2016-09-18 20:01:57 +08:00
string getSignatureBFile ( ) ;
2017-03-29 13:48:39 +08:00
//root Path of this DB + DBInfoFile
2017-01-16 14:12:57 +08:00
string getDBInfoFile ( ) ;
2016-09-18 20:01:57 +08:00
2017-03-29 13:48:39 +08:00
//id tuples file
string getIDTuplesFile ( ) ;
2016-09-18 20:01:57 +08:00
private :
string name ;
2017-03-23 21:32:41 +08:00
string store_path ;
2016-09-18 20:01:57 +08:00
bool is_active ;
2017-03-24 20:10:43 +08:00
TYPE_TRIPLE_NUM triples_num ;
TYPE_ENTITY_LITERAL_ID entity_num ;
TYPE_ENTITY_LITERAL_ID sub_num ;
2017-06-15 21:22:03 +08:00
//BETTER: add object num
2017-03-24 20:10:43 +08:00
TYPE_PREDICATE_ID pre_num ;
TYPE_ENTITY_LITERAL_ID literal_num ;
2016-09-18 20:01:57 +08:00
int encode_mode ;
2017-03-23 21:32:41 +08:00
bool if_loaded ;
2016-09-18 20:01:57 +08:00
VSTree * vstree ;
KVstore * kvstore ;
2016-09-25 22:14:36 +08:00
StringIndex * stringindex ;
2016-09-18 20:01:57 +08:00
Join * join ;
2017-01-16 14:12:57 +08:00
//metadata of this database: sub_num, pre_num, obj_num, literal_num, etc.
string db_info_file ;
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//six tuples: <sub pre obj sid pid oid>
2016-09-18 20:01:57 +08:00
string six_tuples_file ;
2017-03-29 13:48:39 +08:00
2017-01-16 14:12:57 +08:00
//B means binary
2016-09-18 20:01:57 +08:00
string signature_binary_file ;
2017-01-16 14:12:57 +08:00
2017-03-29 13:48:39 +08:00
//id tuples file
string id_tuples_file ;
2017-04-11 15:26:35 +08:00
string update_log ;
string update_log_since_backup ;
2017-03-29 13:48:39 +08:00
2017-01-16 14:12:57 +08:00
//pre2num mapping
2017-03-24 20:10:43 +08:00
TYPE_TRIPLE_NUM * pre2num ;
2017-02-17 17:09:47 +08:00
//valid: check from minNumPID to maxNumPID
2017-03-24 20:10:43 +08:00
TYPE_PREDICATE_ID maxNumPID , minNumPID ;
2017-01-16 14:12:57 +08:00
void setPreMap ( ) ;
2017-03-24 20:10:43 +08:00
//TODO: set the buffer capacity as dynamic according to the current memory usage
2017-01-16 14:12:57 +08:00
//string buffer
Buffer * entity_buffer ;
//unsigned offset; //maybe let id start from an offset
unsigned entity_buffer_size ;
Buffer * literal_buffer ;
unsigned literal_buffer_size ;
2017-03-24 20:10:43 +08:00
2017-01-16 14:12:57 +08:00
void setStringBuffer ( ) ;
void warmUp ( ) ;
2017-03-24 20:10:43 +08:00
//BETTER:add a predicate buffer for ?p query
//However, I think this is not necessary because ?p is rare and the p2xx tree is small enough
2017-01-16 14:12:57 +08:00
2017-06-06 18:56:46 +08:00
void check ( ) ;
//used for multiple threads
void load_vstree ( unsigned _vstree_size ) ;
void load_entity2id ( int _mode ) ;
void load_id2entity ( int _mode ) ;
void load_literal2id ( int _mode ) ;
void load_id2literal ( int _mode ) ;
void load_predicate2id ( int _mode ) ;
void load_id2predicate ( int _mode ) ;
void load_sub2values ( int _mode ) ;
void load_obj2values ( int _mode ) ;
void load_pre2values ( int _mode ) ;
2016-09-18 20:01:57 +08:00
//triple num per group for insert/delete
//can not be too high, otherwise the heap will over
static const int GROUP_SIZE = 1000 ;
//manage the ID allocate and garbage
2017-03-24 20:10:43 +08:00
static const TYPE_ENTITY_LITERAL_ID START_ID_NUM = 0 ;
2017-01-16 14:12:57 +08:00
//static const int START_ID_NUM = 1000;
2016-09-18 20:01:57 +08:00
/////////////////////////////////////////////////////////////////////////////////
//NOTICE:error if >= LITERAL_FIRST_ID
string free_id_file_entity ; //the first is limitID, then free id list
2017-03-24 20:10:43 +08:00
TYPE_ENTITY_LITERAL_ID limitID_entity ; //the current maxium ID num(maybe not used so much)
2016-09-18 20:01:57 +08:00
BlockInfo * freelist_entity ; //free id list, reuse BlockInfo for Storage class
2017-03-24 20:10:43 +08:00
TYPE_ENTITY_LITERAL_ID allocEntityID ( ) ;
void freeEntityID ( TYPE_ENTITY_LITERAL_ID _id ) ;
2016-09-18 20:01:57 +08:00
/////////////////////////////////////////////////////////////////////////////////
//NOTICE:error if >= 2*LITERAL_FIRST_ID
string free_id_file_literal ;
2017-03-24 20:10:43 +08:00
TYPE_ENTITY_LITERAL_ID limitID_literal ;
2017-01-16 14:12:57 +08:00
BlockInfo * freelist_literal ;
2017-03-24 20:10:43 +08:00
TYPE_ENTITY_LITERAL_ID allocLiteralID ( ) ;
void freeLiteralID ( TYPE_ENTITY_LITERAL_ID _id ) ;
2016-09-18 20:01:57 +08:00
/////////////////////////////////////////////////////////////////////////////////
//NOTICE:error if >= 2*LITERAL_FIRST_ID
string free_id_file_predicate ;
2017-03-24 20:10:43 +08:00
TYPE_PREDICATE_ID limitID_predicate ;
2017-01-16 14:12:57 +08:00
BlockInfo * freelist_predicate ;
2017-03-24 20:10:43 +08:00
TYPE_PREDICATE_ID allocPredicateID ( ) ;
void freePredicateID ( TYPE_PREDICATE_ID _id ) ;
2016-09-18 20:01:57 +08:00
/////////////////////////////////////////////////////////////////////////////////
void initIDinfo ( ) ; //initialize the members
void resetIDinfo ( ) ; //reset the id info for build
void readIDinfo ( ) ; //read and build the free list
void writeIDinfo ( ) ; //write and empty the free list
2017-01-16 14:12:57 +08:00
bool saveDBInfoFile ( ) ;
bool loadDBInfoFile ( ) ;
2016-09-18 20:01:57 +08:00
string getStorePath ( ) ;
2017-01-16 14:12:57 +08:00
//encode relative signature data of all Basic Graph Query, who union together into SPARQLquery
2016-09-18 20:01:57 +08:00
void buildSparqlSignature ( SPARQLquery & _sparql_q ) ;
2017-01-16 14:12:57 +08:00
//encode Triple into Subject EntityBitSet
bool encodeTriple2SubEntityBitSet ( EntityBitSet & _bitset , const Triple * _p_triple ) ;
2017-05-19 23:05:38 +08:00
//NOTICE: the encodeTriple with Triple* is invalid now(not enocde the linkage of neighbor-predicate)
bool encodeTriple2SubEntityBitSet ( EntityBitSet & _bitset , TYPE_PREDICATE_ID _pre_id , TYPE_ENTITY_LITERAL_ID _obj_id ) ;
2017-01-16 14:12:57 +08:00
//encode Triple into Object EntityBitSet
bool encodeTriple2ObjEntityBitSet ( EntityBitSet & _bitset , const Triple * _p_triple ) ;
2017-05-19 23:05:38 +08:00
bool encodeTriple2ObjEntityBitSet ( EntityBitSet & _bitset , TYPE_PREDICATE_ID _pre_id , TYPE_ENTITY_LITERAL_ID _sub_id ) ;
2016-09-18 20:01:57 +08:00
2017-03-24 20:10:43 +08:00
bool calculateEntityBitSet ( TYPE_ENTITY_LITERAL_ID _entity_id , EntityBitSet & _bitset ) ;
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//check whether the relative 3-tuples exist
//usually, through sp2olist
2017-03-24 20:10:43 +08:00
bool exist_triple ( TYPE_ENTITY_LITERAL_ID _sub_id , TYPE_PREDICATE_ID _pre_id , TYPE_ENTITY_LITERAL_ID _obj_id ) ;
2017-04-11 15:26:35 +08:00
bool exist_triple ( const TripleWithObjType & _triple ) ;
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//* _rdf_file denotes the path of the RDF file, where stores the rdf data
//* there are many step in this function, each one responds to an sub-function
//* 1. map sub2id and pre2id
//* 2. map literal2id and encode RDF data into signature,
//* storing in binary file: this->getSignatureBFile(), the order responds to subID
//* also, store six_tuples in file: this->getSixTuplesFile()
//* 3. build: subID2objIDlist, <subIDpreID>2objIDlist subID2<preIDobjID>list
//* 4. build: objID2subIDlist, <objIDpreID>2subIDlist objID2<preIDsubID>list
2016-09-18 20:01:57 +08:00
//encodeRDF_new invoke new rdfParser to solve task 1 & 2 in one time scan.
bool encodeRDF_new ( const string _rdf_file ) ;
2017-03-29 13:48:39 +08:00
void readIDTuples ( ID_TUPLE * & _p_id_tuples ) ;
void build_s2xx ( ID_TUPLE * ) ;
void build_o2xx ( ID_TUPLE * ) ;
void build_p2xx ( ID_TUPLE * ) ;
2016-09-18 20:01:57 +08:00
//insert and delete, notice that modify is not needed here
//we can read from file or use sparql syntax
2017-03-24 20:10:43 +08:00
bool insertTriple ( const TripleWithObjType & _triple , vector < unsigned > * _vertices = NULL , vector < unsigned > * _predicates = NULL ) ;
bool removeTriple ( const TripleWithObjType & _triple , vector < unsigned > * _vertices = NULL , vector < unsigned > * _predicates = NULL ) ;
2016-09-18 20:01:57 +08:00
//NOTICE:one by one is too costly, sort and insert/delete at a time will be better
2017-05-18 23:27:58 +08:00
unsigned insert ( const TripleWithObjType * _triples , TYPE_TRIPLE_NUM _triple_num , bool _is_restore = false ) ;
2016-09-18 20:01:57 +08:00
//bool insert(const vector<TripleWithObjType>& _triples, vector<int>& _vertices, vector<int>& _predicates);
2017-05-18 23:27:58 +08:00
unsigned remove ( const TripleWithObjType * _triples , TYPE_TRIPLE_NUM _triple_num , bool _is_restore = false ) ;
2016-09-18 20:01:57 +08:00
2017-03-29 13:48:39 +08:00
bool sub2id_pre2id_obj2id_RDFintoSignature ( const string _rdf_file ) ;
2017-03-24 20:10:43 +08:00
//bool literal2id_RDFintoSignature(const string _rdf_file, int** _p_id_tuples, TYPE_TRIPLE_NUM _id_tuples_max);
2017-01-16 14:12:57 +08:00
2017-03-24 20:10:43 +08:00
bool objIDIsEntityID ( TYPE_ENTITY_LITERAL_ID _id ) ;
2016-09-18 20:01:57 +08:00
2017-01-16 14:12:57 +08:00
//* join on the vector of CandidateList, available after retrieve from the VSTREE
//* and store the resut in _result_set
2016-09-18 20:01:57 +08:00
//bool join(vector<int*>& _result_list, int _var_id, int _pre_id, int _var_id2, const char _edge_type, int _var_num, bool shouldAddLiteral, IDList& _can_list);
//bool select(vector<int*>& _result_list, int _var_id, int _pre_id, int _var_id2, const char _edge_type, int _var_num);
2017-01-16 14:12:57 +08:00
//get the final string result_set from SPARQLquery
2016-09-18 20:01:57 +08:00
bool getFinalResult ( SPARQLquery & _sparql_q , ResultSet & _result_set ) ;
2017-04-11 15:26:35 +08:00
static int read_update_log ( const string _path , multiset < string > & _i , multiset < string > & _r ) ;
bool restore_update ( multiset < string > & _i , multiset < string > & _r ) ;
void clear_update_log ( ) ;
2016-09-18 20:01:57 +08:00
} ;
# endif //_DATABASE_DATABASE_H