diff --git a/Database/Database.cpp b/Database/Database.cpp index 85bbe66..4a8816e 100644 --- a/Database/Database.cpp +++ b/Database/Database.cpp @@ -13,6 +13,7 @@ using namespace std; Database::Database() { this->name = ""; + this->store_path = ""; string store_path = "."; this->signature_binary_file = "signature.binary"; this->six_tuples_file = "six_tuples"; @@ -27,7 +28,8 @@ Database::Database() string stringindex_store_path = store_path + "/stringindex_store"; this->stringindex = new StringIndex(stringindex_store_path); - this->encode_mode = Database::STRING_MODE; + //this->encode_mode = Database::STRING_MODE; + this->encode_mode = Database::ID_MODE; this->is_active = false; this->sub_num = 0; this->pre_num = 0; @@ -42,6 +44,8 @@ Database::Database() this->literal_buffer = NULL; this->literal_buffer_size = 0; + this->if_loaded = false; + //this->resetIDinfo(); this->initIDinfo(); } @@ -49,7 +53,7 @@ Database::Database() Database::Database(string _name) { this->name = _name; - string store_path = this->name; + this->store_path = Util::global_config["db_home"] + "/" + this->name + Util::global_config["db_suffix"]; this->signature_binary_file = "signature.binary"; this->six_tuples_file = "six_tuples"; @@ -64,7 +68,8 @@ Database::Database(string _name) string stringindex_store_path = store_path + "/stringindex_store"; this->stringindex = new StringIndex(stringindex_store_path); - this->encode_mode = Database::STRING_MODE; + //this->encode_mode = Database::STRING_MODE; + this->encode_mode = Database::ID_MODE; this->is_active = false; this->sub_num = 0; this->pre_num = 0; @@ -72,6 +77,8 @@ Database::Database(string _name) this->entity_num = 0; this->triples_num = 0; + this->if_loaded = false; + this->join = NULL; this->pre2num = NULL; this->entity_buffer = NULL; @@ -531,7 +538,11 @@ Database::warmUp() bool Database::load() { - //DEBUG:what if loaded several times?to check if loaded? + if(this->if_loaded) + { + return true; + } + bool flag = (this->vstree)->loadTree(); if (!flag) { @@ -562,6 +573,7 @@ Database::load() //this->warmUp(); //DEBUG:the warmUp() calls query(), which will also output results, this is not we want + this->if_loaded = true; cout << "finish load" << endl; return true; @@ -586,6 +598,9 @@ Database::unload() this->saveDBInfoFile(); this->writeIDinfo(); this->initIDinfo(); + + this->if_loaded = false; + return true; } @@ -727,8 +742,8 @@ Database::build(const string& _rdf_file) string ret = Util::getExactPath(_rdf_file.c_str()); long tv_build_begin = Util::get_cur_time(); - string store_path = this->name; - Util::create_dir(store_path); + //string store_path = this->name; + Util::create_dir(this->store_path); string kv_store_path = store_path + "/kv_store"; Util::create_dir(kv_store_path); @@ -834,10 +849,10 @@ Database::saveDBInfoFile() fwrite(&this->encode_mode, sizeof(int), 1, filePtr); fclose(filePtr); - Util::triple_num = this->triples_num; - Util::pre_num = this->pre_num; - Util::entity_num = this->entity_num; - Util::literal_num = this->literal_num; + //Util::triple_num = this->triples_num; + //Util::pre_num = this->pre_num; + //Util::entity_num = this->entity_num; + //Util::literal_num = this->literal_num; return true; } @@ -863,10 +878,10 @@ Database::loadDBInfoFile() fread(&this->encode_mode, sizeof(int), 1, filePtr); fclose(filePtr); - Util::triple_num = this->triples_num; - Util::pre_num = this->pre_num; - Util::entity_num = this->entity_num; - Util::literal_num = this->literal_num; + //Util::triple_num = this->triples_num; + //Util::pre_num = this->pre_num; + //Util::entity_num = this->entity_num; + //Util::literal_num = this->literal_num; return true; } @@ -874,7 +889,7 @@ Database::loadDBInfoFile() string Database::getStorePath() { - return this->name; + return this->store_path; } //encode relative signature data of the query graph @@ -934,22 +949,30 @@ Database::calculateEntityBitSet(int _entity_id, EntityBitSet & _bitset) bool Database::encodeTriple2SubEntityBitSet(EntityBitSet& _bitset, const Triple* _p_triple) { - int _pre_id = -1; + int _pre_id = (this->kvstore)->getIDByPredicate(_p_triple->predicate); + if(_pre_id != -1) { - _pre_id = (this->kvstore)->getIDByPredicate(_p_triple->predicate); - //BETTER: checking whether _pre_id is -1 or not will be more reliable + Signature::encodePredicate2Entity(_bitset, _pre_id, Util::EDGE_OUT); } - Signature::encodePredicate2Entity(_pre_id, _bitset, Util::EDGE_OUT); - if (this->encode_mode == Database::ID_MODE) + int _obj_id = (this->kvstore)->getIDByEntity(_p_triple->object); + if(_obj_id == -1) { - //TODO + _obj_id = (this->kvstore)->getIDByLiteral(_p_triple->object); } - else if (this->encode_mode == Database::STRING_MODE) + if(_obj_id != -1) { - Signature::encodeStr2Entity((_p_triple->object).c_str(), _bitset); + Signature::encodeStr2Entity(_bitset, _obj_id, Util::EDGE_OUT); } + //if (this->encode_mode == Database::ID_MODE) + //{ + //} + //else if (this->encode_mode == Database::STRING_MODE) + //{ + //Signature::encodeStr2Entity((_p_triple->object).c_str(), _bitset); + //} + return true; } @@ -957,22 +980,27 @@ Database::encodeTriple2SubEntityBitSet(EntityBitSet& _bitset, const Triple* _p_t bool Database::encodeTriple2ObjEntityBitSet(EntityBitSet& _bitset, const Triple* _p_triple) { - int _pre_id = -1; + int _pre_id = (this->kvstore)->getIDByPredicate(_p_triple->predicate); + if(_pre_id != -1) { - _pre_id = (this->kvstore)->getIDByPredicate(_p_triple->predicate); - //BETTER: checking whether _pre_id is -1 or not will be more reliable + Signature::encodePredicate2Entity(_bitset, _pre_id, Util::EDGE_IN); } - Signature::encodePredicate2Entity(_pre_id, _bitset, Util::EDGE_IN); - if (this->encode_mode == Database::ID_MODE) + int _sub_id = (this->kvstore)->getIDByEntity(_p_triple->subject); + if(_sub_id != -1) { - //TODO - } - else if (this->encode_mode == Database::STRING_MODE) - { - Signature::encodeStr2Entity((_p_triple->subject).c_str(), _bitset); + Signature::encodeStr2Entity(_bitset, _sub_id, Util::EDGE_IN); } + //Signature::encodePredicate2Entity(_pre_id, _bitset, Util::EDGE_IN); + //if (this->encode_mode == Database::ID_MODE) + //{ + //} + //else if (this->encode_mode == Database::STRING_MODE) + //{ + //Signature::encodeStr2Entity((_p_triple->subject).c_str(), _bitset); + //} + return true; } @@ -1017,12 +1045,27 @@ Database::encodeRDF_new(const string _rdf_file) long t1 = Util::get_cur_time(); + //NOTICE: in encode process, we should not divide ID of entity and literal totally apart, i.e. entity is a system + //while literal is another system + //The reason is that if we divide entity and literal, then in triple_array and final_result we can not decide a given + //ID is entity or not + //(one way is to add a more structure to tell us which is entity, but this is costly) + //map sub2id, pre2id, entity/literal in obj2id, store in kvstore, encode RDF data into signature if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file, _p_id_tuples, _id_tuples_max)) { return false; } + //TODO+BETTER:after encode, we can know the exact entity num, so we can decide if our system can run this dataset + //based on the current available memory(need a memory manager globally) + //If unbale to run, should exit and give a prompt + //User can modify the config file to run anyway, but gStore will not ensure correctness + //What is more, in load process, we also need to decide if gStore can run it + // + //TODO+BETTER: a global ID manager module, should be based on type template + //this can be used in vstree, storage and Database + long t2 = Util::get_cur_time(); cout << "after encode, used " << (t2 - t1) << "ms." << endl; @@ -1032,6 +1075,8 @@ Database::encodeRDF_new(const string _rdf_file) this->stringindex->setNum(StringIndexFile::Predicate, this->pre_num); this->stringindex->save(*this->kvstore); + //cout<<"special id: "<kvstore->getIDByEntity("")<kvstore->close_entity2id(); this->kvstore->close_id2entity(); @@ -1040,15 +1085,18 @@ Database::encodeRDF_new(const string _rdf_file) this->kvstore->close_predicate2id(); this->kvstore->close_id2predicate(); - this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); + //this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); + this->build_s2xx(_p_id_tuples); long t3 = Util::get_cur_time(); cout << "after s2xx, used " << (t3 - t2) << "ms." << endl; - this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); + //this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); + this->build_o2xx(_p_id_tuples); long t4 = Util::get_cur_time(); cout << "after o2xx, used " << (t4 - t3) << "ms." << endl; - this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); + //this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); + this->build_p2xx(_p_id_tuples); long t5 = Util::get_cur_time(); cout << "after p2xx, used " << (t5 - t4) << "ms." << endl; @@ -1070,6 +1118,202 @@ Database::encodeRDF_new(const string _rdf_file) return true; } +void +Database::build_s2xx(int** _p_id_tuples) +{ + qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp); + this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); + + //save all entity_signature into binary file + string sig_binary_file = this->getSignatureBFile(); + FILE* sig_fp = fopen(sig_binary_file.c_str(), "wb"); + if (sig_fp == NULL) + { + cout << "Failed to open : " << sig_binary_file << endl; + return; + } + + //NOTICE:in build process, all IDs are continuous growing + EntityBitSet tmp_bitset; + tmp_bitset.reset(); + for(int i = 0; i < this->entity_num; ++i) + { + SigEntry* sig = new SigEntry(EntitySig(tmp_bitset), -1); + fwrite(sig, sizeof(SigEntry), 1, sig_fp); + delete sig; + } + + //TODO:use unsigned for type and -1 should be changed + int prev_entity_id = -1; + for (int i = 0; i < this->triples_num; ++i) + { + int subid = _p_id_tuples[i][0]; + int preid = _p_id_tuples[i][1]; + int objid = _p_id_tuples[i][2]; + if(subid != prev_entity_id) + { + if(prev_entity_id != -1) + { +#ifdef DEBUG + //if(prev_entity_id == 13) + //{ + //cout<<"yy: "<triples_num, sizeof(int*), Util::_ops_cmp); + this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); + + //save all entity_signature into binary file + string sig_binary_file = this->getSignatureBFile(); + //NOTICE: this is different from build_s2xx, the file already exists + FILE* sig_fp = fopen(sig_binary_file.c_str(), "rb+"); + if (sig_fp == NULL) + { + cout << "Failed to open : " << sig_binary_file << endl; + return; + } + + //NOTICE:in build process, all IDs are continuous growing + //TODO:use unsigned for type and -1 should be changed + int prev_entity_id = -1; + EntityBitSet tmp_bitset; + for (int i = 0; i < this->triples_num; ++i) + { + int subid = _p_id_tuples[i][0]; + int preid = _p_id_tuples[i][1]; + int objid = _p_id_tuples[i][2]; + + + if(Util::is_literal_ele(objid)) + { + continue; + } + + if(objid != prev_entity_id) + { + if(prev_entity_id != -1) + { + //NOTICE: we must do twice, we need to locate on the same entry to deal, so we must place in order + fseek(sig_fp, sizeof(SigEntry) * prev_entity_id, SEEK_SET); + SigEntry* old_sig = new SigEntry(); + fread(old_sig, sizeof(SigEntry), 1, sig_fp); +#ifdef DEBUG + //cout<<"to write a signature: "<getEntitySig().entityBitSet; + delete old_sig; + +#ifdef DEBUG + //if(prev_entity_id == 13) + //{ + //cout<<"yy: "<getEntitySig().entityBitSet; + delete old_sig; + //write the sig entry + SigEntry* sig = new SigEntry(EntitySig(tmp_bitset), prev_entity_id); + fseek(sig_fp, sizeof(SigEntry) * prev_entity_id, SEEK_SET); + fwrite(sig, sizeof(SigEntry), 1, sig_fp); + //_all_bitset |= *_entity_bitset[i]; + delete sig; + } + + fclose(sig_fp); +} + +void +Database::build_p2xx(int** _p_id_tuples) +{ + qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp); + this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); +} + +//NOTICE:in here and there in the insert/delete, we may get the maxium tuples num first +//and so we can avoid the cost of memcpy(scan quickly or use wc -l) +//However, if use compressed RDF format, how can we do it fi not using parser? +//CONSIDER: just an estimated value is ok or use vector!!!(but vector also copy when enlarge) +//and read file line numbers are also costly! bool Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, int**& _p_id_tuples, int & _id_tuples_max) { @@ -1117,14 +1361,18 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, int**& _ //don't know the number of entity //pre allocate entitybitset_max EntityBitSet for storing signature, double the space until the _entity_bitset is used up. - int entitybitset_max = 10000; - EntityBitSet** _entity_bitset = new EntityBitSet*[entitybitset_max]; - for (int i = 0; i < entitybitset_max; i++) - { - _entity_bitset[i] = new EntityBitSet(); - _entity_bitset[i]->reset(); - } - EntityBitSet _tmp_bitset; + // + //TODO:改为用外存两轮排序,注意文件定位,需要修改Signature中string编码为ID(设置ID_MODE),且应绑定点和边 + //应该跑通4.5亿的数据集 + //int entitybitset_max = 10000000; //set larger to avoid the copy cost + //int entitybitset_max = 10000; + //EntityBitSet** _entity_bitset = new EntityBitSet*[entitybitset_max]; + //for (int i = 0; i < entitybitset_max; i++) + //{ + //_entity_bitset[i] = new EntityBitSet(); + //_entity_bitset[i]->reset(); + //} + //EntityBitSet _tmp_bitset; //parse a file RDFParser _parser(_fin); @@ -1238,98 +1486,100 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, int**& _ #ifdef DEBUG_PRECISE // save six tuples { - _six_tuples_fout << _sub_id << '\t' - << _pre_id << '\t' - << _obj_id << '\t' - << _sub << '\t' - << _pre << '\t' - << _obj << endl; + //_six_tuples_fout << _sub_id << '\t' + //<< _pre_id << '\t' + //<< _obj_id << '\t' + //<< _sub << '\t' + //<< _pre << '\t' + //<< _obj << endl; } #endif + + //TODO:the memory cost maybe too larger if combine teh below process here + //we can do below after this function or after all B+trees are built and closed + //and we can decide the length of signature according to entity num then + //1. after all b+trees: empty id_tuples and only open id2string, reload rdf file and encode(using string for entity/literal) + // + //2. after this function or after all B+trees: close others and only use id_tuples to encode(no need to read file again, which is too costly) + //not encoded with string but all IDs(not using encode for string regex matching, then this is ok!) + //Because we encode with ID, then Signature has to be changed(and dynamic sig length) + //use same encode strategy for entity/literal/predicate, and adjust the rate of the 3 parts according to real case + //What is more, if the system memory is enough(precisely, the memory you want to assign to gstore - to vstree/entity_sig_array), + //we can also set the sig length larger(which should be included in config file) + //_entity_bitset is used up, double the space - if (this->entity_num >= entitybitset_max) - { - //cout<<"to double entity bitset num"<entity_num >= entitybitset_max) + //{ + ////cout<<"to double entity bitset num"<reset(); - } + //int tmp = entitybitset_max * 2; + //for (int i = entitybitset_max; i < tmp; i++) + //{ + //_entity_bitset[i] = new EntityBitSet(); + //_entity_bitset[i]->reset(); + //} - entitybitset_max = tmp; - } + //entitybitset_max = tmp; + //} - { - _tmp_bitset.reset(); - Signature::encodePredicate2Entity(_pre_id, _tmp_bitset, Util::EDGE_OUT); - Signature::encodeStr2Entity(_obj.c_str(), _tmp_bitset); - *_entity_bitset[_sub_id] |= _tmp_bitset; - } + //{ + //_tmp_bitset.reset(); + //Signature::encodePredicate2Entity(_pre_id, _tmp_bitset, Util::EDGE_OUT); + //Signature::encodeStr2Entity(_obj.c_str(), _tmp_bitset); + //*_entity_bitset[_sub_id] |= _tmp_bitset; + //} - if (triple_array[i].isObjEntity()) - { - _tmp_bitset.reset(); - Signature::encodePredicate2Entity(_pre_id, _tmp_bitset, Util::EDGE_IN); - Signature::encodeStr2Entity(_sub.c_str(), _tmp_bitset); - //cout<<"objid: "<<_obj_id < end while(true)"); + cout<<"==> end while(true)"<getSignatureBFile(); - FILE* _sig_fp = fopen(_sig_binary_file.c_str(), "wb"); - if (_sig_fp == NULL) { - cout << "Failed to open : " << _sig_binary_file << endl; - } + //for (int i = 0; i < entitybitset_max; i++) + //{ + //delete _entity_bitset[i]; + //} + //delete[] _entity_bitset; - //EntityBitSet _all_bitset; - for (int i = 0; i < this->entity_num; i++) - { - SigEntry* _sig = new SigEntry(EntitySig(*_entity_bitset[i]), i); - fwrite(_sig, sizeof(SigEntry), 1, _sig_fp); - //_all_bitset |= *_entity_bitset[i]; - delete _sig; - } - fclose(_sig_fp); - - for (int i = 0; i < entitybitset_max; i++) - { - delete _entity_bitset[i]; - } - delete[] _entity_bitset; - } - - { - stringstream _ss; - _ss << "finish sub2id pre2id obj2id" << endl; - _ss << "tripleNum is " << this->triples_num << endl; - _ss << "entityNum is " << this->entity_num << endl; - _ss << "preNum is " << this->pre_num << endl; - _ss << "literalNum is " << this->literal_num << endl; - Util::logging(_ss.str()); - cout << _ss.str() << endl; - } + cout << "finish sub2id pre2id obj2id" << endl; + cout << "tripleNum is " << this->triples_num << endl; + cout << "entityNum is " << this->entity_num << endl; + cout << "preNum is " << this->pre_num << endl; + cout << "literalNum is " << this->literal_num << endl; + + //{ + //stringstream _ss; + //_ss << "finish sub2id pre2id obj2id" << endl; + //_ss << "tripleNum is " << this->triples_num << endl; + //_ss << "entityNum is " << this->entity_num << endl; + //_ss << "preNum is " << this->pre_num << endl; + //_ss << "literalNum is " << this->literal_num << endl; + //Util::logging(_ss.str()); + //cout << _ss.str() << endl; + //} return true; } @@ -1498,7 +1748,7 @@ Database::insertTriple(const TripleWithObjType& _triple, vector* _vertices, //if new entity then insert it, else update it. if (_is_new_sub) { - cout<<"to insert: "<<_sub_id<<" "<kvstore->getEntityByID(_sub_id)<kvstore->getEntityByID(_sub_id)<vstree)->insertEntry(_sig); } @@ -1685,6 +1935,7 @@ Database::removeTriple(const TripleWithObjType& _triple, vector* _vertices, bool Database::insert(std::string _rdf_file) { + //cout<<"to load in insert"<load(); if (!flag) { @@ -1849,6 +2100,11 @@ Database::remove(std::string _rdf_file) cout << "remove rdf triples done." << endl; cout<<"removed triples num: "<vstree->isEmpty()) + { + this->resetIDinfo(); + } + return true; } @@ -2645,6 +2901,13 @@ Database::remove(const TripleWithObjType* _triples, int _triple_num) this->stringindex->disable(vertices, true); this->stringindex->disable(predicates, false); + //BETTER+TODO:this will require us to lock all when remove process not ends(in multiple threads cases) + //An considerable idea is to check if empty after every triple removed + if(this->vstree->isEmpty()) + { + this->resetIDinfo(); + } + return valid_num; } diff --git a/Database/Database.h b/Database/Database.h index 6408226..b992de5 100644 --- a/Database/Database.h +++ b/Database/Database.h @@ -72,6 +72,7 @@ public: private: string name; + string store_path; bool is_active; int triples_num; int entity_num; @@ -81,6 +82,8 @@ private: int encode_mode; + bool if_loaded; + VSTree* vstree; KVstore* kvstore; StringIndex* stringindex; @@ -107,6 +110,7 @@ private: unsigned literal_buffer_size; void setStringBuffer(); void warmUp(); + //BETTER+TODO:add a predicate buffer for ?p query //triple num per group for insert/delete //can not be too high, otherwise the heap will over @@ -170,6 +174,9 @@ private: //* 4. build: objID2subIDlist, 2subIDlist objID2list //encodeRDF_new invoke new rdfParser to solve task 1 & 2 in one time scan. bool encodeRDF_new(const string _rdf_file); + void build_s2xx(int**); + void build_o2xx(int**); + void build_p2xx(int**); //insert and delete, notice that modify is not needed here //we can read from file or use sparql syntax diff --git a/Database/Join.cpp b/Database/Join.cpp index 7073e9a..aa0e675 100644 --- a/Database/Join.cpp +++ b/Database/Join.cpp @@ -951,9 +951,15 @@ Join::update_answer_list(IDList*& valid_ans_list, IDList& _can_list, int* id_lis } } -//TODO:consider two directions according to table1 size and table2 size +//NOTICE: consider two directions according to table1 size and table2 size //1. -> add ID mapping record for the first linking column, whole(offset, size) zengli //2. <- join using inverted index for each column, offset and size for each column, hulin +//However, the result is that this case is rare, and not really better +// +//NOTICE: you may think that when joining to enlarge the current table, there maybe exist many duplicates in a column, +//which causes too many redunt linking operations. +//However, the case is really rare in our test(the reason may be that the web graph is always very sparse) +//If we add a buffer for this case, will cause worse performance bool Join::join_two(vector< vector >& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal) { @@ -1100,6 +1106,8 @@ Join::join_two(vector< vector >& _edges, IDList& _can_list, int _can_list_s } if(exist_constant_pre) { + //NOTICE: this means there exists constant pre in parallel edges, so update_answer_list has already been used + //in this case, later we needn't do s2o_pre_var or o2s_pre_var because sp2o and op2s is more precise continue; } //all pres are variable, so use s2o or o2s to add diff --git a/KVstore/ISTree/ISTree.cpp b/KVstore/ISTree/ISTree.cpp index 18202bb..5a23fcb 100644 --- a/KVstore/ISTree/ISTree.cpp +++ b/KVstore/ISTree/ISTree.cpp @@ -654,4 +654,4 @@ ISTree::print(string s) } else; #endif -} \ No newline at end of file +} diff --git a/KVstore/ISTree/storage/ISStorage.cpp b/KVstore/ISTree/storage/ISStorage.cpp index 00932c0..4f6e223 100644 --- a/KVstore/ISTree/storage/ISStorage.cpp +++ b/KVstore/ISTree/storage/ISStorage.cpp @@ -405,6 +405,8 @@ ISStorage::writeNode(ISNode* _np) t = 0; fwrite(&t, sizeof(unsigned), 1, treefp); //the end-block //_np->setFlag(_np->getFlag() & ~Node::NF_ID); + //NOTICE:we may store the dirty bit into the tree file, but that is ok + //Each time we read the tree file to construct a node, we always set the drity bit to 0 _np->delDirty(); return true; } @@ -573,7 +575,7 @@ ISStorage::updateHeap(ISNode* _np, unsigned _rank, bool _inheap) const } } -void +bool ISStorage::request(long long _needmem) //aligned to byte { //NOTICE: <0 means release //cout<<"freemem: "<freemem<<" needmem: "<<_needmem<handler(_needmem - freemem)) //disaster in buffer memory { print(string("error in request: out of buffer-mem, now to exit")); - exit(1); + //exit(1); + return false; } this->freemem -= _needmem; + return true; } bool @@ -670,4 +674,4 @@ ISStorage::print(string s) fputs(s.c_str(), Util::debug_kvstore); fputs("\n", Util::debug_kvstore); #endif -} \ No newline at end of file +} diff --git a/KVstore/ISTree/storage/ISStorage.h b/KVstore/ISTree/storage/ISStorage.h index 93dd44c..f59c7ac 100644 --- a/KVstore/ISTree/storage/ISStorage.h +++ b/KVstore/ISTree/storage/ISStorage.h @@ -62,11 +62,11 @@ public: bool writeBstr(const Bstr* _bp, unsigned* _curnum, bool& _SpecialBlock); bool writeTree(ISNode* _np); void updateHeap(ISNode* _np, unsigned _rank, bool _inheap) const; - void request(long long _needmem); //deal with memory request + bool request(long long _needmem); //deal with memory request bool handler(unsigned long long _needmem); //swap some nodes out //bool update(); //update InMem Node's rank, with clock ~ISStorage(); void print(std::string s); //DEBUG }; -#endif \ No newline at end of file +#endif diff --git a/KVstore/KVstore.cpp b/KVstore/KVstore.cpp index bbaa377..1928799 100644 --- a/KVstore/KVstore.cpp +++ b/KVstore/KVstore.cpp @@ -1146,7 +1146,7 @@ bool KVstore::close_subID2values() { bool KVstore::build_subID2values(int** _p_id_tuples, int _triples_num) { cout << "Begin building subID2values..." << endl; - qsort(_p_id_tuples, _triples_num, sizeof(int*), KVstore::_spo_cmp); + //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_spo_cmp); vector _oidlist_s; vector _pidoffsetlist_s; int _entity_num = 0; @@ -1363,7 +1363,7 @@ bool KVstore::close_objID2values() { bool KVstore::build_objID2values(int** _p_id_tuples, int _triples_num) { cout << "Begin building objID2values..." << endl; - qsort(_p_id_tuples, _triples_num, sizeof(int*), KVstore::_ops_cmp); + //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_ops_cmp); vector _sidlist_o; vector _pidoffsetlist_o; @@ -1552,7 +1552,7 @@ bool KVstore::close_preID2values() { bool KVstore::build_preID2values(int** _p_id_tuples, int _triples_num) { cout << "Begin building preID2values..." << endl; - qsort(_p_id_tuples, _triples_num, sizeof(int*), KVstore::_pso_cmp); + //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_pso_cmp); vector _sidlist_p; vector _oidlist_p; @@ -1734,80 +1734,6 @@ bool KVstore::getpreIDlistBysubIDobjID(int _subid, int _objid, int*& _preidlist, return true; } -int KVstore::_spo_cmp(const void* _a, const void* _b) { - int** _p_a = (int**)_a; - int** _p_b = (int**)_b; - - int _sub_id_a = (*_p_a)[0]; - int _sub_id_b = (*_p_b)[0]; - if (_sub_id_a != _sub_id_b) { - return _sub_id_a - _sub_id_b; - } - - int _pre_id_a = (*_p_a)[1]; - int _pre_id_b = (*_p_b)[1]; - if (_pre_id_a != _pre_id_b) { - return _pre_id_a - _pre_id_b; - } - - int _obj_id_a = (*_p_a)[2]; - int _obj_id_b = (*_p_b)[2]; - if (_obj_id_a != _obj_id_b) { - return _obj_id_a - _obj_id_b; - } - - return 0; -} - -int KVstore::_ops_cmp(const void* _a, const void* _b) { - int** _p_a = (int**)_a; - int** _p_b = (int**)_b; - - int _obj_id_a = (*_p_a)[2]; - int _obj_id_b = (*_p_b)[2]; - if (_obj_id_a != _obj_id_b) { - return _obj_id_a - _obj_id_b; - } - - int _pre_id_a = (*_p_a)[1]; - int _pre_id_b = (*_p_b)[1]; - if (_pre_id_a != _pre_id_b) { - return _pre_id_a - _pre_id_b; - } - - int _sub_id_a = (*_p_a)[0]; - int _sub_id_b = (*_p_b)[0]; - if (_sub_id_a != _sub_id_b) { - return _sub_id_a - _sub_id_b; - } - - return 0; -} - -int KVstore::_pso_cmp(const void* _a, const void* _b) { - int** _p_a = (int**)_a; - int** _p_b = (int**)_b; - - int _pre_id_a = (*_p_a)[1]; - int _pre_id_b = (*_p_b)[1]; - if (_pre_id_a != _pre_id_b) { - return _pre_id_a - _pre_id_b; - } - - int _sub_id_a = (*_p_a)[0]; - int _sub_id_b = (*_p_b)[0]; - if (_sub_id_a != _sub_id_b) { - return _sub_id_a - _sub_id_b; - } - - int _obj_id_a = (*_p_a)[2]; - int _obj_id_b = (*_p_b)[2]; - if (_obj_id_a != _obj_id_b) { - return _obj_id_a - _obj_id_b; - } - - return 0; -} bool KVstore::open(SITree*& _p_btree, string _tree_name, int _mode, unsigned long long _buffer_size) { if (_p_btree != NULL) { diff --git a/KVstore/KVstore.h b/KVstore/KVstore.h index 27399b8..dfdfd77 100644 --- a/KVstore/KVstore.h +++ b/KVstore/KVstore.h @@ -133,12 +133,6 @@ public: //for so2p bool getpreIDlistBysubIDobjID(int _subID, int _objID, int*& _preidlist, int& _list_len, bool _no_duplicate = false) const; - //=============================================================================== - //sort functions for qsort - - static int _spo_cmp(const void* _a, const void* _b); - static int _ops_cmp(const void* _a, const void* _b); - static int _pso_cmp(const void* _a, const void* _b); private: std::string store_path; @@ -210,4 +204,4 @@ private: static bool isEntity(int id); }; -#endif //_KVSTORE_KVSTORE_H \ No newline at end of file +#endif //_KVSTORE_KVSTORE_H diff --git a/KVstore/SITree/storage/SIStorage.cpp b/KVstore/SITree/storage/SIStorage.cpp index df062c3..39022c0 100644 --- a/KVstore/SITree/storage/SIStorage.cpp +++ b/KVstore/SITree/storage/SIStorage.cpp @@ -42,6 +42,8 @@ SIStorage::SIStorage(string& _filepath, string& _mode, unsigned* _height, unsign } this->treeheight = _height; //originally set to 0 this->max_buffer_size = _buffer_size; + //cout<<"buffer size: "<max_buffer_size<heap_size = this->max_buffer_size / SINode::INTL_SIZE; this->freemem = this->max_buffer_size; this->freelist = new BlockInfo; //null-head @@ -573,16 +575,18 @@ SIStorage::updateHeap(SINode* _np, unsigned _rank, bool _inheap) const } } -void +bool SIStorage::request(long long _needmem) //aligned to byte { //NOTICE: <0 means release if (_needmem > 0 && this->freemem < (unsigned long long)_needmem) if (!this->handler(_needmem - freemem)) //disaster in buffer memory { print(string("error in request: out of buffer-mem, now to exit")); - exit(1); + //exit(1); + return false;; } this->freemem -= _needmem; + return true; } bool @@ -652,4 +656,4 @@ SIStorage::print(string s) fputs(s.c_str(), Util::debug_kvstore); fputs("\n", Util::debug_kvstore); #endif -} \ No newline at end of file +} diff --git a/KVstore/SITree/storage/SIStorage.h b/KVstore/SITree/storage/SIStorage.h index 1f4a6b5..3b454ba 100644 --- a/KVstore/SITree/storage/SIStorage.h +++ b/KVstore/SITree/storage/SIStorage.h @@ -62,11 +62,11 @@ public: bool writeBstr(const Bstr* _bp, unsigned* _curnum, bool& _SpecialBlock); bool writeTree(SINode* _np); void updateHeap(SINode* _np, unsigned _rank, bool _inheap) const; - void request(long long _needmem); //deal with memory request + bool request(long long _needmem); //deal with memory request bool handler(unsigned long long _needmem); //swap some nodes out //bool update(); //update InMem Node's rank, with clock ~SIStorage(); void print(std::string s); //DEBUG }; -#endif \ No newline at end of file +#endif diff --git a/Main/gadd.cpp b/Main/gadd.cpp index 48fd4ab..498f86d 100644 --- a/Main/gadd.cpp +++ b/Main/gadd.cpp @@ -16,6 +16,14 @@ main(int argc, char * argv[]) cout << "insert file:" << argv[2] << endl; string db_folder = string(argv[1]); + int len = db_folder.length(); + + if(db_folder.substr(len-3, 3) == ".db") + { + cout<<"your database can not end with .db"<& args) //native mode if (flag) { - string database = Util::getItemsFromDir(Util::db_home); + string database = Util::getItemsFromDir(db_home); if (database.empty()) { database = "No databases."; } @@ -870,6 +874,14 @@ int build_handler(const vector& args) { cout << "Import dataset to build database..." << endl; cout << "DB_store: " << database << "\tRDF_data: " << dataset << endl; + int len = database.length(); + + if(database.substr(len-3, 3) == ".db") + { + cerr<<"your database can not end with .db"<build(dataset); delete current_database; diff --git a/Main/gquery.cpp b/Main/gquery.cpp index acc65b4..7121b40 100644 --- a/Main/gquery.cpp +++ b/Main/gquery.cpp @@ -60,6 +60,13 @@ main(int argc, char * argv[]) } string db_folder = string(argv[1]); + int len = db_folder.length(); + if(db_folder.substr(len-3, 3) == ".db") + { + cout<<"your database can not end with .db"<=0) +--- +将B+tree中叶节点的大的value分离出来,新建一套缓存,使用block机制,标记length为0表示未读取 +类型bstr的length问题也需要解决 +如果把类型直接改成long long,空间开销一下子就上升了一倍 +解决方法:对于ID2string,仍然用char*和unsigned,但对于s2xx p2xx o2xx,应该用unsigned long long*和unsigned来表示,这样最高可支持到40亿triple +--- +那么是否可以调整entity与literal的分界线,如果entity数目一般都比literal数目多的话 +直接把literal从大到小编号,可在ID模块中指定顺序,这样每个Datbase模块应该有自己独特的分界线,其他模块用时也需要注意 +编号完成之后可以取剩下的编号的中间值作为边界,这样entity跟literal都有一定的增长空间 + + +加锁后就不必因为交换而调堆,写的时候要锁住整棵树,因为得从root节点锁住(整颗树用一把读写锁) +可以考虑不使用锁,而是在node中用一位来标志是否被锁住,锁住则不能交换出去或写入,但可以继续查看内容 +在操作缓存数组的时候,也要考虑多线程冲突问题 +--- +B+树中的heap策略是否真的高效?? 交换时与其update heap权值,不如直接加锁,或者如果是堆顶元素不应换出,可以先移除后插入。update heap中查找太耗时 +kvstore的heap可以通过沉底的方法保证多线程一致么? + +kvstore变慢:s2o这种索引的性能不关键,关键在于压缩后sp2o和op2s的性能查了不少!是否应二分查找pre,好像s2po中不同的pre并不多,避免线性查找 +但根据排查,sp2o不是慢在得到po list后找p的过程,而是慢在读叶节点的过程,按理说这是应该比找sp要快的 +也可以考虑为B+树节点设计两套block,因为大block跳数少,小block更能有效利用空间。 +在同一个文件中很难处理两套block,但可以用两套文件,分别对应一套block,设计高位来支持逻辑地址,判断属于哪个文件 + +缓存是先申请先得到足够的,设置安全上限,如总共100G,已用不能超过90G,因为还有很多零散的内存是没有统计的 +--- +缓存管理模块,负责主要模块的缓存申请和释放,包括各个数据库等,调度各棵树的内存,不负责树内部的管理 +线程管理模块和缓存管理模块都应该掌控全局 +全局只写内存(除非内存放不下而发生内外存交换),空闲时或者定期刷到磁盘,但如果为了可恢复性就要记log,这是很耗时的。 +BufferManager和ThreadManager在最底层,新建GstoreApplication统领全局,无论是main里面的应用还是server应用,都调用GA里的函数进行处理 +NOTICE: BufferManager只支持大体的管理,最好是预申请一大片缓存或者总体申请报销,零零碎碎的new/malloc不用管,避免反复调用开销太大 +GA里面可以管理多个数据库,第一件事就是启动GA,读取init.conf进行配置,包含当前数据库列表的set,新建时已存在则提示,加载时不存在则报错 +注意保持API不变 +每个数据库在一个时刻只能被一个GA使用,因为若是两个GA同时使用,内存独立,会有各种预料不到的问题。一般使用都是CS模式,所以一个GstoreApplication就足够了。 +通过加锁,数据库被GA使用时则加锁,设置标记,不等待,避免server导入数据库后又用gbuild重建数据库 +--- +树索引的内外存可以和读写共用读写锁,即要换出外存或写入时都要获得读写锁中的写锁 +(因为更新操作不多,所以锁等待时间不必过于在意,但树节点过多导致的锁开销可能很大) +(树从上到下获得锁,走严格的树协议,原来的storage堆替换策略也可以继续用,不过要加上锁的保护) +一般来说,保证弱一致性就可以了。 + +开发专门对kvstore的性能测试工具,包括增删查等,默认是用dbpedia170M数据集,需要修改makefile和test/kvstore_test.cpp +join缓存 vstree交换(先把capacity设小) +成组插删还没完全支持和测试,需要修改KVstore.cpp + +hulin添加join缓存,这样的话两表拼接就不适合多线程并行,本身中间表那种添加方式就很难并行化(要改表要加锁,不好) +后面实现全新的join,两表采用索引拼接的方式,就可用多线程并行,但递归调整得在此轮的所有两表拼接完成后(后面再考虑调整时如何并行) +索引拼接的坏处:当重复ID很少时空间开销可能比中间表还大;拼接完需要递归更新直到全局收敛,次数是中间表的行数乘上列数;编程复杂度高。 +实际上大部分图都是稀疏图,基本不会出现重复ID,所以还是用中间表的方式比较好(中间表上的处理也已经非常完善) +join_two里面可以分块并行,但不能用太多线程,在末尾添加新记录时要加锁 + + +gstore后续需要开发的地方: +事务操作:最小粒度是一个sparql/BGP一个事务,要支持workload(多个查询)为一个事务 最终一致性加锁就行了,顺序一致性则要考虑各种先后关系回滚等等,多版本两阶段锁协议 +多领域多库解决方案。 + +任务分配: +--- +数据库连接池 保持连接而不是每次都用socket(http?,用boost库) +分页查询(先将整个查询结果缓存,需要考虑内外存交换) +陈佳棋的任务:s和p对应同一个实体,应该先重命名,再过滤。还有一种情况是两者只是名字相同,实则并无关系 +高阶谓词逻辑 +--- +陈佳棋找人负责: +模仿海量图大作业,基于gStore开发一个社交应用,要求可以批量导入且实时查询 +查询级别的缓存(测试时可将查询复制后再随机打乱) +多查询优化 +--- +王力博: +安全备份 数据库的多版本备份,动态删除 +gserver for multiple users, not load db for each connection, but get the same db pointer +assign the pointer to the user who require this db, and ensure that each db is just kept once in memory +有什么办法去检测一个db是否存在呢?(首先要支持导入多个数据库) +如果不存在 就新建一个 再进行查询 如果存在 就直接进行查询 +gserver-gclient if gclient quit(without unload), then restart, there maybe exist too many gserver process(so 3305 is occupied) +or the reason maybe gserver still dealing with the previous job, then a new connection and a new job comes +如何避免整个server崩溃或卡死,无论单个查询/建立等操作遇到任何问题 +--- +胡琳: +彭鹏师兄的数据集bug +优化谓词查询,谓词少而entity/literal多,所以先过滤得到谓词的解是一种可以考虑的策略 +以谓词为节点,以s/o为信息,来过滤得到谓词的结果 +需要一个查询计划进行选择,可能有些?p应该先做,有些?s/?o应该先做 +--- +陈语嫣: +网站设计以及和外包方的联系 +之后用pthread将join_two函数内部的拼接并行化,先实现一个最基本的版本即可 +--- +张雨的任务:单起点单终点的正则表达式路径问题,如果是多起点多终点? +--- +WARN:B+树删除时,向旁边兄弟借或者合并,要注意兄弟的定义,应该是同一父节点才对! +考虑使用 sigmod2016 那篇图同构的论文方法,实现一套join +但那个是基于路径的,BFS和索引连接的思想值得借用,可作为另外一套join方法 +@hulin +叶子节点是否也可以先过滤先join,或者说非核心节点,sparql查询图中算度数时是否不要考虑常量? +新的方法:基于谓词的频率动态调整顺序,但中间结果的保留是另一个问题,是用中间表还是索引连接,是否需要multiJoin? +另外超级点和超级边的概念也需要被提出 +--- +李荆的任务: +考虑出入度数,编码为1的个数?应该不用,在编码的邻居信息中能够得到体现。 +第二步编码应该更长,点和边对应着放在一起。按出入边分区,一步点,二步边分区和二步点。 +对一个节点保留其最长链信息没啥用,因为数据图基本是连通的,最长链就是图的最长链。 +多步编码不应分开,而应在编码逐一扩展,第二步可以依旧保留详细信息,最好用更长编码,因为信息更多。 +可能有相同的谓词对应多个邻居,同样的谓词只要保留一个即可,不同邻居可以重合。 +第三步可以只记谓词,第四步可以只记边的出入向。 +vstree并行过滤 +--- +实现其他的join思路,比如基于过滤效果 + +如何在preFilter和join的开销之间做平衡 +preFilter中的限制条件是否过于严格 + +寻找查询图的特征,分类做查询计划: +先对于每个查询,确定各部分的开销比例 + +fix the full_test:how to sum the db size right? +for virtuoso, better to reset each time and the given configure file(need to reserver the temp logs) + +load过程先导入满足内存的内容,或者先来几轮搜索但不输出结果,避免开头的查询要读磁盘。vstree直接全导入内存? +先完成合并测试,再测lubm500M和bsbm500M -- 90 server + +jemalloc?? + +各版本对比的表格中应加一列几何平均数,现实中大多数查询是简单查询,最好还有一个平均数,对应着把数据做归一化后求和 +dbpedia q6现在应该统计并对比结果了,包含在测试结果中 + +在改善kvstore后,build过程明显加快了很多,现在vstree的建立时间成了瓶颈 +preFilter中不仅要看谓词,也要看表大小以及度数,有些节点最好过滤!!! + +允许同时使用多个数据库,查询时指明数据库,这样的话可以支持from,但各个数据库是相互独立的; +添加scala API; +git管理开发; + +从下往上建立B+树和vstree树 +加快更新操作,如insert和delete等,是否考虑增加修改的源操作 +删除时数据全部删光会出大问题,保留空树?或者插入时考虑为空树的情况? +--- +更新量太多可直接重建索引,少量更新可写到overflow page中,在系统闲置时合并 +比如维护一个insert和delete记录索引,每次需要查多个,判断关系 +同一triple在同一索引中最多出现一次:原来已有/没有,先删后插,先插后删时等价的? +若原来已有,插入应该被放弃?若原来没有,删除应该被放弃? + +常量triple怎么办,有些点比较重要必须精确过滤,不仅仅看谓词还要看度数等等 +--- +评估函数用机器学习的方式?学习参数?学习模型? +但机器学习对新问题没有一个基本的界,不像数学化的评估函数,对任何情况都能保证一个上界 +(目前数据库里面很少有人利用机器学习来估价) + +URI是唯一的,RDF图中不存在相同标签的节点,无法合并相似节点,也没法利用自同构的预处理加快查询 +DBpedia最新的数据集,原来的相对太小了 +count函数的优化:深搜计数即可,不必生成中间表 +B+树每个节点内部添加索引,对keys分段? + + +是否可以考虑把literal也作为vstree中的节点,加快过滤? +join过程中生成literal可能开销大,且可以考虑用周围边对literal进行过滤。。。 +但sp2o生成的literal链应该不会很长? + +join buffer:use array instead of map, an ID can be in several columns, id2string maybe error(different pre, different list)! +use Bstr[2^16], only for entity, also as buffers(after intersecting with candidates) +if in id range, then use, otherwise generate(not update?) (when merging with hulin, discuss if exists) +Method:scan all linking point from left to right to find a node without literals(if not?), and buffer for this column +scan the column and get the maxID and minID, set minID as offset and only need to build a maxID-minID array +(the interval smaller, the better, how about using hash-conflict-list here) + +set different buffer size for different trees in build/query, for example, sp2o and op2s with 16G, p2so with 16G +set string2id and id2string as 1-2G + +测试时最好也统计出最长的string,和string的总大小 + +--- + +方正智汇:调用gStore做后台数据库,需要定义范式,比如领域、模型等等 +递归删除的效率问题:比如删除一整个模型(没有关系数据库删除整个表快) +可以考虑加一个域的约束,根据这种边来全部删除 +domain可用前缀表表达,但x可属于多个domain + +批量插入批量修改的规模很大 +需要支持修改谓词,不会出现p p o的情况 +加谓词比较好加 +删类的谓词,那么实例有该谓词的都得删 + +模型/领域 每个entity一个领域id? 只给class分配id?只给最上层class分配id?(这样查询太复杂,效率低) +划分多个数据库,同样考虑模式,如果领域和领域之间没有交集 +如果领域之间不完全独立,不能直接划分为多个数据库!如果划分需要考虑连接,划分后可直接用于分布式数据库应用 + +--- + +bsbm300M q8.sql - new gstore worser +it is because this query is linear, which means the dfs join order not optimized, and the time spent in preFilter maybe not change +maybe using non-dfs order for join? +we need to compare the time of each part in q8.sql + +用vf2来对比?借鉴它的思路?不过vf2做的是标准的子图同构,而非子图包含,需要修改 +vf2与gStore相互借鉴,另一种过滤方式? +客观来说,多边时vstree还是比较好的 + +对B+树加锁,可以直接更新节点的rank值,使其rank值最大(最高位或次高位),解锁时清除相应位 +(不过这样会引入比较大的开销,因为内存堆的修改过程是比较耗时的,需要线性的查找时间) +实际上B+树应侧重于读的性能而非读写的平衡,比如不用块结构,直接把一个节点存成一片。 +也可以使得每个节点固定大小,顺序存储,所有value存成记录文件,只增不减。 +query过程几乎没有修改,需要free的block几乎没有,可以直接存free块号和当前最大块号, +而无需存所有块的使用情况。也是由于这个原因,树的查询中几乎不对堆做修改,避开了耗时的堆指针查询操作。 + +修复:需要返回pre结果的谓词查询,首先要知道哪些谓词位于select中,这需要扫描查询 + +change to BestJoin using index lists and super edges: +the memory cost of join will be cut down, so we can cache more for querying! +define different cache size for build and query in makefile with -D +?how about BFS instead of DFS? + +需要一个整体的并行模块: +并行的线程总数要根据cpu的线程总数来设置,不能超过太多 +三种粒度,多个sparql query的并行,一个sparql内部多个basic query的并行,一个basic query内部的join流水线,两表拼接时的分块并行 +并行时对kvstore部分的影响(vstree因为基本是全内存,所以没啥影响),换入换出问题,需要加锁(禁止换出) +树协议加锁? +多query并行时join的内存开销可能超标,需要在vector申请失败时抛出异常,其他地方捕捉异常处理(比如等待内存空闲) + +关注jena的更新,以及其是否修复bug +每个模块的时间和提升 +gStore性能表格,前后对比 + +to support 10 billion in a single machine, the memory should > 100G +change Bstr length to long long type +(also update anywhere related to length or strlen, such as Signature::encodeStr2Entity) +change entity/literal ID type to unsigned ++++++++++++++++++++++ +新建一个ID管理模块,要求高效稳定,放在Util中 +在storage, Database, VStree多个地方用到 +better move StringIndex to KVstore +In unit test, insert/delete should be tested to improve coverage!!! +too many entities for lubm500M, so it is very slow + + +应统计并降低IO开销(主要是kvstore),尽量顺序连续、预读取 +比如加载时除了索引结构外,可以先读入一批节点 +另外如果可以把bstr和节点分离,确保每个节点都是相同大小(预先分配定量数组) +所有bstr串单独放到一个文件中,不删除不修改,只能附加 +大小超过一个阈值后,进行整理,移除无效的字段 +节点中只保留Bstr的文件偏移,用long long类型 +可以考虑将id2string直接使用倒排表实现,不用B+树(因为插删是很少的) +但string2id很麻烦,不能直接用倒排表计算 + +get 不能全部返回去重的,因为插删也需要调用get +除了join模块,其他只和查询相关的模块,最好也改为去重的 + +watdiv5000 exit unexpectedly -- p2so index +541740149 549240150 +entity 26060745 +literal 23964574 +pre 86 + +调试成组插删,提升效率 +目前存在的问题在于small_aa small_ab q3.sql +toStartJoin to add literals p2olist the olist is not right + +精简索引,如p2s和p2o可借用p2so来完成,同样得比如s2po和o2ps +而且sp2o普遍比较大,也可以用s2po加二分查找来完成 +最终只需要完成6+3棵kv树 +(很多问题,实现s2o要排序,代价可能非常高) +(可以考虑共用s2索引,同时存下o和po作为value,一个key对应两个Bstr) + +implement so2p with s2p and o2p may cause error +s p o1 +s2 p o + +test on real data of applications is needed +when case is ?s p1 o ?o p2 o +this will be viewed as two basic queries and answered separately +is this good? or if considered as one BGP, how can we do 2-hop encoding on it? + +join(preFilter根据p的num数量)和stream中的问题(判断失误或内存泄露) +将b树返回num改为直接用已有的实现(应该在底层实现么,如何保证效率) +在Query中建立p2num和sp2num等对应,避免反复搜索。 +(o2p o2ps o2s是否应该划分为entity/literal来加速, id2string是否用倒排表) + +考虑用内联汇编在大循环里做优化(测试对比效率提升) +最好把entity和literal彻底分开 + +Join::preFilter -- 某些时候先过滤,某些时候后过滤,视结果数目和边的过滤性能而定 +不用似乎也不会出错 +when using allFilterByPres, not always good, sometimes too costly!(dbpedia2014, self6.sql) +use "well-designed" in GeneralEvaluation to enable not all selected query; +三个瓶颈:getFinalResult copyToResult allPreFilter +join_basic的效率非常关键,是核心 + +另一种过滤:用sp2num预估数量,或者也预估vstree的结果数量,set triple dealed(以后避免重复处理) +可以考虑只在拼接时add literal(需要考虑常量约束,entity与literal分开考虑),最多提前加一个保证join启动 +在idlist中寻找entity和literal的临界点时,注意左小右大,不要单纯扫描 +是否最后再考虑卫星边的约束更好?(卫星边一定是单度的) +entity和literal没必要放在一起作交,另外交的顺序或许也很重要 +(每次应选最小的两个,或者多路merge的形式) + ++++++++++++++++++++++ +带标签的路径,而不是过滤加验证的方式 |图仿真避开无用的候选集和join +分析cas的结构和查询模式 +ask转换为常量三元组的判断 + +统计数据库大小时使用du -h还是ls -lR是个问题,但一般也不会相差太多 +gStore中的内存空洞问题,一方面是可能某个块没有写满,另一方面是可能预申请的块太多。 + +- - - + +# TEST + +signature.binary in db file are not removed in program, but it should be delete to save space + +单元测试: +http://www.cnblogs.com/linux-sir/archive/2012/08/25/2654557.html +http://www.ibm.com/developerworks/cn/linux/l-cn-cppunittest/ +https://my.oschina.net/vaero/blog/214893 + +watdiv5000在建立p2?索引时出错(p2so索引太大?) +jena在测试lubm5000的查询时,似乎答案有问题,如q0.sql + +若将signature编码长度由800扩大到1200+1000,build时间大幅上升,查询时间有些不变甚至加长,有些减小一半。 +for q6.txt in DBpedia, the latter signature length returns the time of 16366841ms, while teh size is the same as original, 457393467 +(but the size of q6.sql in jena is 17852675, which one is right?) + +建立日志系统方便调试? +出错时如何回滚,直接进行版本管理? +(每次插删前,将原来的索引做一个备份?) + +测试时每个查询除了时间外,还要记录一下结果数,备用 +a whole test on dbpedia must be done before commited!!!(just check query answer size) +dbpedia q0.sql gstore比virtuoso慢,因为过滤花了很久,而且候选集很大,之后又要反复判断 +bsbm100000 self3.sql pre_filter too costly +3204145 for ?v0 1 for ?v1 total 3204145 +maybe we should start from ?v1 or using p2s or p2o(maybe p2so first, then pre_filter, then generate) +self5.sql 中也是preFilter时间太长 +可以考虑视情况进行preFilter,或者研究下开销为何有时那么大 +或许可以不用,或者仍然全用s2p或改判断条件,或者只过滤单度顶点的边约束 + +virtuoso: dbpedia2014 这个测试很慢,容易出问题,最好单列 +100441525 ms to load +and the size is 17672699904/1000-39846 KB +50ms for q0.sql +217ms for q1.sql +210ms for q2.sql +23797ms for q3.sql +5536ms for q4.sql +2736ms for q5.sql +9515231ms for q9.sql + +virtuoso: bsbm_10000 +40137ms to load ,the size is 635437056/1000-39846 KB + +#### using multi-join and stream, compared with jena +gstore performs worser than jena in these cases: +bsbm series: self1.sql, sellf3.sql, self8.sql (self4,5,6) +dbpedia series: q3.sql, q4.sql, q5.sql (q9.sql) +lubm series: q0.sql, q2.sql, q13.sql, q16.sql +watdiv series: C1.sql, F3.sql + +#### performance of vstree have great impact on join process, due to the candidate size +the build time is 3 orders...it seems not directly related with the length of signature +(so we can extend to better length!) + +#### BSBM: when query contains "^^"(self0.sql), gstore output nothing. and when the results contain "^^"(self3.sql, self8.sql), gstore will omit the thing linked by "^^". +(this causes miss match because the other three dbms support this symbol. Virtuoso, however, can deal with queries containing "^^" + but will not output "^^..." in results) + +#### sesame does not support lubm(invalid IRI), and unable to deal with too large datasets like dbpedia2014, watdiv_300M, bsbm_100000... + +- - - + +# DEBUG + +error when use query "...." > ans.txt in gconsole + +lubm5000 q1 q2 delete/insert/query +1003243 none after vstree + +dbpedia q6.sql + +build db error if triple num > 500M + +- - - + +# BETTER + +#### 在BasicQuery.cpp中的encodeBasicQuery函数中发现有pre_id==-1时就可以直接中止查询,返回空值! + +#### 将KVstore模块中在堆中寻找Node*的操作改为用treap实现(或多存指针避开搜索?) + +#### 无法查询谓词,因为VSTREE中只能过滤得到点的候选解,如果有对边的查询是否可以分离另加考虑(hard to join)。(或集成到vstree中, add 01/10 at the beginning to divide s/o and p. however, result is 11 after OR, predicates are not so many, so if jump into s/o branch, too costly. and ho wabout the information encoding?) + +- - - + +# DOCS: + +#### how about STL: +http://www.zhihu.com/question/38225973?sort=created +http://www.zhihu.com/question/20201972 +http://www.oschina.net/question/188977_58777 + +- - - + +# WARN + +重定义问题绝对不能忍受,现已全部解决(否则会影响其他库的使用,vim的quickfix也会一直显示) +(因为和QueryTree冲突,最终搁置) +类型不匹配问题也要注意,尽量不要有(SparqlLexer.c的多字节常量问题) +变量定义但未使用,对antlr3生成的解析部分可以不考虑(文件太大,自动生成,影响不大) +以后最好使用antlr最新版(支持C++的)来重新生成,基于面向对象,防止与Linux库中的定义冲突! +(目前是在重定义项前加前缀) + +- - - + +# KEEP + +大量循环内的语句必须尽可能地优化!!! + +- gStore also use subgraph homomorphism! + +- always use valgrind to test and improve + +- 此版本用于开发,最终需要将整个项目转到gStore仓库中用于发布。转移时先删除gStore中除.git和LICENSE外的所有文件,然后复制即可(不要复制LICENSE,因为版本不同;也不用复制NOTES.md,因为仅用于记录开发事项) + +- build git-page for gStore + +- 测试时应都在热启动的情况下比较才有意义!!!gStore应该开-O2优化,且注释掉-g以及代码中所有debug宏 + +- 新算法在冷启动时时间不理想,即便只是0轮join开销也很大,对比时采用热启动 + +- 如果select中某变量未出现在查询图中,应该对该变量返回空值还是直接认为该查询非法 + +- - - + +# ADVICE + +#### 数值型查询 实数域 [-bound, bound] 类型很难匹配,有必要单独编码么? 数据集中不应有范围 Query中编码过滤后还需验证 +x>a, x=, <=, a时不直接取字符串,而是转换为数值并编码 +难点:约束条件转换为析取范式, 同一变量的约束,不同变量间的约束 + +#### 如何用RDF表示社交网络,以及社交应用的并发问题 + +#### construct a paper, revisit gStore: join method, encoding way, query plan + +#### 阅读分析PG源代码 + +#### 单元测试保证正确性和效率,valgrind分析性能瓶颈与内存泄露,重复越多的部分越应该深入优化(甚至是汇编级别) + +#### we can try on watdiv_1000/watdiv_2000/watdiv_1000M/freebase/bio2rdf + +#### limited depth recursive encoding strategy: build using topological ordering, how about updates? (append neighbor vertices encodings) (efficience and correctness?) + +#### 目前的vstree按B+树形式实现,其实没有必要将一个节点的标签存两次。可以父节点放child子节点放entry,也可以在父节点存entry数组,根节点的entry单独提出。 +(这样需要从上到下考虑) + +#### 论文中的vs*tree结构其实更复杂,节点之间有很多边相连,每层做一次子图匹配,过滤效果应该会更好,之后join代价很可能也会更小。目前的vstree仿照B+树实现更简单,每次单独过滤出一个节点的候选集,之后再统一join,但效果差强人意。 + +#### 在index_join中考虑所有判断情况、一次多边、交集/过滤等等,multi_join不动 + +#### 在join时两个表时有太多策略和条件,对大的需要频繁使用的数据列可考虑建立BloomFilter进行过滤 + +#### not operate on the same db when connect to local server and native! + +#### VStree部分的内存和时间开销都很大,测试gstore时应打印/分析各部分的时间。打印编码实例用于分析和测试,如何划分或运算空间(异或最大或夹角最大,立方体,只取决于方向而非长度) + +#### full_test中捕捉stderr信息(重要信息如时间结果应该确保是标准输出?),结果第一行是变量名(有select *时应该和jena分列比) + +#### Can not operate on the same db when connect to local server and native! + +#### auto关键字和智能指针? + +#### 实现内存池来管理内存? + +#### join可以不按照树序,考虑评估每两个表的连接代价 +1. 用机器学习(对查询分类寻找最优,梯度下降,调参)评估深搜顺序的好坏 +2. 压缩字符串:整体控制是否启动(比如安装时),同时/不同时用于内存和硬盘。对单个string根据结构判断是否压缩?(一个标志位)关键词映射?string相关操作主要是比较,相关压缩算法必须有效且不能太复杂! +3. 实现对谓词的查询(再看论文) +4. 将查询里的常量加入变量集,否则可能不连通而无法查询,也可能影响join效率。如A->c, B->c,本来应该是通过c相连的子图才对,但目前的gstore无法识别。 + +#### 考虑使用并行优化:load过程能否并行?vstree过滤时可多个点并行进行,另外join时可用pipeline策略,每得到一个就传给后续去操作。 + +#### 写清楚每个人的贡献 + + +## SSD +how to set /home/ssd r/w/x for everybody +mkfs.ext4 -E stride=128,stripe-width=128 /dev/sdb +tune2fs -O ^has_journal /dev/sdb +modify /etc/fstab: +/dev/sdb /home/ssd ext4 noatime,commit=600,errors=remount-ro 0 1 +to check if valid: mount /home/ssd +ensure that IO scheduler is noop(just ssd) or deadline(ssd+disk), otherwise: +add in rc.local: echo deadline(noop,cfq) >(>>) /sys/block/sdb/queue/scheduler +(open trim if ssd and system permitted) + +## 考虑支持GPU (可作为企业版的扩展功能) + +## RAID + +## 学习Orient DB的方式,可同时支持无模式、全模式和混合模式 +ACID? neo4j GraphDB + +## 单个文件的gStore?嵌入式,轻便,类似sqlite,方便移植,做成库的方式给python等调用 + +## 联邦数据库,避免数据重导入,上层查询分块 + +## 没必要关闭IO缓冲同步,因为用的基本都是C语言的输入输出操作 + +是否可以考虑用vf2算法来作子图同构?比较效率,相互结合? +考虑按谓词频度划分,比如建立两棵sp2o树,两者的缓存大小应该不同 + +Consider the use of Bloom Filter and FM-sketches + +--- + +## DataSet + +http://www.hprd.org/download/ + diff --git a/Query/BasicQuery.cpp b/Query/BasicQuery.cpp index 44be04e..cbafaf5 100644 --- a/Query/BasicQuery.cpp +++ b/Query/BasicQuery.cpp @@ -30,14 +30,14 @@ BasicQuery::clear() for(int i = 0; i < BasicQuery::MAX_VAR_NUM; i ++) { - delete[] this->edge_sig[i]; + //delete[] this->edge_sig[i]; delete[] this->edge_id[i]; delete[] this->edge_nei_id[i]; delete[] this->edge_pre_id[i]; delete[] this->edge_type[i]; } - delete[] this->edge_sig; + //delete[] this->edge_sig; delete[] this->edge_id; delete[] this->edge_nei_id; delete[] this->edge_pre_id; @@ -47,7 +47,7 @@ BasicQuery::clear() this->var_sig = NULL; this->var_name = NULL; - this->edge_sig = NULL; + //this->edge_sig = NULL; this->edge_id = NULL; this->edge_nei_id = NULL; this->edge_pre_id = NULL; @@ -339,55 +339,66 @@ BasicQuery::setReady(int _var) } void -BasicQuery::updateSubSig(int _sub_id, int _pre_id, int _obj_id, string _obj,int _line_id) +BasicQuery::updateSubSig(int _sub_var_id, int _pre_id, int _obj_id, int _line_id, int _obj_var_id) { + cout<<"sub var id: "<<_sub_var_id<= 0) { - Signature::encodeStr2Entity(_obj.c_str(), this->var_sig[_sub_id]); + //Signature::encodeStr2Entity(_obj.c_str(), this->var_sig[_sub_id]); + Signature::encodeStr2Entity(this->var_sig[_sub_var_id], _obj_id, Util::EDGE_OUT); } if(_pre_id >= 0) { - Signature::encodePredicate2Entity(_pre_id, this->var_sig[_sub_id], Util::EDGE_OUT); + Signature::encodePredicate2Entity(this->var_sig[_sub_var_id], _pre_id, Util::EDGE_OUT); } // update var(sub)_degree & edge_id according to this triple - int sub_degree = this->var_degree[_sub_id]; + int sub_degree = this->var_degree[_sub_var_id]; // edge_id[var_id][i] : the ID of the i-th edge of the var - this->edge_id[_sub_id][sub_degree] = _line_id; - this->edge_nei_id[_sub_id][sub_degree] = _obj_id; - this->edge_type[_sub_id][sub_degree] = Util::EDGE_OUT; - this->edge_pre_id[_sub_id][sub_degree] = _pre_id; - this->var_degree[_sub_id] ++; + this->edge_id[_sub_var_id][sub_degree] = _line_id; + this->edge_nei_id[_sub_var_id][sub_degree] = _obj_var_id; + this->edge_type[_sub_var_id][sub_degree] = Util::EDGE_OUT; + this->edge_pre_id[_sub_var_id][sub_degree] = _pre_id; + this->var_degree[_sub_var_id] ++; } void -BasicQuery::updateObjSig(int _obj_id, int _pre_id, int _sub_id, string _sub,int _line_id) +BasicQuery::updateObjSig(int _obj_var_id, int _pre_id, int _sub_id, int _line_id, int _sub_var_id) { + cout<<"obj var id: "<<_obj_var_id<= 0) { - cout << "str2entity" << endl; - Signature::encodeStr2Entity(_sub.c_str(), this->var_sig[_obj_id]); + //cout << "str2entity" << endl; + Signature::encodeStr2Entity(this->var_sig[_obj_var_id], _sub_id, Util::EDGE_IN); } if(_pre_id >= 0) { - cout << "pre2entity" << endl; - Signature::encodePredicate2Entity(_pre_id, this->var_sig[_obj_id], Util::EDGE_IN); + //cout << "pre2entity" << endl; + Signature::encodePredicate2Entity(this->var_sig[_obj_var_id], _pre_id, Util::EDGE_IN); +#ifdef DEBUG + //if(_obj_var_id == 1) + //{ + //cout<<"yy: "<var_sig[1])<var_degree[_obj_id]; + int obj_degree = this->var_degree[_obj_var_id]; // edge_id[var_id][i] : the ID of the i-th edge of the var - this->edge_id[_obj_id][obj_degree] = _line_id; - this->edge_nei_id[_obj_id][obj_degree] = _sub_id; - this->edge_type[_obj_id][obj_degree] = Util::EDGE_IN; - this->edge_pre_id[_obj_id][obj_degree] = _pre_id; - this->var_degree[_obj_id] ++; + this->edge_id[_obj_var_id][obj_degree] = _line_id; + this->edge_nei_id[_obj_var_id][obj_degree] = _sub_var_id; + this->edge_type[_obj_var_id][obj_degree] = Util::EDGE_IN; + this->edge_pre_id[_obj_var_id][obj_degree] = _pre_id; + this->var_degree[_obj_var_id] ++; } // encode relative signature data of the query graph @@ -505,9 +516,9 @@ BasicQuery::encodeBasicQuery(KVstore* _p_kvstore, const vector& _query_v // -1 if not found, this means this query is invalid pre_id = _p_kvstore->getIDByPredicate(pre); { - stringstream _ss; - _ss << "pre2id: " << pre << "=>" << pre_id << endl; - Util::logging(_ss.str()); + //stringstream _ss; + //_ss << "pre2id: " << pre << "=>" << pre_id << endl; + //Util::logging(_ss.str()); } } if(pre_id == -1) @@ -516,66 +527,86 @@ BasicQuery::encodeBasicQuery(KVstore* _p_kvstore, const vector& _query_v return false; } - int sub_id = -1; - int obj_id = -1; + int sub_var_id = -1; + int obj_var_id = -1; // -1 if not found, this means this subject is a constant map::iterator _find_sub_itr = (this->var_str2id).find(sub); if(_find_sub_itr != this->var_str2id.end()) { - sub_id = _find_sub_itr->second; + sub_var_id = _find_sub_itr->second; } // -1 if not found, this means this object is a constant(string) map::iterator _find_obj_itr = (this->var_str2id).find(obj); if(_find_obj_itr != this->var_str2id.end()) { - obj_id = _find_obj_itr->second; + obj_var_id = _find_obj_itr->second; } // sub is either a var or a string - bool sub_is_var = (sub_id != -1); + bool sub_is_var = (sub_var_id != -1); if(sub_is_var) { - this->updateSubSig(sub_id, pre_id, obj_id, obj,i); + int obj_id = -1; + if(obj.at(0) != '?') + { + obj_id = _p_kvstore->getIDByEntity(obj); + if(obj_id == -1) + { + obj_id = _p_kvstore->getIDByLiteral(obj); + } + } + //cout<<"to update sub: "<updateSubSig(sub_var_id, pre_id, obj_id, i, obj_var_id); //debug { - stringstream _ss; - _ss << "updateSubSig:\tsub:" << sub_id << "; pre:" << pre_id << "; obj:" << obj_id; - _ss << "; [" << obj << "]"; - Util::logging(_ss.str()); + //stringstream _ss; + //_ss << "updateSubSig:\tsub:" << sub_var_id << "; pre:" << pre_id << "; obj:" << obj_var_id; + //_ss << "; [" << obj << "]"; + //Util::logging(_ss.str()); } } // obj is either a var or a string - bool obj_is_var = (obj_id != -1); + bool obj_is_var = (obj_var_id != -1); if(obj_is_var) { - this->updateObjSig(obj_id, pre_id, sub_id, sub,i); + int sub_id = -1; + if(sub.at(0) != '?') + { + sub_id = _p_kvstore->getIDByEntity(sub); + } + //cout<<"to update obj: "<updateObjSig(obj_var_id, pre_id, sub_id, i, sub_var_id); //debug { - stringstream _ss; - _ss << "updateObjSig:\tobj:" << obj_id << "; pre:" << pre_id << "; sub:" << sub_id; - _ss << "; [" << sub << "]"; - Util::logging(_ss.str()); + //stringstream _ss; + //_ss << "updateObjSig:\tobj:" << obj_var_id << "; pre:" << pre_id << "; sub:" << sub_var_id; + //_ss << "; [" << sub << "]"; + //Util::logging(_ss.str()); } } // if both end points are variables - bool two_var_edge = (sub_is_var && obj_is_var); - if(two_var_edge) - { - if(pre_id >= 0) - { - cout << "pre2edge" << endl; - Signature::encodePredicate2Edge(pre_id, this->edge_sig[sub_id][obj_id]); -// this->edge_pre_id[sub_id][obj_id] = pre_id; - } - } - + //bool two_var_edge = (sub_is_var && obj_is_var); + //if(two_var_edge) + //{ + //if(pre_id >= 0) + //{ + //cout << "pre2edge" << endl; + //Signature::encodePredicate2Edge(pre_id, this->edge_sig[sub_id][obj_id]); + ////this->edge_pre_id[sub_id][obj_id] = pre_id; + //} + //} } +#ifdef DEBUG + //cout<<"yy: "<var_sig[2])<var_name[2]< 1 this->retrieve_var_num = 0; for(int i = 0; i < this->graph_var_num; ++i) @@ -764,7 +795,7 @@ BasicQuery::null_initial() this->edge_pre_id = NULL; this->edge_type = NULL; this->var_sig = NULL; - this->edge_sig = NULL; + //this->edge_sig = NULL; this->encode_method = BasicQuery::NOT_JUST_SELECT; this->candidate_list = NULL; this->graph_var_num = 0; @@ -784,7 +815,7 @@ BasicQuery::initial() this->var_sig = new EntityBitSet[BasicQuery::MAX_VAR_NUM]; this->var_name = new string[BasicQuery::MAX_VAR_NUM]; - this->edge_sig = new EdgeBitSet*[BasicQuery::MAX_VAR_NUM]; + //this->edge_sig = new EdgeBitSet*[BasicQuery::MAX_VAR_NUM]; this->edge_id = new int*[BasicQuery::MAX_VAR_NUM]; this->edge_nei_id = new int*[BasicQuery::MAX_VAR_NUM]; this->edge_pre_id = new int*[BasicQuery::MAX_VAR_NUM]; @@ -803,7 +834,7 @@ BasicQuery::initial() this->ready[i] = false; this->need_retrieve[i] = false; - this->edge_sig[i] = new EdgeBitSet[BasicQuery::MAX_VAR_NUM]; + //this->edge_sig[i] = new EdgeBitSet[BasicQuery::MAX_VAR_NUM]; this->edge_id[i] = new int[BasicQuery::MAX_VAR_NUM]; this->edge_nei_id[i] = new int[BasicQuery::MAX_VAR_NUM]; this->edge_pre_id[i] = new int[BasicQuery::MAX_VAR_NUM]; @@ -811,7 +842,7 @@ BasicQuery::initial() for(int j = 0; j < BasicQuery::MAX_VAR_NUM; ++j) { - this->edge_sig[i][j].reset(); + //this->edge_sig[i][j].reset(); this->edge_id[i][j] = -1; this->edge_nei_id[i][j] = -1; this->edge_pre_id[i][j] = -1; @@ -1127,18 +1158,18 @@ string BasicQuery::to_str() _ss << endl; } - for(int i = 0; i < this->graph_var_num; i ++) - { - for(int j = 0; j < BasicQuery::MAX_VAR_NUM; j ++) - { - if(edge_sig[i][j].count() != 0) - { - _ss << "pre_id=" << edge_pre_id[i][j] << "\t"; - _ss << i << ":" << j << "\t" << edge_sig[i][j] << endl; - } - } - _ss << endl; - } + //for(int i = 0; i < this->graph_var_num; i ++) + //{ + //for(int j = 0; j < BasicQuery::MAX_VAR_NUM; j ++) + //{ + //if(edge_sig[i][j].count() != 0) + //{ + //_ss << "pre_id=" << edge_pre_id[i][j] << "\t"; + //_ss << i << ":" << j << "\t" << edge_sig[i][j] << endl; + //} + //} + //_ss << endl; + //} Util::logging(_ss.str()); //debug diff --git a/Query/BasicQuery.h b/Query/BasicQuery.h index 2a5d074..94128b3 100644 --- a/Query/BasicQuery.h +++ b/Query/BasicQuery.h @@ -152,7 +152,7 @@ private: // BETTER:edge sig is of little importance // edge_sig[sub_id][obj_id] - EdgeBitSet** edge_sig; + //EdgeBitSet** edge_sig; void addInVarNotInSelect(); void findVarNotInSelect(); @@ -160,8 +160,10 @@ private: void initial(); void null_initial(); - void updateSubSig(int _sub_id, int _pre_id, int _obj_id, std::string _obj, int _line_id); - void updateObjSig(int _obj_id, int _pre_id, int _sub_id, std::string _sub, int _line_id); + //void updateSubSig(int _sub_id, int _pre_id, int _obj_id, std::string _obj, int _line_id); + //void updateObjSig(int _obj_id, int _pre_id, int _sub_id, std::string _sub, int _line_id); + void updateSubSig(int _sub_var_id, int _pre_id, int _obj_id, int _line_id, int _obj_var_id); + void updateObjSig(int _obj_var_id, int _pre_id, int _sub_id, int _line_id, int _sub_var_id); //infos for predicate variables vector pre_var; diff --git a/Query/SPARQLquery.h b/Query/SPARQLquery.h index 38d8033..f004676 100644 --- a/Query/SPARQLquery.h +++ b/Query/SPARQLquery.h @@ -16,6 +16,7 @@ class SPARQLquery private: vector query_union; vector query_var; + public: SPARQLquery(const string& _query); diff --git a/Server/Server.cpp b/Server/Server.cpp index 41ff517..11279e3 100644 --- a/Server/Server.cpp +++ b/Server/Server.cpp @@ -16,6 +16,8 @@ Server::Server() this->connectionMaxNum = Socket::MAX_CONNECTIONS; this->databaseMaxNum = 1; // will be updated when supporting multiple databases. this->database = NULL; + this->db_home = Util::global_config["db_home"]; + this->db_suffix = Util::global_config["db_suffix"]; } Server::Server(unsigned short _port) @@ -335,12 +337,14 @@ Server::dropDatabase(std::string _db_name, std::string _ac_name, std::string& _r } size_t length = _db_name.length(); - if (length < 3 || _db_name.substr(length - 3, 3) != ".db") { - _ret_msg = "you can only drop databases whose names end with \".db\""; + if (length < 3 || _db_name.substr(length - 3, 3) == ".db") { + _ret_msg = "you can not only drop databases whose names end with \".db\""; return false; } - std::string cmd = std::string("rm -rf ") + _db_name; + string store_path = this->db_home + "/" + _db_name + this->db_suffix; + + std::string cmd = std::string("rm -rf ") + store_path; int ret = system(cmd.c_str()); if (ret == 0) { _ret_msg = "drop database done."; @@ -518,7 +522,7 @@ Server::showDatabases(string _para, string _ac_name, string& _ret_msg) { if (_para == "all") { - _ret_msg = Util::getItemsFromDir(Util::db_home); + _ret_msg = Util::getItemsFromDir(this->db_home); return true; } if (this->database != NULL) diff --git a/Server/Server.h b/Server/Server.h index 3f409e0..38dae8a 100644 --- a/Server/Server.h +++ b/Server/Server.h @@ -58,6 +58,8 @@ private: int databaseMaxNum; Socket socket; Database* database; + std::string db_home; + std::string db_suffix; }; diff --git a/Signature/Signature.cpp b/Signature/Signature.cpp index 4f52731..2796355 100644 --- a/Signature/Signature.cpp +++ b/Signature/Signature.cpp @@ -31,9 +31,25 @@ Signature::BitSet2str(const EntityBitSet& _bitset) return _ss.str(); } -void -Signature::encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type) +void +Signature::encodeEdge2Entity(EntityBitSet& _entity_bs, int _pre_id, int _neighbor_id, const char _type) { + Signature::encodePredicate2Entity(_entity_bs, _pre_id, _type); + +#ifdef DEBUG + //if(_neighbor_id == 438460) + //{ + //cout<<"predicate encoded"<0 && _str[0] == '?') - return; + //NOTICE: we assume the parameter is always valid(invalid args should not be passed here) + long long id = _neighbor_id; + //NOTICE: in * maybe the int will overflow + long long seed = id * 5003 % 49957; + seed = seed % Signature::STR_SIG_INTERVAL_BASE; + seed = seed + (id % Signature::STR_SIG_INTERVAL_NUM) * Signature::STR_SIG_INTERVAL_BASE; - int length = (int)strlen(_str); - unsigned int hashKey = 0; - unsigned int pos = 0; - char *str2 = (char*)calloc(length + 1, sizeof(char)); - strcpy(str2, _str); - char *str = str2; - - unsigned base = Signature::STR_SIG_BASE * (Signature::HASH_NUM - 1); - for (int i = Signature::HASH_NUM - 1; i >= 0; --i) + if(Util::is_literal_ele(_neighbor_id)) { - HashFunction hf = Util::hash[i]; - if (hf == NULL) - break; - hashKey = hf(str); - str = str2; - pos = base + hashKey % Signature::STR_SIG_BASE; - base -= Signature::STR_SIG_BASE; - if (_str[0] == '"') - { - pos += Signature::STR_SIG_LENGTH2; - } - else if (_str[0] != '<') - { -#ifdef DEBUG_VSTREE - cerr << "error in encodeStr2Entity(): neighbor is neither a literal or entity!" << endl; -#endif - } - _entity_bs.set(pos); + seed += Signature::STR_SIG_ENTITY; } + else //entity part + { + //entity can be in edge or out edge + if (_type == Util::EDGE_OUT) + { + seed += Signature::STR_SIG_LITERAL; + } + } + + //if(_neighbor_id == 438460) + //{ + //cout<<_neighbor_id<<" "<0 && _str[0] == '?') + //return; + //int length = (int)strlen(_str); + //unsigned int hashKey = 0; + //unsigned int pos = 0; + //char *str2 = (char*)calloc(length + 1, sizeof(char)); + //strcpy(str2, _str); + //char *str = str2; + //unsigned base = Signature::STR_SIG_BASE * (Signature::HASH_NUM - 1); + //for (int i = Signature::HASH_NUM - 1; i >= 0; --i) + //{ + //HashFunction hf = Util::hash[i]; + //if (hf == NULL) + //break; + //hashKey = hf(str); + //str = str2; + //pos = base + hashKey % Signature::STR_SIG_BASE; + //base -= Signature::STR_SIG_BASE; + //if (_str[0] == '"') + //{ + //pos += Signature::STR_SIG_LENGTH2; + //} + //else if (_str[0] != '<') + //{ +//#ifdef DEBUG_VSTREE + //cerr << "error in encodeStr2Entity(): neighbor is neither a literal or entity!" << endl; +//#endif + //} + //_entity_bs.set(pos); + //} //BETTER: use multiple threads for different hash functions #ifdef DEBUG_VSTREE @@ -151,14 +194,14 @@ Signature::encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs) //_ss << "encodeStr2Entity:" << str2 << endl; //Util::logging(_ss.str()); #endif - free(str2); + //free(str2); } -void -Signature::encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs) -{ - //TODO -} +//void +//Signature::encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs) +//{ + ////NOT USED NOW +//} EntitySig::EntitySig() { @@ -216,35 +259,35 @@ EntitySig::getBitset()const return this->entityBitSet; } -EdgeSig::EdgeSig() -{ - this->edgeBitSet.reset(); -} +//EdgeSig::EdgeSig() +//{ + //this->edgeBitSet.reset(); +//} -EdgeSig::EdgeSig(const EdgeSig* _p_sig) -{ - this->edgeBitSet.reset(); - this->edgeBitSet |= _p_sig->edgeBitSet; -} +//EdgeSig::EdgeSig(const EdgeSig* _p_sig) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _p_sig->edgeBitSet; +//} -EdgeSig::EdgeSig(const EdgeSig& _sig) -{ - this->edgeBitSet.reset(); - this->edgeBitSet |= _sig.edgeBitSet; -} +//EdgeSig::EdgeSig(const EdgeSig& _sig) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _sig.edgeBitSet; +//} -EdgeSig::EdgeSig(const EdgeBitSet& _bitset) -{ - this->edgeBitSet.reset(); - this->edgeBitSet |= _bitset; -} +//EdgeSig::EdgeSig(const EdgeBitSet& _bitset) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _bitset; +//} -EdgeSig& -EdgeSig::operator|=(const EdgeSig& _sig) -{ - this->edgeBitSet |= _sig.edgeBitSet; - return *this; -} +//EdgeSig& +//EdgeSig::operator|=(const EdgeSig& _sig) +//{ + //this->edgeBitSet |= _sig.edgeBitSet; + //return *this; +//} string EntitySig::to_str() const @@ -254,4 +297,5 @@ EntitySig::to_str() const _ss << Signature::BitSet2str(this->entityBitSet); return _ss.str(); -} \ No newline at end of file +} + diff --git a/Signature/Signature.h b/Signature/Signature.h index fcab605..cea8fd2 100644 --- a/Signature/Signature.h +++ b/Signature/Signature.h @@ -12,44 +12,59 @@ #include "../Util/Util.h" +//NOTICE: +//1. it is hard to set the parameter dynamiclly in Signature +//2. unable to bind an edge's neighbor and pre because either pre or neighbor can be a variable in query class Signature { public: + //NOTICE: the match can be 160 * 3 + 160 * 2 + //or 200 * 3 + 100 * 2 + //TODO: we should adjust the parameter to harvest the best performance + //static HashFunction hash[HashNum]; //must make sure: ENTITY_SIG_LENGTH = EDGE_SIG_LENGTH + STR_SIG_LENGTH - //const static int ENTITY_SIG_LENGTH = 400; - static const int STR_SIG_BASE = 100; - //NOTICE: we can also use id here, but string is recommended due to special structure - //(maybe needed later, for example, wildcards) - //Th ehash function is costly, so just use two - static const int HASH_NUM = 3; //no more than Util::HashNum - //NOTICE:if using str id, we can also divide like EDGE_SIG - //here we divide as entity neighbors and literal neighbors: ENTITY, LITERAL - static const int STR_SIG_LENGTH = 2 * STR_SIG_BASE * HASH_NUM; //250 - static const int STR_SIG_LENGTH2 = STR_SIG_BASE * HASH_NUM; + static const int STR_SIG_INTERVAL_NUM = 20; + //static const int STR_SIG_INTERVAL_NUM = 16; + static const int STR_SIG_INTERVAL_BASE = 10; + static const int STR_SIG_LITERAL = STR_SIG_INTERVAL_NUM * STR_SIG_INTERVAL_BASE; + static const int STR_SIG_ENTITY = STR_SIG_LITERAL * 2; + //here we divide as entity neighbors and literal neighbors: ENTITY(in and out), LITERAL(only for out edges) + static const int STR_SIG_LENGTH = STR_SIG_ENTITY + STR_SIG_LITERAL; //600 - //QUERY:I think that str filter is more important in VSTree than predicate, because - //a predicate may correspond to a lot of entities and predicate num is usually small - static const int EDGE_SIG_INTERVAL_NUM_HALF = 5; //in edge or out edge - static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF; - static const int EDGE_SIG_INTERVAL_BASE = 20; - static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //150 - static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE; //150 + //NOTICE: after vstree filter, all constant neighbors will be used again to do precise filtering + //howvere, only few constant pres will be used again for filtering later + //So we must make most use of the pres here, while keeping the effect of string part + //(otherwise the result will be too large) - static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; + //str filter is more important in VSTree than predicate, because + //a predicate may correspond to a lot of entities and predicate num is usually small + static const int EDGE_SIG_INTERVAL_NUM_HALF = 10; //in edge or out edge + //static const int EDGE_SIG_INTERVAL_NUM_HALF = 16; //in edge or out edge + static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF; + static const int EDGE_SIG_INTERVAL_BASE = 10; + static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //200 + //static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE; + + static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; //1000 //static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH + NEIGHBOR_SIG_LENGTH; - typedef std::bitset EdgeBitSet; + //QUERY: the num of bitset must be based on 16, i.e. unsigned short? 1000 is not allowed + //but 800, 500 is ok + + //typedef std::bitset EdgeBitSet; typedef std::bitset EntityBitSet; static std::string BitSet2str(const EntityBitSet& _bitset); //NOTICE: there are two predicate encoding method now, see the encoding functions @Signature.cpp for details const static int PREDICATE_ENCODE_METHOD = 1; - static void encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type); - static void encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs); - static void encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs); //_str is subject or object(literal) - static void encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs); + static void encodePredicate2Entity(EntityBitSet& _entity_bs, int _pre_id, const char _type); + static void encodeStr2Entity(EntityBitSet& _entity_bs, int _neighbor_id, const char _type); + static void encodeEdge2Entity(EntityBitSet& _entity_bs, int _pre_id, int _neighbor_id, const char _type); + //static void encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs); + //static void encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs); + //Signature() //{ //NOTICE:not exceed the HashNum @@ -79,7 +94,7 @@ public: //WARN:also defined in Signature, must be same!!! //NOTICE:EdgeBitSet is only used in Query, not for VSTree -typedef std::bitset EdgeBitSet; +//typedef std::bitset EdgeBitSet; typedef std::bitset EntityBitSet; class EntitySig : Signature{ @@ -98,15 +113,15 @@ public: std::string to_str() const; }; -class EdgeSig : Signature{ -public: - EdgeBitSet edgeBitSet; - EdgeSig(); - EdgeSig(const EdgeSig* _p_sig); - EdgeSig(const EdgeSig& _sig); - EdgeSig(const EdgeBitSet& _bitset); - EdgeSig& operator|=(const EdgeSig& _sig); -}; +//class EdgeSig : Signature{ +//public: + //EdgeBitSet edgeBitSet; + //EdgeSig(); + //EdgeSig(const EdgeSig* _p_sig); + //EdgeSig(const EdgeSig& _sig); + //EdgeSig(const EdgeBitSet& _bitset); + //EdgeSig& operator|=(const EdgeSig& _sig); +//}; #endif // _SIGNATURE_SIGNATURE_H diff --git a/Util/Util.cpp b/Util/Util.cpp index b3fd62b..9a0a883 100644 --- a/Util/Util.cpp +++ b/Util/Util.cpp @@ -12,27 +12,53 @@ using namespace std; -//NOTICE:used in Database, Join and Strategy -int Util::triple_num = 0; -int Util::pre_num = 0; -int Util::entity_num = 0; -int Util::literal_num = 0; +//================================================================================================================== +//configure() to config the basic options of gStore system +//================================================================================================================== + +//string Util::profile = "../init.conf"; +string Util::profile = "init.conf"; + +map Util::global_config; //database home directory, which is an absolute path by config //TODO:everywhere using database, the prefix should be it -string Util::db_home = "."; +//string Util::db_home = "."; //false:single true:distribute -bool Util::gStore_mode = false; +//bool Util::gstore_mode = false; + +//control the debug information +//string Util::debug_level = "simple"; + +//database placed in which path +//string Util::db_path = "."; + +//the suffix to be added to database name +//string Util::db_suffix = ".db"; + +//the maxium buffer size assigned to gStore system +//string Util::buffer_maxium = "100"; //the unit is GB + +//the maxium thread num assigned to gStore system +//string Util::thread_maxium = "1000"; + +//if record logs in gStore system(to be recoverable or faster) +//string Util::operation_logs = "true"; + +//================================================================================================================== + +//NOTICE:used in Database, Join and Strategy +//int Util::triple_num = 0; +//int Util::pre_num = 0; +//int Util::entity_num = 0; +//int Util::literal_num = 0; //string Util::tmp_path = "../.tmp/"; //string Util::debug_path = "../.debug/"; string Util::tmp_path = ".tmp/"; string Util::debug_path = ".debug/"; -//string Util::profile = "../init.conf"; -string Util::profile = "init.conf"; - //QUERY: assign all in Util()? //BETTER:assigned in KVstore, not one tree? FILE* Util::debug_kvstore = NULL; //used by KVstore @@ -85,7 +111,89 @@ Util::a_trim(char * szOutput, const char * szInput) bool Util::configure() { - return Util::config_setting() && Util::config_debug() && Util::config_advanced(); + const unsigned len = 505; + char *buf, *c; + char buf_i[len], buf_o[len]; + FILE *fp = NULL; + char keyname[len]; + char keyval[len]; + + //initialize the settings + Util::global_config["gstore_mode"] = "single"; + //NOTICE+BETTER+TODO:use macro is better to avoid too many judging on this variable(add a DEBUG macro at the outer) + Util::global_config["debug_level"] = "simple"; + Util::global_config["db_home"] = "."; + Util::global_config["db_suffix"] = ".db"; + Util::global_config["buffer_maxium"] = "100"; + Util::global_config["thread_maxium"] = "1000"; + //TODO:to be recoverable + Util::global_config["operation_logs"] = "true"; + +#ifdef DEBUG + fprintf(stderr, "profile: %s\n", profile.c_str()); +#endif + if((fp = fopen(profile.c_str(), "r")) == NULL) //NOTICE: this is not a binary file + { +#ifdef DEBUG + fprintf(stderr, "openfile [%s] error [%s]\n", profile.c_str(), strerror(errno)); +#endif + return false; + } + fseek(fp, 0, SEEK_SET); + + while(!feof(fp) && fgets(buf_i, len, fp) != NULL) + { + //fprintf(stderr, "buffer: %s\n", buf_i); + Util::l_trim(buf_o, buf_i); + if(strlen(buf_o) <= 0) + continue; + buf = NULL; + buf = buf_o; + if(buf[0] == '#') + { + continue; + } + else if(buf[0] == '[') + { + continue; + } + if((c = (char*)strchr(buf, '=')) == NULL) + continue; + memset(keyname, 0, sizeof(keyname)); + sscanf(buf, "%[^=|^ |^\t]", keyname); +#ifdef DEBUG + //fprintf(stderr, "keyname: %s\n", keyname); +#endif + sscanf(++c, "%[^\n]", keyval); + char *keyval_o = (char *)calloc(strlen(keyval) + 1, sizeof(char)); + if(keyval_o != NULL) + { + Util::a_trim(keyval_o, keyval); +#ifdef DEBUG + //fprintf(stderr, "keyval: %s\n", keyval_o); +#endif + if(keyval_o && strlen(keyval_o) > 0) + { + //strcpy(keyval, keyval_o); + global_config[string(keyname)] = string(keyval_o); + } + xfree(keyval_o); + } + } + + fclose(fp); + //display all settings here + cout<<"the current settings are as below: "<::iterator it = global_config.begin(); it != global_config.end(); ++it) + { + cout<first<<" : "<second< //NOTICE:below are libraries need to link +#include #include #include #include +//if use pthread and lock +//#define THREAD_ON 1 +//if use stream module if result is too large than memory can hold #define STREAM_ON 1 //when used as C/S, if output query result in the server port: default not(you can see the result in the client) //#define OUTPUT_QUERY_RESULT 1 #define SERVER_SEND_JSON 1 +//if to use readline library for console(open by default) #define READLINE_ON 1 +//if to use multiple strategy for answering queries #define MULTI_INDEX 1 //#define SO2P 1 //#define USE_GROUP_INSERT 1 @@ -81,7 +87,8 @@ in the sparql query can point to the same node in data graph) //#define DEBUG_STREAM //#define DEBUG_PRECISE 1 all information //#define DEBUG_KVSTORE 1 //in KVstore -//#define DEBUG_VSTREE 1 //in Database +#define DEBUG_VSTREE 1 //in Database +//#define DEBUG_LRUCACHE 1 //#define DEBUG_DATABASE 1 //in Database // // @@ -136,18 +143,28 @@ typedef unsigned(*HashFunction)(const char*); //http://kb.cnblogs.com/page/189480/ // //type for the triple num +//TODO:this should use unsigned (triple num may > 2500000000) typedef int TNUM; //type for entity/literal/predicate ID typedef int ELPID; +//TODO:typedef several ID typesand new a ID module +//what is more, the str length and Block ID in kvstore +typedef unsigned PREDICATE_ID; +//TODO:encode entity from low to high, encode literal from high to low(finally select the mid of space as border) +typedef unsigned ENTITY_LITERAL_ID; +typedef unsigned NODE_ID; +//can use `man limits.h` to see more +#define INVALID UINT_MAX + /******** all static&universal constants and fucntions ********/ class Util { public: - static int triple_num; - static int pre_num; - static int entity_num; - static int literal_num; + //static int triple_num; + //static int pre_num; + //static int entity_num; + //static int literal_num; static const unsigned MB = 1048576; static const unsigned GB = 1073741824; @@ -177,15 +194,6 @@ public: static const int II_TREE = 2; static const int IS_TREE = 3; - static std::string db_home; - static std::string tmp_path; - // this are for debugging - //to build logs-system, each class: print() in time - static std::string debug_path; - static FILE* debug_kvstore; - static FILE* debug_database; - static FILE* debug_vstree; - static int memUsedPercentage(); static int memoryLeft(); static int compare(const char* _str1, unsigned _len1, const char* _str2, unsigned _len2); //QUERY(how to use default args) @@ -254,7 +262,23 @@ public: static bool config_setting(); static bool config_advanced(); static bool config_debug(); - static bool gStore_mode; + //static bool gStore_mode; + static std::map global_config; + //static std::string db_home; + + //sort functions for qsort + static int _spo_cmp(const void* _a, const void* _b); + static int _ops_cmp(const void* _a, const void* _b); + static int _pso_cmp(const void* _a, const void* _b); + + static std::string tmp_path; + // this are for debugging + //to build logs-system, each class: print() in time + static std::string debug_path; + static FILE* debug_kvstore; + static FILE* debug_database; + static FILE* debug_vstree; + private: static bool isValidIPV4(std::string); diff --git a/VSTree/LRUCache.cpp b/VSTree/LRUCache.cpp index b19393e..d2461cf 100644 --- a/VSTree/LRUCache.cpp +++ b/VSTree/LRUCache.cpp @@ -16,17 +16,34 @@ using namespace std; //NOTICE:In fact, real graph is not linear, we can assume that 1 billion triples contains at most 1 billion entities //then the memory cost at most is 23448 * 10M = 200G, which is also too large //But we can only see at most 200M entities in web graphs, then the memory cost is 40G, which is affordable -//TODO+BETTER:support memory-disk swap in vstree -int LRUCache::DEFAULT_CAPACITY = 10000000; -//int LRUCache::DEFAULT_CAPACITY = 1 * 1000 * 1000; +// +//CONSIDER:support memory-disk swap in vstree +//However, if we adjust the sig length according to entity num, and VNODE size is decided by sig length, we can control the +//whole vstree memory cost almost 20G +//What is more, if the system memory is enough(precisely, the memory you want to assign to gstore), +//we can also set the sig length larger(which should be included in config file) +//int LRUCache::DEFAULT_CAPACITY = 10000000; +int LRUCache::DEFAULT_CAPACITY = 1 * 1000 * 1000; //about 20G memory for vstree +//int LRUCache::DEFAULT_CAPACITY = 1000; +//TODO:10^6 is a good parameter, at most use 20G LRUCache::LRUCache(int _capacity) { - cout<<"size of VNODE: "<cache_lock), NULL); +#endif + + //cout<<"size of VNODE: "<capacity = _capacity > 0 ? _capacity : LRUCache::DEFAULT_CAPACITY; + + // TODO+DEBUG:it seems that a minium size is required, for example, multiple path down(the height?) + //at least 3*h + // // we should guarantee the cache is big enough. - this->capacity = std::max(this->capacity, VNode::MAX_CHILD_NUM * 2000); + //this->capacity = std::max(this->capacity, VNode::MAX_CHILD_NUM * 2000); this->next = new int[this->capacity + 2]; this->prev = new int[this->capacity + 2]; @@ -61,8 +78,14 @@ LRUCache::~LRUCache() delete this->values[i]; } delete[] this->values; + + //destroy the lock +#ifdef THREAD_ON + pthread_rwlock_destroy(&(this->cache_lock)); +#endif } +//NOTICE:this must be done in one thread(and only one time) //load cache's elements from an exist data file. bool LRUCache::loadCache(string _filePath) { @@ -78,7 +101,8 @@ bool LRUCache::loadCache(string _filePath) //NOTICE:here we set it to the maxium, to ensure all VNODE in memory int defaultLoadSize = this->capacity; //int defaultLoadSize = this->capacity / 2; - size_t vNodeSize = sizeof(VNode); + size_t vNodeSize = VNode::VNODE_SIZE; + //size_t vNodeSize = sizeof(VNode); int flag = 0; flag = fseek(filePtr, 0, SEEK_SET); @@ -89,22 +113,32 @@ bool LRUCache::loadCache(string _filePath) return false; } - int _tmp_cycle_count = 0; + //int _tmp_cycle_count = 0; while (this->size < defaultLoadSize) { - VNode* nodePtr = new VNode(); bool is_reach_EOF = feof(filePtr); - bool is_node_read = (fread((char *)nodePtr, vNodeSize, 1, filePtr) == 1); - - if (is_reach_EOF || !is_node_read) + if(is_reach_EOF) { break; } + VNode* nodePtr = new VNode(true); + //VNode* nodePtr = NULL; + //bool is_node_read = (fread((char *)nodePtr, vNodeSize, 1, filePtr) == 1); + bool is_node_read = nodePtr->readNode(filePtr); + + if (!is_node_read) + { + delete nodePtr; + break; + } + //NOTICE:not consider invalid node if(nodePtr->getFileLine() < 0) { + //remove invalid node + delete nodePtr; continue; } @@ -123,7 +157,7 @@ bool LRUCache::loadCache(string _filePath) //} //} - _tmp_cycle_count++; + //_tmp_cycle_count++; } fclose(filePtr); @@ -152,6 +186,11 @@ bool LRUCache::createCache(string _filePath) //set the key(node's file line) and value(node's pointer). if the key exists now, the value of this key will be overwritten. bool LRUCache::set(int _key, VNode * _value) { +#ifdef THREAD_ON + pthread_rwlock_wrlock(&(this->cache_lock)); + pthread_mutex_lock(&(_value->node_lock)); +#endif + map::iterator iter = this->key2pos.find(_key); // if the _key is found, overwrite its mapping value. @@ -160,25 +199,54 @@ bool LRUCache::set(int _key, VNode * _value) int pos = iter->second; this->freeElem(pos); this->setElem(pos, _key, _value); + //this->refresh(pos); } // if the cache is not full now, just put the key-value to the free slot. else if (this->size < this->capacity) { +#ifdef DEBUG_LRUCACHE + //cout<<"to insert a node in LRU cache"<size; this->setElem(pos, _key, _value); + //this->refresh(pos); } // if the cache is full, should swap out the least recently used one to hard disk. else { - cout<<"memory-disk swap hadppened in VSTree - LRUCache"<next[LRUCache::START_INDEX]; + //cout<getFileLine()<values[pos]->node_lock)); +#endif + //TODO:scan and select a unlocked one to swap, if no, then wait by cond + if(ret != 0) //not success + { + cout<<"error: fail to get the vnode lock in LRUCache::set()"<values[pos]->node_lock)); +#endif + this->writeOut(pos, this->keys[pos]); this->freeElem(pos); // set the new one to the memory pool. this->setElem(pos, _key, _value); + //this->refresh(pos); } + +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif return false; } @@ -186,7 +254,11 @@ bool LRUCache::set(int _key, VNode * _value) bool LRUCache::del(int _key) { -#ifdef DEBUG +#ifdef THREAD_ON + pthread_rwlock_wrlock(&(this->cache_lock)); +#endif + +#ifdef DEBUG_LRUCACHE cout<<"to del in LRUCache "<<_key<::iterator iter = this->key2pos.find(_key); @@ -194,27 +266,45 @@ LRUCache::del(int _key) { int pos1 = iter->second; int pos2 = LRUCache::DEFAULT_NUM + this->size - 1; +#ifdef DEBUG_LRUCACHE cout<<"pos 1: "<values[pos1]->getFileLine() != _key) { +#ifdef DEBUG_LRUCACHE cout<<"error in del() - file line not mapping"<fillElem(pos1, pos2); + //this->refresh(pos1); //NOTICE:we do not need to update the file now //We only record the freed file_line, and not used now //When this file_line is allocated again, then the new node can //be written into the unused file part //(VNode size is fixed) +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + return true; } +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + return false; } //get the value(node's pointer) by key(node's file line). VNode* LRUCache::get(int _key) { +#ifdef THREAD_ON + pthread_rwlock_rdlock(&(this->cache_lock)); +#endif + VNode* ret = NULL; //NOTICE:use map[] will cause the rbtree to enlarge, so we should use find map::iterator iter = this->key2pos.find(_key); @@ -223,39 +313,89 @@ VNode* LRUCache::get(int _key) { int pos = iter->second; ret = this->values[pos]; + this->refresh(pos); } // the value is not in memory now, should load it from hard disk. else if (this->size < this->capacity) { +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); + pthread_rwlock_wrlock(&(this->cache_lock)); +#endif + //NOTICE+DEBUG:now all are loaded and there should not be any not read, goes here means error! //And this will cause error in multiple threads program(even if only read) +#ifdef DEBUG_LRUCACHE cout<<"new read hadppened in VSTree - LRUCache"<size; if (this->readIn(pos, _key)) { ret = this->values[pos]; + this->refresh(pos); + } + else + { + cout<<"LRUCache::get() - readIn error in the second case"<cache_lock)); + pthread_rwlock_wrlock(&(this->cache_lock)); +#endif + +#ifdef DEBUG_LRUCACHE + //cout<<"memory-disk swap hadppened in VSTree - LRUCache::get()"<next[LRUCache::START_INDEX]; + + int retval = 0; +#ifdef THREAD_ON + retval = pthread_mutex_trylock(&(this->values[pos]->node_lock)); +#endif + //TODO:scan and select a unlocked one to swap, if no, then wait by cond + if(retval != 0) //not success + { + cout<<"error: fail to get the vnode lock in LRUCache::set()"<values[pos]->node_lock)); +#endif + this->writeOut(pos, this->keys[pos]); this->freeElem(pos); + //NOTICE: readIn will call setElem to add the new node to tail + //swap the head and push new to tail, so this is a LRU strategy if (this->readIn(pos, _key)) { ret = this->values[pos]; + this->refresh(pos); + } + else + { + cout<<"LRUCache::get() - readIn error in the third case"<node_lock)); + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + return ret; } //update the _key's mapping _value. if the key do not exist, this operation will fail and return false. bool LRUCache::update(int _key, VNode* _value) { +#ifdef THREAD_ON + pthread_rwlock_wrlock(&(this->cache_lock)); +#endif + // should swap it into cache first. VNode* valuePtr = this->get(_key); @@ -266,15 +406,28 @@ bool LRUCache::update(int _key, VNode* _value) if (this->keys[pos] != _key) { cerr << "error, the pos is wrong. @LRUCache::update" << endl; + +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + return false; } this->values[pos] = _value; +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + return true; } cerr << "error:the key not exist!"<cache_lock)); +#endif + return false; } @@ -282,29 +435,64 @@ int LRUCache::getCapacity() { return this->capacity; } + int LRUCache::getRestAmount() { - return this->capacity - this->size; +#ifdef THREAD_ON + pthread_rwlock_rdlock(&(this->cache_lock)); +#endif + int t = this->size; +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + + return this->capacity - t; + //return this->capacity - this->size; } + void LRUCache::showAmount() { +#ifdef THREAD_ON + pthread_rwlock_rdlock(&(this->cache_lock)); +#endif + printf( "TotalAmount=%d\tUsedAmount=%d\tUsedPercent=%.2f%%\n", this->capacity, this->size, (double)this->size / this->capacity * 100.0); -} -bool LRUCache::isFull() -{ - return this->size == this->capacity; + +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif } -//put the new visited one to the tail +bool LRUCache::isFull() +{ +#ifdef THREAD_ON + pthread_rwlock_rdlock(&(this->cache_lock)); +#endif + bool ret = this->size == this->capacity; +#ifdef THREAD_ON + pthread_rwlock_unlock(&(this->cache_lock)); +#endif + + return ret; + //return this->size == this->capacity; +} + +//LRU: put the new visited one to the tail void LRUCache::refresh(int _pos) { int prevPos, nextPos; - prevPos = this->prev[_pos]; nextPos = this->next[_pos]; + if(nextPos == LRUCache::END_INDEX) + { + //already the last element + return; + } + + prevPos = this->prev[_pos]; this->next[prevPos] = nextPos; this->prev[nextPos] = prevPos; @@ -347,6 +535,7 @@ void LRUCache::freeElem(int _pos) this->size--; } +//NOTICE: setElem will append the ele to the end, so LRU is ok //set the memory of the _pos element in cache void LRUCache::setElem(int _pos, int _key, VNode* _value) { @@ -365,6 +554,7 @@ void LRUCache::setElem(int _pos, int _key, VNode* _value) this->size++; } +//NOTICE: fillElem will change the pos1's next to the end, so LRU is ok(pos2 is always the current maximium position) //move pos2 ele to pos1, and pos1 ele should be freed void LRUCache::fillElem(int _pos1, int _pos2) { @@ -376,12 +566,17 @@ void LRUCache::fillElem(int _pos1, int _pos2) this->freeElem(_pos1); if(_pos1 >= _pos2) //0 ele or 1 ele(just remove the only one) { +#ifdef DEBUG_LRUCACHE cout<<"LRUCache::fillElem() - no need to fill"<keys[_pos2]; +#ifdef DEBUG_LRUCACHE cout<<"another key in fillElem() - "<values[_pos2] == NULL) { cout<<"error in fillElem() - value for pos2 is NULL"<prev[_pos2]; int nextPos = this->next[_pos2]; + //a real LRU strategy should be used, i.e. push new in tail, remove the oldest in head + //TODO:change to list and LRU, search only keep current node is ok, but update not + //lock: tree and buffer + // //QUERY:if pos1 and pos2 are neighbors in prev-next relations //can this conflict with freeElem? this->next[prevPos] = _pos1; @@ -420,7 +619,8 @@ LRUCache::freeDisk(int _pos) return false; } - size_t vNodeSize = sizeof(VNode); + //size_t vNodeSize = sizeof(VNode); + size_t vNodeSize = VNode::VNODE_SIZE; int line = nodePtr->getFileLine(); int flag = 0; long long seekPos = (long long)line * vNodeSize; @@ -434,7 +634,8 @@ LRUCache::freeDisk(int _pos) } nodePtr->setFileLine(-1); - fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + //fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + nodePtr->writeNode(filePtr); fclose(filePtr); @@ -461,11 +662,23 @@ LRUCache::writeOut(int _pos, int _fileLine) if (nodePtr->getFileLine() != _fileLine) { - cerr << "error, fileLine " << _fileLine << "wrong. @LRUCache::writeOut" << endl; + cerr << "error, fileLine " << _fileLine <<" "<< nodePtr->getFileLine() << " wrong. @LRUCache::writeOut" << endl; + } + + if(!nodePtr->isDirty()) + { + //cout<<"the node not dirty!"<setDirty(false); } int line = _fileLine == -1 ? nodePtr->getFileLine() : _fileLine; - size_t vNodeSize = sizeof(VNode); + size_t vNodeSize = VNode::VNODE_SIZE; + //size_t vNodeSize = sizeof(VNode); int flag = 0; long long seekPos = (long long)line * vNodeSize; @@ -477,7 +690,8 @@ LRUCache::writeOut(int _pos, int _fileLine) return false; } - fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + //fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + nodePtr->writeNode(filePtr); fclose(filePtr); return true; @@ -487,14 +701,19 @@ LRUCache::writeOut(int _pos, int _fileLine) //before use it, you must make sure that the _pos element in cache is free(unoccupied). bool LRUCache::readIn(int _pos, int _fileLine) { - VNode* nodePtr = new VNode(); +#ifdef DEBUG_LRUCACHE + //cout<<"pos: "<<_pos<<" "<<"fileline: "<<_fileLine<dataFilePath.c_str(), "rb"); - if (nodePtr == NULL) - { - cerr << "error, can not new a VNode. @LRUCache::readIn" << endl; - return false; - } + //if (nodePtr == NULL) + //{ + //cerr << "error, can not new a VNode. @LRUCache::readIn" << endl; + //return false; + //} + if (filePtr == NULL) { cerr << "error, can't open " << @@ -504,7 +723,8 @@ bool LRUCache::readIn(int _pos, int _fileLine) } int line = _fileLine; - size_t vNodeSize = sizeof(VNode); + size_t vNodeSize = VNode::VNODE_SIZE; + //size_t vNodeSize = sizeof(VNode); int flag = 0; long long seekPos = (long long)line * vNodeSize; @@ -517,11 +737,13 @@ bool LRUCache::readIn(int _pos, int _fileLine) } //bool is_node_read = (fread((char *)nodePtr, vNodeSize, 1, filePtr) == 1); - fread((char *)nodePtr, vNodeSize, 1, filePtr); + //fread((char *)nodePtr, vNodeSize, 1, filePtr); + nodePtr->readNode(filePtr); fclose(filePtr); if (nodePtr == NULL || nodePtr->getFileLine() != _fileLine) { + cout<<"node file line: "<getFileLine()<dataFilePath.c_str(), "r+b"); if (filePtr == NULL) @@ -544,7 +769,8 @@ bool LRUCache::flush() int startIndex = LRUCache::DEFAULT_NUM; int endIndex = startIndex + this->size; - size_t vNodeSize = sizeof(VNode); + size_t vNodeSize = VNode::VNODE_SIZE; + //size_t vNodeSize = sizeof(VNode); //NOTICE:values are continuous for (int i = startIndex; i < endIndex; ++i) @@ -554,10 +780,10 @@ bool LRUCache::flush() //cout<<"file line to write "<getFileLine() != line) - { - cout << "line error at !!!" << line << " " << nodePtr->getFileLine() << endl; - } + if (nodePtr->getFileLine() != line) + { + cout << "line error at !!!" << line << " " << nodePtr->getFileLine() << endl; + } #endif if (nodePtr == NULL) @@ -566,6 +792,11 @@ bool LRUCache::flush() return false; } + if(!nodePtr->isDirty()) + { + continue; + } + int flag = 0; long long seekPos = (long long)line * vNodeSize; flag = fseek(filePtr, seekPos, SEEK_SET); @@ -576,7 +807,8 @@ bool LRUCache::flush() return false; } - fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + //fwrite((char *)nodePtr, vNodeSize, 1, filePtr); + nodePtr->writeNode(filePtr); } fclose(filePtr); diff --git a/VSTree/LRUCache.h b/VSTree/LRUCache.h index e88c937..086b7a6 100644 --- a/VSTree/LRUCache.h +++ b/VSTree/LRUCache.h @@ -13,6 +13,10 @@ class VNode; +//NOTICE: we should implement the LRU, not simply FIFO +//not only consider updates, but also visits +//TODO:this may cause the cost of mutiple-thread-sync very high + // before using the cache, you must loadCache or createCache. class LRUCache { @@ -33,7 +37,7 @@ public: //delete a node from LRUcache and file bool del(int _key); //update the _key's mapping _value. if the key do not exist, this operation will fail and return false. - bool update(int _key, VNode* _value); + bool update(int _key, VNode* _value); //write out all the elements to hard disk. bool flush(); int getCapacity(); @@ -51,6 +55,7 @@ private: //(pos is needed due to swap-in-out and insert/delete) int* keys; VNode** values; + //NOTICE: the key is node file line, i.e. vstree node ID std::map key2pos; // mapping from key to pos. std::string dataFilePath; static const int DEFAULT_NUM = 2; @@ -59,7 +64,7 @@ private: static const int NULL_INDEX = -1; static const int EOF_FLAG = -1; //put the new visited one to the tail - void refresh(int _pos); + void refresh(int _pos); //free the memory of the _pos element in cache. void freeElem(int _pos); //set the memory of the _pos element in cache @@ -72,6 +77,19 @@ private: //read the value from hard disk, and put it to the values[_pos]. //before use it, you must make sure that the _pos element in cache is free(unoccupied). bool readIn(int _pos, int _fileLine); + + //NOTICE: cost of rw lock is higher than mutex + //By default, read lock is recursive, write lock not, we can set mutex as recursive + //You can acquire read lock first and then upgrade to write lock, finally unlock twice + //R/W lock only fits for many-read and rare-write cases + // + //lock the whole buffer if get/set/swap element +#ifdef THREAD_ON + pthread_rwlock_t cache_lock; + //TODO:if find no unlocked one to swap out, then need to wait by cond + //pthread_cond_t cache_cond; +#endif + }; #endif //_VSTREE_LRUCACHE_H diff --git a/VSTree/VNode.cpp b/VSTree/VNode.cpp index 7cb2e96..9a39db9 100644 --- a/VSTree/VNode.cpp +++ b/VSTree/VNode.cpp @@ -12,51 +12,150 @@ using namespace std; VNode::VNode() { - this->is_leaf = false; - this->is_root = false; - this->child_num = 0; + //this->is_leaf = false; + //this->is_root = false; + //this->child_num = 0; + this->flag = 0; this->self_file_line = -1; this->father_file_line = -1; + + this->child_file_lines = new int[VNode::MAX_CHILD_NUM]; + for(int i = 0; i < VNode::MAX_CHILD_NUM; i ++) + { + this->child_file_lines[i] = -1; + } + + InitLock(); +} + +VNode::VNode(bool _is_leaf) +{ + //this->is_leaf = false; + //this->is_root = false; + //this->child_num = 0; + this->flag = 0; + this->self_file_line = -1; + this->father_file_line = -1; + + if(_is_leaf) + { + this->child_file_lines = NULL; + //return; + } + else + { + this->AllocChilds(); + } + + InitLock(); +} + +VNode::~VNode() +{ + delete[] this->child_file_lines; + this->child_file_lines = NULL; + +#ifdef THREAD_ON + pthread_mutex_destroy(&(this->node_lock)); +#endif +} + +void +VNode::AllocChilds() +{ + this->child_file_lines = new int[VNode::MAX_CHILD_NUM]; for(int i = 0; i < VNode::MAX_CHILD_NUM; i ++) { this->child_file_lines[i] = -1; } } +void +VNode::InitLock() +{ +#ifdef THREAD_ON + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&(this->node_lock), &attr); +#endif +} + +void +VNode::setFlag(unsigned _flag) +{ + this->flag = _flag; +} + +bool +VNode::isDirty() const +{ + return this->flag & VNode::DIRTY_PART; +} + +void +VNode::setDirty(bool _flag) +{ + if(_flag) + { + this->flag |= VNode::DIRTY_PART; + } + else + { + this->flag &= VNode::DEL_DIRTY_PART; + } +} + bool VNode::isLeaf() const { - return this->is_leaf; + return this->flag & VNode::LEAF_PART; } bool VNode::isRoot() const { - return this->is_root; + return this->flag & VNode::ROOT_PART; } bool VNode::isFull() const { - return (this->child_num == VNode::MAX_CHILD_NUM - 1); // need one slot for splitting node. + return (this->getChildNum() == VNode::MAX_CHILD_NUM - 1); // need one slot for splitting node. } void VNode::setAsLeaf(bool _isLeaf) { - this->is_leaf = _isLeaf; + if(_isLeaf) + { + this->flag |= VNode::LEAF_PART; + } + else + { + this->flag &= VNode::DEL_LEAF_PART; + } } void VNode::setAsRoot(bool _isRoot) { - this->is_root = _isRoot; + if(_isRoot) + { + this->flag |= VNode::ROOT_PART; + } + else + { + this->flag &= VNode::DEL_ROOT_PART; + } + //this->setDirty(); } int VNode::getChildNum() const { - return this->child_num; + //return this->child_num; + return this->flag & VNode::NUM_PART; } int @@ -80,13 +179,17 @@ VNode::getChildFileLine(int _i) const void VNode::setChildNum(int _num) { - this->child_num = _num; + //this->child_num = _num; + this->flag &= VNode::DEL_NUM_PART; + this->flag |= _num; + //this->setDirty(); } void VNode::setFileLine(int _line) { this->self_file_line = _line; + //this->setDirty(); } void VNode::setFatherFileLine(int _line) @@ -161,8 +264,10 @@ bool VNode::addChildEntry(const SigEntry _entry, bool _is_splitting) //return false; //} - this->setChildEntry(this->child_num, _entry); - this->child_num ++; + int child_num = this->getChildNum(); + this->setChildEntry(child_num, _entry); + //this->child_num ++; + this->setChildNum(child_num+1); return true; } @@ -182,8 +287,10 @@ bool VNode::addChildNode(VNode* _p_child_node, bool _is_splitting) //return false; //} + int child_num = this->getChildNum(); _p_child_node->setFatherFileLine(this->self_file_line); - this->setChildFileLine(this->child_num, _p_child_node->getFileLine()); + _p_child_node->setDirty(); + this->setChildFileLine(child_num, _p_child_node->getFileLine()); this->addChildEntry( _p_child_node->getEntry(), _is_splitting); //NOTICE:this function calls addChildEntry(), which already add the child_num //this->child_num ++; @@ -193,18 +300,32 @@ bool VNode::addChildNode(VNode* _p_child_node, bool _is_splitting) bool VNode::removeChild(int _i) { - if (_i < 0 || _i >= this->child_num) + int child_num = this->getChildNum(); + if (_i < 0 || _i >= child_num) { cerr<< "error, illegal child index. @VNode::removeChild" << endl; return false; } - for (int j = _i + 1; j < this->child_num; ++j) - { - child_entries[j-1] = child_entries[j]; - child_file_lines[j-1] = child_file_lines[j]; - } - this->child_num --; + if(this->isLeaf()) + { + for (int j = _i + 1; j < child_num; ++j) + { + child_entries[j-1] = child_entries[j]; + //child_file_lines[j-1] = child_file_lines[j]; + } + } + else + { + for (int j = _i + 1; j < child_num; ++j) + { + child_entries[j-1] = child_entries[j]; + child_file_lines[j-1] = child_file_lines[j]; + } + } + + //this->child_num --; + this->setChildNum(child_num-1); return true; } @@ -223,19 +344,26 @@ int VNode::getIndexInFatherNode(LRUCache& _nodeBuffer) { if (fatherNodePtr->getChildFileLine(i) == this->self_file_line) { +#ifdef THREAD_ON + pthread_mutex_unlock(&(fatherNodePtr->node_lock)); +#endif return i; } } - cerr << "error, can not find rank in father node. @VNode::getRankFatherNode" << endl; +#ifdef THREAD_ON + pthread_mutex_unlock(&(fatherNodePtr->node_lock)); +#endif + cerr << "error, can not find rank in father node. @VNode::getIndexInFatherNode" << endl; return 0; } void VNode::refreshSignature() { EntitySig sig; + int child_num = this->getChildNum(); - for (int i=0;ichild_num;i++) + for (int i = 0; i < child_num; i++) { sig |= this->child_entries[i].getEntitySig(); } @@ -255,6 +383,9 @@ void VNode::refreshAncestorSignature(LRUCache& _nodeBuffer) this->refreshSignature(); // refresh father node's signature. +#ifdef DEBUG_VSTREE + //cout<<"VNode::refreshAncestorSignature() - to get father"<getFather(_nodeBuffer); if (fatherNodePtr == NULL) { @@ -266,9 +397,13 @@ void VNode::refreshAncestorSignature(LRUCache& _nodeBuffer) int rank = this->getIndexInFatherNode(_nodeBuffer); if (fatherNodePtr->getChildEntry(rank).getEntitySig() != this->entry.getEntitySig()) { + fatherNodePtr->setDirty(); fatherNodePtr->setChildEntry(rank, this->entry); fatherNodePtr->refreshAncestorSignature(_nodeBuffer); } +#ifdef THREAD_ON + pthread_mutex_unlock(&(fatherNodePtr->node_lock)); +#endif } bool VNode::retrieveChild(vector& _child_vec, const EntitySig _filter_sig, LRUCache& _nodeBuffer) @@ -279,7 +414,8 @@ bool VNode::retrieveChild(vector& _child_vec, const EntitySig _filter_si return false; } - for (int i=0;ichild_num;i++) + int child_num = this->getChildNum(); + for (int i = 0; i < child_num; i++) { if (this->child_entries[i].cover(_filter_sig)) { @@ -298,7 +434,8 @@ bool VNode::retrieveEntry(vector& _entry_vec, const EntitySig _filter_ return false; } - for (int i=0;ichild_num;i++) + int child_num = this->getChildNum(); + for (int i = 0 ; i < child_num; i++) { if (this->child_entries[i].cover(_filter_sig)) { @@ -314,7 +451,8 @@ bool VNode::checkState() if (this->getFileLine() < 0) return false; - for (int i=0;ichild_num;i++) + int child_num = this->getChildNum(); + for (int i = 0; i < child_num; i++) if (!this->isLeaf() && this->getChildFileLine(i) < 0) { return false; @@ -327,13 +465,14 @@ std::string VNode::to_str() std::stringstream _ss; _ss << "VNode:" << endl; _ss << "\tEntityID:" << entry.getEntityId() << endl; - _ss << "\tisLeaf:" << this->is_leaf << endl; - _ss << "\tisRoot:" << this->is_root << endl; + _ss << "\tisLeaf:" << this->isLeaf() << endl; + _ss << "\tisRoot:" << this->isRoot() << endl; _ss << "\tfileline:" << this->self_file_line << endl; _ss << "\tsignature:" << Signature::BitSet2str(this->entry.getEntitySig().entityBitSet ) << endl; - _ss << "\tchildNum:" << this->child_num << endl << "\t"; - for(int i = 0; i < this->child_num; i ++) + int child_num = this->getChildNum(); + _ss << "\tchildNum:" << child_num << endl << "\t"; + for(int i = 0; i < child_num; i ++) { if(! this->isLeaf()){ _ss << "[" << this->getChildFileLine(i) << "]\t"; @@ -349,3 +488,95 @@ std::string VNode::to_str() return _ss.str(); } + +//TODO: keep a lock for each node, but not write to disk + +bool +VNode::readNode(FILE* _fp) +{ + int ret = fread(&(this->flag), sizeof(unsigned), 1, _fp); + if(ret == 0) //the edn of file + { + return false; + } + fread(&(this->self_file_line), sizeof(int), 1, _fp); + //cout<<"to read node: "<self_file_line<father_file_line), sizeof(int), 1, _fp); + fread(&(this->entry), sizeof(SigEntry), 1, _fp); + + //for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) + //{ + //fread(&(this->child_entries[i]), sizeof(SigEntry), 1, _fp); + //} + fread(this->child_entries, sizeof(SigEntry), VNode::MAX_CHILD_NUM, _fp); + + if(!this->isLeaf()) //internal node + { + this->child_file_lines = new int[VNode::MAX_CHILD_NUM]; + //for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) + //{ + //fread(&(this->child_file_lines[i]), sizeof(int), 1, _fp); + //} + fread(this->child_file_lines, sizeof(int), VNode::MAX_CHILD_NUM, _fp); + } + else //move to the end of the node block + { + fseek(_fp, sizeof(int) * VNode::MAX_CHILD_NUM, SEEK_CUR); + } + //this->setDirty(false); + + return true; +} + +bool +VNode::writeNode(FILE* _fp) +{ + //clean, then no need to write + //if(!this->isDirty()) + //{ + //return true; + //} + //NOTICE:already dealed in LRUCache + //this->setDirty(false); + + //cout<<"to write node: "<self_file_line<flag), sizeof(unsigned), 1, _fp); + fwrite(&(this->self_file_line), sizeof(int), 1, _fp); + //NOTICE: this must be a old node(not new inserted node), so no need to write more + if(this->self_file_line < 0) + { + return true; + } + + fwrite(&(this->father_file_line), sizeof(int), 1, _fp); + fwrite(&(this->entry), sizeof(SigEntry), 1, _fp); + + //for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) + //{ + //fwrite(&(this->child_entries[i]), sizeof(SigEntry), 1, _fp); + //} + fwrite(this->child_entries, sizeof(SigEntry), VNode::MAX_CHILD_NUM, _fp); + + if(!this->isLeaf()) //internal node + { + //for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) + //{ + //fwrite(&(this->child_file_lines[i]), sizeof(int), 1, _fp); + //} + fwrite(this->child_file_lines, sizeof(int), VNode::MAX_CHILD_NUM, _fp); + } + else //move to the end of the node block + { + //fseek(_fp, sizeof(int) * VNode::MAX_CHILD_NUM, SEEK_CUR); + int t = 0; + for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) + { + fwrite(&t, sizeof(int), 1, _fp); + } + } + + return true; +} + diff --git a/VSTree/VNode.h b/VSTree/VNode.h index d1926b8..4e8a721 100644 --- a/VSTree/VNode.h +++ b/VSTree/VNode.h @@ -24,38 +24,74 @@ public: //static const int MAX_CHILD_NUM = 151; static const int MIN_CHILD_NUM = VNode::DEGREE; //static const int MIN_CHILD_NUM = 60; + + //size of Vnode + static const int VNODE_SIZE = sizeof(int) * (3+MAX_CHILD_NUM) + sizeof(SigEntry) * (MAX_CHILD_NUM+1); + + //extract different parts of flag + static const unsigned DIRTY_PART = 0x08000000; + static const unsigned DEL_DIRTY_PART = ~DIRTY_PART; + static const unsigned LEAF_PART = 0x04000000; + static const unsigned DEL_LEAF_PART = ~LEAF_PART; + static const unsigned ROOT_PART = 0x02000000; + static const unsigned DEL_ROOT_PART = ~ROOT_PART; + //NOTICE:child num is below 256, so use 8 bits is ok + static const unsigned NUM_PART = 0x000000ff; + static const unsigned DEL_NUM_PART = ~NUM_PART; + +#ifdef THREAD_ON + //NOTICE: rw lock has higher cost than mutex lock + //mutex lock: if in memory + pthread_mutex_t node_lock; +#endif //debug // static const int MAX_CHILD_NUM = 50; // static const int MIN_CHILD_NUM = 20; VNode(); - bool isLeaf()const; - bool isRoot()const; - bool isFull()const; + VNode(bool _is_leaf); + ~VNode(); + + void setFlag(unsigned _flag); + + bool isLeaf() const; + bool isRoot() const; + bool isFull() const; + void setAsLeaf(bool _isLeaf); void setAsRoot(bool _isRoot); - int getChildNum()const; - int getFileLine()const; - int getFatherFileLine()const; - int getChildFileLine(int _i)const; + + bool isDirty() const; + void setDirty(bool _flag = true); + + int getChildNum() const; + int getFileLine() const; + int getFatherFileLine() const; + int getChildFileLine(int _i) const; + void setChildNum(int _num); void setFileLine(int _line); void setFatherFileLine(int _line); void setChildFileLine(int _i, int _line); + const SigEntry& getEntry()const; - const SigEntry& getChildEntry(int _i)const; + const SigEntry& getChildEntry(int _i) const; void setEntry(const SigEntry _entry); void setChildEntry(int _i, const SigEntry _entry); - VNode* getFather(LRUCache& _nodeBuffer)const; // get the father node's pointer. - VNode* getChild(int _i, LRUCache& _nodeBuffer)const; // get the _i-th child node's pointer. + + VNode* getFather(LRUCache& _nodeBuffer) const; // get the father node's pointer. + VNode* getChild(int _i, LRUCache& _nodeBuffer) const; // get the _i-th child node's pointer. + /* add one child node to this node. when splitting this node, can add a new child to it. */ bool addChildNode(VNode* _p_child_node, bool _is_splitting = false); /* add one child entry to this node. when splitting this node, can add a new entry to it. */ bool addChildEntry(const SigEntry _entry, bool _is_splitting = false); bool removeChild(int _i); + int getIndexInFatherNode(LRUCache& _nodeBuffer); void refreshSignature(); // just refresh itself signature. void refreshAncestorSignature(LRUCache& _nodeBuffer); // refresh self and its ancestor's signature. + /* used by internal Node */ bool retrieveChild(std::vector& _child_vec, const EntitySig _filter_sig, LRUCache& _nodeBuffer); /* only used by leaf Node */ @@ -66,17 +102,29 @@ public: std::string to_str(); + //NOTICE: read and write based on unequal IntlNode and LeafNode + bool readNode(FILE* _fp); + bool writeNode(FILE* _fp); + private: - bool is_leaf; - bool is_root; - int child_num; + //BETTER:including height and modify the memory-disk swap strategy + //then is_root flag is unnecessary + unsigned flag; + //bool dirty; + //bool is_leaf; + //bool is_root; + //int child_num; int self_file_line; int father_file_line; SigEntry entry; - //BETTER:is this necessary? too much memory? - //DEBUG:add 1 in case of error SigEntry child_entries[VNode::MAX_CHILD_NUM]; - int child_file_lines[VNode::MAX_CHILD_NUM]; + + //NOTICE: in leaf node, no need to keep the big child array + //int child_file_lines[VNode::MAX_CHILD_NUM]; + int* child_file_lines; + + void AllocChilds(); + void InitLock(); }; #endif // _VSTREE_VNODE_H diff --git a/VSTree/VSTree.cpp b/VSTree/VSTree.cpp index 613b63b..a2ee983 100644 --- a/VSTree/VSTree.cpp +++ b/VSTree/VSTree.cpp @@ -39,6 +39,12 @@ VSTree::~VSTree() this->max_nid_alloc = 0; } +bool +VSTree::isEmpty() const +{ + return this->height == 0; +} + int VSTree::getHeight() const { @@ -49,6 +55,10 @@ VSTree::getHeight() const VNode* VSTree::getRoot() { + //if(this->root_file_line < 0) + //{ + //return NULL; + //} return (this->node_buffer)->get(this->root_file_line); } @@ -59,6 +69,7 @@ VSTree::getNode(int _line) //if (_line >= this->node_num) if (_line >= this->max_nid_alloc) { + //cout<<_line <<" "<max_nid_alloc<root_file_line < 0) + { + return; + } + Util::logging("IN retrieve"); //debug @@ -141,6 +157,7 @@ void VSTree::retrieve(SPARQLquery& _query) Util::logging("OUT retrieve"); } +//NOTICE:this can only be done by one thread //build the VSTree from the _entity_signature_file. bool VSTree::buildTree(std::string _entry_file_path) @@ -148,7 +165,8 @@ VSTree::buildTree(std::string _entry_file_path) Util::logging("IN VSTree::buildTree"); // create the entry buffer and node buffer. - this->entry_buffer = new EntryBuffer(EntryBuffer::DEFAULT_CAPACITY); + this->entry_buffer = new EntryBuffer(EntryBuffer::DEFAULT_CAPACITY); + //cout<<"entry buffer newed"<node_buffer = new LRUCache(LRUCache::DEFAULT_CAPACITY); // create the root node. @@ -183,6 +201,11 @@ VSTree::buildTree(std::string _entry_file_path) { SigEntry* entryPtr = this->entry_buffer->getElem(i); + //if(entryPtr->getEntityId() < 0) + //{ + //cout<<"error: "<insertEntry(*entryPtr); @@ -193,20 +216,20 @@ VSTree::buildTree(std::string _entry_file_path) } //debug - Util::logging("insert entries to tree done."); + //Util::logging("insert entries to tree done."); //bool flag = this->node_buffer->flush(); bool flag = this->saveTree(); //debug { - stringstream _ss; - _ss << "tree height: " << this->getHeight() << endl; - _ss << "node num: " << this->node_num << endl; - Util::logging(_ss.str()); + //stringstream _ss; + //_ss << "tree height: " << this->getHeight() << endl; + //_ss << "node num: " << this->node_num << endl; + //Util::logging(_ss.str()); } - Util::logging("OUT VSTree::buildTree"); + //Util::logging("OUT VSTree::buildTree"); //debug // { @@ -302,9 +325,18 @@ bool VSTree::updateEntry(int _entity_id, const EntityBitSet& _bitset) if (!findFlag) { cerr<< "error, can not find the mapping child entry in the leaf node. @VSTree::updateEntry" << endl; +#ifdef THREAD_ON + pthread_mutex_unlock(&(leafNodePtr->node_lock)); +#endif return false; } + //the node has been changed + leafNodePtr->setDirty(); + +#ifdef THREAD_ON + pthread_mutex_unlock(&(leafNodePtr->node_lock)); +#endif return true; } @@ -313,6 +345,11 @@ bool VSTree::updateEntry(int _entity_id, const EntityBitSet& _bitset) bool VSTree::replaceEntry(int _entity_id, const EntityBitSet& _bitset) { + if(this->root_file_line < 0) + { + return false; + } + //cout<<"begin replaceEntry()"<getLeafNodeByEntityID(_entity_id); @@ -356,9 +393,17 @@ VSTree::replaceEntry(int _entity_id, const EntityBitSet& _bitset) if (!findFlag) { cerr << "error, can not find the mapping child entry in the leaf node. @VSTree::replaceEntry" << endl; +#ifdef THREAD_ON + pthread_mutex_unlock(&(leafNodePtr->node_lock)); +#endif return false; } + leafNodePtr->setDirty(); + +#ifdef THREAD_ON + pthread_mutex_unlock(&(leafNodePtr->node_lock)); +#endif return true; } @@ -370,11 +415,21 @@ VSTree::replaceEntry(int _entity_id, const EntityBitSet& _bitset) bool VSTree::insertEntry(const SigEntry& _entry) { - //WARN:we do not deal with the case:the vstree is already empty, - //then to insert now - //choose the best leaf node to insert the _entry - VNode* choosedNodePtr = this->chooseNode(this->getRoot(), _entry); + VNode* choosedNodePtr = NULL; + if(this->root_file_line < 0) + { + choosedNodePtr = this->createNode(); + choosedNodePtr->setAsRoot(true); + choosedNodePtr->setAsLeaf(true); + this->height ++; + this->root_file_line = 0; + } + else + { + choosedNodePtr = this->chooseNode(this->getRoot(), _entry); + } + #ifdef DEBUG_VSTREE if(_entry.getEntityId() == 200) @@ -400,6 +455,7 @@ VSTree::insertEntry(const SigEntry& _entry) return false; } + choosedNodePtr->setDirty(); if (choosedNodePtr->isFull()) { #ifdef DEBUG_VSTREE @@ -434,6 +490,11 @@ VSTree::insertEntry(const SigEntry& _entry) // update the entityID2FileLineMap. this->entityID2FileLineMap[_entry.getEntityId()] = choosedNodePtr->getFileLine(); } + +#ifdef THREAD_ON + pthread_mutex_unlock(&(choosedNodePtr->node_lock)); +#endif + this->entry_num ++; #ifdef DEBUG_VSTREE @@ -447,6 +508,11 @@ VSTree::insertEntry(const SigEntry& _entry) bool VSTree::removeEntry(int _entity_id) { + if(this->root_file_line < 0) + { + return false; + } + //cout<<"file line check: "<entityID2FileLineMap[200]<getLeafNodeByEntityID(_entity_id); @@ -478,21 +544,30 @@ VSTree::removeEntry(int _entity_id) return false; } + leafNodePtr->setDirty(); //BETTER?:consider up->bopttom to deal, not find leaf and recursively if(leafNodePtr->isRoot()) { if(childNum == 1) { //the tree is empty now + cout<<"the vstree is empty now!"<removeChild(entryIndex); leafNodePtr->refreshAncestorSignature(*(this->node_buffer)); this->removeNode(leafNodePtr); + leafNodePtr = NULL; + //DEBUG: already deleted in freeElem //delete leafNodePtr; //leafNodePtr = NULL; + this->root_file_line = -1; this->height = 0; this->entry_num = 0; this->node_num = 0; + + //reset the ID info + this->free_nid_list.clear(); + this->max_nid_alloc = 0; } else { @@ -528,6 +603,12 @@ VSTree::removeEntry(int _entity_id) } } + if(leafNodePtr != NULL) + { +#ifdef THREAD_ON + pthread_mutex_unlock(&(leafNodePtr->node_lock)); +#endif + } this->entry_num--; this->entityID2FileLineMap.erase(_entity_id); @@ -566,8 +647,10 @@ VSTree::loadTree() { cout << "load VSTree..." << endl; (this->node_buffer) = new LRUCache(LRUCache::DEFAULT_CAPACITY); + cout<<"LRU cache built"<loadTreeInfo(); + cout<<"tree info loaded"<node_lock)); +#endif + } return candidateLeafPtr; } if(minDis > curDis) { + if(ret != NULL) + { +#ifdef THREAD_ON + pthread_mutex_unlock(&(ret->node_lock)); +#endif + } minDis = curDis; ret = candidateLeafPtr; } + else + { +#ifdef THREAD_ON + pthread_mutex_unlock(&(candidateLeafPtr->node_lock)); +#endif + } } +#ifdef THREAD_ON + pthread_mutex_unlock(&(_p_node->node_lock)); +#endif + return ret; } } @@ -672,7 +778,9 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* cout << "split happen" << endl; cout<<_p_node_being_split->getFileLine() << endl; cout<<"to insert id "<<_insert_entry.getEntityId()<height<getFileLine(); // first, add the new child node(if not leaf) or child entry(if leaf) to the full node. bool just_insert_entry = (_p_insert_node == NULL); if(just_insert_entry) @@ -682,6 +790,7 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* else { _p_node_being_split->addChildNode(_p_insert_node, true); + //pthread_mutex_unlock(&(_p_insert_node->node_lock)); } //NOTICE:now the child num in this node is exactly MAX_CHILD_NUM @@ -690,9 +799,11 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* //BETTER: use hanming, xor result or the vector included angle to guess the distince. //And then also use the farest two as seeds. // - //two seeds to generate two new nodes. - //seedA kernel: the SigEntry with the minimal count of signature. - //seedB kernel: the SigEntry with the maximal count of signature. + //two seeds to generate two new nodes. + //seedA kernel: the SigEntry with the minimal count of signature. + //seedB kernel: the SigEntry with the maximal count of signature. + // + //AIM: divide the entries into two parts, and the or result of the two parts are very different, i.e. a^b is the maxium int maxCount = 0; // record the minimal signature count. @@ -708,51 +819,52 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* } entryA = _p_node_being_split->getChildEntry(entryA_index); - maxCount = 0; - int entryB_index = 0; // record the seedB kernel index. - for(int i = 0; i < VNode::MAX_CHILD_NUM; i++) - { - //NOTICE:I think xOR should be used here to choose the farest two - int currentCount = entryA.xOR(_p_node_being_split->getChildEntry(i)); - //int currentCount = entryA.xEpsilen(_p_node_being_split->getChildEntry(i)); - if(i != entryA_index && maxCount <= currentCount) - { - maxCount = currentCount; - entryB_index = i; - } - } - entryB = _p_node_being_split->getChildEntry(entryB_index); + //maxCount = 0; + //int entryB_index = 0; // record the seedB kernel index. + //for(int i = 0; i < VNode::MAX_CHILD_NUM; i++) + //{ + ////NOTICE:I think xOR should be used here to choose the farest two + //int currentCount = entryA.xOR(_p_node_being_split->getChildEntry(i)); + ////int currentCount = entryA.xEpsilen(_p_node_being_split->getChildEntry(i)); + //if(i != entryA_index && maxCount <= currentCount) + //{ + //maxCount = currentCount; + //entryB_index = i; + //} + //} + //entryB = _p_node_being_split->getChildEntry(entryB_index); // AEntryIndex: the entry index near seedA. // BEntryIndex: the entry index near seedB. std::vector entryIndex_nearA, entryIndex_nearB; entryIndex_nearA.clear(); entryIndex_nearB.clear(); - entryIndex_nearA.push_back(entryA_index); - entryIndex_nearB.push_back(entryB_index); - int cnt = 1, i; - //BETTER:maybe sort and add?(how to sort according to two seeds) - for(i = 0; i < VNode::MAX_CHILD_NUM; ++i) + + multimap dist; + for(int i = 0; i < VNode::MAX_CHILD_NUM; ++i) { - if(i == entryA_index || i == entryB_index) - { - continue; - } - if(cnt > VNode::MIN_CHILD_NUM) //num+1 + int d = entryA.xEpsilen(_p_node_being_split->getChildEntry(i)); + dist.insert(pair(d, i)); + } + + int cnt = 0; + multimap::iterator it; + for(it = dist.begin(); it != dist.end(); ++it) + { + if(cnt > VNode::MIN_CHILD_NUM) { break; } + entryIndex_nearA.push_back(it->second); cnt++; - entryIndex_nearA.push_back(i); } - for(; i < VNode::MAX_CHILD_NUM; ++i) + + for(; it != dist.end(); ++it) { - if(i == entryA_index || i == entryB_index) - { - continue; - } - entryIndex_nearB.push_back(i); + entryIndex_nearB.push_back(it->second); } + dist.clear(); + //NOTICE: code below maybe exist error, can not divide evenly(and maybe not necessary to compute distance) // @@ -811,35 +923,49 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* cout<<"A: "<isLeaf(); // then create a new node to act as BEntryIndex's father. - VNode* newNodePtr = this->createNode(); + VNode* newNodePtr = this->createNode(is_leaf); #ifdef DEBUG cout<<"new node file line: "<getFileLine()<isLeaf()) + if(is_leaf) { newNodePtr->setAsLeaf(true); } - //add all the entries in BEntryIndex into the new node child entry array, + //TODO:use lock, then can be used in parallism + //oldNodePtr = this->getNode(oldnode_fileline); + //add all the entries in BEntryIndex into the new node child entry array, //and calculate the new node's entry. - for(unsigned i = 0; i < entryIndex_nearB.size(); i++) - { - if(oldNodePtr->isLeaf()) - { - newNodePtr->addChildEntry(oldNodePtr->getChildEntry(entryIndex_nearB[i]), false); - } - else - { - VNode* childPtr = oldNodePtr->getChild(entryIndex_nearB[i], *(this->node_buffer)); - newNodePtr->addChildNode(childPtr); - } - } + if(is_leaf) + { + //NOTICE:I think write the judgement outside is better(no need to judge each time in loop) + for(unsigned i = 0; i < entryIndex_nearB.size(); i++) + { + //SigEntry st = oldNodePtr->getChildEntry(entryIndex_nearB[i]); + newNodePtr->addChildEntry(oldNodePtr->getChildEntry(entryIndex_nearB[i]), false); + } + } + else + { + for(unsigned i = 0; i < entryIndex_nearB.size(); i++) + { + //NOTICE: this requires that the buffer should be able to place half childs of a node + //(to set the father as the new node) + //If split occur recursively, the buffer size should be at least 101 * h + h = 102h + VNode* childPtr = oldNodePtr->getChild(entryIndex_nearB[i], *(this->node_buffer)); + newNodePtr->addChildNode(childPtr); +#ifdef THREAD_ON + pthread_mutex_unlock(&(childPtr->node_lock)); +#endif + } + } newNodePtr->refreshSignature(); //label the child being removed with -1, @@ -847,36 +973,47 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* sort(entryIndex_nearA.begin(), entryIndex_nearA.end(), less()); #ifdef DEBUG_VSTREE - stringstream _ss1; - { - _ss1 << "nearA: "; - for(unsigned i = 0; i < entryIndex_nearA.size(); i++) - { - _ss1 << entryIndex_nearA[i] << " "; - } - _ss1 << endl; + //stringstream _ss1; + //{ + //_ss1 << "nearA: "; + //for(unsigned i = 0; i < entryIndex_nearA.size(); i++) + //{ + //_ss1 << entryIndex_nearA[i] << " "; + //} + //_ss1 << endl; - _ss1 << "nearB: "; - for(unsigned i = 0; i < entryIndex_nearB.size(); i++) - { - _ss1 << entryIndex_nearB[i] << " "; - } - _ss1 << endl; - } - Util::logging(_ss1.str()); + //_ss1 << "nearB: "; + //for(unsigned i = 0; i < entryIndex_nearB.size(); i++) + //{ + //_ss1 << entryIndex_nearB[i] << " "; + //} + //_ss1 << endl; + //} + //Util::logging(_ss1.str()); #endif - for(unsigned i = 0; i < entryIndex_nearA.size(); i++) - { - oldNodePtr->setChildEntry(i, oldNodePtr->getChildEntry(entryIndex_nearA[i])); - oldNodePtr->setChildFileLine(i, oldNodePtr->getChildFileLine(entryIndex_nearA[i])); - } + //NOTICE:we need to sort entryIndex_nearA first because we must get from oldNodePtr and set oldNodePtr + //(in case of overwriting!) + if(is_leaf) + { + for(unsigned i = 0; i < entryIndex_nearA.size(); i++) + { + oldNodePtr->setChildEntry(i, oldNodePtr->getChildEntry(entryIndex_nearA[i])); + //WARN:no child file line for leaf node + //oldNodePtr->setChildFileLine(i, oldNodePtr->getChildFileLine(entryIndex_nearA[i])); + } + } + else + { + for(unsigned i = 0; i < entryIndex_nearA.size(); i++) + { + oldNodePtr->setChildEntry(i, oldNodePtr->getChildEntry(entryIndex_nearA[i])); + oldNodePtr->setChildFileLine(i, oldNodePtr->getChildFileLine(entryIndex_nearA[i])); + } + } oldNodePtr->setChildNum(entryIndex_nearA.size()); oldNodePtr->refreshSignature(); - int oldNode_index = oldNodePtr->getIndexInFatherNode(*(this->node_buffer)); - // full node's father pointer. - VNode* oldNodeFatherPtr = oldNodePtr->getFather(*(this->node_buffer)); if(oldNodePtr->isRoot()) { #ifdef DEBUG @@ -885,15 +1022,15 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* //if the old node is root, //split the root, create a new root, //and the tree height will be increased. - VNode* RootNewPtr = this->createNode(); + VNode* RootNewPtr = this->createNode(false); //change the old root node to not-root node, //and set the RootNew to root node. oldNodePtr->setAsRoot(false); RootNewPtr->setAsRoot(true); - //set the split two node(old node and new node) as the new root's child, - //and update signatures. + //set the split two node(old node and new node) as the new root's child, + //and update signatures. RootNewPtr->addChildNode(oldNodePtr); RootNewPtr->addChildNode(newNodePtr); RootNewPtr->refreshSignature(); @@ -906,8 +1043,8 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* // Util::logging(_ss.str()); // } - //should keep the root node always being - //at the first line(line zero) of the tree node file. + //should keep the root node always being + //at the first line(line zero) of the tree node file. this->swapNodeFileLine(RootNewPtr, oldNodePtr); cout<<"new root: "<getFileLine()<height++; @@ -927,13 +1064,19 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* // _ss << RootNewPtr->to_str() << endl; // Util::logging(_ss.str()); // } +#ifdef THREAD_ON + pthread_mutex_unlock(&(RootNewPtr->node_lock)); +#endif } else { - //if the (OldNode) is not Root, - //change the old node's signature to A's signature. + int oldNode_index = oldNodePtr->getIndexInFatherNode(*(this->node_buffer)); + // full node's father pointer. + VNode* oldNodeFatherPtr = oldNodePtr->getFather(*(this->node_buffer)); + //if the (OldNode) is not Root, + //change the old node's signature to A's signature. oldNodeFatherPtr->setChildEntry(oldNode_index, oldNodePtr->getEntry()); - + oldNodeFatherPtr->setDirty(); if(oldNodeFatherPtr->isFull()) { @@ -945,6 +1088,10 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* oldNodeFatherPtr->addChildNode(newNodePtr); oldNodeFatherPtr->refreshAncestorSignature(*(this->node_buffer)); } + +#ifdef THREAD_ON + pthread_mutex_unlock(&(oldNodeFatherPtr->node_lock)); +#endif } //debug @@ -964,12 +1111,17 @@ VSTree::split(VNode* _p_node_being_split, const SigEntry& _insert_entry, VNode* // update the entityID2FileLineMap by these two nodes. this->updateEntityID2FileLineMap(oldNodePtr); this->updateEntityID2FileLineMap(newNodePtr); + + //pthread_mutex_unlock(&(oldNodePtr->node_lock)); +#ifdef THREAD_ON + pthread_mutex_unlock(&(newNodePtr->node_lock)); +#endif } //the _entry_index in _child is to be removed. //node can only be deleted in this function. void -VSTree::coalesce(VNode* _child, int _entry_index) +VSTree::coalesce(VNode*& _child, int _entry_index) { #ifdef DEBUG cout << "coalesce happen" <getChild(0, *(this->node_buffer)); newRoot->setAsRoot(true); + newRoot->setDirty(); #ifdef DEBUG cout<<"shrink root in coalesce() -- to swap node file"<swapNodeFileLine(newRoot, _child); + +#ifdef THREAD_ON + pthread_mutex_unlock(&(newRoot->node_lock)); +#endif + //this->root_file_line = newRoot->getFileLine(); this->height--; this->removeNode(_child); + _child = NULL; } return; } @@ -1072,6 +1231,9 @@ VSTree::coalesce(VNode* _child, int _entry_index) } if(ccase > 2) { +#ifdef THREAD_ON + pthread_mutex_unlock(&(p->node_lock)); +#endif p = tp; n = tn; } @@ -1100,11 +1262,11 @@ VSTree::coalesce(VNode* _child, int _entry_index) case 1: //union right to this if(_child->isLeaf()) { - _child->setChildFileLine(_entry_index, p->getChildFileLine(0)); + //_child->setChildFileLine(_entry_index, p->getChildFileLine(0)); _child->setChildEntry(_entry_index, p->getChildEntry(0)); for(int i = 1; i < n; ++i) { - _child->setChildFileLine(cn+i-1, p->getChildFileLine(i)); + //_child->setChildFileLine(cn+i-1, p->getChildFileLine(i)); _child->addChildEntry(p->getChildEntry(i)); } } @@ -1114,6 +1276,10 @@ VSTree::coalesce(VNode* _child, int _entry_index) _child->setChildEntry(_entry_index, p->getChildEntry(0)); tmp = p->getChild(0, *(this->node_buffer)); tmp->setFatherFileLine(child_no); + tmp->setDirty(); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif for(int i = 1; i < n; ++i) { tmp = p->getChild(i, *(this->node_buffer)); @@ -1121,25 +1287,41 @@ VSTree::coalesce(VNode* _child, int _entry_index) _child->addChildNode(tmp); //_child->setChildNum(cn+i); //tmp->setFatherFileLine(child_no); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif } } + p->setDirty(); this->removeNode(p); + //p = NULL; _child->refreshSignature(); + _father->setDirty(); + _father->setChildEntry(_child_index, _child->getEntry()); //recursive:to remove child index+1 in father this->coalesce(_father, _child_index+1); break; case 2: //move one from right - _child->setChildFileLine(_entry_index, p->getChildFileLine(n-1)); + if(!_child->isLeaf()) + { + _child->setChildFileLine(_entry_index, p->getChildFileLine(n-1)); + } _child->setChildEntry(_entry_index, p->getChildEntry(n-1)); _child->refreshSignature(); if(!_child->isLeaf()) { tmp = p->getChild(n-1, *(this->node_buffer)); tmp->setFatherFileLine(child_no); + tmp->setDirty(); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif } + p->setDirty(); p->removeChild(n-1); p->refreshSignature(); + _father->setDirty(); _father->setChildEntry(_child_index, _child->getEntry()); _father->setChildEntry(_child_index+1, p->getEntry()); _father->refreshAncestorSignature(*(this->node_buffer)); @@ -1148,11 +1330,11 @@ VSTree::coalesce(VNode* _child, int _entry_index) case 3: //union left to this if(_child->isLeaf()) { - _child->setChildFileLine(_entry_index, p->getChildFileLine(0)); + //_child->setChildFileLine(_entry_index, p->getChildFileLine(0)); _child->setChildEntry(_entry_index, p->getChildEntry(0)); for(int i = 1; i < n; ++i) { - _child->setChildFileLine(cn+i-1, p->getChildFileLine(i)); + //_child->setChildFileLine(cn+i-1, p->getChildFileLine(i)); _child->addChildEntry(p->getChildEntry(i)); } } @@ -1162,6 +1344,10 @@ VSTree::coalesce(VNode* _child, int _entry_index) _child->setChildEntry(_entry_index, p->getChildEntry(0)); tmp = p->getChild(0, *(this->node_buffer)); tmp->setFatherFileLine(child_no); + tmp->setDirty(); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif for(int i = 1; i < n; ++i) { tmp = p->getChild(i, *(this->node_buffer)); @@ -1169,25 +1355,40 @@ VSTree::coalesce(VNode* _child, int _entry_index) _child->addChildNode(tmp); //_child->setChildNum(cn+i); //tmp->setFatherFileLine(child_no); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif } } + p->setDirty(); this->removeNode(p); _child->refreshSignature(); + _father->setDirty(); + _father->setChildEntry(_child_index, _child->getEntry()); //recursive:to remove child index-1 in father this->coalesce(_father, _child_index-1); break; case 4: //move one from left - _child->setChildFileLine(_entry_index, p->getChildFileLine(n-1)); + if(!_child->isLeaf()) + { + _child->setChildFileLine(_entry_index, p->getChildFileLine(n-1)); + } _child->setChildEntry(_entry_index, p->getChildEntry(n-1)); _child->refreshSignature(); if(!_child->isLeaf()) { VNode* tmp = p->getChild(n-1, *(this->node_buffer)); tmp->setFatherFileLine(child_no); + tmp->setDirty(); +#ifdef THREAD_ON + pthread_mutex_unlock(&(tmp->node_lock)); +#endif } + p->setDirty(); p->removeChild(n-1); p->refreshSignature(); + _father->setDirty(); _father->setChildEntry(_child_index, _child->getEntry()); _father->setChildEntry(_child_index-1, p->getEntry()); _father->refreshAncestorSignature(*(this->node_buffer)); @@ -1198,6 +1399,10 @@ VSTree::coalesce(VNode* _child, int _entry_index) break; } +#ifdef THREAD_ON + pthread_mutex_unlock(&(_father->node_lock)); +#endif + //BETTER:this maybe very costly because many entity no need to update if(_child->isLeaf()) { @@ -1205,6 +1410,9 @@ VSTree::coalesce(VNode* _child, int _entry_index) if(ccase == 2 || ccase == 4) { this->updateEntityID2FileLineMap(p); +#ifdef THREAD_ON + pthread_mutex_unlock(&(p->node_lock)); +#endif } else { @@ -1217,9 +1425,9 @@ VSTree::coalesce(VNode* _child, int _entry_index) //create a new node when one node need splitting. VNode* -VSTree::createNode() +VSTree::createNode(bool _is_leaf) { - VNode* newNodePtr = new VNode(); + VNode* newNodePtr = new VNode(_is_leaf); int key = -1; if(this->free_nid_list.empty()) { @@ -1237,6 +1445,7 @@ VSTree::createNode() } //key = this->node_num; newNodePtr->setFileLine(key); + newNodePtr->setDirty(); this->node_buffer->set(key, newNodePtr); this->node_num++; @@ -1288,12 +1497,20 @@ VSTree::swapNodeFileLine(VNode* _p_node_a, VNode* _p_node_b) nodeBFatherPtr->setChildFileLine(nodeBRank, newNodeBFileLine); } +#ifdef THREAD_ON + pthread_mutex_unlock(&(nodeAFatherPtr->node_lock)); + pthread_mutex_unlock(&(nodeBFatherPtr->node_lock)); +#endif + // update nodes' children's father file line. if (!_p_node_a->isLeaf()) { for (int i = 0; i < nodeAChildNum; ++i) { nodeAChildPtr[i]->setFatherFileLine(newNodeAFileLine); +#ifdef THREAD_ON + pthread_mutex_unlock(&(nodeAChildPtr[i]->node_lock)); +#endif } } if (!_p_node_b->isLeaf()) @@ -1301,6 +1518,9 @@ VSTree::swapNodeFileLine(VNode* _p_node_a, VNode* _p_node_b) for (int i = 0; i < nodeBChildNum; ++i) { nodeBChildPtr[i]->setFatherFileLine(newNodeBFileLine); +#ifdef THREAD_ON + pthread_mutex_unlock(&(nodeBChildPtr[i]->node_lock)); +#endif } } @@ -1372,6 +1592,7 @@ VSTree::loadTreeInfo() cerr << "error, can not open tree file:[" << VSTree::tree_info_file_path << "]@VSTree::loadTreeInfo" << endl; + fclose(filePtr); return false; } @@ -1419,13 +1640,15 @@ VSTree::loadTreeInfo() this->free_nid_list.push_back(key); fread(&key, sizeof(int), 1, filePtr); } + fclose(filePtr); - if(this->root_file_line < 0 || this->root_file_line >= this->max_nid_alloc) + //NOTICE: the tree can be empty + //if(this->root_file_line < 0 || this->root_file_line >= this->max_nid_alloc) + if(this->root_file_line >= this->max_nid_alloc) { return false; } - fclose(filePtr); return true; } @@ -1455,13 +1678,16 @@ VSTree::loadEntityID2FileLineMap() this->entityID2FileLineMap.clear(); - VNode* nodePtr = new VNode(); - int cycle_count = 0; + //int cycle_count = 0; + while (!feof(filePtr)) { - bool is_node_read = (fread((char *)nodePtr,vNodeSize,1,filePtr) == 1); + VNode* nodePtr = new VNode(true); + //bool is_node_read = (fread((char *)nodePtr,vNodeSize,1,filePtr) == 1); + bool is_node_read = nodePtr->readNode(filePtr); + //NOTICE:not consider invalid node - if (is_node_read && nodePtr->getFileLine() >= 0) + if (is_node_read && nodePtr->getFileLine() >= 0 && nodePtr->isLeaf()) { this->updateEntityID2FileLineMap(nodePtr); //debug @@ -1473,10 +1699,10 @@ VSTree::loadEntityID2FileLineMap() //Util::logging(_ss.str()); //} //} - cycle_count ++; + //cycle_count ++; } + delete nodePtr; } - delete nodePtr; fclose(filePtr); @@ -1487,8 +1713,6 @@ VSTree::loadEntityID2FileLineMap() void VSTree::updateEntityID2FileLineMap(VNode* _p_node) { - if (_p_node->isLeaf()) - { int line = _p_node->getFileLine(); //cout<<"updateEntityID2FileLineMap() - file line "<getChildNum(); @@ -1513,7 +1737,6 @@ VSTree::updateEntityID2FileLineMap(VNode* _p_node) //} //} } - } } //get the leaf node pointer by the given _entityID @@ -1537,6 +1760,11 @@ VSTree::getLeafNodeByEntityID(int _entityID) void VSTree::retrieveEntity(const EntityBitSet& _entity_bit_set, IDList* _p_id_list) { + if(this->root_file_line < 0) + { + return; + } + //NOTICE:this may cause parallism error //Util::logging("IN retrieveEntity"); EntitySig filterSig(_entity_bit_set); @@ -1574,10 +1802,13 @@ VSTree::retrieveEntity(const EntityBitSet& _entity_bit_set, IDList* _p_id_list) while (!nodeQueue.empty()) { int currentNodeFileLine = nodeQueue.front(); + //cout<<"current node file line: "<getNode(currentNodeFileLine); int childNum = currentNodePtr->getChildNum(); + //cout<<"child num: "<getChildFileLine(i); nodeQueue.push(childNodeFileLine); + //the root + //if(currentNodePtr->getFileLine() == 0) + //{ + //cout<<"root: "<node_lock)); +#endif } //Util::logging("OUT retrieveEntity"); } diff --git a/VSTree/VSTree.h b/VSTree/VSTree.h index afee94d..a9215ff 100644 --- a/VSTree/VSTree.h +++ b/VSTree/VSTree.h @@ -28,6 +28,9 @@ public: bool buildTree(std::string _entity_signature_file); bool deleteTree(); + //if the tree is empty + bool isEmpty() const; + //Incrementally update bitset of _entity_id conduct OR operation on Entry(_entity_id)'s EntityBitSet with _bitset //Entry of _entity_id must exists bool updateEntry(int _entity_id, const EntityBitSet& _bitset); @@ -55,6 +58,11 @@ public: void retrieveEntity(const EntityBitSet& _entity_bit_set, IDList* _p_id_list); private: + //TODO:add a tree lock(read-write), if a thread is writing, lock the whole tree + //NOTICE: all updates occur in one-thread, the buffer ensures that 3*h nodes can be loaded is ok + //However, only-read queries can occur in many threads, but a query at a time only need to keep one node + //But, how can you ensure that for a thread, its original node is at the top of the list? we must keep a lock for a node(but no need to write to disk)!!! + //(and each time select a unlocked node to swap out) int root_file_line; int node_num; int entry_num; @@ -82,9 +90,9 @@ private: //need to be insert to the _p_full_node. void split(VNode* _p_full_node, const SigEntry& _insert_entry, VNode* _p_insert_node); //deal when _child key num not enough - void coalesce(VNode* _child, int _entry_index); + void coalesce(VNode*& _child, int _entry_index); //create a new node when one node need splitting. - VNode* createNode(); + VNode* createNode(bool _is_leaf = true); //swap two nodes' file line, their related nodes(father and children nodes) will also be updated. void swapNodeFileLine(VNode* _p_node_a, VNode* _p_node_b); //save VSTree's information to tree_info_file_path, such as node_num, entry_num, height, etc. diff --git a/data/para1.sql b/data/para1.sql deleted file mode 100644 index bcdec9b..0000000 --- a/data/para1.sql +++ /dev/null @@ -1,4 +0,0 @@ -select ?p where -{ - ?p . -} diff --git a/data/para2.sql b/data/para2.sql deleted file mode 100644 index e04d0c0..0000000 --- a/data/para2.sql +++ /dev/null @@ -1,5 +0,0 @@ -select ?s ?o where -{ -?s ?o . -?s ?o . -} diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 5dbd646..ec71dd4 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -1,8 +1,10 @@ +You are advised to read init.conf file, and modify it as you wish. (this file will configure the basic options of gStore system) + gStore is a green software, and you just need to compile it with one command. Please run `make` -in the gStore root directory to compile the gStore code, link the ANTLR lib, and build executable "gload", "gquery", "gserver", "gclient", "gconsole". What is more, the api of gStore is also built now. +in the gStore root directory to compile the gStore code, link the ANTLR lib, and build executable "gbuild", "gquery", "gserver", "gclient", "gconsole". What is more, the api of gStore is also built now. If you want to use API examples of gStore, please run `make APIexample` to compile example codes for both C++ API and Java API. For details of API, please visit [API](API.md) chapter. diff --git a/init.conf b/init.conf index 0943918..da57dea 100644 --- a/init.conf +++ b/init.conf @@ -1,17 +1,41 @@ # NOTICE: this file configures the gStore system, please remember to edit it before using gStore # NOTICE: each line should not exceed 500 +# NOTICE: the settings are for all databases in this gStore application [setting] # Basic settings for gStore system to run correctly as you want # MUST: please give your choice about using gStore for a single machine(mode = single) or using distributed gStore(mode = distribute) -mode = single +gstore_mode = single + +# you can choose to output how much debug information when the system is running: +# no(only result), simple(by default, only necessary information), all(all information, used to debug program) +# For more choices(for example, you want to control the output of each module, so you can debug a single module), +# please set this option to all and go to modify the debug macros in Util/Util.h (choose to comment out the debug option or not) +debug_level = simple [option] -#BETTER:the position is the root of Gstore by default -#(or change to a specified folder later) +# This option means which directory do you want to place your database in(the directory will be created if not exists) +# NOTICE:the position is the root of gStore system directory by default +# db_home = . +# user_home = . -#DBpath = . +# which suffix do you want to add to your database name? please set it here +# NOTICE:all database name ends with ".db" by default, but you shouldn't add ".db" to the name you pass to gStore system +# db_suffix = .db + +# how much memory(the maxium) do you give for gStore in your system: the unit is GB +# NOTICE: if the memory you assign is not adequate for your dataset, then gStore will run into error +buffer_maxium = 100 + +# NOTICE: please uncomment and modify the option below if you want to set the threads num that gStore can use(unlimited by default) +# thread_maxium = 1 +# thread_maxium = 1000 + + +# you can choose if to record the operation log in gStore, by default it is opened. +# If it is closed(that is, the option is uncommented and set to false), then gStore will run fatser but maybe not safe and recoverable +# operation_logs = true diff --git a/makefile b/makefile index 0d2bd68..2d27bda 100644 --- a/makefile +++ b/makefile @@ -43,11 +43,11 @@ CC = ccache g++ #NOTICE: -O2 is recommended, while -O3 is dangerous #when developing, not use -O because it will disturb the normal #routine. use it for test and release. -#CFLAGS = -c -Wall -g #-fprofile-arcs -ftest-coverage #-pg -#EXEFLAG = -g #-fprofile-arcs -ftest-coverage #-pg +CFLAGS = -c -Wall -g -pthread #-fprofile-arcs -ftest-coverage #-pg +EXEFLAG = -g -pthread #-fprofile-arcs -ftest-coverage #-pg #-coverage -CFLAGS = -c -Wall -O2 -EXEFLAG = -O2 +#CFLAGS = -c -Wall -O2 -pthread +#EXEFLAG = -O2 -pthread #add -lreadline -ltermcap if using readline or objs contain readline library = -ltermcap -lreadline -L./lib -lantlr -lgcov @@ -408,7 +408,7 @@ dist: clean tarball: tar -czvf devGstore.tar.gz api bin lib tools .debug .tmp .objs test docs data makefile \ - Main Database KVstore Util Query Signature VSTree Parser Server README.md init.conf NOTES.md StringIndex COVERAGE + Main Database KVstore Util Query Signature VSTree Parser Server README.md init.conf NOTES.md StringIndex COVERAGE LICENSE APIexample: $(api_cpp) $(api_java) $(MAKE) -C api/cpp/example @@ -452,3 +452,7 @@ fulltest: cd ~ bash full_test.sh +#test the efficience of kvstore, insert/delete/search, use dbpedia170M by default +test-kvstore: + echo "TODO" + diff --git a/test/kvstore_test.cpp b/test/kvstore_test.cpp new file mode 100644 index 0000000..1be1d31 --- /dev/null +++ b/test/kvstore_test.cpp @@ -0,0 +1,2217 @@ +/*============================================================================= +# Filename: kvstore_test.cpp +# Author: Bookug Lobert +# Mail: zengli-bookug@pku.edu.cn +# Last Modified: 2017-03-06 15:46 +# Description: test the efficience of insert, delete, search +=============================================================================*/ + +#include "../Util/Util.h" +#include "../Util/Triple.h" +#include "../KVstore/KVstore.h" +#include "../Parser/DBparser.h" +#include "../Parser/RDFParser.h" + +using namespace std; + +string db_name = "kvstore_test.db"; +string rdf_path = "/home/data/DBpedia/database/dbpedia170M.nt"; + +int triples_num; +int entity_num; +int sub_num; +int pre_num; +int literal_num; + +KVstore* kvstore; +string six_tuples_file; + +//triple num per group for insert/delete +//can not be too high, otherwise the heap will over +static const int GROUP_SIZE = 1000; +//manage the ID allocate and garbage +static const int START_ID_NUM = 0; +//static const int START_ID_NUM = 1000; +///////////////////////////////////////////////////////////////////////////////// +//NOTICE:error if >= LITERAL_FIRST_ID +string free_id_file_entity; //the first is limitID, then free id list +int limitID_entity; //the current maxium ID num(maybe not used so much) +BlockInfo* freelist_entity; //free id list, reuse BlockInfo for Storage class +int allocEntityID(); +void freeEntityID(int _id); +///////////////////////////////////////////////////////////////////////////////// +//NOTICE:error if >= 2*LITERAL_FIRST_ID +string free_id_file_literal; +int limitID_literal; +BlockInfo* freelist_literal; +int allocLiteralID(); +void freeLiteralID(int _id); +///////////////////////////////////////////////////////////////////////////////// +//NOTICE:error if >= 2*LITERAL_FIRST_ID +string free_id_file_predicate; +int limitID_predicate; +BlockInfo* freelist_predicate; +int allocPredicateID(); +void freePredicateID(int _id); +///////////////////////////////////////////////////////////////////////////////// +void initIDinfo(); //initialize the members +void resetIDinfo(); //reset the id info for build +void readIDinfo(); //read and build the free list +void writeIDinfo(); //write and empty the free list + + +//================================================================================================================================================== +//NOTICE: +//there are 3 ways to manage a dynamic-garbage ID list +//1. push all unused IDs into list, get top each time to alloc, push freed one to tail, push more large one if NULL +//(can use bit array if stored in disk) +//2. when free, change the mapping for non-free one whose ID is the largest currently +//(sometimes maybe the copy cost is very high) +//3. NULL first and push one if freed, get one if not empty(limit+1 if NULL) +//(when stored in disk, maybe consume larger space) +//However, this method can keep the used largest easily(no!! 1->2->3, delete 2 and delete 3, then the max 1 is not kept by limit!) + +void initIDinfo() +{ + //NOTICE:keep that limit-1 the maxium using ID + this->free_id_file_entity = this->getStorePath() + "/freeEntityID.dat"; + this->limitID_entity = 0; + this->freelist_entity = NULL; + + this->free_id_file_literal = this->getStorePath() + "/freeLiteralID.dat"; + this->limitID_literal = 0; + this->freelist_literal = NULL; + + this->free_id_file_predicate = this->getStorePath() + "/freePredicateID.dat"; + this->limitID_predicate = 0; + this->freelist_predicate = NULL; +} + +void +Database::resetIDinfo() +{ + this->initIDinfo(); + + //this->limitID_entity = Database::START_ID_NUM; + //NOTICE:add base LITERAL_FIRST_ID for literals + //this->limitID_literal = Database::START_ID_NUM; + //this->limitID_predicate = Database::START_ID_NUM; + + //BlockInfo* tmp = NULL; + //for (int i = Database::START_ID_NUM - 1; i >= 0; --i) + //{ + //tmp = new BlockInfo(i, this->freelist_entity); + //this->freelist_entity = tmp; + //tmp = new BlockInfo(i, this->freelist_literal); + //this->freelist_literal = tmp; + //tmp = new BlockInfo(i, this->freelist_predicate); + //this->freelist_predicate = tmp; + //} +} + +void +Database::readIDinfo() +{ + this->initIDinfo(); + + FILE* fp = NULL; + int t = -1; + BlockInfo* bp = NULL; + + fp = fopen(this->free_id_file_entity.c_str(), "r"); + if (fp == NULL) + { + cout << "read entity id info error" << endl; + return; + } + //QUERY:this will reverse the original order, if change? + //Notice that if we cannot ensure that IDs are uporder and continuous, we can + //not keep an array for IDs like _entity_bitset + BlockInfo *tmp = NULL, *cur = NULL; + fread(&(this->limitID_entity), sizeof(int), 1, fp); + fread(&t, sizeof(int), 1, fp); + while (!feof(fp)) + { + //if(t == 14912) + //{ + //cout<<"Database::readIDinfo() - get 14912"<freelist_entity); + //this->freelist_entity = bp; + tmp = new BlockInfo(t); + if (cur == NULL) + { + this->freelist_entity = cur = tmp; + } + else + { + cur->next = tmp; + cur = tmp; + } + fread(&t, sizeof(int), 1, fp); + } + fclose(fp); + fp = NULL; + + + fp = fopen(this->free_id_file_literal.c_str(), "r"); + if (fp == NULL) + { + cout << "read literal id info error" << endl; + return; + } + fread(&(this->limitID_literal), sizeof(int), 1, fp); + fread(&t, sizeof(int), 1, fp); + while (!feof(fp)) + { + bp = new BlockInfo(t, this->freelist_literal); + this->freelist_literal = bp; + fread(&t, sizeof(int), 1, fp); + } + fclose(fp); + fp = NULL; + + fp = fopen(this->free_id_file_predicate.c_str(), "r"); + if (fp == NULL) + { + cout << "read predicate id info error" << endl; + return; + } + fread(&(this->limitID_predicate), sizeof(int), 1, fp); + fread(&t, sizeof(int), 1, fp); + while (!feof(fp)) + { + bp = new BlockInfo(t, this->freelist_predicate); + this->freelist_predicate = bp; + fread(&t, sizeof(int), 1, fp); + } + fclose(fp); + fp = NULL; +} + +void +Database::writeIDinfo() +{ + //cout<<"now to write the id info"<free_id_file_entity.c_str(), "w+"); + if (fp == NULL) + { + cout << "write entity id info error" << endl; + return; + } + fwrite(&(this->limitID_entity), sizeof(int), 1, fp); + bp = this->freelist_entity; + while (bp != NULL) + { + //if(bp->num == 14912) + //{ + //cout<<"Database::writeIDinfo() - get 14912"<num), sizeof(int), 1, fp); + tp = bp->next; + delete bp; + bp = tp; + } + fclose(fp); + fp = NULL; + + fp = fopen(this->free_id_file_literal.c_str(), "w+"); + if (fp == NULL) + { + cout << "write literal id info error" << endl; + return; + } + fwrite(&(this->limitID_literal), sizeof(int), 1, fp); + bp = this->freelist_literal; + while (bp != NULL) + { + fwrite(&(bp->num), sizeof(int), 1, fp); + tp = bp->next; + delete bp; + bp = tp; + } + fclose(fp); + fp = NULL; + + fp = fopen(this->free_id_file_predicate.c_str(), "w+"); + if (fp == NULL) + { + cout << "write predicate id info error" << endl; + return; + } + fwrite(&(this->limitID_predicate), sizeof(int), 1, fp); + bp = this->freelist_predicate; + while (bp != NULL) + { + fwrite(&(bp->num), sizeof(int), 1, fp); + tp = bp->next; + delete bp; + bp = tp; + } + fclose(fp); + fp = NULL; +} + +//ID alloc garbage error(LITERAL_FIRST_ID or double) add base for literal +int +Database::allocEntityID() +{ + int t; + if (this->freelist_entity == NULL) + { + t = this->limitID_entity++; + if (this->limitID_entity >= Util::LITERAL_FIRST_ID) + { + cout << "fail to alloc id for entity" << endl; + return -1; + } + } + else + { + t = this->freelist_entity->num; + BlockInfo* op = this->freelist_entity; + this->freelist_entity = this->freelist_entity->next; + delete op; + } + + this->entity_num++; + return t; +} + +void +Database::freeEntityID(int _id) +{ + if (_id == this->limitID_entity - 1) + { + this->limitID_entity--; + } + else + { + BlockInfo* p = new BlockInfo(_id, this->freelist_entity); + this->freelist_entity = p; + } + + this->entity_num--; +} + +int +Database::allocLiteralID() +{ + int t; + if (this->freelist_literal == NULL) + { + t = this->limitID_literal++; + if (this->limitID_literal >= Util::LITERAL_FIRST_ID) + { + cout << "fail to alloc id for literal" << endl; + return -1; + } + } + else + { + t = this->freelist_literal->num; + BlockInfo* op = this->freelist_literal; + this->freelist_literal = this->freelist_literal->next; + delete op; + } + + this->literal_num++; + return t + Util::LITERAL_FIRST_ID; +} + +void +Database::freeLiteralID(int _id) +{ + if (_id == this->limitID_literal - 1) + { + this->limitID_literal--; + } + else + { + BlockInfo* p = new BlockInfo(_id - Util::LITERAL_FIRST_ID, this->freelist_literal); + this->freelist_literal = p; + } + + this->literal_num--; +} + +int +Database::allocPredicateID() +{ + int t; + if (this->freelist_predicate == NULL) + { + t = this->limitID_predicate++; + if (this->limitID_predicate >= Util::LITERAL_FIRST_ID) + { + cout << "fail to alloc id for predicate" << endl; + return -1; + } + } + else + { + t = this->freelist_predicate->num; + BlockInfo* op = this->freelist_predicate; + this->freelist_predicate = this->freelist_predicate->next; + delete op; + } + + this->pre_num++; + return t; +} + +void +Database::freePredicateID(int _id) +{ + if (_id == this->limitID_predicate - 1) + { + this->limitID_predicate--; + } + else + { + BlockInfo* p = new BlockInfo(_id, this->freelist_predicate); + this->freelist_predicate = p; + } + + this->pre_num--; +} + +//NOTICE+QUERY:to save memory for large cases, we can consider building one tree at a time(then release) +//Or read the rdf file on separate segments +//WARN:the ID type is int, and entity/literal are just separated by a limit +//which means that entity num <= 10^9 literal num <= 10^9 predicate num <= 2*10^9 +//If we use unsigned as type, then max triple can be 10^9(edge case) +//If we use long long, no more problem, but wasteful +//Or we can consider divide entity and literal totally +//In distributed gStore, each machine's graph should be based on unique encoding IDs, +//and require that triples in each graph no more than a limit(maybe 10^9) +bool +Database::build(const string& _rdf_file) +{ + //manage the id for a new database + this->resetIDinfo(); + + string ret = Util::getExactPath(_rdf_file.c_str()); + long tv_build_begin = Util::get_cur_time(); + + string kv_store_path = store_path + "/kv_store"; + Util::create_dir(kv_store_path); + + string vstree_store_path = store_path + "/vs_store"; + Util::create_dir(vstree_store_path); + + if (!this->encodeRDF_new(ret)) //<-- this->kvstore->id2* trees are closed + { + return false; + } + cout << "finish encode." << endl; + + //cout<<"test kv"<kvstore->getIDByPredicate("")<kvstore->flush(); + delete this->kvstore; + this->kvstore = NULL; + //sync(); + //cout << "sync kvstore" << endl; + //this->kvstore->release(); + //cout<<"release kvstore"<vstree->saveTree(); + //delete this->vstree; + //this->vstree = NULL; + //sync(); + //cout << "sync vstree" << endl; + + //string cmd = "rm -rf " + _entry_file; + //system(cmd.c_str()); + //cout << "signature file removed" << endl; + + return true; +} + +//root Path of this DB + sixTuplesFile +string +Database::getSixTuplesFile() +{ + return this->getStorePath() + "/" + this->six_tuples_file; +} + +/* root Path of this DB + signatureBFile */ +string +Database::getSignatureBFile() +{ + return this->getStorePath() + "/" + this->signature_binary_file; +} + +/* root Path of this DB + DBInfoFile */ +string +Database::getDBInfoFile() +{ + return this->getStorePath() + "/" + this->db_info_file; +} + +bool +Database::saveDBInfoFile() +{ + FILE* filePtr = fopen(this->getDBInfoFile().c_str(), "wb"); + + if (filePtr == NULL) + { + cout << "error, can not create db info file. @Database::saveDBInfoFile" << endl; + return false; + } + + fseek(filePtr, 0, SEEK_SET); + + fwrite(&this->triples_num, sizeof(int), 1, filePtr); + fwrite(&this->entity_num, sizeof(int), 1, filePtr); + fwrite(&this->sub_num, sizeof(int), 1, filePtr); + fwrite(&this->pre_num, sizeof(int), 1, filePtr); + fwrite(&this->literal_num, sizeof(int), 1, filePtr); + fwrite(&this->encode_mode, sizeof(int), 1, filePtr); + fclose(filePtr); + + Util::triple_num = this->triples_num; + Util::pre_num = this->pre_num; + Util::entity_num = this->entity_num; + Util::literal_num = this->literal_num; + + return true; +} + +bool +Database::loadDBInfoFile() +{ + FILE* filePtr = fopen(this->getDBInfoFile().c_str(), "rb"); + + if (filePtr == NULL) + { + cout << "error, can not open db info file. @Database::loadDBInfoFile" << endl; + return false; + } + + fseek(filePtr, 0, SEEK_SET); + + fread(&this->triples_num, sizeof(int), 1, filePtr); + fread(&this->entity_num, sizeof(int), 1, filePtr); + fread(&this->sub_num, sizeof(int), 1, filePtr); + fread(&this->pre_num, sizeof(int), 1, filePtr); + fread(&this->literal_num, sizeof(int), 1, filePtr); + fread(&this->encode_mode, sizeof(int), 1, filePtr); + fclose(filePtr); + + Util::triple_num = this->triples_num; + Util::pre_num = this->pre_num; + Util::entity_num = this->entity_num; + Util::literal_num = this->literal_num; + + return true; +} + +//check whether the relative 3-tuples exist usually, through sp2olist +bool +Database::exist_triple(int _sub_id, int _pre_id, int _obj_id) +{ + int* _objidlist = NULL; + int _list_len = 0; + (this->kvstore)->getobjIDlistBysubIDpreID(_sub_id, _pre_id, _objidlist, _list_len); + + bool is_exist = false; + // for(int i = 0; i < _list_len; i ++) + // { + // if(_objidlist[i] == _obj_id) + // { + // is_exist = true; + // break; + // } + // } + if (Util::bsearch_int_uporder(_obj_id, _objidlist, _list_len) != -1) + { + is_exist = true; + } + delete[] _objidlist; + + return is_exist; +} + +//NOTICE: all constants are transfered to ids in memory +//this maybe not ok when size is too large! +bool +Database::encodeRDF_new(const string _rdf_file) +{ +#ifdef DEBUG + //cout<< "now to log!!!" << endl; + Util::logging("In encodeRDF_new"); + //cout<< "end log!!!" << endl; +#endif + int** _p_id_tuples = NULL; + int _id_tuples_max = 0; + + long t1 = Util::get_cur_time(); + + //map sub2id, pre2id, entity/literal in obj2id, store in kvstore, encode RDF data into signature + if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file, _p_id_tuples, _id_tuples_max)) + { + return false; + } + + long t2 = Util::get_cur_time(); + cout << "after encode, used " << (t2 - t1) << "ms." << endl; + + //NOTICE:close these trees now to save memory + this->kvstore->close_entity2id(); + this->kvstore->close_id2entity(); + this->kvstore->close_literal2id(); + this->kvstore->close_id2literal(); + this->kvstore->close_predicate2id(); + this->kvstore->close_id2predicate(); + + this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); + long t3 = Util::get_cur_time(); + cout << "after s2xx, used " << (t3 - t2) << "ms." << endl; + + this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); + long t4 = Util::get_cur_time(); + cout << "after o2xx, used " << (t4 - t3) << "ms." << endl; + + this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); + long t5 = Util::get_cur_time(); + cout << "after p2xx, used " << (t5 - t4) << "ms." << endl; + + //WARN:we must free the memory for id_tuples array + for (int i = 0; i < this->triples_num; ++i) + { + delete[] _p_id_tuples[i]; + } + delete[] _p_id_tuples; + + bool flag = this->saveDBInfoFile(); + if (!flag) + { + return false; + } + + Util::logging("finish encodeRDF_new"); + + return true; +} + +//NOTICE:in here and there in the insert/delete, we may get the maxium tuples num first +//and so we can avoid the cost of memcpy(scan quickly or use wc -l) +//However, if use compressed RDF format, how can we do it fi not using parser? +//CONSIDER: just an estimated value is ok or use vector!!!(but vector also copy when enlarge) +//and read file line numbers are also costly! +bool +Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, int**& _p_id_tuples, int & _id_tuples_max) +{ + int _id_tuples_size; + { + //initial + _id_tuples_max = 10 * 1000 * 1000; + _p_id_tuples = new int*[_id_tuples_max]; + _id_tuples_size = 0; + this->sub_num = 0; + this->pre_num = 0; + this->entity_num = 0; + this->literal_num = 0; + this->triples_num = 0; + (this->kvstore)->open_entity2id(KVstore::CREATE_MODE); + (this->kvstore)->open_id2entity(KVstore::CREATE_MODE); + (this->kvstore)->open_predicate2id(KVstore::CREATE_MODE); + (this->kvstore)->open_id2predicate(KVstore::CREATE_MODE); + (this->kvstore)->open_literal2id(KVstore::CREATE_MODE); + (this->kvstore)->open_id2literal(KVstore::CREATE_MODE); + } + + //Util::logging("finish initial sub2id_pre2id_obj2id"); + cout << "finish initial sub2id_pre2id_obj2id" << endl; + + //BETTER?:close the stdio buffer sync?? + ifstream _fin(_rdf_file.c_str()); + if (!_fin) + { + cout << "sub2id&pre2id&obj2id: Fail to open : " << _rdf_file << endl; + //exit(0); + return false; + } + + string _six_tuples_file = this->getSixTuplesFile(); + ofstream _six_tuples_fout(_six_tuples_file.c_str()); + if (!_six_tuples_fout) + { + cout << "sub2id&pre2id&obj2id: Fail to open: " << _six_tuples_file << endl; + //exit(0); + return false; + } + + TripleWithObjType* triple_array = new TripleWithObjType[RDFParser::TRIPLE_NUM_PER_GROUP]; + + //don't know the number of entity + //pre allocate entitybitset_max EntityBitSet for storing signature, double the space until the _entity_bitset is used up. + int entitybitset_max = 10000000; //set larger to avoid the copy cost + //int entitybitset_max = 10000; + EntityBitSet** _entity_bitset = new EntityBitSet*[entitybitset_max]; + for (int i = 0; i < entitybitset_max; i++) + { + _entity_bitset[i] = new EntityBitSet(); + _entity_bitset[i]->reset(); + } + EntityBitSet _tmp_bitset; + + //parse a file + RDFParser _parser(_fin); + + Util::logging("==> while(true)"); + + while (true) + { + int parse_triple_num = 0; + + _parser.parseFile(triple_array, parse_triple_num); + + { + stringstream _ss; + _ss << "finish rdfparser" << this->triples_num << endl; + //Util::logging(_ss.str()); + cout << _ss.str() << endl; + } + cout << "after info in sub2id_" << endl; + + if (parse_triple_num == 0) + { + break; + } + + //Process the Triple one by one + for (int i = 0; i < parse_triple_num; i++) + { + this->triples_num++; + + //if the _id_tuples exceeds, double the space + if (_id_tuples_size == _id_tuples_max) + { + int _new_tuples_len = _id_tuples_max * 2; + int** _new_id_tuples = new int*[_new_tuples_len]; + memcpy(_new_id_tuples, _p_id_tuples, sizeof(int*) * _id_tuples_max); + delete[] _p_id_tuples; + _p_id_tuples = _new_id_tuples; + _id_tuples_max = _new_tuples_len; + } + + // For subject + // (all subject is entity, some object is entity, the other is literal) + string _sub = triple_array[i].getSubject(); + int _sub_id = (this->kvstore)->getIDByEntity(_sub); + if (_sub_id == -1) + { + //_sub_id = this->entity_num; + _sub_id = this->allocEntityID(); + //this->entity_num++; + (this->kvstore)->setIDByEntity(_sub, _sub_id); + (this->kvstore)->setEntityByID(_sub_id, _sub); + } + // For predicate + string _pre = triple_array[i].getPredicate(); + int _pre_id = (this->kvstore)->getIDByPredicate(_pre); + if (_pre_id == -1) + { + //_pre_id = this->pre_num; + _pre_id = this->allocPredicateID(); + //this->pre_num++; + (this->kvstore)->setIDByPredicate(_pre, _pre_id); + (this->kvstore)->setPredicateByID(_pre_id, _pre); + } + + // For object + string _obj = triple_array[i].getObject(); + int _obj_id = -1; + // obj is entity + if (triple_array[i].isObjEntity()) + { + _obj_id = (this->kvstore)->getIDByEntity(_obj); + if (_obj_id == -1) + { + //_obj_id = this->entity_num; + _obj_id = this->allocEntityID(); + //this->entity_num++; + (this->kvstore)->setIDByEntity(_obj, _obj_id); + (this->kvstore)->setEntityByID(_obj_id, _obj); + } + } + //obj is literal + if (triple_array[i].isObjLiteral()) + { + _obj_id = (this->kvstore)->getIDByLiteral(_obj); + if (_obj_id == -1) + { + //_obj_id = Util::LITERAL_FIRST_ID + (this->literal_num); + _obj_id = this->allocLiteralID(); + //this->literal_num++; + (this->kvstore)->setIDByLiteral(_obj, _obj_id); + (this->kvstore)->setLiteralByID(_obj_id, _obj); + //#ifdef DEBUG + //if(_obj == "\"Bob\"") + //{ + //cout << "this is id for Bob: " << _obj_id << endl; + //} + //cout<<"literal should be bob: " << kvstore->getLiteralByID(_obj_id)<getIDByLiteral("\"Bob\"")<entity_num >= entitybitset_max) + { + //cout<<"to double entity bitset num"<reset(); + } + + entitybitset_max = tmp; + } + + { + _tmp_bitset.reset(); + Signature::encodePredicate2Entity(_pre_id, _tmp_bitset, Util::EDGE_OUT); + Signature::encodeStr2Entity(_obj.c_str(), _tmp_bitset); + *_entity_bitset[_sub_id] |= _tmp_bitset; + } + + if (triple_array[i].isObjEntity()) + { + _tmp_bitset.reset(); + Signature::encodePredicate2Entity(_pre_id, _tmp_bitset, Util::EDGE_IN); + Signature::encodeStr2Entity(_sub.c_str(), _tmp_bitset); + //cout<<"objid: "<<_obj_id < end while(true)"<getSignatureBFile(); + FILE* _sig_fp = fopen(_sig_binary_file.c_str(), "wb"); + if (_sig_fp == NULL) { + cout << "Failed to open : " << _sig_binary_file << endl; + } + + //NOTICE:in build process, all IDs are continuous growing + //EntityBitSet _all_bitset; + for (int i = 0; i < this->entity_num; i++) + { + SigEntry* _sig = new SigEntry(EntitySig(*_entity_bitset[i]), i); + fwrite(_sig, sizeof(SigEntry), 1, _sig_fp); + //_all_bitset |= *_entity_bitset[i]; + delete _sig; + } + fclose(_sig_fp); + + for (int i = 0; i < entitybitset_max; i++) + { + delete _entity_bitset[i]; + } + delete[] _entity_bitset; + } + + { + stringstream _ss; + _ss << "finish sub2id pre2id obj2id" << endl; + _ss << "tripleNum is " << this->triples_num << endl; + _ss << "entityNum is " << this->entity_num << endl; + _ss << "preNum is " << this->pre_num << endl; + _ss << "literalNum is " << this->literal_num << endl; + Util::logging(_ss.str()); + cout << _ss.str() << endl; + } + + return true; +} + +bool +Database::insertTriple(const TripleWithObjType& _triple, vector* _vertices, vector* _predicates) +{ + //cout<kvstore->getIDByEntity(""); + //int sid2 = this->kvstore->getIDByEntity(""); + //int oid1 = this->kvstore->getIDByEntity(""); + //int oid2 = this->kvstore->getIDByEntity(""); + //cout<kvstore)->getIDByEntity(_triple.subject); + bool _is_new_sub = false; + //if sub does not exist + if (_sub_id == -1) + { + _is_new_sub = true; + //_sub_id = this->entity_num++; + _sub_id = this->allocEntityID(); + //cout<<"this is a new sub id"<sub_num++; + (this->kvstore)->setIDByEntity(_triple.subject, _sub_id); + (this->kvstore)->setEntityByID(_sub_id, _triple.subject); + + //update the string buffer + if (_sub_id < this->entity_buffer_size) + { + this->entity_buffer->set(_sub_id, _triple.subject); + } + + if (_vertices != NULL) + _vertices->push_back(_sub_id); + } + + int _pre_id = (this->kvstore)->getIDByPredicate(_triple.predicate); + bool _is_new_pre = false; + //if pre does not exist + if (_pre_id == -1) + { + _is_new_pre = true; + //_pre_id = this->pre_num++; + _pre_id = this->allocPredicateID(); + (this->kvstore)->setIDByPredicate(_triple.predicate, _pre_id); + (this->kvstore)->setPredicateByID(_pre_id, _triple.predicate); + + if (_predicates != NULL) + _predicates->push_back(_pre_id); + } + + //object is either entity or literal + int _obj_id = -1; + bool _is_new_obj = false; + bool is_obj_entity = _triple.isObjEntity(); + if (is_obj_entity) + { + _obj_id = (this->kvstore)->getIDByEntity(_triple.object); + + if (_obj_id == -1) + { + _is_new_obj = true; + //_obj_id = this->entity_num++; + _obj_id = this->allocEntityID(); + (this->kvstore)->setIDByEntity(_triple.object, _obj_id); + (this->kvstore)->setEntityByID(_obj_id, _triple.object); + + //update the string buffer + if (_obj_id < this->entity_buffer_size) + { + this->entity_buffer->set(_obj_id, _triple.object); + } + + if (_vertices != NULL) + _vertices->push_back(_obj_id); + } + } + else + { + _obj_id = (this->kvstore)->getIDByLiteral(_triple.object); + + if (_obj_id == -1) + { + _is_new_obj = true; + //_obj_id = Util::LITERAL_FIRST_ID + this->literal_num; + _obj_id = this->allocLiteralID(); + (this->kvstore)->setIDByLiteral(_triple.object, _obj_id); + (this->kvstore)->setLiteralByID(_obj_id, _triple.object); + + //update the string buffer + int tid = _obj_id - Util::LITERAL_FIRST_ID; + if (tid < this->literal_buffer_size) + { + this->literal_buffer->set(tid, _triple.object); + } + + if (_vertices != NULL) + _vertices->push_back(_obj_id); + } + } + + //if this is not a new triple, return directly + bool _triple_exist = false; + if (!_is_new_sub && !_is_new_pre && !_is_new_obj) + { + _triple_exist = this->exist_triple(_sub_id, _pre_id, _obj_id); + } + + //debug + // { + // stringstream _ss; + // _ss << this->literal_num << endl; + // _ss <<"ids: " << _sub_id << " " << _pre_id << " " << _obj_id << " " << _triple_exist << endl; + // Util::logging(_ss.str()); + // } + + if (_triple_exist) + { + cout << "this triple already exist" << endl; + return false; + } + else + { + this->triples_num++; + } + //cout<<"the triple spo ids: "<<_sub_id<<" "<<_pre_id<<" "<<_obj_id<kvstore)->updateTupleslist_insert(_sub_id, _pre_id, _obj_id); + + //int* list = NULL; + //int len = 0; + //int root = this->kvstore->getIDByEntity(""); + //int contain = this->kvstore->getIDByPredicate(""); + //this->kvstore->getobjIDlistBysubIDpreID(root, contain, list, len); + //cout<kvstore->getEntityByID(list[i])<<" "<encodeTriple2SubEntityBitSet(_sub_entity_bitset, &_triple); + + //if new entity then insert it, else update it. + if (_is_new_sub) + { + cout<<"to insert: "<<_sub_id<<" "<kvstore->getEntityByID(_sub_id)<vstree)->insertEntry(_sig); + } + else + { + //cout<<"to update: "<<_sub_id<<" "<kvstore->getEntityByID(_sub_id)<vstree)->updateEntry(_sub_id, _sub_entity_bitset); + } + + //if the object is an entity, then update or insert this entity's entry. + if (is_obj_entity) + { + EntityBitSet _obj_entity_bitset; + _obj_entity_bitset.reset(); + + this->encodeTriple2ObjEntityBitSet(_obj_entity_bitset, &_triple); + + if (_is_new_obj) + { + //cout<<"to insert: "<<_obj_id<<" "<kvstore->getEntityByID(_obj_id)<vstree)->insertEntry(_sig); + } + else + { + //cout<<"to update: "<<_obj_id<<" "<kvstore->getEntityByID(_obj_id)<vstree)->updateEntry(_obj_id, _obj_entity_bitset); + } + } + + long tv_vs_store_end = Util::get_cur_time(); + + //debug + { + cout << "update kv_store, used " << (tv_kv_store_end - tv_kv_store_begin) << "ms." << endl; + cout << "update vs_store, used " << (tv_vs_store_end - tv_kv_store_end) << "ms." << endl; + } + + return true; + //return updateLen; +} + +bool +Database::removeTriple(const TripleWithObjType& _triple, vector* _vertices, vector* _predicates) +{ + long tv_kv_store_begin = Util::get_cur_time(); + + int _sub_id = (this->kvstore)->getIDByEntity(_triple.subject); + int _pre_id = (this->kvstore)->getIDByPredicate(_triple.predicate); + int _obj_id = (this->kvstore)->getIDByEntity(_triple.object); + if (_obj_id == -1) + { + _obj_id = (this->kvstore)->getIDByLiteral(_triple.object); + } + + if (_sub_id == -1 || _pre_id == -1 || _obj_id == -1) + { + return false; + } + bool _exist_triple = this->exist_triple(_sub_id, _pre_id, _obj_id); + if (!_exist_triple) + { + return false; + } + else + { + this->triples_num--; + } + + cout << "triple existence checked" << endl; + + //remove from sp2o op2s s2po o2ps s2o o2s + //sub2id, pre2id and obj2id will not be updated + (this->kvstore)->updateTupleslist_remove(_sub_id, _pre_id, _obj_id); + cout << "11 trees updated" << endl; + + long tv_kv_store_end = Util::get_cur_time(); + + int sub_degree = (this->kvstore)->getEntityDegree(_sub_id); + //if subject become an isolated point, remove its corresponding entry + if (sub_degree == 0) + { + cout<<"to remove entry for sub"<kvstore->getEntityByID(_sub_id)<kvstore->subEntityByID(_sub_id); + this->kvstore->subIDByEntity(_triple.subject); + (this->vstree)->removeEntry(_sub_id); + this->freeEntityID(_sub_id); + this->sub_num--; + //update the string buffer + if (_sub_id < this->entity_buffer_size) + { + this->entity_buffer->del(_sub_id); + } + if (_vertices != NULL) + _vertices->push_back(_sub_id); + } + //else re-calculate the signature of subject & replace that in vstree + else + { + //cout<<"to replace entry for sub"<kvstore->getEntityByID(_sub_id)<calculateEntityBitSet(_sub_id, _entity_bitset); + //NOTICE:can not use updateEntry as insert because this is in remove + //In insert we can add a OR operation and all is ok + (this->vstree)->replaceEntry(_sub_id, _entity_bitset); + } + //cout<<"subject dealed"<kvstore->getEntityDegree(_obj_id); + if (obj_degree == 0) + { + //cout<<"to remove entry for obj"<kvstore->getEntityByID(_obj_id)<kvstore->subEntityByID(_obj_id); + this->kvstore->subIDByEntity(_triple.object); + this->vstree->removeEntry(_obj_id); + this->freeEntityID(_obj_id); + //update the string buffer + if (_obj_id < this->entity_buffer_size) + { + this->entity_buffer->del(_obj_id); + } + if (_vertices != NULL) + _vertices->push_back(_obj_id); + } + else + { + //cout<<"to replace entry for obj"<kvstore->getEntityByID(_obj_id)<calculateEntityBitSet(_obj_id, _entity_bitset); + this->vstree->replaceEntry(_obj_id, _entity_bitset); + } + } + else + { + obj_degree = this->kvstore->getLiteralDegree(_obj_id); + if (obj_degree == 0) + { + this->kvstore->subLiteralByID(_obj_id); + this->kvstore->subIDByLiteral(_triple.object); + this->freeLiteralID(_obj_id); + //update the string buffer + int tid = _obj_id - Util::LITERAL_FIRST_ID; + if (tid < this->literal_buffer_size) + { + this->literal_buffer->del(tid); + } + if (_vertices != NULL) + _vertices->push_back(_obj_id); + } + } + //cout<<"object dealed"<kvstore->getPredicateDegree(_pre_id); + if (pre_degree == 0) + { + this->kvstore->subPredicateByID(_pre_id); + this->kvstore->subIDByPredicate(_triple.predicate); + this->freePredicateID(_pre_id); + if (_predicates != NULL) + _predicates->push_back(_pre_id); + } + //cout<<"predicate dealed"<load(); + if (!flag) + { + return false; + } + cout << "finish loading" << endl; + + long tv_load = Util::get_cur_time(); + + int success_num = 0; + + ifstream _fin(_rdf_file.c_str()); + if (!_fin) + { + cout << "fail to open : " << _rdf_file << ".@insert_test" << endl; + //exit(0); + return false; + } + + //NOTICE+WARN:we can not load all triples into memory all at once!!! + //the parameter in build and insert must be the same, because RDF parser also use this + //for build process, this one can be big enough if memory permits + //for insert/delete process, this can not be too large, otherwise too costly + TripleWithObjType* triple_array = new TripleWithObjType[RDFParser::TRIPLE_NUM_PER_GROUP]; + //parse a file + RDFParser _parser(_fin); + + int triple_num = 0; +#ifdef DEBUG + Util::logging("==> while(true)"); +#endif + while (true) + { + int parse_triple_num = 0; + _parser.parseFile(triple_array, parse_triple_num); +#ifdef DEBUG + stringstream _ss; + //NOTICE:this is not same as others, use parse_triple_num directly + _ss << "finish rdfparser" << parse_triple_num << endl; + Util::logging(_ss.str()); + cout << _ss.str() << endl; +#endif + if (parse_triple_num == 0) + { + break; + } + + //Process the Triple one by one + success_num += this->insert(triple_array, parse_triple_num); + //some maybe invalid or duplicate + //triple_num += parse_triple_num; + } + + delete[] triple_array; + long tv_insert = Util::get_cur_time(); + cout << "after insert, used " << (tv_insert - tv_load) << "ms." << endl; + //BETTER:update kvstore and vstree separately, to lower the memory cost + //flag = this->vstree->saveTree(); + //if (!flag) + //{ + //return false; + //} + //flag = this->saveDBInfoFile(); + //if (!flag) + //{ + //return false; + //} + + cout << "insert rdf triples done." << endl; + cout<<"inserted triples num: "<kvstore->getIDByEntity(""); + //int contain = this->kvstore->getIDByPredicate(""); + //this->kvstore->getobjIDlistBysubIDpreID(root, contain, list, len); + //cout<kvstore->getEntityByID(list[i])<<" "<kvstore->getIDByEntity(""); + //cout<kvstore->getEntityByID(0)<load(); + if (!flag) + { + return false; + } + cout << "finish loading" << endl; + + long tv_load = Util::get_cur_time(); + int success_num = 0; + + ifstream _fin(_rdf_file.c_str()); + if (!_fin) + { + cout << "fail to open : " << _rdf_file << ".@remove_test" << endl; + return false; + } + + //NOTICE+WARN:we can not load all triples into memory all at once!!! + TripleWithObjType* triple_array = new TripleWithObjType[RDFParser::TRIPLE_NUM_PER_GROUP]; + //parse a file + RDFParser _parser(_fin); + + //int triple_num = 0; +#ifdef DEBUG + Util::logging("==> while(true)"); +#endif + while (true) + { + int parse_triple_num = 0; + _parser.parseFile(triple_array, parse_triple_num); +#ifdef DEBUG + stringstream _ss; + //NOTICE:this is not same as others, use parse_triple_num directly + _ss << "finish rdfparser" << parse_triple_num << endl; + Util::logging(_ss.str()); + cout << _ss.str() << endl; +#endif + if (parse_triple_num == 0) + { + break; + } + + success_num += this->remove(triple_array, parse_triple_num); + //some maybe invalid or duplicate + //triple_num -= parse_triple_num; + } + + //TODO:better to free this just after id_tuples are ok + //(only when using group insertion/deletion) + //or reduce the array size + delete[] triple_array; + long tv_remove = Util::get_cur_time(); + cout << "after remove, used " << (tv_remove - tv_load) << "ms." << endl; + + //flag = this->vstree->saveTree(); + //if (!flag) + //{ + //return false; + //} + //flag = this->saveDBInfoFile(); + //if (!flag) + //{ + //return false; + //} + + cout << "remove rdf triples done." << endl; + cout<<"removed triples num: "< vertices, predicates; + int valid_num = 0; + +#ifdef USE_GROUP_INSERT + //NOTICE:this is called by insert(file) or query()(but can not be too large), + //assume that db is loaded already + int** id_tuples = new int*[_triple_num]; + int i = 0; + //for(i = 0; i < _triple_num; ++i) + //{ + //id_tuples[i] = new int[3]; + //} + map old_sigmap; + map new_sigmap; + set new_entity; + map::iterator it; + EntityBitSet tmpset; + tmpset.reset(); + + int subid, objid, preid; + bool is_obj_entity; + for (i = 0; i < _triple_num; ++i) + { + bool is_new_sub = false, is_new_pre = false, is_new_obj = false; + + string sub = _triples[i].getSubject(); + subid = this->kvstore->getIDByEntity(sub); + if (subid == -1) + { + is_new_sub = true; + subid = this->allocEntityID(); + cout << "this is a new subject: " << sub << " " << subid << endl; + this->sub_num++; + this->kvstore->setIDByEntity(sub, subid); + this->kvstore->setEntityByID(subid, sub); + new_entity.insert(subid); + //add info and update buffer + vertices.push_back(subid); + if (subid < this->entity_buffer_size) + { + this->entity_buffer->set(subid, sub); + } + } + + string pre = _triples[i].getPredicate(); + preid = this->kvstore->getIDByPredicate(pre); + if (preid == -1) + { + is_new_pre = true; + preid = this->allocPredicateID(); + this->kvstore->setIDByPredicate(pre, preid); + this->kvstore->setPredicateByID(preid, pre); + predicates.push_back(preid); + } + + is_obj_entity = _triples[i].isObjEntity(); + string obj = _triples[i].getObject(); + if (is_obj_entity) + { + objid = this->kvstore->getIDByEntity(obj); + if (objid == -1) + { + is_new_obj = true; + objid = this->allocEntityID(); + cout << "this is a new object: " << obj << " " << objid << endl; + //this->obj_num++; + this->kvstore->setIDByEntity(obj, objid); + this->kvstore->setEntityByID(objid, obj); + new_entity.insert(objid); + //add info and update + vertices.push_back(objid); + if (objid < this->entity_buffer_size) + { + this->entity_buffer->set(objid, obj); + } + } + } + else //isObjLiteral + { + objid = this->kvstore->getIDByLiteral(obj); + if (objid == -1) + { + is_new_obj = true; + objid = this->allocLiteralID(); + //this->obj_num++; + this->kvstore->setIDByLiteral(obj, objid); + this->kvstore->setLiteralByID(objid, obj); + //add info and update + vertices.push_back(objid); + int tid = objid - Util::LITERAL_FIRST_ID; + if (tid < this->literal_buffer_size) + { + this->literal_buffer->set(tid, obj); + } + } + } + + bool triple_exist = false; + if (!is_new_sub && !is_new_pre && !is_new_obj) + { + triple_exist = this->exist_triple(subid, preid, objid); + } + if (triple_exist) + { +#ifdef DEBUG + cout << "this triple exist" << endl; +#endif + continue; + } +#ifdef DEBUG + cout << "this triple not exist" << endl; +#endif + + id_tuples[valid_num] = new int[3]; + id_tuples[valid_num][0] = subid; + id_tuples[valid_num][1] = preid; + id_tuples[valid_num][2] = objid; + this->triples_num++; + valid_num++; + + tmpset.reset(); + Signature::encodePredicate2Entity(preid, tmpset, Util::EDGE_OUT); + Signature::encodeStr2Entity(obj.c_str(), tmpset); + if (new_entity.find(subid) != new_entity.end()) + { + it = new_sigmap.find(subid); + if (it != new_sigmap.end()) + { + it->second |= tmpset; + } + else + { + new_sigmap[subid] = tmpset; + } + } + else + { + it = old_sigmap.find(subid); + if (it != old_sigmap.end()) + { + it->second |= tmpset; + } + else + { + old_sigmap[subid] = tmpset; + } + } + + if (is_obj_entity) + { + tmpset.reset(); + Signature::encodePredicate2Entity(preid, tmpset, Util::EDGE_IN); + Signature::encodeStr2Entity(sub.c_str(), tmpset); + if (new_entity.find(objid) != new_entity.end()) + { + it = new_sigmap.find(objid); + if (it != new_sigmap.end()) + { + it->second |= tmpset; + } + else + { + new_sigmap[objid] = tmpset; + } + } + else + { + it = old_sigmap.find(objid); + if (it != old_sigmap.end()) + { + it->second |= tmpset; + } + else + { + old_sigmap[objid] = tmpset; + } + } + } + } + +#ifdef DEBUG + cout << "old sigmap size: " << old_sigmap.size() << endl; + cout << "new sigmap size: " << new_sigmap.size() << endl; + cout << "valid num: " << valid_num << endl; +#endif + + //NOTICE:need to sort and remove duplicates, update the valid num + //Notice that duplicates in a group can csuse problem + //We finish this by spo cmp + + //this->kvstore->updateTupleslist_insert(_sub_id, _pre_id, _obj_id); + //sort and update kvstore: 11 indexes + // + //BETTER:maybe also use int* here with a size to start + //NOTICE:all kvtrees are opened now, one by one if memory is bottleneck + // + //spo cmp: s2p s2o s2po sp2o + { +#ifdef DEBUG + cout << "INSRET PROCESS: to spo cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int*), KVstore::_spo_cmp); + + //To remove duplicates + //int ti = 1, tj = 1; + //while(tj < valid_num) + //{ + //if(id_tuples[tj][0] != id_tuples[tj-1][0] || id_tuples[tj][1] != id_tuples[tj-1][1] || id_tuples[tj][2] != id_tuples[tj-1][2]) + //{ + //id_tuples[ti][0] = id_tuples[tj][0]; + //id_tuples[ti][1] = id_tuples[tj][1]; + //id_tuples[ti][2] = id_tuples[tj][2]; + //ti++; + //} + //tj++; + //} + //for(tj = ti; tj < valid_num; ++tj) + //{ + //delete[] id_tuples[tj]; + //id_tuples[tj] = NULL; + //} + //valid_num = ti; + // + //Notice that below already consider duplicates in loop + + vector oidlist_s; + vector pidlist_s; + vector oidlist_sp; + vector pidoidlist_s; + + bool _sub_change = true; + bool _sub_pre_change = true; + bool _pre_change = true; + + for (int i = 0; i < valid_num; ++i) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + oidlist_s.push_back(_obj_id); + oidlist_sp.push_back(_obj_id); + pidoidlist_s.push_back(_pre_id); + pidoidlist_s.push_back(_obj_id); + pidlist_s.push_back(_pre_id); + + _sub_change = (i + 1 == valid_num) || (id_tuples[i][0] != id_tuples[i + 1][0]); + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _sub_pre_change = _sub_change || _pre_change; + + if (_sub_pre_change) + { +#ifdef DEBUG + cout << "update sp2o: " << _sub_id << " " << _pre_id << " " << oidlist_sp.size() << endl; +#endif + cout << this->kvstore->getEntityByID(_sub_id) << endl; + cout << this->kvstore->getPredicateByID(_pre_id) << endl; + //this->kvstore->updateInsert_sp2o(_sub_id, _pre_id, oidlist_sp); + oidlist_sp.clear(); + } + + if (_sub_change) + { +#ifdef DEBUG + cout << "update s2p: " << _sub_id << " " << pidlist_s.size() << endl; +#endif + //this->kvstore->updateInsert_s2p(_sub_id, pidlist_s); + pidlist_s.clear(); + +#ifdef DEBUG + cout << "update s2po: " << _sub_id << " " << pidoidlist_s.size() << endl; +#endif + this->kvstore->updateInsert_s2values(_sub_id, pidoidlist_s); + pidoidlist_s.clear(); + +#ifdef DEBUG + cout << "update s2o: " << _sub_id << " " << oidlist_s.size() << endl; +#endif + sort(oidlist_s.begin(), oidlist_s.end()); + //this->kvstore->updateInsert_s2o(_sub_id, oidlist_s); + oidlist_s.clear(); + } + + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT s2po..." << endl; +#endif + } + //ops cmp: o2p o2s o2ps op2s + { +#ifdef DEBUG + cout << "INSRET PROCESS: to ops cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int**), KVstore::_ops_cmp); + vector sidlist_o; + vector sidlist_op; + vector pidsidlist_o; + vector pidlist_o; + + bool _obj_change = true; + bool _pre_change = true; + bool _obj_pre_change = true; + + for (int i = 0; i < valid_num; ++i) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + sidlist_o.push_back(_sub_id); + sidlist_op.push_back(_sub_id); + pidsidlist_o.push_back(_pre_id); + pidsidlist_o.push_back(_sub_id); + pidlist_o.push_back(_pre_id); + + _obj_change = (i + 1 == valid_num) || (id_tuples[i][2] != id_tuples[i + 1][2]); + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _obj_pre_change = _obj_change || _pre_change; + + if (_obj_pre_change) + { +#ifdef DEBUG + cout << "update op2s: " << _obj_id << " " << _pre_id << " " << sidlist_op.size() << endl; +#endif + //this->kvstore->updateInsert_op2s(_obj_id, _pre_id, sidlist_op); + sidlist_op.clear(); + } + + if (_obj_change) + { +#ifdef DEBUG + cout << "update o2s: " << _obj_id << " " << sidlist_o.size() << endl; +#endif + sort(sidlist_o.begin(), sidlist_o.end()); + //this->kvstore->updateInsert_o2s(_obj_id, sidlist_o); + sidlist_o.clear(); + +#ifdef DEBUG + cout << "update o2ps: " << _obj_id << " " << pidsidlist_o.size() << endl; +#endif + this->kvstore->updateInsert_o2values(_obj_id, pidsidlist_o); + pidsidlist_o.clear(); + +#ifdef DEBUG + cout << "update o2p: " << _obj_id << " " << pidlist_o.size() << endl; +#endif + //this->kvstore->updateInsert_o2p(_obj_id, pidlist_o); + pidlist_o.clear(); + } + + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT o2ps..." << endl; +#endif + } + //pso cmp: p2s p2o p2so + { +#ifdef DEBUG + cout << "INSRET PROCESS: to pso cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int*), KVstore::_pso_cmp); + vector sidlist_p; + vector oidlist_p; + vector sidoidlist_p; + + bool _pre_change = true; + bool _sub_change = true; + //bool _pre_sub_change = true; + + for (int i = 0; i < valid_num; i++) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + oidlist_p.push_back(_obj_id); + sidoidlist_p.push_back(_sub_id); + sidoidlist_p.push_back(_obj_id); + sidlist_p.push_back(_sub_id); + + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _sub_change = (i + 1 == valid_num) || (id_tuples[i][0] != id_tuples[i + 1][0]); + //_pre_sub_change = _pre_change || _sub_change; + + if (_pre_change) + { +#ifdef DEBUG + cout << "update p2s: " << _pre_id << " " << sidlist_p.size() << endl; +#endif + //this->kvstore->updateInsert_p2s(_pre_id, sidlist_p); + sidlist_p.clear(); + +#ifdef DEBUG + cout << "update p2o: " << _pre_id << " " << oidlist_p.size() << endl; +#endif + sort(oidlist_p.begin(), oidlist_p.end()); + //this->kvstore->updateInsert_p2o(_pre_id, oidlist_p); + oidlist_p.clear(); + +#ifdef DEBUG + cout << "update p2so: " << _pre_id << " " << sidoidlist_p.size() << endl; +#endif + this->kvstore->updateInsert_p2values(_pre_id, sidoidlist_p); + sidoidlist_p.clear(); + } + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT p2so..." << endl; +#endif + } + + + for (int i = 0; i < valid_num; ++i) + { + delete[] id_tuples[i]; + } + delete[] id_tuples; + + for (it = old_sigmap.begin(); it != old_sigmap.end(); ++it) + { + this->vstree->updateEntry(it->first, it->second); + } + for (it = new_sigmap.begin(); it != new_sigmap.end(); ++it) + { + SigEntry _sig(it->first, it->second); + this->vstree->insertEntry(_sig); + } +#else + //NOTICE:we deal with insertions one by one here + //Callers should save the vstree(node and info) after calling this function + for (int i = 0; i < _triple_num; ++i) + { + bool ret = this->insertTriple(_triples[i], &vertices, &predicates); + if(ret) + { + valid_num++; + } + } +#endif + + return valid_num; +} + +int +Database::remove(const TripleWithObjType* _triples, int _triple_num) +{ + vector vertices, predicates; + int valid_num = 0; + +#ifdef USE_GROUP_DELETE + //NOTICE:this is called by remove(file) or query()(but can not be too large), + //assume that db is loaded already + int** id_tuples = new int*[_triple_num]; + int i = 0; + //for(i = 0; i < _triple_num; ++i) + //{ + //id_tuples[i] = new int[3]; + //} + //map sigmap; + //map::iterator it; + EntityBitSet tmpset; + tmpset.reset(); + + int subid, objid, preid; + bool is_obj_entity; + for (i = 0; i < _triple_num; ++i) + { + string sub = _triples[i].getSubject(); + subid = this->kvstore->getIDByEntity(sub); + if(subid == -1) + { + continue; + } + + string pre = _triples[i].getPredicate(); + preid = this->kvstore->getIDByPredicate(pre); + if(preid == -1) + { + continue; + } + + is_obj_entity = _triples[i].isObjEntity(); + string obj = _triples[i].getObject(); + if (is_obj_entity) + { + objid = this->kvstore->getIDByEntity(obj); + } + else //isObjLiteral + { + objid = this->kvstore->getIDByLiteral(obj); + } + if(objid == -1) + { + continue; + } + + //if (subid == -1 || preid == -1 || objid == -1) + //{ + //continue; + //} + bool _exist_triple = this->exist_triple(subid, preid, objid); + if (!_exist_triple) + { + continue; + } + + id_tuples[valid_num] = new int[3]; + id_tuples[valid_num][0] = subid; + id_tuples[valid_num][1] = preid; + id_tuples[valid_num][2] = objid; + this->triples_num--; + valid_num++; + } + + //NOTICE:sort and remove duplicates, update the valid num + //Notice that duplicates in a group can cause problem + + int sub_degree, obj_degree, pre_degree; + string tmpstr; + //sort and update kvstore: 11 indexes + // + //BETTER:maybe also use int* here with a size to start + //NOTICE:all kvtrees are opened now, one by one if memory is bottleneck + // + //spo cmp: s2p s2o s2po sp2o + { +#ifdef DEBUG + cout << "INSRET PROCESS: to spo cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int*), KVstore::_spo_cmp); + vector oidlist_s; + vector pidlist_s; + vector oidlist_sp; + vector pidoidlist_s; + + bool _sub_change = true; + bool _sub_pre_change = true; + bool _pre_change = true; + + for (int i = 0; i < valid_num; ++i) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + oidlist_s.push_back(_obj_id); + oidlist_sp.push_back(_obj_id); + pidoidlist_s.push_back(_pre_id); + pidoidlist_s.push_back(_obj_id); + pidlist_s.push_back(_pre_id); + + _sub_change = (i + 1 == valid_num) || (id_tuples[i][0] != id_tuples[i + 1][0]); + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _sub_pre_change = _sub_change || _pre_change; + + if (_sub_pre_change) + { + this->kvstore->updateRemove_sp2o(_sub_id, _pre_id, oidlist_sp); + oidlist_sp.clear(); + } + + if (_sub_change) + { + this->kvstore->updateRemove_s2p(_sub_id, pidlist_s); + pidlist_s.clear(); + this->kvstore->updateRemove_s2po(_sub_id, pidoidlist_s); + pidoidlist_s.clear(); + + sort(oidlist_s.begin(), oidlist_s.end()); + this->kvstore->updateRemove_s2o(_sub_id, oidlist_s); + oidlist_s.clear(); + + sub_degree = (this->kvstore)->getEntityDegree(_sub_id); + if (sub_degree == 0) + { + tmpstr = this->kvstore->getEntityByID(_sub_id); + this->kvstore->subEntityByID(_sub_id); + this->kvstore->subIDByEntity(tmpstr); + (this->vstree)->removeEntry(_sub_id); + this->freeEntityID(_sub_id); + this->sub_num--; + //add info and update buffer + vertices.push_back(_sub_id); + if (_sub_id < this->entity_buffer_size) + { + this->entity_buffer->del(_sub_id); + } + } + else + { + tmpset.reset(); + this->calculateEntityBitSet(_sub_id, tmpset); + this->vstree->replaceEntry(_sub_id, tmpset); + } + } + + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT s2po..." << endl; +#endif + } + //ops cmp: o2p o2s o2ps op2s + { +#ifdef DEBUG + cout << "INSRET PROCESS: to ops cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int**), KVstore::_ops_cmp); + vector sidlist_o; + vector sidlist_op; + vector pidsidlist_o; + vector pidlist_o; + + bool _obj_change = true; + bool _pre_change = true; + bool _obj_pre_change = true; + + for (int i = 0; i < valid_num; ++i) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + sidlist_o.push_back(_sub_id); + sidlist_op.push_back(_sub_id); + pidsidlist_o.push_back(_pre_id); + pidsidlist_o.push_back(_sub_id); + pidlist_o.push_back(_pre_id); + + _obj_change = (i + 1 == valid_num) || (id_tuples[i][2] != id_tuples[i + 1][2]); + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _obj_pre_change = _obj_change || _pre_change; + + if (_obj_pre_change) + { + this->kvstore->updateRemove_op2s(_obj_id, _pre_id, sidlist_op); + sidlist_op.clear(); + } + + if (_obj_change) + { + sort(sidlist_o.begin(), sidlist_o.end()); + this->kvstore->updateRemove_o2s(_obj_id, sidlist_o); + sidlist_o.clear(); + this->kvstore->updateRemove_o2ps(_obj_id, pidsidlist_o); + pidsidlist_o.clear(); + + this->kvstore->updateRemove_o2p(_obj_id, pidlist_o); + pidlist_o.clear(); + + is_obj_entity = this->objIDIsEntityID(_obj_id); + if (is_obj_entity) + { + obj_degree = this->kvstore->getEntityDegree(_obj_id); + if (obj_degree == 0) + { + tmpstr = this->kvstore->getEntityByID(_obj_id); + this->kvstore->subEntityByID(_obj_id); + this->kvstore->subIDByEntity(tmpstr); + (this->vstree)->removeEntry(_obj_id); + this->freeEntityID(_obj_id); + //add info and update buffer + vertices.push_back(_obj_id); + if (_obj_id < this->entity_buffer_size) + { + this->entity_buffer->del(_obj_id); + } + } + else + { + tmpset.reset(); + this->calculateEntityBitSet(_obj_id, tmpset); + this->vstree->replaceEntry(_obj_id, tmpset); + } + } + else + { + obj_degree = this->kvstore->getLiteralDegree(_obj_id); + if (obj_degree == 0) + { + tmpstr = this->kvstore->getLiteralByID(_obj_id); + this->kvstore->subLiteralByID(_obj_id); + this->kvstore->subIDByLiteral(tmpstr); + this->freeLiteralID(_obj_id); + //add info and update buffer + vertices.push_back(_obj_id); + int tid = _obj_id - Util::LITERAL_FIRST_ID; + if (tid < this->literal_buffer_size) + { + this->literal_buffer->del(tid); + } + } + } + } + + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT o2ps..." << endl; +#endif + } + //pso cmp: p2s p2o p2so + { +#ifdef DEBUG + cout << "INSRET PROCESS: to pso cmp and update" << endl; +#endif + qsort(id_tuples, valid_num, sizeof(int*), KVstore::_pso_cmp); + vector sidlist_p; + vector oidlist_p; + vector sidoidlist_p; + + bool _pre_change = true; + bool _sub_change = true; + //bool _pre_sub_change = true; + + for (int i = 0; i < valid_num; i++) + if (i + 1 == valid_num || (id_tuples[i][0] != id_tuples[i + 1][0] || id_tuples[i][1] != id_tuples[i + 1][1] || id_tuples[i][2] != id_tuples[i + 1][2])) + { + int _sub_id = id_tuples[i][0]; + int _pre_id = id_tuples[i][1]; + int _obj_id = id_tuples[i][2]; + + oidlist_p.push_back(_obj_id); + sidoidlist_p.push_back(_sub_id); + sidoidlist_p.push_back(_obj_id); + sidlist_p.push_back(_sub_id); + + _pre_change = (i + 1 == valid_num) || (id_tuples[i][1] != id_tuples[i + 1][1]); + _sub_change = (i + 1 == valid_num) || (id_tuples[i][0] != id_tuples[i + 1][0]); + //_pre_sub_change = _pre_change || _sub_change; + + if (_pre_change) + { + this->kvstore->updateRemove_p2s(_pre_id, sidlist_p); + sidlist_p.clear(); + + sort(oidlist_p.begin(), oidlist_p.end()); + this->kvstore->updateRemove_p2o(_pre_id, oidlist_p); + oidlist_p.clear(); + + this->kvstore->updateRemove_p2so(_pre_id, sidoidlist_p); + sidoidlist_p.clear(); + + pre_degree = this->kvstore->getPredicateDegree(_pre_id); + if (pre_degree == 0) + { + tmpstr = this->kvstore->getPredicateByID(_pre_id); + this->kvstore->subPredicateByID(_pre_id); + this->kvstore->subIDByPredicate(tmpstr); + this->freePredicateID(_pre_id); + //add pre info + predicates.push_back(_pre_id); + } + } + } +#ifdef DEBUG + cout << "INSERT PROCESS: OUT p2so..." << endl; +#endif + } + + + for (int i = 0; i < valid_num; ++i) + { + delete[] id_tuples[i]; + } + delete[] id_tuples; +#else + //NOTICE:we deal with deletions one by one here + //Callers should save the vstree(node and info) after calling this function + for (int i = 0; i < _triple_num; ++i) + { + bool ret = this->removeTriple(_triples[i], &vertices, &predicates); + if(ret) + { + valid_num++; + } + } +#endif + + return valid_num; +} + +bool +Database::objIDIsEntityID(int _id) +{ + return _id < Util::LITERAL_FIRST_ID; +} + +int +main(int argc, char * argv[]) +{ + //chdir(dirname(argv[0])); +#ifdef DEBUG + Util util; +#endif + + return 0; +} + diff --git a/test/signature_bind/Signature.cpp b/test/signature_bind/Signature.cpp new file mode 100644 index 0000000..dde9134 --- /dev/null +++ b/test/signature_bind/Signature.cpp @@ -0,0 +1,293 @@ +/*============================================================================= +# Filename: Signature.cpp +# Author: Bookug Lobert +# Mail: zengli-bookug@pku.edu.cn +# Last Modified: 2016-04-11 13:18 +# Description: +=============================================================================*/ + +#include "Signature.h" + +using namespace std; + +std::string +Signature::BitSet2str(const EntityBitSet& _bitset) +{ + std::stringstream _ss; + bool any = false; + for (unsigned i = 0; i < _bitset.size(); i++) + { + if (_bitset.test(i)) + { + _ss << "[" << i << "] "; + any = true; + } + } + if (!any) + { + _ss << "empty" << endl; + } + _ss << endl; + return _ss.str(); +} + +//void +//Signature::encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type) +//{ + //if (Signature::PREDICATE_ENCODE_METHOD == 0) + //{ + ////WARN:change if need to use again, because the encoding method has changed now! + //int pos = ((_pre_id + 10) % Signature::EDGE_SIG_LENGTH) + Signature::STR_SIG_LENGTH; + //_entity_bs.set(pos); + //} + //else + //{ + //int seed_num = _pre_id % Signature::EDGE_SIG_INTERVAL_NUM_HALF; + + //if (_type == Util::EDGE_OUT) + //{ + //seed_num += Signature::EDGE_SIG_INTERVAL_NUM_HALF; + //} + + ////int primeSize = 5; + ////int prime1[]={5003,5009,5011,5021,5023}; + ////int prime2[]={49943,49957,49991,49993,49999}; + + ////NOTICE: more ones in the bitset(use more primes) means less conflicts, but also weakens the filtration of VSTree. + //// when the data set is big enough, cutting down the size of candidate list should come up to our primary consideration. + //// in this case we should not encode too many ones in entities' signature. + //// also, when the data set is small, hash conflicts can hardly happen. + //// therefore, I think using 2 primes(set up two ones in bitset) is enough. + //// --by hanshuo. + ////int primeSize = 2; + ////int prime1[] = {5003, 5011}; + ////int prime2[] = {49957, 49993}; + + ////for(int i = 0; i < primeSize; i++) + ////{ + ////int seed = _pre_id * prime1[i] % prime2[i]; + ////int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + ////_entity_bs.set(pos); + ////} + //int seed = _pre_id * 5003 % 49957; + //int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + //_entity_bs.set(pos); + //} +//} + +//void +//Signature::encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs) +//{ + //if (Signature::PREDICATE_ENCODE_METHOD == 0) + //{ + //int pos = (_pre_id + 10) % Signature::EDGE_SIG_LENGTH; + //_edge_bs.set(pos); + //} + //else + //{ + //int seed_num = _pre_id % Signature::EDGE_SIG_INTERVAL_NUM_HALF; + ////int primeSize = 5; + ////int prime1[]={5003,5009,5011,5021,5023}; + ////int prime2[]={49943,49957,49991,49993,49999}; + + ////int primeSize = 2; + ////int prime1[] = {5003,5011}; + ////int prime2[] = {49957,49993}; + + ////for (int i = 0; i < primeSize; i++) + ////{ + ////int seed = _pre_id * prime1[i] % prime2[i]; + ////int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + ////_edge_bs.set(pos); + ////} + //int seed = _pre_id * 5003 % 49957; + //int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + //_edge_bs.set(pos); + //} +//} + +////NOTICE: no need to encode itself because only variable in query need to be filtered! +////So only consider all neighbors! +//void +//Signature::encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs) +//{ + ////_str is subject or object or literal + //if (strlen(_str) >0 && _str[0] == '?') + //return; + + //int length = (int)strlen(_str); + //unsigned int hashKey = 0; + //unsigned int pos = 0; + //char *str2 = (char*)calloc(length + 1, sizeof(char)); + //strcpy(str2, _str); + //char *str = str2; + + //unsigned base = Signature::STR_SIG_BASE * (Signature::HASH_NUM - 1); + //for (int i = Signature::HASH_NUM - 1; i >= 0; --i) + //{ + //HashFunction hf = Util::hash[i]; + //if (hf == NULL) + //break; + //hashKey = hf(str); + //str = str2; + //pos = base + hashKey % Signature::STR_SIG_BASE; + //base -= Signature::STR_SIG_BASE; + //if (_str[0] == '"') + //{ + //pos += Signature::STR_SIG_LENGTH2; + //} + //else if (_str[0] != '<') + //{ +//#ifdef DEBUG_VSTREE + //cerr << "error in encodeStr2Entity(): neighbor is neither a literal or entity!" << endl; +//#endif + //} + //_entity_bs.set(pos); + //} + ////BETTER: use multiple threads for different hash functions + +//#ifdef DEBUG_VSTREE + ////std::stringstream _ss; + ////_ss << "encodeStr2Entity:" << str2 << endl; + ////Util::logging(_ss.str()); +//#endif + //free(str2); +//} + +//void +//Signature::encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs) +//{ + ////NO NEED +//} + +//TODO: what if pre or neighbor not exist in Query? how to ensure the containment?! +void +Signature::encodeEdge2Entity(EntityBitSet& _entity_bs, int _pre_id, int _neighbor_id, const char _type) +{ + //switch(_type) + //{ + //case Util::EDGE_IN: + //break; + //case Util::EDGE_OUT: + //break; + //default: + //cout<<"error in Signature::encodeEdge2Entity() - non seen type"<entityBitSet.reset(); +} + +EntitySig::EntitySig(const EntitySig* _p_sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _p_sig->entityBitSet; +} + +EntitySig::EntitySig(const EntitySig& _sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _sig.entityBitSet; +} + +EntitySig::EntitySig(const EntityBitSet& _bitset) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _bitset; +} + +EntitySig& +EntitySig::operator|=(const EntitySig& _sig) +{ + this->entityBitSet |= _sig.entityBitSet; + return *this; +} + +bool +EntitySig::operator==(const EntitySig& _sig)const +{ + return (this->entityBitSet == _sig.entityBitSet); +} + +bool +EntitySig::operator!=(const EntitySig& _sig)const +{ + return (this->entityBitSet != _sig.entityBitSet); +} + +EntitySig& +EntitySig::operator=(const EntitySig& _sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _sig.getBitset(); + return *this; +} + +const EntityBitSet& +EntitySig::getBitset()const +{ + return this->entityBitSet; +} + +//EdgeSig::EdgeSig() +//{ + //this->edgeBitSet.reset(); +//} + +//EdgeSig::EdgeSig(const EdgeSig* _p_sig) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _p_sig->edgeBitSet; +//} + +//EdgeSig::EdgeSig(const EdgeSig& _sig) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _sig.edgeBitSet; +//} + +//EdgeSig::EdgeSig(const EdgeBitSet& _bitset) +//{ + //this->edgeBitSet.reset(); + //this->edgeBitSet |= _bitset; +//} + +//EdgeSig& +//EdgeSig::operator|=(const EdgeSig& _sig) +//{ + //this->edgeBitSet |= _sig.edgeBitSet; + //return *this; +//} + +string +EntitySig::to_str() const +{ + std::stringstream _ss; + + _ss << Signature::BitSet2str(this->entityBitSet); + + return _ss.str(); +} + diff --git a/test/signature_bind/Signature.h b/test/signature_bind/Signature.h new file mode 100644 index 0000000..d244432 --- /dev/null +++ b/test/signature_bind/Signature.h @@ -0,0 +1,124 @@ +/*============================================================================= +# Filename: Signature.h +# Author: Bookug Lobert +# Mail: zengli-bookug@pku.edu.cn +# Last Modified: 2016-04-11 12:50 +# Description: written by liyouhuan and hanshuo, modified by zengli +=============================================================================*/ + + +#ifndef _SIGNATURE_SIGNATURE_H +#define _SIGNATURE_SIGNATURE_H + +#include "../Util/Util.h" + +class Signature +{ +public: + //TODO:how to set the length as a dynamic parameter? below use typedef bitset + //keep the parameter in Database and passed to here and Query?? + //QUERY: is this really needed?! + + //static HashFunction hash[HashNum]; + static const int PRE_SIG_BASE = 10; + static const int STR_SIG_BASE = PRE_SIG_BASE; + static const int STR_SIG_LENGTH = 3 * STR_SIG_BASE; + static const int ENTITY_SIG_INTERVAL_BASE = PRE_SIG_BASE + STR_SIG_LENGTH; + static const int EDGE_SIG_INTERVAL_NUM = 5; //in edge or out edge, entity or literal + static const int ENTITY_SIG_INTERVAL_HALF = 2 * EDGE_SIG_INTERVAL_NUM; + //STRUCT: in egde | out egde(neighbor is entity | literal(pre1 pre2 pre3 pre4 pre5(str pre * 3) ) ) + //NOTICE: the length should below 1000 + static const int ENTITY_SIG_LENGTH = 2 * 2 * EDGE_SIG_INTERVAL_NUM * ENTITY_SIG_INTERVAL_BASE; + + //NOTICE: we can also use id here, but string is recommended due to special structure + //(maybe needed later, for example, wildcards) + //Th ehash function is costly, so just use two + //static const int HASH_NUM = 3; //no more than Util::HashNum + //NOTICE:if using str id, we can also divide like EDGE_SIG + //here we divide as entity neighbors and literal neighbors: ENTITY, LITERAL + //static const int STR_SIG_LENGTH = 2 * STR_SIG_BASE * HASH_NUM; //250 + //static const int STR_SIG_LENGTH2 = STR_SIG_BASE * HASH_NUM; + + //QUERY:I think that str filter is more important in VSTree than predicate, because + //a predicate may correspond to a lot of entities and predicate num is usually small + //static const int EDGE_SIG_INTERVAL_NUM_HALF = 5; //in edge or out edge + //static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF; + //static const int EDGE_SIG_INTERVAL_BASE = 20; + //static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //150 + //static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE; //150 + + //static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; + //static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH + NEIGHBOR_SIG_LENGTH; + + //typedef std::bitset EdgeBitSet; + typedef std::bitset EntityBitSet; + + static std::string BitSet2str(const EntityBitSet& _bitset); + + //NOTICE: there are two predicate encoding method now, see the encoding functions @Signature.cpp for details + //const static int PREDICATE_ENCODE_METHOD = 1; + //static void encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type); + //static void encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs); + //static void encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs); //_str is subject or object(literal) + //static void encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs); + static void encodeEdge2Entity(EntityBitSet& _entity_bs, int _pre_id, int _neighbor_id, const char _type); + //Signature() + //{ + //NOTICE:not exceed the HashNum + //this->hash = new HashFunction[HashNum]; + //this->hash[0] = Util::simpleHash; + //this->hash[1] = Util::APHash; + //this->hash[2] = Util::BKDRHash; + //this->hash[3] = Util::DJBHash; + //this->hash[4] = Util::ELFHash; + //this->hash[5] = Util::DEKHash; + //this->hash[6] = Util::BPHash; + //this->hash[7] = Util::FNVHash; + //this->hash[8] = Util::HFLPHash; + //this->hash[9] = Util::HFHash; + //this->hash[10] = Util::JSHash; + //this->hash[11] = Util::PJWHash; + //this->hash[12] = Util::RSHash; + //this->hash[13] = Util::SDBMHash; + //this->hash[14] = Util::StrHash; + //this->hash[15] = Util::TianlHash; + //} + //~Signature() + //{ + //delete[] this->hash; + //} +}; + +//WARN:also defined in Signature, must be same!!! +//NOTICE:EdgeBitSet is only used in Query, not for VSTree +//typedef std::bitset EdgeBitSet; +typedef std::bitset EntityBitSet; + +class EntitySig : Signature{ +public: + EntityBitSet entityBitSet; + EntitySig(); + EntitySig(const EntitySig* _p_sig); + EntitySig(const EntitySig& _sig); + EntitySig(const EntityBitSet& _bitset); + EntitySig& operator|=(const EntitySig& _sig); + bool operator==(const EntitySig& _sig)const; + bool operator!=(const EntitySig& _sig)const; + EntitySig& operator=(const EntitySig& _sig); + const EntityBitSet& getBitset()const; + void encode(const char * _str, int _pre_id); + std::string to_str() const; +}; + +//class EdgeSig : Signature{ +//public: + //EdgeBitSet edgeBitSet; + //EdgeSig(); + //EdgeSig(const EdgeSig* _p_sig); + //EdgeSig(const EdgeSig& _sig); + //EdgeSig(const EdgeBitSet& _bitset); + //EdgeSig& operator|=(const EdgeSig& _sig); +//}; + +#endif // _SIGNATURE_SIGNATURE_H + diff --git a/test/signature_separate/Signature.cpp b/test/signature_separate/Signature.cpp new file mode 100644 index 0000000..4f52731 --- /dev/null +++ b/test/signature_separate/Signature.cpp @@ -0,0 +1,257 @@ +/*============================================================================= +# Filename: Signature.cpp +# Author: Bookug Lobert +# Mail: zengli-bookug@pku.edu.cn +# Last Modified: 2016-04-11 13:18 +# Description: +=============================================================================*/ + +#include "Signature.h" + +using namespace std; + +std::string +Signature::BitSet2str(const EntityBitSet& _bitset) +{ + std::stringstream _ss; + bool any = false; + for (unsigned i = 0; i < _bitset.size(); i++) + { + if (_bitset.test(i)) + { + _ss << "[" << i << "] "; + any = true; + } + } + if (!any) + { + _ss << "empty" << endl; + } + _ss << endl; + return _ss.str(); +} + +void +Signature::encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type) +{ + if (Signature::PREDICATE_ENCODE_METHOD == 0) + { + //WARN:change if need to use again, because the encoding method has changed now! + int pos = ((_pre_id + 10) % Signature::EDGE_SIG_LENGTH) + Signature::STR_SIG_LENGTH; + _entity_bs.set(pos); + } + else + { + int seed_num = _pre_id % Signature::EDGE_SIG_INTERVAL_NUM_HALF; + + if (_type == Util::EDGE_OUT) + { + seed_num += Signature::EDGE_SIG_INTERVAL_NUM_HALF; + } + + //int primeSize = 5; + //int prime1[]={5003,5009,5011,5021,5023}; + //int prime2[]={49943,49957,49991,49993,49999}; + + //NOTICE: more ones in the bitset(use more primes) means less conflicts, but also weakens the filtration of VSTree. + // when the data set is big enough, cutting down the size of candidate list should come up to our primary consideration. + // in this case we should not encode too many ones in entities' signature. + // also, when the data set is small, hash conflicts can hardly happen. + // therefore, I think using 2 primes(set up two ones in bitset) is enough. + // --by hanshuo. + //int primeSize = 2; + //int prime1[] = {5003, 5011}; + //int prime2[] = {49957, 49993}; + + //for(int i = 0; i < primeSize; i++) + //{ + //int seed = _pre_id * prime1[i] % prime2[i]; + //int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + //_entity_bs.set(pos); + //} + int seed = _pre_id * 5003 % 49957; + int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + _entity_bs.set(pos); + } +} + +void +Signature::encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs) +{ + if (Signature::PREDICATE_ENCODE_METHOD == 0) + { + int pos = (_pre_id + 10) % Signature::EDGE_SIG_LENGTH; + _edge_bs.set(pos); + } + else + { + int seed_num = _pre_id % Signature::EDGE_SIG_INTERVAL_NUM_HALF; + //int primeSize = 5; + //int prime1[]={5003,5009,5011,5021,5023}; + //int prime2[]={49943,49957,49991,49993,49999}; + + //int primeSize = 2; + //int prime1[] = {5003,5011}; + //int prime2[] = {49957,49993}; + + //for (int i = 0; i < primeSize; i++) + //{ + //int seed = _pre_id * prime1[i] % prime2[i]; + //int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + //_edge_bs.set(pos); + //} + int seed = _pre_id * 5003 % 49957; + int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::EDGE_SIG_INTERVAL_BASE * seed_num; + _edge_bs.set(pos); + } +} + +//NOTICE: no need to encode itself because only variable in query need to be filtered! +//So only consider all neighbors! +void +Signature::encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs) +{ + //_str is subject or object or literal + if (strlen(_str) >0 && _str[0] == '?') + return; + + int length = (int)strlen(_str); + unsigned int hashKey = 0; + unsigned int pos = 0; + char *str2 = (char*)calloc(length + 1, sizeof(char)); + strcpy(str2, _str); + char *str = str2; + + unsigned base = Signature::STR_SIG_BASE * (Signature::HASH_NUM - 1); + for (int i = Signature::HASH_NUM - 1; i >= 0; --i) + { + HashFunction hf = Util::hash[i]; + if (hf == NULL) + break; + hashKey = hf(str); + str = str2; + pos = base + hashKey % Signature::STR_SIG_BASE; + base -= Signature::STR_SIG_BASE; + if (_str[0] == '"') + { + pos += Signature::STR_SIG_LENGTH2; + } + else if (_str[0] != '<') + { +#ifdef DEBUG_VSTREE + cerr << "error in encodeStr2Entity(): neighbor is neither a literal or entity!" << endl; +#endif + } + _entity_bs.set(pos); + } + //BETTER: use multiple threads for different hash functions + +#ifdef DEBUG_VSTREE + //std::stringstream _ss; + //_ss << "encodeStr2Entity:" << str2 << endl; + //Util::logging(_ss.str()); +#endif + free(str2); +} + +void +Signature::encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs) +{ + //TODO +} + +EntitySig::EntitySig() +{ + this->entityBitSet.reset(); +} + +EntitySig::EntitySig(const EntitySig* _p_sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _p_sig->entityBitSet; +} + +EntitySig::EntitySig(const EntitySig& _sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _sig.entityBitSet; +} + +EntitySig::EntitySig(const EntityBitSet& _bitset) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _bitset; +} + +EntitySig& +EntitySig::operator|=(const EntitySig& _sig) +{ + this->entityBitSet |= _sig.entityBitSet; + return *this; +} + +bool +EntitySig::operator==(const EntitySig& _sig)const +{ + return (this->entityBitSet == _sig.entityBitSet); +} + +bool +EntitySig::operator!=(const EntitySig& _sig)const +{ + return (this->entityBitSet != _sig.entityBitSet); +} + +EntitySig& +EntitySig::operator=(const EntitySig& _sig) +{ + this->entityBitSet.reset(); + this->entityBitSet |= _sig.getBitset(); + return *this; +} + +const EntityBitSet& +EntitySig::getBitset()const +{ + return this->entityBitSet; +} + +EdgeSig::EdgeSig() +{ + this->edgeBitSet.reset(); +} + +EdgeSig::EdgeSig(const EdgeSig* _p_sig) +{ + this->edgeBitSet.reset(); + this->edgeBitSet |= _p_sig->edgeBitSet; +} + +EdgeSig::EdgeSig(const EdgeSig& _sig) +{ + this->edgeBitSet.reset(); + this->edgeBitSet |= _sig.edgeBitSet; +} + +EdgeSig::EdgeSig(const EdgeBitSet& _bitset) +{ + this->edgeBitSet.reset(); + this->edgeBitSet |= _bitset; +} + +EdgeSig& +EdgeSig::operator|=(const EdgeSig& _sig) +{ + this->edgeBitSet |= _sig.edgeBitSet; + return *this; +} + +string +EntitySig::to_str() const +{ + std::stringstream _ss; + + _ss << Signature::BitSet2str(this->entityBitSet); + + return _ss.str(); +} \ No newline at end of file diff --git a/test/signature_separate/Signature.h b/test/signature_separate/Signature.h new file mode 100644 index 0000000..835e066 --- /dev/null +++ b/test/signature_separate/Signature.h @@ -0,0 +1,116 @@ +/*============================================================================= +# Filename: Signature.h +# Author: Bookug Lobert +# Mail: zengli-bookug@pku.edu.cn +# Last Modified: 2016-04-11 12:50 +# Description: written by liyouhuan and hanshuo, modified by zengli +=============================================================================*/ + + +#ifndef _SIGNATURE_SIGNATURE_H +#define _SIGNATURE_SIGNATURE_H + +#include "../Util/Util.h" + +class Signature +{ +public: + //static HashFunction hash[HashNum]; + //must make sure: ENTITY_SIG_LENGTH = EDGE_SIG_LENGTH + STR_SIG_LENGTH + //const static int ENTITY_SIG_LENGTH = 400; + static const int STR_SIG_BASE = 100; + //NOTICE: we can also use id here, but string is recommended due to special structure + //(maybe needed later, for example, wildcards) + //Th ehash function is costly, so just use two + static const int HASH_NUM = 3; //no more than Util::HashNum + //NOTICE:if using str id, we can also divide like EDGE_SIG + //here we divide as entity neighbors and literal neighbors: ENTITY, LITERAL + static const int STR_SIG_LENGTH = 2 * STR_SIG_BASE * HASH_NUM; //250 + static const int STR_SIG_LENGTH2 = STR_SIG_BASE * HASH_NUM; + + //TODO:how to set the length as a dynamic parameter? below use typedef bitset + //2 * 10 * (10+30) pre num is very small + + //QUERY:I think that str filter is more important in VSTree than predicate, because + //a predicate may correspond to a lot of entities and predicate num is usually small + static const int EDGE_SIG_INTERVAL_NUM_HALF = 5; //in edge or out edge + static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF; + static const int EDGE_SIG_INTERVAL_BASE = 20; + static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //150 + static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE; //150 + + static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; + //static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH + NEIGHBOR_SIG_LENGTH; + + typedef std::bitset EdgeBitSet; + typedef std::bitset EntityBitSet; + + static std::string BitSet2str(const EntityBitSet& _bitset); + + //NOTICE: there are two predicate encoding method now, see the encoding functions @Signature.cpp for details + const static int PREDICATE_ENCODE_METHOD = 1; + static void encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type); + static void encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs); + static void encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs); //_str is subject or object(literal) + static void encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs); + static void encodeEdge2Entity(EntityBitSet& _entity_bs, int _pre_id, int _neighbor_id, const char _type); + //Signature() + //{ + //NOTICE:not exceed the HashNum + //this->hash = new HashFunction[HashNum]; + //this->hash[0] = Util::simpleHash; + //this->hash[1] = Util::APHash; + //this->hash[2] = Util::BKDRHash; + //this->hash[3] = Util::DJBHash; + //this->hash[4] = Util::ELFHash; + //this->hash[5] = Util::DEKHash; + //this->hash[6] = Util::BPHash; + //this->hash[7] = Util::FNVHash; + //this->hash[8] = Util::HFLPHash; + //this->hash[9] = Util::HFHash; + //this->hash[10] = Util::JSHash; + //this->hash[11] = Util::PJWHash; + //this->hash[12] = Util::RSHash; + //this->hash[13] = Util::SDBMHash; + //this->hash[14] = Util::StrHash; + //this->hash[15] = Util::TianlHash; + //} + //~Signature() + //{ + //delete[] this->hash; + //} +}; + +//WARN:also defined in Signature, must be same!!! +//NOTICE:EdgeBitSet is only used in Query, not for VSTree +typedef std::bitset EdgeBitSet; +typedef std::bitset EntityBitSet; + +class EntitySig : Signature{ +public: + EntityBitSet entityBitSet; + EntitySig(); + EntitySig(const EntitySig* _p_sig); + EntitySig(const EntitySig& _sig); + EntitySig(const EntityBitSet& _bitset); + EntitySig& operator|=(const EntitySig& _sig); + bool operator==(const EntitySig& _sig)const; + bool operator!=(const EntitySig& _sig)const; + EntitySig& operator=(const EntitySig& _sig); + const EntityBitSet& getBitset()const; + void encode(const char * _str, int _pre_id); + std::string to_str() const; +}; + +class EdgeSig : Signature{ +public: + EdgeBitSet edgeBitSet; + EdgeSig(); + EdgeSig(const EdgeSig* _p_sig); + EdgeSig(const EdgeSig& _sig); + EdgeSig(const EdgeBitSet& _bitset); + EdgeSig& operator|=(const EdgeSig& _sig); +}; + +#endif // _SIGNATURE_SIGNATURE_H + diff --git a/test/sumline.sh b/test/sumline.sh index 9ed7339..d612b57 100644 --- a/test/sumline.sh +++ b/test/sumline.sh @@ -1,3 +1,3 @@ rm Parser/Sparql* -find . -type f -print | grep -E "\.(c(pp)?|h)$" | xargs wc -l +find . -type f -print | grep -E "(makefile|Makefile|\.(c(pp)?|h|sh|py|ini|sql|conf))$" | xargs wc -l