From 2f7a7a8b69700f92fdc1eb5ab34f6a51346c3460 Mon Sep 17 00:00:00 2001 From: bookug Date: Wed, 29 Mar 2017 13:48:39 +0800 Subject: [PATCH] refactor: move id_tuples to disk in build to run large datasets like freebase by zengli, no changes to others --- .gitignore | 3 + Database/Database.cpp | 197 ++++++++++++++++++++++++++++++++---------- Database/Database.h | 25 ++++-- KVstore/KVstore.cpp | 66 ++++++++------ KVstore/KVstore.h | 6 +- NOTES.md | 23 ++--- Util/Util.cpp | 82 ++++++++++++++++++ Util/Util.h | 12 +++ package.json | 12 +++ 9 files changed, 323 insertions(+), 103 deletions(-) create mode 100644 package.json diff --git a/.gitignore b/.gitignore index ceb77fe..70c39fd 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,6 @@ tags *.out *.bak~ +# modules +node_modules + diff --git a/Database/Database.cpp b/Database/Database.cpp index 0ac5e41..ade436d 100644 --- a/Database/Database.cpp +++ b/Database/Database.cpp @@ -18,6 +18,7 @@ Database::Database() this->signature_binary_file = "signature.binary"; this->six_tuples_file = "six_tuples"; this->db_info_file = "db_info_file.dat"; + this->id_tuples_file = "id_tuples"; string kv_store_path = store_path + "/kv_store"; this->kvstore = new KVstore(kv_store_path); @@ -58,6 +59,7 @@ Database::Database(string _name) this->signature_binary_file = "signature.binary"; this->six_tuples_file = "six_tuples"; this->db_info_file = "db_info_file.dat"; + this->id_tuples_file = "id_tuples"; string kv_store_path = store_path + "/kv_store"; this->kvstore = new KVstore(kv_store_path); @@ -804,6 +806,8 @@ Database::build(const string& _rdf_file) string _entry_file = this->getSignatureBFile(); cout << "begin build VS-Tree on " << ret << "..." << endl; + //TODO: we can use larger buffer for vstree in building process, because it does not compete with others + //we only need to build vstree in this phase(no need for id tuples anymore) (this->vstree)->buildTree(_entry_file); long tv_build_end = Util::get_cur_time(); @@ -832,20 +836,26 @@ Database::getSixTuplesFile() return this->getStorePath() + "/" + this->six_tuples_file; } -/* root Path of this DB + signatureBFile */ +//root Path of this DB + signatureBFile string Database::getSignatureBFile() { return this->getStorePath() + "/" + this->signature_binary_file; } -/* root Path of this DB + DBInfoFile */ +//root Path of this DB + DBInfoFile string Database::getDBInfoFile() { return this->getStorePath() + "/" + this->db_info_file; } +string +Database::getIDTuplesFile() +{ + return this->getStorePath() + "/" + this->id_tuples_file; +} + bool Database::saveDBInfoFile() { @@ -1059,7 +1069,9 @@ Database::encodeRDF_new(const string _rdf_file) Util::logging("In encodeRDF_new"); //cout<< "end log!!!" << endl; #endif - TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL; + + //TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL; + ID_TUPLE* _p_id_tuples = NULL; TYPE_TRIPLE_NUM _id_tuples_max = 0; long t1 = Util::get_cur_time(); @@ -1071,7 +1083,7 @@ Database::encodeRDF_new(const string _rdf_file) //(one way is to add a more structure to tell us which is entity, but this is costly) //map sub2id, pre2id, entity/literal in obj2id, store in kvstore, encode RDF data into signature - if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file, _p_id_tuples, _id_tuples_max)) + if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file)) { return false; } @@ -1094,6 +1106,9 @@ Database::encodeRDF_new(const string _rdf_file) this->stringindex->setNum(StringIndexFile::Predicate, this->pre_num); this->stringindex->save(*this->kvstore); + long t3 = Util::get_cur_time(); + cout << "after stringindex, used " << (t3 - t2) << "ms." << endl; + //cout<<"special id: "<kvstore->getIDByEntity("")<kvstore->close_predicate2id(); this->kvstore->close_id2predicate(); + long t4 = Util::get_cur_time(); + cout << "id2string and string2id closed, used " << (t4 - t3) << "ms." << endl; + + //after closing the 6 trees, read the id tuples again, and remove the file given num, a dimension,return a pointer + //NOTICE: the file can also be used for debugging, and a program can start just from the id tuples file + //(if copy the 6 id2string trees, no need to parse each time) + this->readIDTuples(_p_id_tuples); + + long t5 = Util::get_cur_time(); + cout << "id tuples read, used " << (t5 - t4) << "ms." << endl; + + //TODO: how to set the buffer of trees is a big question, fully utilize the availiable memory + //this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); this->build_s2xx(_p_id_tuples); - long t3 = Util::get_cur_time(); - cout << "after s2xx, used " << (t3 - t2) << "ms." << endl; + + long t6 = Util::get_cur_time(); + cout << "after s2xx, used " << (t6 - t5) << "ms." << endl; //this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); this->build_o2xx(_p_id_tuples); - long t4 = Util::get_cur_time(); - cout << "after o2xx, used " << (t4 - t3) << "ms." << endl; + + long t7 = Util::get_cur_time(); + cout << "after o2xx, used " << (t7 - t6) << "ms." << endl; //this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); this->build_p2xx(_p_id_tuples); - long t5 = Util::get_cur_time(); - cout << "after p2xx, used " << (t5 - t4) << "ms." << endl; + + long t8 = Util::get_cur_time(); + cout << "after p2xx, used " << (t8 - t7) << "ms." << endl; //WARN:we must free the memory for id_tuples array - for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i) - { - delete[] _p_id_tuples[i]; - } delete[] _p_id_tuples; + //for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i) + //{ + //delete[] _p_id_tuples[i]; + //} + //delete[] _p_id_tuples; bool flag = this->saveDBInfoFile(); if (!flag) @@ -1132,15 +1164,47 @@ Database::encodeRDF_new(const string _rdf_file) return false; } - Util::logging("finish encodeRDF_new"); + long t9 = Util::get_cur_time(); + cout << "db info saved, used " << (t9 - t8) << "ms." << endl; + + //Util::logging("finish encodeRDF_new"); return true; } -void -Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) +void +Database::readIDTuples(ID_TUPLE*& _p_id_tuples) { - qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp); + _p_id_tuples = NULL; + string fname = this->getIDTuplesFile(); + FILE* fp = fopen(fname.c_str(), "rb"); + if(fp == NULL) + { + cout<<"error in Database::readIDTuples() -- unable to open file "<triples_num * 3; + //_p_id_tuples = new TYPE_ENTITY_LITERAL_ID[total_num]; + _p_id_tuples = new ID_TUPLE[this->triples_num]; + fread(_p_id_tuples, sizeof(ID_TUPLE), this->triples_num, fp); + + fclose(fp); + //NOTICE: choose to empty the file or not + Util::empty_file(fname.c_str()); + + //return NULL; +} + +void +Database::build_s2xx(ID_TUPLE* _p_id_tuples) +{ + //NOTICE: STL sort() is generally fatser than C qsort, especially when qsort is very slow + //STL sort() not only use qsort algorithm, it can also choose heap-sort method + sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::spo_cmp_idtuple); + //qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp); this->kvstore->build_subID2values(_p_id_tuples, this->triples_num); //save all entity_signature into binary file @@ -1164,11 +1228,17 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID; //int prev_entity_id = -1; + + //NOTICE: i*3 + j maybe break the unsigned limit + //for (unsigned long i = 0; i < this->triples_num; ++i) for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i) { - TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0]; - TYPE_PREDICATE_ID preid = _p_id_tuples[i][1]; - TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2]; + TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid; + TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid; + TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid; + //TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0]; + //TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1]; + //TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2]; if(subid != prev_entity_id) { if(prev_entity_id != INVALID_ENTITY_LITERAL_ID) @@ -1216,9 +1286,10 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) } void -Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) +Database::build_o2xx(ID_TUPLE* _p_id_tuples) { - qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp); + sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::ops_cmp_idtuple); + //qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp); this->kvstore->build_objID2values(_p_id_tuples, this->triples_num); //save all entity_signature into binary file @@ -1236,11 +1307,17 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID; //int prev_entity_id = -1; EntityBitSet tmp_bitset; + + //NOTICE: i*3 + j maybe break the unsigned limit + //for (unsigned long i = 0; i < this->triples_num; ++i) for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i) { - TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0]; - TYPE_PREDICATE_ID preid = _p_id_tuples[i][1]; - TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2]; + TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid; + TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid; + TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid; + //TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0]; + //TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1]; + //TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2]; if(Util::is_literal_ele(objid)) @@ -1327,9 +1404,10 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) } void -Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) +Database::build_p2xx(ID_TUPLE* _p_id_tuples) { - qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp); + sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::pso_cmp_idtuple); + //qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp); this->kvstore->build_preID2values(_p_id_tuples, this->triples_num); } @@ -1339,14 +1417,33 @@ Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples) //CONSIDER: just an estimated value is ok or use vector!!!(but vector also copy when enlarge) //and read file line numbers are also costly! bool -Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max) +Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file) { + //NOTICE: if we keep the id_tuples always in memory, i.e. [unsigned*] each unsigned* is [3] + //then for freebase, there is 2.5B triples. the mmeory cost of this array is 25*10^8*3*4 + 25*10^8*8 = 50G + // + //So I choose not to store the id_tuples in memory in this function, but to store them in file and read again after this function + //Notice that the most memory-costly part of building process is this function, setup 6 trees together + //later we can read the id_tuples and stored as [num][3], only cost 25*10^8*3*4 = 30G, and later we only build one tree at a time + + string fname = this->getIDTuplesFile(); + FILE* fp = fopen(fname.c_str(), "wb"); + if(fp == NULL) + { + cout<<"error in Database::sub2id_pre2id_obj2id() -- unable to open file to write "<sub_num = 0; this->pre_num = 0; this->entity_num = 0; @@ -1430,15 +1527,15 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT this->triples_num++; //if the _id_tuples exceeds, double the space - if (_id_tuples_size == _id_tuples_max) - { - TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2; - TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len]; - memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max); - delete[] _p_id_tuples; - _p_id_tuples = _new_id_tuples; - _id_tuples_max = _new_tuples_len; - } + //if (_id_tuples_size == _id_tuples_max) + //{ + //TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2; + //TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len]; + //memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max); + //delete[] _p_id_tuples; + //_p_id_tuples = _new_id_tuples; + //_id_tuples_max = _new_tuples_len; + //} // For subject // (all subject is entity, some object is entity, the other is literal) @@ -1508,11 +1605,18 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT } // For id_tuples - _p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3]; - _p_id_tuples[_id_tuples_size][0] = _sub_id; - _p_id_tuples[_id_tuples_size][1] = _pre_id; - _p_id_tuples[_id_tuples_size][2] = _obj_id; - _id_tuples_size++; + //_p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3]; + //_p_id_tuples[_id_tuples_size][0] = _sub_id; + //_p_id_tuples[_id_tuples_size][1] = _pre_id; + //_p_id_tuples[_id_tuples_size][2] = _obj_id; + //_id_tuples_size++; + tmp_id_tuple.subid = _sub_id; + tmp_id_tuple.preid = _pre_id; + tmp_id_tuple.objid = _obj_id; + fwrite(&tmp_id_tuple, sizeof(ID_TUPLE), 1, fp); + //fwrite(&_sub_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp); + //fwrite(&_pre_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp); + //fwrite(&_obj_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp); #ifdef DEBUG_PRECISE //// save six tuples @@ -1585,6 +1689,7 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT delete[] triple_array; _fin.close(); _six_tuples_fout.close(); + fclose(fp); //for (int i = 0; i < entitybitset_max; i++) diff --git a/Database/Database.h b/Database/Database.h index b09a6f3..195620a 100644 --- a/Database/Database.h +++ b/Database/Database.h @@ -60,17 +60,21 @@ public: bool insert(std::string _rdf_file); bool remove(std::string _rdf_file); - /* name of this DB*/ + //name of this DB string getName(); - /* root Path of this DB + sixTuplesFile */ + + //root Path of this DB + sixTuplesFile string getSixTuplesFile(); - /* root Path of this DB + signatureBFile */ + //root Path of this DB + signatureBFile string getSignatureBFile(); - /* root Path of this DB + DBInfoFile */ + //root Path of this DB + DBInfoFile string getDBInfoFile(); + //id tuples file + string getIDTuplesFile(); + private: string name; string store_path; @@ -95,9 +99,13 @@ private: //six tuples: string six_tuples_file; + //B means binary string signature_binary_file; + //id tuples file + string id_tuples_file; + //pre2num mapping TYPE_TRIPLE_NUM* pre2num; //valid: check from minNumPID to maxNumPID @@ -179,9 +187,10 @@ private: //* 4. build: objID2subIDlist, 2subIDlist objID2list //encodeRDF_new invoke new rdfParser to solve task 1 & 2 in one time scan. bool encodeRDF_new(const string _rdf_file); - void build_s2xx(TYPE_ENTITY_LITERAL_ID**); - void build_o2xx(TYPE_ENTITY_LITERAL_ID**); - void build_p2xx(TYPE_ENTITY_LITERAL_ID**); + void readIDTuples(ID_TUPLE*& _p_id_tuples); + void build_s2xx(ID_TUPLE*); + void build_o2xx(ID_TUPLE*); + void build_p2xx(ID_TUPLE*); //insert and delete, notice that modify is not needed here //we can read from file or use sparql syntax @@ -193,7 +202,7 @@ private: unsigned remove(const TripleWithObjType* _triples, TYPE_TRIPLE_NUM _triple_num); //bool remove(const vector& _triples, vector& _vertices, vector& _predicates); - bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max); + bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file); //bool literal2id_RDFintoSignature(const string _rdf_file, int** _p_id_tuples, TYPE_TRIPLE_NUM _id_tuples_max); bool objIDIsEntityID(TYPE_ENTITY_LITERAL_ID _id); diff --git a/KVstore/KVstore.cpp b/KVstore/KVstore.cpp index cac84f1..3df4dc4 100644 --- a/KVstore/KVstore.cpp +++ b/KVstore/KVstore.cpp @@ -1247,7 +1247,7 @@ bool KVstore::close_subID2values() { } bool -KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) +KVstore::build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) { cout << "Begin building subID2values..." << endl; //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_spo_cmp); @@ -1264,20 +1264,26 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N this->open_subID2values(KVstore::CREATE_MODE); - for (unsigned i = 0; i < _triples_num; i++) { - if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0] - || _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) { - if (_sub_change) { + //NOTICE: i*3 + j maybe break the unsigned limit + //for (unsigned long i = 0; i < _triples_num; i++) + for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) + { + if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid + || _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) + { + if (_sub_change) + { _pidoffsetlist_s.clear(); _oidlist_s.clear(); _entity_num = 0; } - TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0]; - TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1]; - TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2]; + TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid; + TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid; + TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid; - if (_sub_pre_change) { + if (_sub_pre_change) + { _pidoffsetlist_s.push_back(_pre_id); _pidoffsetlist_s.push_back(_oidlist_s.size()); } @@ -1287,8 +1293,8 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N _entity_num++; } - _sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i][0] != _p_id_tuples[i + 1][0]); - _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]); + _sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i].subid != _p_id_tuples[i+1].subid); + _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid); _sub_pre_change = _sub_change || _pre_change; if (_sub_change) { @@ -1478,7 +1484,7 @@ bool KVstore::close_objID2values() { } bool -KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) +KVstore::build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) { cout << "Begin building objID2values..." << endl; //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_ops_cmp); @@ -1494,17 +1500,19 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N this->open_objID2values(KVstore::CREATE_MODE); - for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) { - if (i + 1 == _triples_num || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2] - || _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]) { + //for (unsigned long i = 0; i < _triples_num; i++) + for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) + { + if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid + || _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) { if (_obj_change) { _pidoffsetlist_o.clear(); _sidlist_o.clear(); } - TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0]; - TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1]; - TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2]; + TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid; + TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid; + TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid; if (_obj_pre_change) { _pidoffsetlist_o.push_back(_pre_id); @@ -1513,8 +1521,8 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N _sidlist_o.push_back(_sub_id); - _obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i][2] != _p_id_tuples[i + 1][2]); - _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]); + _obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i].objid != _p_id_tuples[i+1].objid); + _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid); _obj_pre_change = _obj_change || _pre_change; if (_obj_change) { @@ -1683,7 +1691,7 @@ bool KVstore::close_preID2values() { } bool -KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) +KVstore::build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num) { cout << "Begin building preID2values..." << endl; //qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_pso_cmp); @@ -1695,22 +1703,24 @@ KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N this->open_preID2values(KVstore::CREATE_MODE); - for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) { - if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0] - || _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) { + //for (unsigned long i = 0; i < _triples_num; i++) + for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) + { + if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid + || _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) { if (_pre_change) { _sidlist_p.clear(); _oidlist_p.clear(); } - TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0]; - TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1]; - TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2]; + TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid; + TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid; + TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid; _sidlist_p.push_back(_sub_id); _oidlist_p.push_back(_obj_id); - _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]); + _pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid); if (_pre_change) { unsigned* _entrylist_p = new unsigned[1 + _sidlist_p.size() * 2]; diff --git a/KVstore/KVstore.h b/KVstore/KVstore.h index e0e4cd8..a305058 100644 --- a/KVstore/KVstore.h +++ b/KVstore/KVstore.h @@ -107,7 +107,7 @@ public: //for subID2values bool open_subID2values(int _mode); bool close_subID2values(); - bool build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num); + bool build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num); bool getpreIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getobjIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getobjIDlistBysubIDpreID(TYPE_ENTITY_LITERAL_ID _subid, TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const; @@ -116,7 +116,7 @@ public: //for objID2values bool open_objID2values(int _mode); bool close_objID2values(); - bool build_objID2values(unsigned** _p_id_tuples, unsigned _triples_num); + bool build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num); bool getpreIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getsubIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getsubIDlistByobjIDpreID(TYPE_ENTITY_LITERAL_ID _objid, TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const; @@ -125,7 +125,7 @@ public: //for preID2values bool open_preID2values(int _mode); bool close_preID2values(); - bool build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num); + bool build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num); bool getsubIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const; bool getsubIDobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subid_objidlist, unsigned& _list_len, bool _no_duplicate = false) const; diff --git a/NOTES.md b/NOTES.md index 1a699f1..99aa0ae 100644 --- a/NOTES.md +++ b/NOTES.md @@ -7,6 +7,11 @@ 在使用gserver时,不能在数据库没有unload时再用gbuild或其他命令修改数据库,仅限于C/S模式 将IRC聊天放到gstore文档上,freenode #gStore +另外要有桌面应用或者网页应用,以可视化的方式操作数据库,类似virtuoso和neo4j那种 +server 118.89.115.42 gstore-pku.com + +考虑使用hbase,结合云平台 + --- 论文:新的join策略,特殊的子图同态问题,如何选择顺序 @@ -20,24 +25,6 @@ --- -# 推广 - -必须建立一个官方网站,可以展示下团队、demo,需要建立社区/论坛并维护 -另外要有桌面应用或者网页应用,以可视化的方式操作数据库,类似virtuoso和neo4j那种 -server 118.89.115.42 gstore-pku.com - -自己的网站可以用实验室的服务器,gstore网站最好用云服务,图个稳定 -但用实验室主机,备案时是否更麻烦?得以企业为单位,而且解析是否更麻烦? -gstore网站中的demo应用的主体可以放在实验室主机上,至少是gstore数据库应抽离出来,但若实验室主机不开外网,应如何而配置代理? -demo应用全部外链,具体服务放在实验室公开的主机上,通过ip:port连接 - -方正 -微生物所 -社交网络(正在让北师那个学生在做) -DBpeida数据集上SPARQL查询接口 - ---- - # 并行策略- 线程控制模块 不宜使用并行框架,可使用C的pthread,boost的thread库,或者启用C++11,gcc编译器需要高于4.8.1才能完整支持C++11 diff --git a/Util/Util.cpp b/Util/Util.cpp index 612cd0e..1cb29aa 100644 --- a/Util/Util.cpp +++ b/Util/Util.cpp @@ -1537,3 +1537,85 @@ Util::_pso_cmp(const void* _a, const void* _b) return 0; } +bool +Util::spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b) +{ + if(a.subid != b.subid) + { + return a.subid < b.subid; + } + + if(a.preid != b.preid) + { + return a.preid < b.preid; + } + + if(a.objid != b.objid) + { + return a.objid < b.objid; + } + + //all are equal, no need to sort this two + return false; +} + +bool +Util::ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b) +{ + if(a.objid != b.objid) + { + return a.objid < b.objid; + } + + if(a.preid != b.preid) + { + return a.preid < b.preid; + } + + if(a.subid != b.subid) + { + return a.subid < b.subid; + } + + //all are equal, no need to sort this two + return false; +} + +bool +Util::pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b) +{ + if(a.preid != b.preid) + { + return a.preid < b.preid; + } + + if(a.subid != b.subid) + { + return a.subid < b.subid; + } + + if(a.objid != b.objid) + { + return a.objid < b.objid; + } + + //all are equal, no need to sort this two + return false; +} + +void +Util::empty_file(const char* _fname) +{ + FILE * fp; + //NOTICE: if exist, then overwrite and create a empty file + fp = fopen(_fname, "w"); + if(fp == NULL) + { + printf("do empty file %s failed\n", _fname); + } + else + { + fclose(fp); + } +} + diff --git a/Util/Util.h b/Util/Util.h index 6cee72d..9b9800a 100644 --- a/Util/Util.h +++ b/Util/Util.h @@ -175,6 +175,13 @@ static const unsigned INVALID = UINT_MAX; // //NOTICE: if use define, the type is none +typedef struct TYPE_ID_TUPLE +{ + TYPE_ENTITY_LITERAL_ID subid; + TYPE_ENTITY_LITERAL_ID preid; + TYPE_ENTITY_LITERAL_ID objid; +}ID_TUPLE; + /******** all static&universal constants and fucntions ********/ class Util { @@ -247,6 +254,7 @@ public: static std::string getExactPath(const char* path); static std::string getItemsFromDir(std::string path); static void logging(std::string _str); + static void empty_file(const char* _fname); // Below are some useful hash functions for string static unsigned simpleHash(const char *_str); @@ -295,6 +303,10 @@ public: static int _spo_cmp(const void* _a, const void* _b); static int _ops_cmp(const void* _a, const void* _b); static int _pso_cmp(const void* _a, const void* _b); + //sort functions for sort on ID_TUPLE + static bool spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b); + static bool ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b); + static bool pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b); static std::string tmp_path; // this are for debugging diff --git a/package.json b/package.json new file mode 100644 index 0000000..fe9e70c --- /dev/null +++ b/package.json @@ -0,0 +1,12 @@ +{ + "config": { + "ghooks": { + "commit-msg": "validate-commit-msg" + } + }, + + "scripts": { + "changelog-all": "conventional-changelog -p angular -i CHANGELOG.md -w -r 0", + "changelog": "conventional-changelog -p angular -i CHANGELOG.md -w" + } +}