refactor: move id_tuples to disk in build

to run large datasets like freebase

by zengli, no changes to others
This commit is contained in:
bookug 2017-03-29 13:48:39 +08:00
parent 8e74a29f07
commit 2f7a7a8b69
9 changed files with 323 additions and 103 deletions

3
.gitignore vendored
View File

@ -91,3 +91,6 @@ tags
*.out
*.bak~
# modules
node_modules

View File

@ -18,6 +18,7 @@ Database::Database()
this->signature_binary_file = "signature.binary";
this->six_tuples_file = "six_tuples";
this->db_info_file = "db_info_file.dat";
this->id_tuples_file = "id_tuples";
string kv_store_path = store_path + "/kv_store";
this->kvstore = new KVstore(kv_store_path);
@ -58,6 +59,7 @@ Database::Database(string _name)
this->signature_binary_file = "signature.binary";
this->six_tuples_file = "six_tuples";
this->db_info_file = "db_info_file.dat";
this->id_tuples_file = "id_tuples";
string kv_store_path = store_path + "/kv_store";
this->kvstore = new KVstore(kv_store_path);
@ -804,6 +806,8 @@ Database::build(const string& _rdf_file)
string _entry_file = this->getSignatureBFile();
cout << "begin build VS-Tree on " << ret << "..." << endl;
//TODO: we can use larger buffer for vstree in building process, because it does not compete with others
//we only need to build vstree in this phase(no need for id tuples anymore)
(this->vstree)->buildTree(_entry_file);
long tv_build_end = Util::get_cur_time();
@ -832,20 +836,26 @@ Database::getSixTuplesFile()
return this->getStorePath() + "/" + this->six_tuples_file;
}
/* root Path of this DB + signatureBFile */
//root Path of this DB + signatureBFile
string
Database::getSignatureBFile()
{
return this->getStorePath() + "/" + this->signature_binary_file;
}
/* root Path of this DB + DBInfoFile */
//root Path of this DB + DBInfoFile
string
Database::getDBInfoFile()
{
return this->getStorePath() + "/" + this->db_info_file;
}
string
Database::getIDTuplesFile()
{
return this->getStorePath() + "/" + this->id_tuples_file;
}
bool
Database::saveDBInfoFile()
{
@ -1059,7 +1069,9 @@ Database::encodeRDF_new(const string _rdf_file)
Util::logging("In encodeRDF_new");
//cout<< "end log!!!" << endl;
#endif
TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL;
//TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL;
ID_TUPLE* _p_id_tuples = NULL;
TYPE_TRIPLE_NUM _id_tuples_max = 0;
long t1 = Util::get_cur_time();
@ -1071,7 +1083,7 @@ Database::encodeRDF_new(const string _rdf_file)
//(one way is to add a more structure to tell us which is entity, but this is costly)
//map sub2id, pre2id, entity/literal in obj2id, store in kvstore, encode RDF data into signature
if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file, _p_id_tuples, _id_tuples_max))
if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file))
{
return false;
}
@ -1094,6 +1106,9 @@ Database::encodeRDF_new(const string _rdf_file)
this->stringindex->setNum(StringIndexFile::Predicate, this->pre_num);
this->stringindex->save(*this->kvstore);
long t3 = Util::get_cur_time();
cout << "after stringindex, used " << (t3 - t2) << "ms." << endl;
//cout<<"special id: "<<this->kvstore->getIDByEntity("<point7>")<<endl;
//NOTICE:close these trees now to save memory
@ -1104,27 +1119,44 @@ Database::encodeRDF_new(const string _rdf_file)
this->kvstore->close_predicate2id();
this->kvstore->close_id2predicate();
long t4 = Util::get_cur_time();
cout << "id2string and string2id closed, used " << (t4 - t3) << "ms." << endl;
//after closing the 6 trees, read the id tuples again, and remove the file given num, a dimension,return a pointer
//NOTICE: the file can also be used for debugging, and a program can start just from the id tuples file
//(if copy the 6 id2string trees, no need to parse each time)
this->readIDTuples(_p_id_tuples);
long t5 = Util::get_cur_time();
cout << "id tuples read, used " << (t5 - t4) << "ms." << endl;
//TODO: how to set the buffer of trees is a big question, fully utilize the availiable memory
//this->kvstore->build_subID2values(_p_id_tuples, this->triples_num);
this->build_s2xx(_p_id_tuples);
long t3 = Util::get_cur_time();
cout << "after s2xx, used " << (t3 - t2) << "ms." << endl;
long t6 = Util::get_cur_time();
cout << "after s2xx, used " << (t6 - t5) << "ms." << endl;
//this->kvstore->build_objID2values(_p_id_tuples, this->triples_num);
this->build_o2xx(_p_id_tuples);
long t4 = Util::get_cur_time();
cout << "after o2xx, used " << (t4 - t3) << "ms." << endl;
long t7 = Util::get_cur_time();
cout << "after o2xx, used " << (t7 - t6) << "ms." << endl;
//this->kvstore->build_preID2values(_p_id_tuples, this->triples_num);
this->build_p2xx(_p_id_tuples);
long t5 = Util::get_cur_time();
cout << "after p2xx, used " << (t5 - t4) << "ms." << endl;
long t8 = Util::get_cur_time();
cout << "after p2xx, used " << (t8 - t7) << "ms." << endl;
//WARN:we must free the memory for id_tuples array
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
{
delete[] _p_id_tuples[i];
}
delete[] _p_id_tuples;
//for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
//{
//delete[] _p_id_tuples[i];
//}
//delete[] _p_id_tuples;
bool flag = this->saveDBInfoFile();
if (!flag)
@ -1132,15 +1164,47 @@ Database::encodeRDF_new(const string _rdf_file)
return false;
}
Util::logging("finish encodeRDF_new");
long t9 = Util::get_cur_time();
cout << "db info saved, used " << (t9 - t8) << "ms." << endl;
//Util::logging("finish encodeRDF_new");
return true;
}
void
Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
void
Database::readIDTuples(ID_TUPLE*& _p_id_tuples)
{
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp);
_p_id_tuples = NULL;
string fname = this->getIDTuplesFile();
FILE* fp = fopen(fname.c_str(), "rb");
if(fp == NULL)
{
cout<<"error in Database::readIDTuples() -- unable to open file "<<fname<<endl;
return;
}
//NOTICE: avoid to break the unsigned limit, size_t is used in Linux C
//size_t means long unsigned int in 64-bit machine
//unsigned long total_num = this->triples_num * 3;
//_p_id_tuples = new TYPE_ENTITY_LITERAL_ID[total_num];
_p_id_tuples = new ID_TUPLE[this->triples_num];
fread(_p_id_tuples, sizeof(ID_TUPLE), this->triples_num, fp);
fclose(fp);
//NOTICE: choose to empty the file or not
Util::empty_file(fname.c_str());
//return NULL;
}
void
Database::build_s2xx(ID_TUPLE* _p_id_tuples)
{
//NOTICE: STL sort() is generally fatser than C qsort, especially when qsort is very slow
//STL sort() not only use qsort algorithm, it can also choose heap-sort method
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::spo_cmp_idtuple);
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp);
this->kvstore->build_subID2values(_p_id_tuples, this->triples_num);
//save all entity_signature into binary file
@ -1164,11 +1228,17 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID;
//int prev_entity_id = -1;
//NOTICE: i*3 + j maybe break the unsigned limit
//for (unsigned long i = 0; i < this->triples_num; ++i)
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
{
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0];
TYPE_PREDICATE_ID preid = _p_id_tuples[i][1];
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2];
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid;
TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid;
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid;
//TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0];
//TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1];
//TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2];
if(subid != prev_entity_id)
{
if(prev_entity_id != INVALID_ENTITY_LITERAL_ID)
@ -1216,9 +1286,10 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
}
void
Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
Database::build_o2xx(ID_TUPLE* _p_id_tuples)
{
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp);
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::ops_cmp_idtuple);
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp);
this->kvstore->build_objID2values(_p_id_tuples, this->triples_num);
//save all entity_signature into binary file
@ -1236,11 +1307,17 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID;
//int prev_entity_id = -1;
EntityBitSet tmp_bitset;
//NOTICE: i*3 + j maybe break the unsigned limit
//for (unsigned long i = 0; i < this->triples_num; ++i)
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
{
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0];
TYPE_PREDICATE_ID preid = _p_id_tuples[i][1];
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2];
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid;
TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid;
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid;
//TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0];
//TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1];
//TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2];
if(Util::is_literal_ele(objid))
@ -1327,9 +1404,10 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
}
void
Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
Database::build_p2xx(ID_TUPLE* _p_id_tuples)
{
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp);
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::pso_cmp_idtuple);
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp);
this->kvstore->build_preID2values(_p_id_tuples, this->triples_num);
}
@ -1339,14 +1417,33 @@ Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
//CONSIDER: just an estimated value is ok or use vector!!!(but vector also copy when enlarge)
//and read file line numbers are also costly!
bool
Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max)
Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file)
{
//NOTICE: if we keep the id_tuples always in memory, i.e. [unsigned*] each unsigned* is [3]
//then for freebase, there is 2.5B triples. the mmeory cost of this array is 25*10^8*3*4 + 25*10^8*8 = 50G
//
//So I choose not to store the id_tuples in memory in this function, but to store them in file and read again after this function
//Notice that the most memory-costly part of building process is this function, setup 6 trees together
//later we can read the id_tuples and stored as [num][3], only cost 25*10^8*3*4 = 30G, and later we only build one tree at a time
string fname = this->getIDTuplesFile();
FILE* fp = fopen(fname.c_str(), "wb");
if(fp == NULL)
{
cout<<"error in Database::sub2id_pre2id_obj2id() -- unable to open file to write "<<fname<<endl;
return false;
}
ID_TUPLE tmp_id_tuple;
//NOTICE: avoid to break the unsigned limit, size_t is used in Linux C
//size_t means long unsigned int in 64-bit machine
//fread(_p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID), total_num, fp);
TYPE_TRIPLE_NUM _id_tuples_size;
{
//initial
_id_tuples_max = 10 * 1000 * 1000;
_p_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_id_tuples_max];
_id_tuples_size = 0;
//_id_tuples_max = 10 * 1000 * 1000;
//_p_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_id_tuples_max];
//_id_tuples_size = 0;
this->sub_num = 0;
this->pre_num = 0;
this->entity_num = 0;
@ -1430,15 +1527,15 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
this->triples_num++;
//if the _id_tuples exceeds, double the space
if (_id_tuples_size == _id_tuples_max)
{
TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2;
TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len];
memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max);
delete[] _p_id_tuples;
_p_id_tuples = _new_id_tuples;
_id_tuples_max = _new_tuples_len;
}
//if (_id_tuples_size == _id_tuples_max)
//{
//TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2;
//TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len];
//memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max);
//delete[] _p_id_tuples;
//_p_id_tuples = _new_id_tuples;
//_id_tuples_max = _new_tuples_len;
//}
// For subject
// (all subject is entity, some object is entity, the other is literal)
@ -1508,11 +1605,18 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
}
// For id_tuples
_p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3];
_p_id_tuples[_id_tuples_size][0] = _sub_id;
_p_id_tuples[_id_tuples_size][1] = _pre_id;
_p_id_tuples[_id_tuples_size][2] = _obj_id;
_id_tuples_size++;
//_p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3];
//_p_id_tuples[_id_tuples_size][0] = _sub_id;
//_p_id_tuples[_id_tuples_size][1] = _pre_id;
//_p_id_tuples[_id_tuples_size][2] = _obj_id;
//_id_tuples_size++;
tmp_id_tuple.subid = _sub_id;
tmp_id_tuple.preid = _pre_id;
tmp_id_tuple.objid = _obj_id;
fwrite(&tmp_id_tuple, sizeof(ID_TUPLE), 1, fp);
//fwrite(&_sub_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
//fwrite(&_pre_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
//fwrite(&_obj_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
#ifdef DEBUG_PRECISE
//// save six tuples
@ -1585,6 +1689,7 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
delete[] triple_array;
_fin.close();
_six_tuples_fout.close();
fclose(fp);
//for (int i = 0; i < entitybitset_max; i++)

View File

@ -60,17 +60,21 @@ public:
bool insert(std::string _rdf_file);
bool remove(std::string _rdf_file);
/* name of this DB*/
//name of this DB
string getName();
/* root Path of this DB + sixTuplesFile */
//root Path of this DB + sixTuplesFile
string getSixTuplesFile();
/* root Path of this DB + signatureBFile */
//root Path of this DB + signatureBFile
string getSignatureBFile();
/* root Path of this DB + DBInfoFile */
//root Path of this DB + DBInfoFile
string getDBInfoFile();
//id tuples file
string getIDTuplesFile();
private:
string name;
string store_path;
@ -95,9 +99,13 @@ private:
//six tuples: <sub pre obj sid pid oid>
string six_tuples_file;
//B means binary
string signature_binary_file;
//id tuples file
string id_tuples_file;
//pre2num mapping
TYPE_TRIPLE_NUM* pre2num;
//valid: check from minNumPID to maxNumPID
@ -179,9 +187,10 @@ private:
//* 4. build: objID2subIDlist, <objIDpreID>2subIDlist objID2<preIDsubID>list
//encodeRDF_new invoke new rdfParser to solve task 1 & 2 in one time scan.
bool encodeRDF_new(const string _rdf_file);
void build_s2xx(TYPE_ENTITY_LITERAL_ID**);
void build_o2xx(TYPE_ENTITY_LITERAL_ID**);
void build_p2xx(TYPE_ENTITY_LITERAL_ID**);
void readIDTuples(ID_TUPLE*& _p_id_tuples);
void build_s2xx(ID_TUPLE*);
void build_o2xx(ID_TUPLE*);
void build_p2xx(ID_TUPLE*);
//insert and delete, notice that modify is not needed here
//we can read from file or use sparql syntax
@ -193,7 +202,7 @@ private:
unsigned remove(const TripleWithObjType* _triples, TYPE_TRIPLE_NUM _triple_num);
//bool remove(const vector<TripleWithObjType>& _triples, vector<int>& _vertices, vector<int>& _predicates);
bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max);
bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file);
//bool literal2id_RDFintoSignature(const string _rdf_file, int** _p_id_tuples, TYPE_TRIPLE_NUM _id_tuples_max);
bool objIDIsEntityID(TYPE_ENTITY_LITERAL_ID _id);

View File

@ -1247,7 +1247,7 @@ bool KVstore::close_subID2values() {
}
bool
KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
KVstore::build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
{
cout << "Begin building subID2values..." << endl;
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_spo_cmp);
@ -1264,20 +1264,26 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
this->open_subID2values(KVstore::CREATE_MODE);
for (unsigned i = 0; i < _triples_num; i++) {
if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) {
if (_sub_change) {
//NOTICE: i*3 + j maybe break the unsigned limit
//for (unsigned long i = 0; i < _triples_num; i++)
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
{
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid)
{
if (_sub_change)
{
_pidoffsetlist_s.clear();
_oidlist_s.clear();
_entity_num = 0;
}
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
if (_sub_pre_change) {
if (_sub_pre_change)
{
_pidoffsetlist_s.push_back(_pre_id);
_pidoffsetlist_s.push_back(_oidlist_s.size());
}
@ -1287,8 +1293,8 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
_entity_num++;
}
_sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i][0] != _p_id_tuples[i + 1][0]);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
_sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i].subid != _p_id_tuples[i+1].subid);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
_sub_pre_change = _sub_change || _pre_change;
if (_sub_change) {
@ -1478,7 +1484,7 @@ bool KVstore::close_objID2values() {
}
bool
KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
KVstore::build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
{
cout << "Begin building objID2values..." << endl;
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_ops_cmp);
@ -1494,17 +1500,19 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
this->open_objID2values(KVstore::CREATE_MODE);
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) {
if (i + 1 == _triples_num || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]) {
//for (unsigned long i = 0; i < _triples_num; i++)
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
{
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) {
if (_obj_change) {
_pidoffsetlist_o.clear();
_sidlist_o.clear();
}
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
if (_obj_pre_change) {
_pidoffsetlist_o.push_back(_pre_id);
@ -1513,8 +1521,8 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
_sidlist_o.push_back(_sub_id);
_obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i][2] != _p_id_tuples[i + 1][2]);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
_obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i].objid != _p_id_tuples[i+1].objid);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
_obj_pre_change = _obj_change || _pre_change;
if (_obj_change) {
@ -1683,7 +1691,7 @@ bool KVstore::close_preID2values() {
}
bool
KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
KVstore::build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
{
cout << "Begin building preID2values..." << endl;
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_pso_cmp);
@ -1695,22 +1703,24 @@ KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
this->open_preID2values(KVstore::CREATE_MODE);
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) {
if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) {
//for (unsigned long i = 0; i < _triples_num; i++)
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
{
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) {
if (_pre_change) {
_sidlist_p.clear();
_oidlist_p.clear();
}
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
_sidlist_p.push_back(_sub_id);
_oidlist_p.push_back(_obj_id);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
if (_pre_change) {
unsigned* _entrylist_p = new unsigned[1 + _sidlist_p.size() * 2];

View File

@ -107,7 +107,7 @@ public:
//for subID2values
bool open_subID2values(int _mode);
bool close_subID2values();
bool build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
bool build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
bool getpreIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getobjIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getobjIDlistBysubIDpreID(TYPE_ENTITY_LITERAL_ID _subid, TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
@ -116,7 +116,7 @@ public:
//for objID2values
bool open_objID2values(int _mode);
bool close_objID2values();
bool build_objID2values(unsigned** _p_id_tuples, unsigned _triples_num);
bool build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
bool getpreIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getsubIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getsubIDlistByobjIDpreID(TYPE_ENTITY_LITERAL_ID _objid, TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
@ -125,7 +125,7 @@ public:
//for preID2values
bool open_preID2values(int _mode);
bool close_preID2values();
bool build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
bool build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
bool getsubIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
bool getsubIDobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subid_objidlist, unsigned& _list_len, bool _no_duplicate = false) const;

View File

@ -7,6 +7,11 @@
在使用gserver时不能在数据库没有unload时再用gbuild或其他命令修改数据库仅限于C/S模式
将IRC聊天放到gstore文档上freenode #gStore
另外要有桌面应用或者网页应用以可视化的方式操作数据库类似virtuoso和neo4j那种
server 118.89.115.42 gstore-pku.com
考虑使用hbase结合云平台
---
论文新的join策略特殊的子图同态问题如何选择顺序
@ -20,24 +25,6 @@
---
# 推广
必须建立一个官方网站可以展示下团队、demo需要建立社区/论坛并维护
另外要有桌面应用或者网页应用以可视化的方式操作数据库类似virtuoso和neo4j那种
server 118.89.115.42 gstore-pku.com
自己的网站可以用实验室的服务器gstore网站最好用云服务图个稳定
但用实验室主机,备案时是否更麻烦?得以企业为单位,而且解析是否更麻烦?
gstore网站中的demo应用的主体可以放在实验室主机上至少是gstore数据库应抽离出来但若实验室主机不开外网应如何而配置代理
demo应用全部外链具体服务放在实验室公开的主机上通过ip:port连接
方正
微生物所
社交网络(正在让北师那个学生在做)
DBpeida数据集上SPARQL查询接口
---
# 并行策略- 线程控制模块
不宜使用并行框架可使用C的pthreadboost的thread库或者启用C++11gcc编译器需要高于4.8.1才能完整支持C++11

View File

@ -1537,3 +1537,85 @@ Util::_pso_cmp(const void* _a, const void* _b)
return 0;
}
bool
Util::spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
{
if(a.subid != b.subid)
{
return a.subid < b.subid;
}
if(a.preid != b.preid)
{
return a.preid < b.preid;
}
if(a.objid != b.objid)
{
return a.objid < b.objid;
}
//all are equal, no need to sort this two
return false;
}
bool
Util::ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
{
if(a.objid != b.objid)
{
return a.objid < b.objid;
}
if(a.preid != b.preid)
{
return a.preid < b.preid;
}
if(a.subid != b.subid)
{
return a.subid < b.subid;
}
//all are equal, no need to sort this two
return false;
}
bool
Util::pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
{
if(a.preid != b.preid)
{
return a.preid < b.preid;
}
if(a.subid != b.subid)
{
return a.subid < b.subid;
}
if(a.objid != b.objid)
{
return a.objid < b.objid;
}
//all are equal, no need to sort this two
return false;
}
void
Util::empty_file(const char* _fname)
{
FILE * fp;
//NOTICE: if exist, then overwrite and create a empty file
fp = fopen(_fname, "w");
if(fp == NULL)
{
printf("do empty file %s failed\n", _fname);
}
else
{
fclose(fp);
}
}

View File

@ -175,6 +175,13 @@ static const unsigned INVALID = UINT_MAX;
//
//NOTICE: if use define, the type is none
typedef struct TYPE_ID_TUPLE
{
TYPE_ENTITY_LITERAL_ID subid;
TYPE_ENTITY_LITERAL_ID preid;
TYPE_ENTITY_LITERAL_ID objid;
}ID_TUPLE;
/******** all static&universal constants and fucntions ********/
class Util
{
@ -247,6 +254,7 @@ public:
static std::string getExactPath(const char* path);
static std::string getItemsFromDir(std::string path);
static void logging(std::string _str);
static void empty_file(const char* _fname);
// Below are some useful hash functions for string
static unsigned simpleHash(const char *_str);
@ -295,6 +303,10 @@ public:
static int _spo_cmp(const void* _a, const void* _b);
static int _ops_cmp(const void* _a, const void* _b);
static int _pso_cmp(const void* _a, const void* _b);
//sort functions for sort on ID_TUPLE
static bool spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
static bool ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
static bool pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
static std::string tmp_path;
// this are for debugging

12
package.json Normal file
View File

@ -0,0 +1,12 @@
{
"config": {
"ghooks": {
"commit-msg": "validate-commit-msg"
}
},
"scripts": {
"changelog-all": "conventional-changelog -p angular -i CHANGELOG.md -w -r 0",
"changelog": "conventional-changelog -p angular -i CHANGELOG.md -w"
}
}