refactor: move id_tuples to disk in build
to run large datasets like freebase by zengli, no changes to others
This commit is contained in:
parent
8e74a29f07
commit
2f7a7a8b69
|
@ -91,3 +91,6 @@ tags
|
|||
*.out
|
||||
*.bak~
|
||||
|
||||
# modules
|
||||
node_modules
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ Database::Database()
|
|||
this->signature_binary_file = "signature.binary";
|
||||
this->six_tuples_file = "six_tuples";
|
||||
this->db_info_file = "db_info_file.dat";
|
||||
this->id_tuples_file = "id_tuples";
|
||||
|
||||
string kv_store_path = store_path + "/kv_store";
|
||||
this->kvstore = new KVstore(kv_store_path);
|
||||
|
@ -58,6 +59,7 @@ Database::Database(string _name)
|
|||
this->signature_binary_file = "signature.binary";
|
||||
this->six_tuples_file = "six_tuples";
|
||||
this->db_info_file = "db_info_file.dat";
|
||||
this->id_tuples_file = "id_tuples";
|
||||
|
||||
string kv_store_path = store_path + "/kv_store";
|
||||
this->kvstore = new KVstore(kv_store_path);
|
||||
|
@ -804,6 +806,8 @@ Database::build(const string& _rdf_file)
|
|||
string _entry_file = this->getSignatureBFile();
|
||||
|
||||
cout << "begin build VS-Tree on " << ret << "..." << endl;
|
||||
//TODO: we can use larger buffer for vstree in building process, because it does not compete with others
|
||||
//we only need to build vstree in this phase(no need for id tuples anymore)
|
||||
(this->vstree)->buildTree(_entry_file);
|
||||
|
||||
long tv_build_end = Util::get_cur_time();
|
||||
|
@ -832,20 +836,26 @@ Database::getSixTuplesFile()
|
|||
return this->getStorePath() + "/" + this->six_tuples_file;
|
||||
}
|
||||
|
||||
/* root Path of this DB + signatureBFile */
|
||||
//root Path of this DB + signatureBFile
|
||||
string
|
||||
Database::getSignatureBFile()
|
||||
{
|
||||
return this->getStorePath() + "/" + this->signature_binary_file;
|
||||
}
|
||||
|
||||
/* root Path of this DB + DBInfoFile */
|
||||
//root Path of this DB + DBInfoFile
|
||||
string
|
||||
Database::getDBInfoFile()
|
||||
{
|
||||
return this->getStorePath() + "/" + this->db_info_file;
|
||||
}
|
||||
|
||||
string
|
||||
Database::getIDTuplesFile()
|
||||
{
|
||||
return this->getStorePath() + "/" + this->id_tuples_file;
|
||||
}
|
||||
|
||||
bool
|
||||
Database::saveDBInfoFile()
|
||||
{
|
||||
|
@ -1059,7 +1069,9 @@ Database::encodeRDF_new(const string _rdf_file)
|
|||
Util::logging("In encodeRDF_new");
|
||||
//cout<< "end log!!!" << endl;
|
||||
#endif
|
||||
TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL;
|
||||
|
||||
//TYPE_ENTITY_LITERAL_ID** _p_id_tuples = NULL;
|
||||
ID_TUPLE* _p_id_tuples = NULL;
|
||||
TYPE_TRIPLE_NUM _id_tuples_max = 0;
|
||||
|
||||
long t1 = Util::get_cur_time();
|
||||
|
@ -1071,7 +1083,7 @@ Database::encodeRDF_new(const string _rdf_file)
|
|||
//(one way is to add a more structure to tell us which is entity, but this is costly)
|
||||
|
||||
//map sub2id, pre2id, entity/literal in obj2id, store in kvstore, encode RDF data into signature
|
||||
if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file, _p_id_tuples, _id_tuples_max))
|
||||
if (!this->sub2id_pre2id_obj2id_RDFintoSignature(_rdf_file))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
@ -1094,6 +1106,9 @@ Database::encodeRDF_new(const string _rdf_file)
|
|||
this->stringindex->setNum(StringIndexFile::Predicate, this->pre_num);
|
||||
this->stringindex->save(*this->kvstore);
|
||||
|
||||
long t3 = Util::get_cur_time();
|
||||
cout << "after stringindex, used " << (t3 - t2) << "ms." << endl;
|
||||
|
||||
//cout<<"special id: "<<this->kvstore->getIDByEntity("<point7>")<<endl;
|
||||
|
||||
//NOTICE:close these trees now to save memory
|
||||
|
@ -1104,27 +1119,44 @@ Database::encodeRDF_new(const string _rdf_file)
|
|||
this->kvstore->close_predicate2id();
|
||||
this->kvstore->close_id2predicate();
|
||||
|
||||
long t4 = Util::get_cur_time();
|
||||
cout << "id2string and string2id closed, used " << (t4 - t3) << "ms." << endl;
|
||||
|
||||
//after closing the 6 trees, read the id tuples again, and remove the file given num, a dimension,return a pointer
|
||||
//NOTICE: the file can also be used for debugging, and a program can start just from the id tuples file
|
||||
//(if copy the 6 id2string trees, no need to parse each time)
|
||||
this->readIDTuples(_p_id_tuples);
|
||||
|
||||
long t5 = Util::get_cur_time();
|
||||
cout << "id tuples read, used " << (t5 - t4) << "ms." << endl;
|
||||
|
||||
//TODO: how to set the buffer of trees is a big question, fully utilize the availiable memory
|
||||
|
||||
//this->kvstore->build_subID2values(_p_id_tuples, this->triples_num);
|
||||
this->build_s2xx(_p_id_tuples);
|
||||
long t3 = Util::get_cur_time();
|
||||
cout << "after s2xx, used " << (t3 - t2) << "ms." << endl;
|
||||
|
||||
long t6 = Util::get_cur_time();
|
||||
cout << "after s2xx, used " << (t6 - t5) << "ms." << endl;
|
||||
|
||||
//this->kvstore->build_objID2values(_p_id_tuples, this->triples_num);
|
||||
this->build_o2xx(_p_id_tuples);
|
||||
long t4 = Util::get_cur_time();
|
||||
cout << "after o2xx, used " << (t4 - t3) << "ms." << endl;
|
||||
|
||||
long t7 = Util::get_cur_time();
|
||||
cout << "after o2xx, used " << (t7 - t6) << "ms." << endl;
|
||||
|
||||
//this->kvstore->build_preID2values(_p_id_tuples, this->triples_num);
|
||||
this->build_p2xx(_p_id_tuples);
|
||||
long t5 = Util::get_cur_time();
|
||||
cout << "after p2xx, used " << (t5 - t4) << "ms." << endl;
|
||||
|
||||
long t8 = Util::get_cur_time();
|
||||
cout << "after p2xx, used " << (t8 - t7) << "ms." << endl;
|
||||
|
||||
//WARN:we must free the memory for id_tuples array
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
|
||||
{
|
||||
delete[] _p_id_tuples[i];
|
||||
}
|
||||
delete[] _p_id_tuples;
|
||||
//for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
|
||||
//{
|
||||
//delete[] _p_id_tuples[i];
|
||||
//}
|
||||
//delete[] _p_id_tuples;
|
||||
|
||||
bool flag = this->saveDBInfoFile();
|
||||
if (!flag)
|
||||
|
@ -1132,15 +1164,47 @@ Database::encodeRDF_new(const string _rdf_file)
|
|||
return false;
|
||||
}
|
||||
|
||||
Util::logging("finish encodeRDF_new");
|
||||
long t9 = Util::get_cur_time();
|
||||
cout << "db info saved, used " << (t9 - t8) << "ms." << endl;
|
||||
|
||||
//Util::logging("finish encodeRDF_new");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
||||
void
|
||||
Database::readIDTuples(ID_TUPLE*& _p_id_tuples)
|
||||
{
|
||||
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp);
|
||||
_p_id_tuples = NULL;
|
||||
string fname = this->getIDTuplesFile();
|
||||
FILE* fp = fopen(fname.c_str(), "rb");
|
||||
if(fp == NULL)
|
||||
{
|
||||
cout<<"error in Database::readIDTuples() -- unable to open file "<<fname<<endl;
|
||||
return;
|
||||
}
|
||||
|
||||
//NOTICE: avoid to break the unsigned limit, size_t is used in Linux C
|
||||
//size_t means long unsigned int in 64-bit machine
|
||||
//unsigned long total_num = this->triples_num * 3;
|
||||
//_p_id_tuples = new TYPE_ENTITY_LITERAL_ID[total_num];
|
||||
_p_id_tuples = new ID_TUPLE[this->triples_num];
|
||||
fread(_p_id_tuples, sizeof(ID_TUPLE), this->triples_num, fp);
|
||||
|
||||
fclose(fp);
|
||||
//NOTICE: choose to empty the file or not
|
||||
Util::empty_file(fname.c_str());
|
||||
|
||||
//return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
Database::build_s2xx(ID_TUPLE* _p_id_tuples)
|
||||
{
|
||||
//NOTICE: STL sort() is generally fatser than C qsort, especially when qsort is very slow
|
||||
//STL sort() not only use qsort algorithm, it can also choose heap-sort method
|
||||
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::spo_cmp_idtuple);
|
||||
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_spo_cmp);
|
||||
this->kvstore->build_subID2values(_p_id_tuples, this->triples_num);
|
||||
|
||||
//save all entity_signature into binary file
|
||||
|
@ -1164,11 +1228,17 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
|||
|
||||
TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID;
|
||||
//int prev_entity_id = -1;
|
||||
|
||||
//NOTICE: i*3 + j maybe break the unsigned limit
|
||||
//for (unsigned long i = 0; i < this->triples_num; ++i)
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
|
||||
{
|
||||
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0];
|
||||
TYPE_PREDICATE_ID preid = _p_id_tuples[i][1];
|
||||
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2];
|
||||
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid;
|
||||
TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid;
|
||||
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid;
|
||||
//TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0];
|
||||
//TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1];
|
||||
//TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2];
|
||||
if(subid != prev_entity_id)
|
||||
{
|
||||
if(prev_entity_id != INVALID_ENTITY_LITERAL_ID)
|
||||
|
@ -1216,9 +1286,10 @@ Database::build_s2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
|||
}
|
||||
|
||||
void
|
||||
Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
||||
Database::build_o2xx(ID_TUPLE* _p_id_tuples)
|
||||
{
|
||||
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp);
|
||||
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::ops_cmp_idtuple);
|
||||
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_ops_cmp);
|
||||
this->kvstore->build_objID2values(_p_id_tuples, this->triples_num);
|
||||
|
||||
//save all entity_signature into binary file
|
||||
|
@ -1236,11 +1307,17 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
|||
TYPE_ENTITY_LITERAL_ID prev_entity_id = INVALID_ENTITY_LITERAL_ID;
|
||||
//int prev_entity_id = -1;
|
||||
EntityBitSet tmp_bitset;
|
||||
|
||||
//NOTICE: i*3 + j maybe break the unsigned limit
|
||||
//for (unsigned long i = 0; i < this->triples_num; ++i)
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < this->triples_num; ++i)
|
||||
{
|
||||
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i][0];
|
||||
TYPE_PREDICATE_ID preid = _p_id_tuples[i][1];
|
||||
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i][2];
|
||||
TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i].subid;
|
||||
TYPE_PREDICATE_ID preid = _p_id_tuples[i].preid;
|
||||
TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i].objid;
|
||||
//TYPE_ENTITY_LITERAL_ID subid = _p_id_tuples[i*3+0];
|
||||
//TYPE_PREDICATE_ID preid = _p_id_tuples[i*3+1];
|
||||
//TYPE_ENTITY_LITERAL_ID objid = _p_id_tuples[i*3+2];
|
||||
|
||||
|
||||
if(Util::is_literal_ele(objid))
|
||||
|
@ -1327,9 +1404,10 @@ Database::build_o2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
|||
}
|
||||
|
||||
void
|
||||
Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
||||
Database::build_p2xx(ID_TUPLE* _p_id_tuples)
|
||||
{
|
||||
qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp);
|
||||
sort(_p_id_tuples, _p_id_tuples + this->triples_num, Util::pso_cmp_idtuple);
|
||||
//qsort(_p_id_tuples, this->triples_num, sizeof(int*), Util::_pso_cmp);
|
||||
this->kvstore->build_preID2values(_p_id_tuples, this->triples_num);
|
||||
}
|
||||
|
||||
|
@ -1339,14 +1417,33 @@ Database::build_p2xx(TYPE_ENTITY_LITERAL_ID** _p_id_tuples)
|
|||
//CONSIDER: just an estimated value is ok or use vector!!!(but vector also copy when enlarge)
|
||||
//and read file line numbers are also costly!
|
||||
bool
|
||||
Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max)
|
||||
Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file)
|
||||
{
|
||||
//NOTICE: if we keep the id_tuples always in memory, i.e. [unsigned*] each unsigned* is [3]
|
||||
//then for freebase, there is 2.5B triples. the mmeory cost of this array is 25*10^8*3*4 + 25*10^8*8 = 50G
|
||||
//
|
||||
//So I choose not to store the id_tuples in memory in this function, but to store them in file and read again after this function
|
||||
//Notice that the most memory-costly part of building process is this function, setup 6 trees together
|
||||
//later we can read the id_tuples and stored as [num][3], only cost 25*10^8*3*4 = 30G, and later we only build one tree at a time
|
||||
|
||||
string fname = this->getIDTuplesFile();
|
||||
FILE* fp = fopen(fname.c_str(), "wb");
|
||||
if(fp == NULL)
|
||||
{
|
||||
cout<<"error in Database::sub2id_pre2id_obj2id() -- unable to open file to write "<<fname<<endl;
|
||||
return false;
|
||||
}
|
||||
ID_TUPLE tmp_id_tuple;
|
||||
//NOTICE: avoid to break the unsigned limit, size_t is used in Linux C
|
||||
//size_t means long unsigned int in 64-bit machine
|
||||
//fread(_p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID), total_num, fp);
|
||||
|
||||
TYPE_TRIPLE_NUM _id_tuples_size;
|
||||
{
|
||||
//initial
|
||||
_id_tuples_max = 10 * 1000 * 1000;
|
||||
_p_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_id_tuples_max];
|
||||
_id_tuples_size = 0;
|
||||
//_id_tuples_max = 10 * 1000 * 1000;
|
||||
//_p_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_id_tuples_max];
|
||||
//_id_tuples_size = 0;
|
||||
this->sub_num = 0;
|
||||
this->pre_num = 0;
|
||||
this->entity_num = 0;
|
||||
|
@ -1430,15 +1527,15 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
|
|||
this->triples_num++;
|
||||
|
||||
//if the _id_tuples exceeds, double the space
|
||||
if (_id_tuples_size == _id_tuples_max)
|
||||
{
|
||||
TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2;
|
||||
TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len];
|
||||
memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max);
|
||||
delete[] _p_id_tuples;
|
||||
_p_id_tuples = _new_id_tuples;
|
||||
_id_tuples_max = _new_tuples_len;
|
||||
}
|
||||
//if (_id_tuples_size == _id_tuples_max)
|
||||
//{
|
||||
//TYPE_TRIPLE_NUM _new_tuples_len = _id_tuples_max * 2;
|
||||
//TYPE_ENTITY_LITERAL_ID** _new_id_tuples = new TYPE_ENTITY_LITERAL_ID*[_new_tuples_len];
|
||||
//memcpy(_new_id_tuples, _p_id_tuples, sizeof(TYPE_ENTITY_LITERAL_ID*) * _id_tuples_max);
|
||||
//delete[] _p_id_tuples;
|
||||
//_p_id_tuples = _new_id_tuples;
|
||||
//_id_tuples_max = _new_tuples_len;
|
||||
//}
|
||||
|
||||
// For subject
|
||||
// (all subject is entity, some object is entity, the other is literal)
|
||||
|
@ -1508,11 +1605,18 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
|
|||
}
|
||||
|
||||
// For id_tuples
|
||||
_p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3];
|
||||
_p_id_tuples[_id_tuples_size][0] = _sub_id;
|
||||
_p_id_tuples[_id_tuples_size][1] = _pre_id;
|
||||
_p_id_tuples[_id_tuples_size][2] = _obj_id;
|
||||
_id_tuples_size++;
|
||||
//_p_id_tuples[_id_tuples_size] = new TYPE_ENTITY_LITERAL_ID[3];
|
||||
//_p_id_tuples[_id_tuples_size][0] = _sub_id;
|
||||
//_p_id_tuples[_id_tuples_size][1] = _pre_id;
|
||||
//_p_id_tuples[_id_tuples_size][2] = _obj_id;
|
||||
//_id_tuples_size++;
|
||||
tmp_id_tuple.subid = _sub_id;
|
||||
tmp_id_tuple.preid = _pre_id;
|
||||
tmp_id_tuple.objid = _obj_id;
|
||||
fwrite(&tmp_id_tuple, sizeof(ID_TUPLE), 1, fp);
|
||||
//fwrite(&_sub_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
|
||||
//fwrite(&_pre_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
|
||||
//fwrite(&_obj_id, sizeof(TYPE_ENTITY_LITERAL_ID), 1, fp);
|
||||
|
||||
#ifdef DEBUG_PRECISE
|
||||
//// save six tuples
|
||||
|
@ -1585,6 +1689,7 @@ Database::sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENT
|
|||
delete[] triple_array;
|
||||
_fin.close();
|
||||
_six_tuples_fout.close();
|
||||
fclose(fp);
|
||||
|
||||
|
||||
//for (int i = 0; i < entitybitset_max; i++)
|
||||
|
|
|
@ -60,17 +60,21 @@ public:
|
|||
bool insert(std::string _rdf_file);
|
||||
bool remove(std::string _rdf_file);
|
||||
|
||||
/* name of this DB*/
|
||||
//name of this DB
|
||||
string getName();
|
||||
/* root Path of this DB + sixTuplesFile */
|
||||
|
||||
//root Path of this DB + sixTuplesFile
|
||||
string getSixTuplesFile();
|
||||
|
||||
/* root Path of this DB + signatureBFile */
|
||||
//root Path of this DB + signatureBFile
|
||||
string getSignatureBFile();
|
||||
|
||||
/* root Path of this DB + DBInfoFile */
|
||||
//root Path of this DB + DBInfoFile
|
||||
string getDBInfoFile();
|
||||
|
||||
//id tuples file
|
||||
string getIDTuplesFile();
|
||||
|
||||
private:
|
||||
string name;
|
||||
string store_path;
|
||||
|
@ -95,9 +99,13 @@ private:
|
|||
|
||||
//six tuples: <sub pre obj sid pid oid>
|
||||
string six_tuples_file;
|
||||
|
||||
//B means binary
|
||||
string signature_binary_file;
|
||||
|
||||
//id tuples file
|
||||
string id_tuples_file;
|
||||
|
||||
//pre2num mapping
|
||||
TYPE_TRIPLE_NUM* pre2num;
|
||||
//valid: check from minNumPID to maxNumPID
|
||||
|
@ -179,9 +187,10 @@ private:
|
|||
//* 4. build: objID2subIDlist, <objIDpreID>2subIDlist objID2<preIDsubID>list
|
||||
//encodeRDF_new invoke new rdfParser to solve task 1 & 2 in one time scan.
|
||||
bool encodeRDF_new(const string _rdf_file);
|
||||
void build_s2xx(TYPE_ENTITY_LITERAL_ID**);
|
||||
void build_o2xx(TYPE_ENTITY_LITERAL_ID**);
|
||||
void build_p2xx(TYPE_ENTITY_LITERAL_ID**);
|
||||
void readIDTuples(ID_TUPLE*& _p_id_tuples);
|
||||
void build_s2xx(ID_TUPLE*);
|
||||
void build_o2xx(ID_TUPLE*);
|
||||
void build_p2xx(ID_TUPLE*);
|
||||
|
||||
//insert and delete, notice that modify is not needed here
|
||||
//we can read from file or use sparql syntax
|
||||
|
@ -193,7 +202,7 @@ private:
|
|||
unsigned remove(const TripleWithObjType* _triples, TYPE_TRIPLE_NUM _triple_num);
|
||||
//bool remove(const vector<TripleWithObjType>& _triples, vector<int>& _vertices, vector<int>& _predicates);
|
||||
|
||||
bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file, TYPE_ENTITY_LITERAL_ID**& _p_id_tuples, TYPE_TRIPLE_NUM & _id_tuples_max);
|
||||
bool sub2id_pre2id_obj2id_RDFintoSignature(const string _rdf_file);
|
||||
//bool literal2id_RDFintoSignature(const string _rdf_file, int** _p_id_tuples, TYPE_TRIPLE_NUM _id_tuples_max);
|
||||
|
||||
bool objIDIsEntityID(TYPE_ENTITY_LITERAL_ID _id);
|
||||
|
|
|
@ -1247,7 +1247,7 @@ bool KVstore::close_subID2values() {
|
|||
}
|
||||
|
||||
bool
|
||||
KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
KVstore::build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
{
|
||||
cout << "Begin building subID2values..." << endl;
|
||||
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_spo_cmp);
|
||||
|
@ -1264,20 +1264,26 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
|
|||
|
||||
this->open_subID2values(KVstore::CREATE_MODE);
|
||||
|
||||
for (unsigned i = 0; i < _triples_num; i++) {
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]
|
||||
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) {
|
||||
if (_sub_change) {
|
||||
//NOTICE: i*3 + j maybe break the unsigned limit
|
||||
//for (unsigned long i = 0; i < _triples_num; i++)
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
|
||||
{
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|
||||
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid)
|
||||
{
|
||||
if (_sub_change)
|
||||
{
|
||||
_pidoffsetlist_s.clear();
|
||||
_oidlist_s.clear();
|
||||
_entity_num = 0;
|
||||
}
|
||||
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
|
||||
|
||||
if (_sub_pre_change) {
|
||||
if (_sub_pre_change)
|
||||
{
|
||||
_pidoffsetlist_s.push_back(_pre_id);
|
||||
_pidoffsetlist_s.push_back(_oidlist_s.size());
|
||||
}
|
||||
|
@ -1287,8 +1293,8 @@ KVstore::build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
|
|||
_entity_num++;
|
||||
}
|
||||
|
||||
_sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i][0] != _p_id_tuples[i + 1][0]);
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
|
||||
_sub_change = (i + 1 == _triples_num) || (_p_id_tuples[i].subid != _p_id_tuples[i+1].subid);
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
|
||||
_sub_pre_change = _sub_change || _pre_change;
|
||||
|
||||
if (_sub_change) {
|
||||
|
@ -1478,7 +1484,7 @@ bool KVstore::close_objID2values() {
|
|||
}
|
||||
|
||||
bool
|
||||
KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
KVstore::build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
{
|
||||
cout << "Begin building objID2values..." << endl;
|
||||
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_ops_cmp);
|
||||
|
@ -1494,17 +1500,19 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
|
|||
|
||||
this->open_objID2values(KVstore::CREATE_MODE);
|
||||
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) {
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]
|
||||
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]) {
|
||||
//for (unsigned long i = 0; i < _triples_num; i++)
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
|
||||
{
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|
||||
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) {
|
||||
if (_obj_change) {
|
||||
_pidoffsetlist_o.clear();
|
||||
_sidlist_o.clear();
|
||||
}
|
||||
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
|
||||
|
||||
if (_obj_pre_change) {
|
||||
_pidoffsetlist_o.push_back(_pre_id);
|
||||
|
@ -1513,8 +1521,8 @@ KVstore::build_objID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
|
|||
|
||||
_sidlist_o.push_back(_sub_id);
|
||||
|
||||
_obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i][2] != _p_id_tuples[i + 1][2]);
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
|
||||
_obj_change = (i + 1 == _triples_num) || (_p_id_tuples[i].objid != _p_id_tuples[i+1].objid);
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
|
||||
_obj_pre_change = _obj_change || _pre_change;
|
||||
|
||||
if (_obj_change) {
|
||||
|
@ -1683,7 +1691,7 @@ bool KVstore::close_preID2values() {
|
|||
}
|
||||
|
||||
bool
|
||||
KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
KVstore::build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num)
|
||||
{
|
||||
cout << "Begin building preID2values..." << endl;
|
||||
//qsort(_p_id_tuples, _triples_num, sizeof(int*), Util::_pso_cmp);
|
||||
|
@ -1695,22 +1703,24 @@ KVstore::build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_N
|
|||
|
||||
this->open_preID2values(KVstore::CREATE_MODE);
|
||||
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++) {
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i][0] != _p_id_tuples[i + 1][0]
|
||||
|| _p_id_tuples[i][1] != _p_id_tuples[i + 1][1] || _p_id_tuples[i][2] != _p_id_tuples[i + 1][2]) {
|
||||
//for (unsigned long i = 0; i < _triples_num; i++)
|
||||
for (TYPE_TRIPLE_NUM i = 0; i < _triples_num; i++)
|
||||
{
|
||||
if (i + 1 == _triples_num || _p_id_tuples[i].subid != _p_id_tuples[i+1].subid
|
||||
|| _p_id_tuples[i].preid != _p_id_tuples[i+1].preid || _p_id_tuples[i].objid != _p_id_tuples[i+1].objid) {
|
||||
if (_pre_change) {
|
||||
_sidlist_p.clear();
|
||||
_oidlist_p.clear();
|
||||
}
|
||||
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i][0];
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i][1];
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i][2];
|
||||
TYPE_ENTITY_LITERAL_ID _sub_id = _p_id_tuples[i].subid;
|
||||
TYPE_PREDICATE_ID _pre_id = _p_id_tuples[i].preid;
|
||||
TYPE_ENTITY_LITERAL_ID _obj_id = _p_id_tuples[i].objid;
|
||||
|
||||
_sidlist_p.push_back(_sub_id);
|
||||
_oidlist_p.push_back(_obj_id);
|
||||
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i][1] != _p_id_tuples[i + 1][1]);
|
||||
_pre_change = (i + 1 == _triples_num) || (_p_id_tuples[i].preid != _p_id_tuples[i+1].preid);
|
||||
|
||||
if (_pre_change) {
|
||||
unsigned* _entrylist_p = new unsigned[1 + _sidlist_p.size() * 2];
|
||||
|
|
|
@ -107,7 +107,7 @@ public:
|
|||
//for subID2values
|
||||
bool open_subID2values(int _mode);
|
||||
bool close_subID2values();
|
||||
bool build_subID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
|
||||
bool build_subID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
|
||||
bool getpreIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getobjIDlistBysubID(TYPE_ENTITY_LITERAL_ID _subid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getobjIDlistBysubIDpreID(TYPE_ENTITY_LITERAL_ID _subid, TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
|
@ -116,7 +116,7 @@ public:
|
|||
//for objID2values
|
||||
bool open_objID2values(int _mode);
|
||||
bool close_objID2values();
|
||||
bool build_objID2values(unsigned** _p_id_tuples, unsigned _triples_num);
|
||||
bool build_objID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
|
||||
bool getpreIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _preidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getsubIDlistByobjID(TYPE_ENTITY_LITERAL_ID _objid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getsubIDlistByobjIDpreID(TYPE_ENTITY_LITERAL_ID _objid, TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
|
@ -125,7 +125,7 @@ public:
|
|||
//for preID2values
|
||||
bool open_preID2values(int _mode);
|
||||
bool close_preID2values();
|
||||
bool build_preID2values(TYPE_ENTITY_LITERAL_ID** _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
|
||||
bool build_preID2values(ID_TUPLE* _p_id_tuples, TYPE_TRIPLE_NUM _triples_num);
|
||||
bool getsubIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
bool getsubIDobjIDlistBypreID(TYPE_PREDICATE_ID _preid, unsigned*& _subid_objidlist, unsigned& _list_len, bool _no_duplicate = false) const;
|
||||
|
|
23
NOTES.md
23
NOTES.md
|
@ -7,6 +7,11 @@
|
|||
在使用gserver时,不能在数据库没有unload时再用gbuild或其他命令修改数据库,仅限于C/S模式
|
||||
将IRC聊天放到gstore文档上,freenode #gStore
|
||||
|
||||
另外要有桌面应用或者网页应用,以可视化的方式操作数据库,类似virtuoso和neo4j那种
|
||||
server 118.89.115.42 gstore-pku.com
|
||||
|
||||
考虑使用hbase,结合云平台
|
||||
|
||||
---
|
||||
|
||||
论文:新的join策略,特殊的子图同态问题,如何选择顺序
|
||||
|
@ -20,24 +25,6 @@
|
|||
|
||||
---
|
||||
|
||||
# 推广
|
||||
|
||||
必须建立一个官方网站,可以展示下团队、demo,需要建立社区/论坛并维护
|
||||
另外要有桌面应用或者网页应用,以可视化的方式操作数据库,类似virtuoso和neo4j那种
|
||||
server 118.89.115.42 gstore-pku.com
|
||||
|
||||
自己的网站可以用实验室的服务器,gstore网站最好用云服务,图个稳定
|
||||
但用实验室主机,备案时是否更麻烦?得以企业为单位,而且解析是否更麻烦?
|
||||
gstore网站中的demo应用的主体可以放在实验室主机上,至少是gstore数据库应抽离出来,但若实验室主机不开外网,应如何而配置代理?
|
||||
demo应用全部外链,具体服务放在实验室公开的主机上,通过ip:port连接
|
||||
|
||||
方正
|
||||
微生物所
|
||||
社交网络(正在让北师那个学生在做)
|
||||
DBpeida数据集上SPARQL查询接口
|
||||
|
||||
---
|
||||
|
||||
# 并行策略- 线程控制模块
|
||||
|
||||
不宜使用并行框架,可使用C的pthread,boost的thread库,或者启用C++11,gcc编译器需要高于4.8.1才能完整支持C++11
|
||||
|
|
|
@ -1537,3 +1537,85 @@ Util::_pso_cmp(const void* _a, const void* _b)
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool
|
||||
Util::spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
|
||||
{
|
||||
if(a.subid != b.subid)
|
||||
{
|
||||
return a.subid < b.subid;
|
||||
}
|
||||
|
||||
if(a.preid != b.preid)
|
||||
{
|
||||
return a.preid < b.preid;
|
||||
}
|
||||
|
||||
if(a.objid != b.objid)
|
||||
{
|
||||
return a.objid < b.objid;
|
||||
}
|
||||
|
||||
//all are equal, no need to sort this two
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
Util::ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
|
||||
{
|
||||
if(a.objid != b.objid)
|
||||
{
|
||||
return a.objid < b.objid;
|
||||
}
|
||||
|
||||
if(a.preid != b.preid)
|
||||
{
|
||||
return a.preid < b.preid;
|
||||
}
|
||||
|
||||
if(a.subid != b.subid)
|
||||
{
|
||||
return a.subid < b.subid;
|
||||
}
|
||||
|
||||
//all are equal, no need to sort this two
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
Util::pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b)
|
||||
{
|
||||
if(a.preid != b.preid)
|
||||
{
|
||||
return a.preid < b.preid;
|
||||
}
|
||||
|
||||
if(a.subid != b.subid)
|
||||
{
|
||||
return a.subid < b.subid;
|
||||
}
|
||||
|
||||
if(a.objid != b.objid)
|
||||
{
|
||||
return a.objid < b.objid;
|
||||
}
|
||||
|
||||
//all are equal, no need to sort this two
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
Util::empty_file(const char* _fname)
|
||||
{
|
||||
FILE * fp;
|
||||
//NOTICE: if exist, then overwrite and create a empty file
|
||||
fp = fopen(_fname, "w");
|
||||
if(fp == NULL)
|
||||
{
|
||||
printf("do empty file %s failed\n", _fname);
|
||||
}
|
||||
else
|
||||
{
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
12
Util/Util.h
12
Util/Util.h
|
@ -175,6 +175,13 @@ static const unsigned INVALID = UINT_MAX;
|
|||
//
|
||||
//NOTICE: if use define, the type is none
|
||||
|
||||
typedef struct TYPE_ID_TUPLE
|
||||
{
|
||||
TYPE_ENTITY_LITERAL_ID subid;
|
||||
TYPE_ENTITY_LITERAL_ID preid;
|
||||
TYPE_ENTITY_LITERAL_ID objid;
|
||||
}ID_TUPLE;
|
||||
|
||||
/******** all static&universal constants and fucntions ********/
|
||||
class Util
|
||||
{
|
||||
|
@ -247,6 +254,7 @@ public:
|
|||
static std::string getExactPath(const char* path);
|
||||
static std::string getItemsFromDir(std::string path);
|
||||
static void logging(std::string _str);
|
||||
static void empty_file(const char* _fname);
|
||||
|
||||
// Below are some useful hash functions for string
|
||||
static unsigned simpleHash(const char *_str);
|
||||
|
@ -295,6 +303,10 @@ public:
|
|||
static int _spo_cmp(const void* _a, const void* _b);
|
||||
static int _ops_cmp(const void* _a, const void* _b);
|
||||
static int _pso_cmp(const void* _a, const void* _b);
|
||||
//sort functions for sort on ID_TUPLE
|
||||
static bool spo_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
|
||||
static bool ops_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
|
||||
static bool pso_cmp_idtuple(const ID_TUPLE& a, const ID_TUPLE& b);
|
||||
|
||||
static std::string tmp_path;
|
||||
// this are for debugging
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"config": {
|
||||
"ghooks": {
|
||||
"commit-msg": "validate-commit-msg"
|
||||
}
|
||||
},
|
||||
|
||||
"scripts": {
|
||||
"changelog-all": "conventional-changelog -p angular -i CHANGELOG.md -w -r 0",
|
||||
"changelog": "conventional-changelog -p angular -i CHANGELOG.md -w"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue