refactor: add VList for IVTree

also, lower the copy cost in KVstore

by zengli, all changes closed in KVstore, using new/delete for all instead of malloc/free
This commit is contained in:
bookug 2017-04-01 16:03:05 +08:00
parent ceff3544ae
commit 80080d1bca
22 changed files with 317 additions and 82 deletions

View File

@ -800,6 +800,7 @@ Database::build(const string& _rdf_file)
//sync();
//cout << "sync vstree" << endl;
//TODO: use fopen w+ to remove signature.binary file
//string cmd = "rm -rf " + _entry_file;
//system(cmd.c_str());
//cout << "signature file removed" << endl;

View File

@ -419,7 +419,8 @@ ISStorage::readBstr(Bstr* _bp, unsigned* _next)
fread(&len, sizeof(unsigned), 1, this->treefp);
this->ReadAlign(_next);
//this->request(len);
char* s = (char*)malloc(len);
//char* s = (char*)malloc(len);
char* s = new char[len];
_bp->setLen(len);
for (i = 0; i + 4 < len; i += 4)
{

View File

@ -237,7 +237,11 @@ IVTree::insert(unsigned _key, char* _str, unsigned _len)
p->addKey(_key, i);
p->addValue(this->value_list, i, _str, _len, true);
p->addNum();
//NOTICE: is this is a vlist, then it will be freed, and should not be included in the request memory
if(!VList::isLongList(_len))
{
request += _len;
}
//request += val->getLen();
p->setDirty();
this->TSM->updateHeap(p, p->getRank(), true);
@ -272,13 +276,20 @@ IVTree::modify(unsigned _key, char* _str, unsigned _len)
//NOTICE+DEBUG: if this value is a long list, then it is not saved in memory, here should return 0 in Bstr
unsigned len = ret->getValue(store)->getLen();
if(ret->getValue(store)->isBstrLongList())
{
len = 0;
}
ret->setValue(this->value_list, store, _str, _len, true);
//ret->setValue(val, store, true);
//cout<<"value reset"<<endl;
//cout<<"newlen: "<<val->getLen()<<" oldlen: "<<len<<endl;
//request += (val->getLen() - len);
this->request = _len;
if(!VList::isLongList(_len))
{
this->request += _len;
}
//this->request = val->getLen();
this->request -= len;
ret->setDirty();
@ -416,8 +427,11 @@ IVTree::remove(unsigned _key)
i = p->searchKey_equal(_key);
//WARN+NOTICE:here must check, because the key to remove maybe not exist
if (i != (int)p->getNum())
{
if(!p->getValue(i)->isBstrLongList())
{
request -= p->getValue(i)->getLen();
}
p->subKey(i); //to release
p->subValue(this->value_list, i, true); //to release
p->subNum();
@ -605,6 +619,8 @@ IVTree::release(IVNode* _np) const
IVTree::~IVTree()
{
delete this->value_list;
delete this->stream; //maybe NULL
delete TSM;
#ifdef DEBUG_KVSTORE

View File

@ -126,6 +126,9 @@ IVLeafNode::getValue(VList* _vlist, int _index, char*& _str, unsigned& _len) con
//read long list
if(this->values[_index].isBstrLongList())
{
#ifdef DEBUG_VLIST
cout<<"this is a vlist in get()"<<endl;
#endif
unsigned block_num = this->values[_index].getLen();
_vlist->readValue(block_num, _str, _len);
}
@ -150,6 +153,9 @@ IVLeafNode::setValue(VList* _vlist, int _index, char* _str, unsigned _len, bool
if(this->values[_index].isBstrLongList())
{
#ifdef DEBUG_VLIST
cout<<"this is a vlist in set()"<<endl;
#endif
unsigned block_num = this->values[_index].getLen();
_vlist->removeValue(block_num);
}
@ -173,6 +179,8 @@ IVLeafNode::setValue(VList* _vlist, int _index, char* _str, unsigned _len, bool
unsigned block_num = _vlist->writeValue(_str, _len);
this->values[_index].setStr(NULL);
this->values[_index].setLen(block_num);
//NOTICE: we need to free the long list value
delete[] _str;
}
else
{
@ -203,9 +211,17 @@ IVLeafNode::addValue(VList* _vlist, int _index, char* _str, unsigned _len, bool
if(VList::isLongList(_len))
{
#ifdef DEBUG_VLIST
cout<<"this is a vlist in add()"<<endl;
#endif
unsigned block_num = _vlist->writeValue(_str, _len);
this->values[_index].setStr(NULL);
this->values[_index].setLen(block_num);
//NOTICE: we need to free the long list value
delete[] _str;
#ifdef DEBUG_VLIST
//cout<<"to check vlist: "<<this->values[_index].getLen()<<endl;
#endif
}
else
{

View File

@ -444,6 +444,9 @@ IVStorage::readBstr(Bstr* _bp, unsigned* _next)
{
unsigned addr = 0;
fread(&addr, sizeof(unsigned), 1, this->treefp);
#ifdef DEBUG_VLIST
cout<<"read a vlist in IVStorage - addr: "<<addr<<endl;
#endif
_bp->setLen(addr);
_bp->setStr(NULL);
this->ReadAlign(_next);
@ -489,6 +492,9 @@ IVStorage::writeBstr(const Bstr* _bp, unsigned* _curnum, bool& _SpecialBlock)
this->WriteAlign(_curnum, _SpecialBlock);
//then this is the real block num
fwrite(&len, sizeof(unsigned), 1, treefp);
#ifdef DEBUG_VLIST
cout<<"to write a vlist in IVStorage::writeBstr() - blocknum: "<<len<<endl;
#endif
this->WriteAlign(_curnum, _SpecialBlock);
return true;
}

View File

@ -12,7 +12,8 @@ using namespace std;
//sets store_path as the root dir of this KVstore
//initial all Tree pointers as NULL
KVstore::KVstore(string _store_path) {
KVstore::KVstore(string _store_path)
{
this->store_path = _store_path;
this->entity2id = NULL;
@ -30,14 +31,17 @@ KVstore::KVstore(string _store_path) {
}
//Release all the memory used in this KVstore before destruction
KVstore::~KVstore() {
KVstore::~KVstore()
{
this->flush();
this->release();
}
//Flush all modified parts into the disk, which will not release any memory
//Does nothing to null tree pointers or parts that has not been modified
void KVstore::flush() {
void
KVstore::flush()
{
this->flush(this->entity2id);
this->flush(this->id2entity);
@ -52,7 +56,9 @@ void KVstore::flush() {
this->flush(this->objID2values);
}
void KVstore::release() {
void
KVstore::release()
{
delete this->entity2id;
this->entity2id = NULL;
delete this->id2entity;
@ -76,7 +82,9 @@ void KVstore::release() {
this->objID2values = NULL;
}
void KVstore::open() {
void
KVstore::open()
{
cout << "open KVstore" << endl;
this->open_entity2id(KVstore::READ_WRITE_MODE);
@ -93,102 +101,192 @@ void KVstore::open() {
this->open_preID2values(KVstore::READ_WRITE_MODE);
}
int KVstore::getEntityDegree(int _entity_id) const {
int
KVstore::getEntityDegree(int _entity_id) const
{
return this->getEntityInDegree(_entity_id) + this->getEntityOutDegree(_entity_id);
}
int KVstore::getEntityInDegree(int _entity_id) const {
int
KVstore::getEntityInDegree(int _entity_id) const
{
//cout << "In getEntityInDegree " << _entity_id << endl;
unsigned* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->objID2values, _entity_id, (char*&)_tmp, _len);
if (!_get) {
return 0;
int ret = 0;
if (_get)
{
ret = _tmp[0];
}
return _tmp[0];
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
int KVstore::getEntityOutDegree(int _entity_id) const {
int
KVstore::getEntityOutDegree(int _entity_id) const
{
//cout << "In getEntityOutDegree " << _entity_id << endl;
unsigned* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->subID2values, _entity_id, (char*&)_tmp, _len);
if (!_get) {
return 0;
int ret = 0;
if (_get)
{
ret = _tmp[0];
}
return _tmp[0];
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
int KVstore::getLiteralDegree(int _literal_id) const {
int
KVstore::getLiteralDegree(int _literal_id) const
{
//cout << "In getLiteralDegree " << _literal_id << endl;
unsigned* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->objID2values, _literal_id, (char*&)_tmp, _len);
if (!_get) {
return 0;
int ret = 0;
if (_get)
{
ret = _tmp[0];
}
return _tmp[0];
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
int KVstore::getPredicateDegree(int _predicate_id) const {
int
KVstore::getPredicateDegree(int _predicate_id) const
{
//cout << "In getPredicate Degree " << _predicate_id << endl;
unsigned* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->preID2values, _predicate_id, (char*&)_tmp, _len);
if (!_get) {
return 0;
int ret = 0;
if (_get)
{
ret = _tmp[0];
}
return _tmp[0];
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
int KVstore::getSubjectPredicateDegree(int _subid, int _preid) const {
int
KVstore::getSubjectPredicateDegree(int _subid, int _preid) const
{
//cout << "In getSubjectPredicateDegree " << _subid << ' ' << _preid << endl;
//TODO: use unsigned
int* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->subID2values, _subid, (char*&)_tmp, _len);
if (!_get) {
return 0;
}
int ret = 0;
if(_get)
{
int _result = KVstore::binarySearch(_preid, _tmp + 3, _tmp[1], 2);
if (_result == -1) {
return 0;
}
if (_result != -1)
{
int _offset = _tmp[4 + 2 * _result];
int _offset_next;
if (_result == _tmp[1] - 1) {
if (_result == _tmp[1] - 1)
{
_offset_next = 3 + 2 * _tmp[1] + _tmp[0];
}
else {
else
{
_offset_next = _tmp[6 + 2 * _result];
}
return _offset_next - _offset;
ret = _offset_next - _offset;
}
}
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
int KVstore::getObjectPredicateDegree(int _objid, int _preid) const {
int
KVstore::getObjectPredicateDegree(int _objid, int _preid) const
{
//cout << "In getObjectPredicateDegree " << _objid << _preid << endl;
//TODO: use unsigned
int* _tmp = NULL;
unsigned _len = 0;
bool _get = this->getValueByKey(this->objID2values, _objid, (char*&)_tmp, _len);
if (!_get) {
return 0;
}
int ret = 0;
if (_get)
{
int _result = KVstore::binarySearch(_preid, _tmp + 2, _tmp[1], 2);
if (_result == -1) {
return 0;
}
if (_result != -1)
{
int _offset = _tmp[3 + 2 * _result];
int _offset_next;
if (_result == _tmp[1] - 1) {
if (_result == _tmp[1] - 1)
{
_offset_next = 2 + 2 * _tmp[1] + _tmp[0];
}
else {
else
{
_offset_next = _tmp[5 + 2 * _result];
}
return _offset_next - _offset;
ret = _offset_next - _offset;
}
}
//if this is a long list, then we should remove itself after copying
//otherwise, we should not free the list memory
if(VList::isLongList(_len))
{
delete[] _tmp;
//_tmp = NULL;
}
return ret;
}
bool KVstore::updateTupleslist_insert(int _sub_id, int _pre_id, int _obj_id) {

View File

@ -419,7 +419,8 @@ SIStorage::readBstr(Bstr* _bp, unsigned* _next)
fread(&len, sizeof(unsigned), 1, this->treefp);
this->ReadAlign(_next);
//this->request(len);
char* s = (char*)malloc(len);
//char* s = (char*)malloc(len);
char* s = new char[len];
_bp->setLen(len);
for (i = 0; i + 4 < len; i += 4)
{

View File

@ -89,6 +89,7 @@ http://blog.csdn.net/infoworld/article/details/8670951
最好在单机100G内存上支持起freebase(2.5B triples)这个规模的数据集就像jena和virtuoso一样慢不要紧
type分支中query过程可能还有问题需要修改Query/里面的类型另外stringindex中也要修改分界线已经是20亿且非法不再是-1
remove signature.binary, 合并两个分支type value
vstree在build和query时可以用不同大小的缓存来加速build过程
---
UBSTR: 类型bstr的length问题也需要解决 如果把类型直接改成long long空间开销一下子就上升了一倍
@ -469,6 +470,8 @@ build db error if triple num > 500M
# BETTER
#### 添加数据访问层,数据范式和生成数据访问的源码
#### 在BasicQuery.cpp中的encodeBasicQuery函数中发现有pre_id==-1时就可以直接中止查询返回空值
#### 将KVstore模块中在堆中寻找Node*的操作改为用treap实现(或多存指针避开搜索?)

View File

@ -28,7 +28,9 @@ Bstr::Bstr(const char* _str, unsigned _len, bool _nocopy)
//return;
//}
this->str = (char*)malloc(_len);
//NOTICE: we decide to use new/delete in global area
//this->str = (char*)malloc(_len);
this->str = new char[_len];
memcpy(this->str, _str, sizeof(char) * _len);
//this->str[_len]='\0';
}
@ -121,12 +123,14 @@ Bstr::operator != (const Bstr& _bstr)
unsigned
Bstr::getLen() const
{
//WARN: we should not include too complicate logic here!!!!
//NOTICE: this is for VList
if(this->isBstrLongList())
//if(this->str == NULL)
{
return 0;
}
//if(this->isBstrLongList())
////if(this->str == NULL)
//{
//return 0;
//}
return length;
}
@ -158,15 +162,18 @@ Bstr::copy(const Bstr* _bp)
this->length = _bp->getLen();
//DEBUG!!!
//cerr<<"bstr length: "<<this->length<<endl;
this->str = (char*)malloc(this->length);
memcpy(this->str, _bp->getStr(), this->length);
//this->str = (char*)malloc(this->length);
this->str = new char[this->length];
memcpy(this->str, _bp->getStr(), sizeof(char) * this->length);
}
void
Bstr::copy(const char* _str, unsigned _len)
{
this->length = _len;
this->str = (char*)malloc(this->length);
//this->str = (char*)malloc(this->length);
this->str = new char[this->length];
memcpy(this->str, _str, this->length);
}
@ -180,7 +187,8 @@ Bstr::clear()
void
Bstr::release()
{
free(this->str); //ok to be null, do nothing
//free(this->str); //ok to be null, do nothing
delete[] this->str;
clear();
}

View File

@ -58,7 +58,8 @@ Stream::Stream(std::vector<int>& _keys, std::vector<bool>& _desc, unsigned _rown
this->record_size = new unsigned[this->colnum];
for(unsigned i = 0; i < this->colnum; ++i)
{
this->record[i].setStr((char*)malloc(Util::TRANSFER_SIZE));
char* tmptr = new char[Util::TRANSFER_SIZE];
this->record[i].setStr(tmptr);
this->record_size[i] = Util::TRANSFER_SIZE;
}
@ -148,7 +149,8 @@ Stream::copyToRecord(const char* _str, unsigned _len, unsigned _idx)
if(length + 1 > this->record_size[_idx])
{
this->record[_idx].release();
this->record[_idx].setStr((char*)malloc((length + 1) * sizeof(char)));
char* tmptr = new char[length+1];
this->record[_idx].setStr(tmptr);
this->record_size[_idx] = length + 1; //one more byte: convenient to add \0
}
@ -187,7 +189,8 @@ Stream::outputCache()
{
unsigned len;
fread(&len, sizeof(unsigned), 1, this->tempfp);
char* p = (char*)malloc(len * sizeof(char));
//char* p = (char*)malloc(len * sizeof(char));
char* p = new char[len];
fread(p, sizeof(char), len, this->tempfp);
bp[i].setLen(len);
bp[i].setStr(p);
@ -320,13 +323,16 @@ Stream::read()
//FILE* fp = (FILE*)(this->ans);
for(unsigned i = 0; i < this->colnum; ++i)
{
//BETTER:alloca and reuse the space in Bstr?
//BETTER:alloc and reuse the space in Bstr?
unsigned len;
fread(&len, sizeof(unsigned), 1, this->ansDisk);
char* s = (char*)calloc(len + 1, sizeof(char));
//char* s = (char*)calloc(len + 1, sizeof(char));
char* s = new char[len+1];
fread(s, sizeof(char), len, this->ansDisk);
s[len] = '\0';
this->copyToRecord(s, len, i);
free(s);
//free(s);
delete[] s;
}
}
this->xpos++;
@ -420,7 +426,9 @@ Stream::mergeSort()
#endif
break;
}
s = (char*)malloc(sizeof(char) * len);
//s = (char*)malloc(sizeof(char) * len);
s = new char[len];
fread(s, sizeof(char), len, tp);
bp[i].setLen(len);
bp[i].setStr(s);

View File

@ -651,7 +651,14 @@ Util::result_id_str(vector<int*>& _v, int _var_num)
bool
Util::dir_exist(const string _dir)
{
return (opendir(_dir.c_str()) != NULL);
DIR* dirptr = opendir(_dir.c_str());
if(dirptr != NULL)
{
closedir(dirptr);
return true;
}
return false;
}
bool

View File

@ -91,6 +91,7 @@ in the sparql query can point to the same node in data graph)
//#define DEBUG_VSTREE 1 //in Database
//#define DEBUG_LRUCACHE 1
//#define DEBUG_DATABASE 1 //in Database
//#define DEBUG_VLIST 1
//
//
@ -124,6 +125,12 @@ in the sparql query can point to the same node in data graph)
#endif
#endif
#ifdef DEBUG_VLIST
#ifndef DEBUG
#define DEBUG
#endif
#endif
#ifndef DEBUG
//#define DEBUG
#endif

View File

@ -180,6 +180,9 @@ VList::WriteAlign(unsigned* _curnum)
bool
VList::readValue(unsigned _block_num, char*& _str, unsigned& _len)
{
#ifdef DEBUG_VLIST
cout<<"to get value of block num: "<<_block_num<<endl;
#endif
fseek(valfp, Address(_block_num), SEEK_SET);
unsigned next;
fread(&next, sizeof(unsigned), 1, valfp);
@ -193,8 +196,14 @@ VList::writeValue(const char* _str, unsigned _len)
{
unsigned blocknum = this->AllocBlock();
unsigned curnum = blocknum;
//NOTICE: here we must skip the next position first
fseek(valfp, Address(curnum) + 4, SEEK_SET);
this->writeBstr(_str, _len, &curnum);
#ifdef DEBUG_VLIST
cout<<"to write value - block num: "<<blocknum<<endl;
#endif
return blocknum;
}
@ -222,9 +231,13 @@ VList::readBstr(char*& _str, unsigned& _len, unsigned* _next)
//long address;
unsigned len, i, j;
fread(&len, sizeof(unsigned), 1, this->valfp);
#ifdef DEBUG_VLIST
cout<<"the length of value: "<<len<<endl;
#endif
this->ReadAlign(_next);
char* s = (char*)malloc(len);
//char* s = (char*)malloc(len);
char* s = new char[len];
_len = len;
for (i = 0; i + 4 < len; i += 4)
@ -257,6 +270,7 @@ VList::writeBstr(const char* _str, unsigned _len, unsigned* _curnum)
unsigned i, j, len = _len;
fwrite(&len, sizeof(unsigned), 1, valfp);
this->WriteAlign(_curnum);
//cout<<"to write bstr, length: "<<len<<endl;
//BETTER: compute this need how many blocks first, then write a block a time
@ -289,7 +303,39 @@ VList::writeBstr(const char* _str, unsigned _len, unsigned* _curnum)
VList::~VList()
{
BlockInfo* bp = this->freelist;
//write the info back
fseek(this->valfp, 0, SEEK_SET);
fwrite(&cur_block_num, sizeof(unsigned), 1, valfp);//write current blocks num
fseek(valfp, BLOCK_SIZE, SEEK_SET);
int i, j = cur_block_num / 8; //(SuperNum-1)*BLOCK_SIZE;
for (i = 0; i < j; ++i)
{
//reset to 1 first
fputc(0xff, valfp);
}
char c;
BlockInfo* bp = this->freelist->next;
while (bp != NULL)
{
//if not-use then set 0, aligned to byte!
#ifdef DEBUG_KVSTORE
if (bp->num > cur_block_num)
{
printf("blocks num exceed, cur_block_num: %u\n", cur_block_num);
exit(1);
}
#endif
j = bp->num - 1;
i = j / 8;
j = 7 - j % 8;
fseek(valfp, BLOCK_SIZE + i, SEEK_SET);
c = fgetc(valfp);
fseek(valfp, -1, SEEK_CUR);
fputc(c & ~(1 << j), valfp);
bp = bp->next;
}
bp = this->freelist;
BlockInfo* next;
while (bp != NULL)
{

View File

@ -12,9 +12,6 @@
#include "Util.h"
#include "Bstr.h"
//TODO: all use new/delete for Bstr, KVstore and trees, including Stream
//then give a full test, including valgrind
//NOTICE: not keep long list in memory, read each time
//but when can you free the long list(kvstore should release it after parsing)
//
@ -31,15 +28,19 @@
//file1 is tree file, the long list is represented as: 0 real-address
//NOTICE: long list is not kept in mmeory for cache, it is read/update each time on need!
//TODO: use fread/fwrite here instead of fgetc/fputc
//including other trees
class VList
{
public:
//NOTICE:the border is 10^6, but the block is larger, 1M
static const unsigned LENGTH_BORDER = 1000000;
//static const unsigned LENGTH_BORDER = 1000000;
static const unsigned LENGTH_BORDER = 1000;
static const unsigned BLOCK_SIZE = 1 << 20; //fixed size of disk-block
static const unsigned MAX_BLOCK_NUM = 1 << 23; //max block-num
//below two constants: must can be exactly divided by 8
static const unsigned SET_BLOCK_NUM = 1 << 2; //initial blocks num
static const unsigned SET_BLOCK_NUM = 1 << 3; //initial blocks num
static const unsigned SET_BLOCK_INC = SET_BLOCK_NUM; //base of blocks-num inc
static const unsigned SuperNum = MAX_BLOCK_NUM / (8 * BLOCK_SIZE) + 1;

5
data/bbug0.sql Normal file
View File

@ -0,0 +1,5 @@
INSERT DATA
{
<http://www.founder/102> <http://www.founder.20.link:52> <http://www.founder/106> .
<http://www.founder/102> <http://www.founder> <http://www.founder/73> .
}

1
data/bbug0d.sql Normal file
View File

@ -0,0 +1 @@
DELETE DATA { <http://www.founder/102> <http://www.founder.20.link:52> <http://www.founder/106> . }

1
data/bbug1.sql Normal file
View File

@ -0,0 +1 @@
select ?subject ?predict ?object WHERE { ?subject <http://www.founder.20.link:52> ?object; ?predict ?object . }

1
data/bbug2.sql Normal file
View File

@ -0,0 +1 @@
DELETE WHERE { <http://www.founder/101> ?predict ?object . }

1
data/bbug3.sql Normal file
View File

@ -0,0 +1 @@
select ?predict where {<http://www.founder/102> ?predict <http://www.founder/73> .}

5
data/bbug4.sql Normal file
View File

@ -0,0 +1,5 @@
select ?subject ?predict ?object where
{
<http://www.founder/102> <http://www.founder.20.link:52> ?object.
?subject ?predict ?object.
}

1
data/bbug5.sql Normal file
View File

@ -0,0 +1 @@
select ?subject ?predict ?object where {?subject <http://www.founder.20.link:52> <http://www.founder/106>; ?predict ?object . }

1
data/bbug6.sql Normal file
View File

@ -0,0 +1 @@
DELETE WHERE { ?subject <http://www.founder.20.link:52> ?objcet. }