refactor: adjust encode strategy

use segements for edge signature, not for str part

by zengli, no influence on other modules
This commit is contained in:
bookug 2017-04-16 19:49:46 +08:00
parent a47bf1d8ac
commit 309d0b9c0a
3 changed files with 24 additions and 8 deletions

View File

@ -79,6 +79,10 @@ http://blog.csdn.net/infoworld/article/details/8670951
要在单机支持到10亿triple最坏情况下最多有20亿entity和20亿literal目前的编号方式是不行的(int扩展为unsigned)
最好在单机100G内存上支持起freebase(2.5B triples)这个规模的数据集就像jena和virtuoso一样慢不要紧
OPTIMIZE: in pre filter and only_pre_after_filter, p2s and p2o should be used instead of for-loop
In addition, pre with lower degree should be placed in early position
use hash-link for large candidates, use sort-link for small candidates and very large list
同时将ID的编码改为unsigned无效标志-1改为最大值的宏, triple数目的类型也要改为unsigned
注意pre的ID还可以为-2或者对于pre仍然用int或者改函数的返回值为long long (还有一些没有用-1而是>=0)
---

View File

@ -62,11 +62,15 @@ Signature::encodePredicate2Entity(EntityBitSet& _entity_bs, int _pre_id, const c
long long id = _pre_id;
int seed_num = id % Signature::EDGE_SIG_INTERVAL_NUM_HALF;
//int pos = Signature::STR_SIG_LENGTH;
if (_type == Util::EDGE_OUT)
{
seed_num += Signature::EDGE_SIG_INTERVAL_NUM_HALF;
//pos += Signature::EDGE_SIG_IN;
}
//pos += (_pre_id % Signature::EDGE_SIG_OUT);
//int primeSize = 5;
//int prime1[]={5003,5009,5011,5021,5023};
//int prime2[]={49943,49957,49991,49993,49999};
@ -87,6 +91,7 @@ Signature::encodePredicate2Entity(EntityBitSet& _entity_bs, int _pre_id, const c
//int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num;
//_entity_bs.set(pos);
//}
int seed = id * 5003 % 49957;
int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num;
_entity_bs.set(pos);
@ -132,9 +137,11 @@ Signature::encodeStr2Entity(EntityBitSet& _entity_bs, int _neighbor_id, const ch
//NOTICE: we assume the parameter is always valid(invalid args should not be passed here)
long long id = _neighbor_id;
//NOTICE: in * maybe the int will overflow
long long seed = id * 5003 % 49957;
seed = seed % Signature::STR_SIG_INTERVAL_BASE;
seed = seed + (id % Signature::STR_SIG_INTERVAL_NUM) * Signature::STR_SIG_INTERVAL_BASE;
//long long seed = id * 5003 % 49957;
//seed = seed % Signature::STR_SIG_INTERVAL_BASE;
//seed = seed + (id % Signature::STR_SIG_INTERVAL_NUM) * Signature::STR_SIG_INTERVAL_BASE;
int seed = _neighbor_id % Signature::STR_SIG_LITERAL;
if(Util::is_literal_ele(_neighbor_id))
{

View File

@ -24,10 +24,11 @@ public:
//static HashFunction hash[HashNum];
//must make sure: ENTITY_SIG_LENGTH = EDGE_SIG_LENGTH + STR_SIG_LENGTH
static const int STR_SIG_INTERVAL_NUM = 20;
//static const int STR_SIG_INTERVAL_NUM = 20;
//static const int STR_SIG_INTERVAL_NUM = 16;
static const int STR_SIG_INTERVAL_BASE = 10;
static const int STR_SIG_LITERAL = STR_SIG_INTERVAL_NUM * STR_SIG_INTERVAL_BASE;
//static const int STR_SIG_INTERVAL_BASE = 10;
//static const int STR_SIG_LITERAL = STR_SIG_INTERVAL_NUM * STR_SIG_INTERVAL_BASE;
static const int STR_SIG_LITERAL = 200;
static const int STR_SIG_ENTITY = STR_SIG_LITERAL * 2;
//here we divide as entity neighbors and literal neighbors: ENTITY(in and out), LITERAL(only for out edges)
static const int STR_SIG_LENGTH = STR_SIG_ENTITY + STR_SIG_LITERAL; //600
@ -39,12 +40,16 @@ public:
//str filter is more important in VSTree than predicate, because
//a predicate may correspond to a lot of entities and predicate num is usually small
static const int EDGE_SIG_INTERVAL_NUM_HALF = 10; //in edge or out edge
//static const int EDGE_SIG_INTERVAL_NUM_HALF = 16; //in edge or out edge
static const int EDGE_SIG_INTERVAL_NUM_HALF = 10; //in edge or out edge
static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF;
static const int EDGE_SIG_INTERVAL_BASE = 10;
static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //200
//static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE;
static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE;
//static const int EDGE_SIG_IN = 100;
//static const int EDGE_SIG_OUT = EDGE_SIG_IN;
//static const int EDGE_SIG_LENGTH = EDGE_SIG_IN + EDGE_SIG_OUT; //200
static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; //1000
//static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH + NEIGHBOR_SIG_LENGTH;