refactor: adjust encode strategy
use segements for edge signature, not for str part by zengli, no influence on other modules
This commit is contained in:
parent
a47bf1d8ac
commit
309d0b9c0a
4
NOTES.md
4
NOTES.md
|
@ -79,6 +79,10 @@ http://blog.csdn.net/infoworld/article/details/8670951
|
|||
要在单机支持到10亿triple,最坏情况下最多有20亿entity和20亿literal,目前的编号方式是不行的(int扩展为unsigned)
|
||||
最好在单机100G内存上支持起freebase(2.5B triples)这个规模的数据集,就像jena和virtuoso一样,慢不要紧
|
||||
|
||||
OPTIMIZE: in pre filter and only_pre_after_filter, p2s and p2o should be used instead of for-loop
|
||||
In addition, pre with lower degree should be placed in early position
|
||||
use hash-link for large candidates, use sort-link for small candidates and very large list
|
||||
|
||||
同时将ID的编码改为unsigned,无效标志-1改为最大值的宏, triple数目的类型也要改为unsigned
|
||||
注意pre的ID还可以为-2,或者对于pre仍然用int,或者改函数的返回值为long long (还有一些没有用-1而是>=0)
|
||||
---
|
||||
|
|
|
@ -62,11 +62,15 @@ Signature::encodePredicate2Entity(EntityBitSet& _entity_bs, int _pre_id, const c
|
|||
long long id = _pre_id;
|
||||
int seed_num = id % Signature::EDGE_SIG_INTERVAL_NUM_HALF;
|
||||
|
||||
//int pos = Signature::STR_SIG_LENGTH;
|
||||
if (_type == Util::EDGE_OUT)
|
||||
{
|
||||
seed_num += Signature::EDGE_SIG_INTERVAL_NUM_HALF;
|
||||
//pos += Signature::EDGE_SIG_IN;
|
||||
}
|
||||
|
||||
//pos += (_pre_id % Signature::EDGE_SIG_OUT);
|
||||
|
||||
//int primeSize = 5;
|
||||
//int prime1[]={5003,5009,5011,5021,5023};
|
||||
//int prime2[]={49943,49957,49991,49993,49999};
|
||||
|
@ -87,6 +91,7 @@ Signature::encodePredicate2Entity(EntityBitSet& _entity_bs, int _pre_id, const c
|
|||
//int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num;
|
||||
//_entity_bs.set(pos);
|
||||
//}
|
||||
|
||||
int seed = id * 5003 % 49957;
|
||||
int pos = (seed % Signature::EDGE_SIG_INTERVAL_BASE) + Signature::STR_SIG_LENGTH + Signature::EDGE_SIG_INTERVAL_BASE * seed_num;
|
||||
_entity_bs.set(pos);
|
||||
|
@ -132,9 +137,11 @@ Signature::encodeStr2Entity(EntityBitSet& _entity_bs, int _neighbor_id, const ch
|
|||
//NOTICE: we assume the parameter is always valid(invalid args should not be passed here)
|
||||
long long id = _neighbor_id;
|
||||
//NOTICE: in * maybe the int will overflow
|
||||
long long seed = id * 5003 % 49957;
|
||||
seed = seed % Signature::STR_SIG_INTERVAL_BASE;
|
||||
seed = seed + (id % Signature::STR_SIG_INTERVAL_NUM) * Signature::STR_SIG_INTERVAL_BASE;
|
||||
//long long seed = id * 5003 % 49957;
|
||||
//seed = seed % Signature::STR_SIG_INTERVAL_BASE;
|
||||
//seed = seed + (id % Signature::STR_SIG_INTERVAL_NUM) * Signature::STR_SIG_INTERVAL_BASE;
|
||||
|
||||
int seed = _neighbor_id % Signature::STR_SIG_LITERAL;
|
||||
|
||||
if(Util::is_literal_ele(_neighbor_id))
|
||||
{
|
||||
|
|
|
@ -24,10 +24,11 @@ public:
|
|||
|
||||
//static HashFunction hash[HashNum];
|
||||
//must make sure: ENTITY_SIG_LENGTH = EDGE_SIG_LENGTH + STR_SIG_LENGTH
|
||||
static const int STR_SIG_INTERVAL_NUM = 20;
|
||||
//static const int STR_SIG_INTERVAL_NUM = 20;
|
||||
//static const int STR_SIG_INTERVAL_NUM = 16;
|
||||
static const int STR_SIG_INTERVAL_BASE = 10;
|
||||
static const int STR_SIG_LITERAL = STR_SIG_INTERVAL_NUM * STR_SIG_INTERVAL_BASE;
|
||||
//static const int STR_SIG_INTERVAL_BASE = 10;
|
||||
//static const int STR_SIG_LITERAL = STR_SIG_INTERVAL_NUM * STR_SIG_INTERVAL_BASE;
|
||||
static const int STR_SIG_LITERAL = 200;
|
||||
static const int STR_SIG_ENTITY = STR_SIG_LITERAL * 2;
|
||||
//here we divide as entity neighbors and literal neighbors: ENTITY(in and out), LITERAL(only for out edges)
|
||||
static const int STR_SIG_LENGTH = STR_SIG_ENTITY + STR_SIG_LITERAL; //600
|
||||
|
@ -39,12 +40,16 @@ public:
|
|||
|
||||
//str filter is more important in VSTree than predicate, because
|
||||
//a predicate may correspond to a lot of entities and predicate num is usually small
|
||||
static const int EDGE_SIG_INTERVAL_NUM_HALF = 10; //in edge or out edge
|
||||
//static const int EDGE_SIG_INTERVAL_NUM_HALF = 16; //in edge or out edge
|
||||
static const int EDGE_SIG_INTERVAL_NUM_HALF = 10; //in edge or out edge
|
||||
static const int EDGE_SIG_INTERVAL_NUM = 2 * EDGE_SIG_INTERVAL_NUM_HALF;
|
||||
static const int EDGE_SIG_INTERVAL_BASE = 10;
|
||||
static const int EDGE_SIG_LENGTH = EDGE_SIG_INTERVAL_NUM * EDGE_SIG_INTERVAL_BASE; //200
|
||||
//static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE;
|
||||
static const int EDGE_SIG_LENGTH2 = EDGE_SIG_INTERVAL_NUM_HALF * EDGE_SIG_INTERVAL_BASE;
|
||||
|
||||
//static const int EDGE_SIG_IN = 100;
|
||||
//static const int EDGE_SIG_OUT = EDGE_SIG_IN;
|
||||
//static const int EDGE_SIG_LENGTH = EDGE_SIG_IN + EDGE_SIG_OUT; //200
|
||||
|
||||
static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH; //1000
|
||||
//static const int ENTITY_SIG_LENGTH = STR_SIG_LENGTH + EDGE_SIG_LENGTH + NEIGHBOR_SIG_LENGTH;
|
||||
|
|
Loading…
Reference in New Issue