gStore/Signature/Signature.cpp

389 lines
9.1 KiB
C++

/*
* Signature.cpp
*
* Created on: 2014-6-20
* Author: liyouhuan
* Implemented on: 2014-6-29
* Author: hanshuo
*/
#include "Signature.h"
#include "../Query/BasicQuery.h"
std::string Signature::BitSet2str(const EntityBitSet& _bitset)
{
std::stringstream _ss;
bool any = false;
for(unsigned i = 0; i < _bitset.size(); i ++)
{
if(_bitset.test(i))
{
_ss << "[" << i << "] ";
any = true;
}
}
if(!any)
{
_ss << "empty" << endl;
}
_ss << endl;
return _ss.str();
}
/* for Signature */
void Signature::encodePredicate2Entity(int _pre_id, EntityBitSet& _entity_bs, const char _type)
{
if (Signature::PREDICATE_ENCODE_METHOD == 0)
{
int pos = ( (_pre_id+10) % Signature::EDGE_SIG_LENGTH ) + Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
}
else
{
int seed_preid = _pre_id;
if(_type == BasicQuery::EDGE_OUT)
{
seed_preid += 101;
}
/*
int primeSize = 5;
int prime1[]={5003,5009,5011,5021,5023};
int prime2[]={49943,49957,49991,49993,49999};
*/
// how to hash the predicate id to signature(bitset) better?
// more ones in the bitset(use more primes) means less conflicts, but also weakens the filtration of VSTree.
// when the data set is big enough, cutting down the size of candidate list should come up to our primary consideration.
// in this case we should not encode too many ones in entities' signature.
// also, when the data set is small, hash conflicts can hardly happen.
// therefore, I think using 2 primes(set up two ones in bitset) is enough.
// --by hanshuo.
int primeSize = 2;
int prime1[]={5003,5011};
int prime2[]={49957,49993};
for (int i=0;i<primeSize;i++)
{
int seed = seed_preid * prime1[i] % prime2[i];
int pos = (seed % Signature::EDGE_SIG_LENGTH ) + Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
}
}
}
void Signature::encodePredicate2Edge(int _pre_id, EdgeBitSet& _edge_bs)
{
if (Signature::PREDICATE_ENCODE_METHOD == 0)
{
int pos = (_pre_id + 10) % Signature::EDGE_SIG_LENGTH;
_edge_bs.set(pos);
}
else
{
/*
int primeSize = 5;
int prime1[]={5003,5009,5011,5021,5023};
int prime2[]={49943,49957,49991,49993,49999};
*/
int primeSize = 2;
int prime1[]={5003,5011};
int prime2[]={49957,49993};
for (int i=0;i<primeSize;i++)
{
int seed = _pre_id * prime1[i] % prime2[i];
int pos = seed % Signature::EDGE_SIG_LENGTH;
_edge_bs.set(pos);
}
}
}
void Signature::encodeStr2Entity(const char* _str, EntityBitSet& _entity_bs) //_str is subject or object or literal
{
if(strlen(_str) >0 && _str[0] == '?')
return;
int length = (int)strlen(_str);
unsigned int hashKey = 0;
unsigned int pos = 0;
char *str2 = new char[length+1];
strcpy(str2, _str);
char *str = str2;
// the same consideration as encodePredicate2Entity.
// I think we should not set too many ones in entities' signature.
hashKey = Signature::simpleHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::RSHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::JSHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::PJWHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
/*
str=str2;
hashKey = Signature::ELFHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::SDBMHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
hashKey = Signature::DJBHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::APHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
str=str2;
hashKey = Signature::BKDRHash(str);
pos = hashKey % Signature::STR_SIG_LENGTH;
_entity_bs.set(pos);
*/
/*
//debug
{
std::stringstream _ss;
_ss << "encodeStr2Entity:" << str2 << endl;
Util::logging(_ss.str());
}
*/
delete []str2;
}
void Signature::encodeStrID2Entity(int _str_id, EntityBitSet& _entity_bs)
{
//to be implement
}
unsigned int Signature::hash(const char* _str)
{
//to be implement
return 0;
}
/* some string hash functions */
unsigned int Signature::BKDRHash(const char *_str)
{
unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
unsigned int key = 0;
while (*_str)
{
key = key * seed + (*_str++);
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::simpleHash(const char *_str)
{
unsigned int key;
unsigned char *p;
for(key = 0, p = (unsigned char *)_str; *p ; p++)
key = 31 * key + *p;
return (key & 0x7FFFFFFF);
}
unsigned int Signature::RSHash(const char *_str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int key = 0;
while (*_str)
{
key = key * a + (*_str++);
a *= b;
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::JSHash(const char *_str)
{
unsigned int key = 1315423911;
while (*_str)
{
key ^= ((key << 5) + (*_str++) + (key >> 2));
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::PJWHash(const char *_str)
{
unsigned int bits_in_unsigned_int = (unsigned int)(sizeof(unsigned int) * 8);
unsigned int three_quarters = (unsigned int)((bits_in_unsigned_int * 3) / 4);
unsigned int one_eighth = (unsigned int)(bits_in_unsigned_int / 8);
unsigned int high_bits = (unsigned int)(0xFFFFFFFF) << (bits_in_unsigned_int - one_eighth);
unsigned int key = 0;
unsigned int test = 0;
while (*_str)
{
key = (key << one_eighth) + (*_str++);
if ((test = key & high_bits) != 0)
{
key = ((key ^ (test >> three_quarters)) & (~high_bits));
}
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::ELFHash(const char *_str)
{
unsigned int key = 0;
unsigned int x = 0;
while (*_str)
{
key = (key << 4) + (*_str++);
if ((x = key & 0xF0000000L) != 0)
{
key ^= (x >> 24);
key &= ~x;
}
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::SDBMHash(const char *_str)
{
unsigned int key = 0;
while (*_str)
{
key = (*_str++) + (key << 6) + (key << 16) - key;
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::DJBHash(const char *_str)
{
unsigned int key = 5381;
while (*_str) {
key += (key << 5) + (*_str++);
}
return (key & 0x7FFFFFFF);
}
unsigned int Signature::APHash(const char *_str)
{
unsigned int key = 0;
int i;
for (i=0; *_str; i++)
{
if ((i & 1) == 0)
{
key ^= ((key << 7) ^ (*_str++) ^ (key >> 3));
}
else
{
key ^= (~((key << 11) ^ (*_str++) ^ (key >> 5)));
}
}
return (key & 0x7FFFFFFF);
}
/* for ENTITYsig */
EntitySig::EntitySig()
{
this->entityBitSet.reset();
}
EntitySig::EntitySig(const EntitySig* _p_sig)
{
this->entityBitSet.reset();
this->entityBitSet |= _p_sig->entityBitSet;
}
EntitySig::EntitySig(const EntitySig& _sig)
{
this->entityBitSet.reset();
this->entityBitSet |= _sig.entityBitSet;
}
EntitySig::EntitySig(const EntityBitSet& _bitset)
{
this->entityBitSet.reset();
this->entityBitSet |= _bitset;
}
EntitySig& EntitySig::operator|=(const EntitySig& _sig)
{
this->entityBitSet |= _sig.entityBitSet;
return *this;
}
bool EntitySig::operator==(const EntitySig& _sig)const
{
return (this->entityBitSet == _sig.entityBitSet);
}
bool EntitySig::operator!=(const EntitySig& _sig)const
{
return (this->entityBitSet != _sig.entityBitSet);
}
EntitySig& EntitySig::operator=(const EntitySig& _sig)
{
this->entityBitSet.reset();
this->entityBitSet |= _sig.getBitset();
return *this;
}
const EntityBitSet & EntitySig::getBitset()const
{
return this->entityBitSet;
}
/* for EDGEsig */
EdgeSig::EdgeSig()
{
this->edgeBitSet.reset();
}
EdgeSig::EdgeSig(const EdgeSig* _p_sig)
{
this->edgeBitSet.reset();
this->edgeBitSet |= _p_sig->edgeBitSet;
}
EdgeSig::EdgeSig(const EdgeSig& _sig)
{
this->edgeBitSet.reset();
this->edgeBitSet |= _sig.edgeBitSet;
}
EdgeSig::EdgeSig(const EdgeBitSet& _bitset)
{
this->edgeBitSet.reset();
this->edgeBitSet |= _bitset;
}
EdgeSig& EdgeSig::operator|=(const EdgeSig& _sig)
{
this->edgeBitSet |= _sig.edgeBitSet;
return *this;
}