refactor: to use hash instead of sort for candidates in Join
by zengli
This commit is contained in:
parent
2c5ef57fe4
commit
a0e470727c
|
@ -1380,8 +1380,16 @@ Join::filter_before_join()
|
|||
IDList &can_list = this->basic_query->getCandidateList(i);
|
||||
//fprintf(stderr, "\t\tsize of canlist before filter: %d\n", can_list.size());
|
||||
cout << "\t\tsize of canlist before filter: " << can_list.size() << endl;
|
||||
|
||||
//NOTICE:must sort before using binary search.
|
||||
//However, the sort-merge maybe not always better because the sort() will take too much time if
|
||||
//the can_list size is large, i.e. > 1000000
|
||||
can_list.sort();
|
||||
//vstree ? place on ID?
|
||||
//TODO: use BoolArray isntead of bitset
|
||||
//n is candidate num, m is sp2o num, then when n<m/(lg2(m)_lg2(n)), sort m and binary search in m
|
||||
//otherwise, use BoolArray for n, only construct a time
|
||||
//NOTICE: for parallelism, use a BoolArray for each BGP(either on join or in Strategy)
|
||||
|
||||
long begin = Util::get_cur_time();
|
||||
bool ret = this->constant_edge_filter(i);
|
||||
|
|
|
@ -1508,3 +1508,11 @@ Util::_pso_cmp(const void* _a, const void* _b)
|
|||
return 0;
|
||||
}
|
||||
|
||||
//require that _base>=1
|
||||
unsigned
|
||||
ceiling(unsigned _val, unsigned _base)
|
||||
{
|
||||
//WARN: we donot check overflow here
|
||||
return (_val+_base-1) / _base * _base;
|
||||
}
|
||||
|
||||
|
|
67
Util/Util.h
67
Util/Util.h
|
@ -225,6 +225,7 @@ public:
|
|||
static std::string getExactPath(const char* path);
|
||||
static std::string getItemsFromDir(std::string path);
|
||||
static void logging(std::string _str);
|
||||
static unsigned ceiling(unsigned _val, unsigned _base);
|
||||
|
||||
// Below are some useful hash functions for string
|
||||
static unsigned simpleHash(const char *_str);
|
||||
|
@ -362,5 +363,71 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
//NOTICE: bool used to be represented by int in C, but in C++ it only occupies a byte
|
||||
//But in 32-bit machine, read/write on 32-bit(4-byte) will be more efficient, so bools are compressed into 4-bytes
|
||||
//vector<bool> is not suggested:)
|
||||
//http://blog.csdn.net/liushu1231/article/details/8844631
|
||||
class BoolArray
|
||||
{
|
||||
private:
|
||||
unsigned size;
|
||||
char* arr;
|
||||
|
||||
public:
|
||||
BoolArray()
|
||||
{
|
||||
size = 0;
|
||||
arr = NULL;
|
||||
}
|
||||
BoolArray(unsigned _size)
|
||||
{
|
||||
//this->size = (_size+7)/8*8;
|
||||
this->size = Util::ceiling(_size, 8);
|
||||
this->arr = new char[this->size/8];
|
||||
}
|
||||
void fill(unsigned _size)
|
||||
{
|
||||
if(this->arr != NULL)
|
||||
{
|
||||
//unsigned tmp = (_size+7)/8*8;
|
||||
unsigned tmp = Util::ceiling(_size, 8);
|
||||
if(tmp > this->size)
|
||||
{
|
||||
this->size= tmp;
|
||||
delete[] this->arr;
|
||||
this->arr = new char[this->size/8];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//this->size = (_size+7)/8*8;
|
||||
this->size = Util::ceiling(_size, 8);
|
||||
this->arr = new char[this->size/8];
|
||||
}
|
||||
}
|
||||
//void load()
|
||||
//{
|
||||
//}
|
||||
|
||||
bool exist()
|
||||
{
|
||||
return this->size > 0;
|
||||
}
|
||||
unsigned getSize()
|
||||
{
|
||||
return size;
|
||||
}
|
||||
void clear()
|
||||
{
|
||||
this->size = 0;
|
||||
delete[] arr;
|
||||
arr = NULL;
|
||||
}
|
||||
~BoolArray()
|
||||
{
|
||||
delete[] arr;
|
||||
}
|
||||
};
|
||||
|
||||
#endif //_UTIL_UTIL_H
|
||||
|
||||
|
|
Loading…
Reference in New Issue