refactor: to use hash instead of sort for candidates in Join

by zengli
This commit is contained in:
bookug 2017-04-13 20:01:17 +08:00
parent 2c5ef57fe4
commit a0e470727c
3 changed files with 83 additions and 0 deletions

View File

@ -1380,8 +1380,16 @@ Join::filter_before_join()
IDList &can_list = this->basic_query->getCandidateList(i);
//fprintf(stderr, "\t\tsize of canlist before filter: %d\n", can_list.size());
cout << "\t\tsize of canlist before filter: " << can_list.size() << endl;
//NOTICE:must sort before using binary search.
//However, the sort-merge maybe not always better because the sort() will take too much time if
//the can_list size is large, i.e. > 1000000
can_list.sort();
//vstree ? place on ID?
//TODO: use BoolArray isntead of bitset
//n is candidate num, m is sp2o num, then when n<m/(lg2(m)_lg2(n)), sort m and binary search in m
//otherwise, use BoolArray for n, only construct a time
//NOTICE: for parallelism, use a BoolArray for each BGP(either on join or in Strategy)
long begin = Util::get_cur_time();
bool ret = this->constant_edge_filter(i);

View File

@ -1508,3 +1508,11 @@ Util::_pso_cmp(const void* _a, const void* _b)
return 0;
}
//require that _base>=1
unsigned
ceiling(unsigned _val, unsigned _base)
{
//WARN: we donot check overflow here
return (_val+_base-1) / _base * _base;
}

View File

@ -225,6 +225,7 @@ public:
static std::string getExactPath(const char* path);
static std::string getItemsFromDir(std::string path);
static void logging(std::string _str);
static unsigned ceiling(unsigned _val, unsigned _base);
// Below are some useful hash functions for string
static unsigned simpleHash(const char *_str);
@ -362,5 +363,71 @@ public:
}
};
//NOTICE: bool used to be represented by int in C, but in C++ it only occupies a byte
//But in 32-bit machine, read/write on 32-bit(4-byte) will be more efficient, so bools are compressed into 4-bytes
//vector<bool> is not suggested:)
//http://blog.csdn.net/liushu1231/article/details/8844631
class BoolArray
{
private:
unsigned size;
char* arr;
public:
BoolArray()
{
size = 0;
arr = NULL;
}
BoolArray(unsigned _size)
{
//this->size = (_size+7)/8*8;
this->size = Util::ceiling(_size, 8);
this->arr = new char[this->size/8];
}
void fill(unsigned _size)
{
if(this->arr != NULL)
{
//unsigned tmp = (_size+7)/8*8;
unsigned tmp = Util::ceiling(_size, 8);
if(tmp > this->size)
{
this->size= tmp;
delete[] this->arr;
this->arr = new char[this->size/8];
}
}
else
{
//this->size = (_size+7)/8*8;
this->size = Util::ceiling(_size, 8);
this->arr = new char[this->size/8];
}
}
//void load()
//{
//}
bool exist()
{
return this->size > 0;
}
unsigned getSize()
{
return size;
}
void clear()
{
this->size = 0;
delete[] arr;
arr = NULL;
}
~BoolArray()
{
delete[] arr;
}
};
#endif //_UTIL_UTIL_H