217 lines
5.2 KiB
Plaintext
217 lines
5.2 KiB
Plaintext
// Author : Pinaggle
|
|
// Last Modified: 2019/9/22 14:38
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <time.h>
|
|
using namespace std;
|
|
|
|
/*
|
|
* this function is used to do multi-join on GPU, the lists to be joined is recorded
|
|
* one by one in arr[],and the start position of each list is recorded in head[].
|
|
* table saves which lists are to be intersected. It's size is join_length*item_size.
|
|
* Each column of table represent a multi-way join, so we hacve item_size results at
|
|
* last.
|
|
*/
|
|
/*
|
|
void disp(const char* name,unsigned *arr,unsigned length,unsigned line)
|
|
{
|
|
cout << name << endl;
|
|
for(unsigned i = 0;i < length;i++)
|
|
{
|
|
cout <<arr[i] << ' ';
|
|
if(i % line == line-1)
|
|
cout << endl;
|
|
}
|
|
cout << endl;
|
|
}
|
|
*/
|
|
void start_clock(struct timespec &start)
|
|
{
|
|
clock_gettime(CLOCK_MONOTONIC,&start);
|
|
}
|
|
|
|
void gettime_and_reset_clock(struct timespec &start, const char* info)
|
|
{
|
|
struct timespec cnt;
|
|
clock_gettime(CLOCK_MONOTONIC,&cnt);
|
|
double period = 1000*(cnt.tv_sec - start.tv_sec)
|
|
+ 0.000001*(cnt.tv_nsec - start.tv_nsec);
|
|
cout << info << " used " << setiosflags(ios::fixed)
|
|
<< setprecision(2) << period <<"ms"<<endl;
|
|
start = cnt;
|
|
}
|
|
void gettime(struct timespec &start, const char* info)
|
|
{
|
|
|
|
struct timespec cnt;
|
|
clock_gettime(CLOCK_MONOTONIC,&cnt);
|
|
double period = 1000*(cnt.tv_sec - start.tv_sec)
|
|
+ 0.000001*(cnt.tv_nsec - start.tv_nsec);
|
|
cout << info << " used " << setiosflags(ios::fixed)
|
|
<< setprecision(2) << period <<"ms"<<endl;
|
|
}
|
|
|
|
void cudaInit()
|
|
{
|
|
unsigned *d_init;
|
|
cout << "CudaInitFinished" << endl;
|
|
cudaMalloc(&d_init,sizeof(unsigned));
|
|
cudaFree(d_init);
|
|
}
|
|
__global__ void binarySearch(unsigned *d_head,unsigned *d_arr,unsigned join_width,
|
|
unsigned table_length,unsigned *d_can_list,unsigned can_list_size)
|
|
{
|
|
unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
|
|
unsigned gridSize = gridDim.x*blockDim.x;
|
|
|
|
while(idx < d_head[table_length])
|
|
{
|
|
unsigned key = d_arr[idx];
|
|
unsigned l = 0,r = table_length,m,i;
|
|
while(l+1 < r)
|
|
{
|
|
m = (l+r)/2;
|
|
if(d_head[m] <= idx)
|
|
l = m;
|
|
else
|
|
r = m;
|
|
}
|
|
i = l;
|
|
bool found = true;
|
|
for(unsigned j = 1;j < join_width;j++)
|
|
{
|
|
l = d_head[j*table_length+i];
|
|
r = d_head[j*table_length+i+1];
|
|
while(l+1 < r)
|
|
{
|
|
m = (l+r)/2;
|
|
if(d_arr[m] <= key)
|
|
l = m;
|
|
else
|
|
r = m;
|
|
}
|
|
if((l == r)||(d_arr[l] != key))
|
|
{
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
if(found && (d_can_list != NULL))
|
|
{
|
|
l = 0;
|
|
r = can_list_size;
|
|
while(l+1 < r)
|
|
{
|
|
m = (l+r)/2;
|
|
if(d_can_list[m] <= key)
|
|
l = m;
|
|
else
|
|
r = m;
|
|
}
|
|
if((l == r)||(d_can_list[l] != key))
|
|
found = false;
|
|
}
|
|
if(!found)
|
|
d_arr[idx] = -1;
|
|
idx += gridSize;
|
|
}
|
|
|
|
}
|
|
|
|
void calc(unsigned *head,unsigned *arr, unsigned join_width,unsigned table_length,
|
|
unsigned item_num,unsigned *can_list,unsigned can_list_size)
|
|
{
|
|
bool use_gpu = true;
|
|
if(use_gpu)
|
|
{
|
|
struct timespec t;
|
|
start_clock(t);
|
|
unsigned *d_head, *d_arr, *d_can_list = NULL;
|
|
cudaMalloc(&d_head, sizeof(unsigned)*(join_width*table_length+1));
|
|
cudaMalloc(&d_arr, sizeof(unsigned)*item_num);
|
|
if(can_list != NULL)
|
|
cudaMalloc(&d_can_list, sizeof(unsigned)*can_list_size);
|
|
gettime_and_reset_clock(t,"cudaMalloc");
|
|
cudaMemcpy(d_head, head, sizeof(unsigned)*(join_width*table_length+1), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_arr, arr, sizeof(unsigned)*item_num, cudaMemcpyHostToDevice);
|
|
if(can_list != NULL)
|
|
cudaMemcpy(d_can_list, can_list, sizeof(unsigned)*can_list_size, cudaMemcpyHostToDevice);
|
|
gettime_and_reset_clock(t,"cudaMemcpy");
|
|
int threadsPerBlock = 256;
|
|
int blocksPerGrid = (head[table_length] + threadsPerBlock - 1) / threadsPerBlock;
|
|
if(blocksPerGrid > 1024)
|
|
blocksPerGrid = 1024;
|
|
cudaEvent_t start,stop;
|
|
float period;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
cudaEventRecord(start,0);
|
|
binarySearch <<<blocksPerGrid, threadsPerBlock >>> (d_head,d_arr,
|
|
join_width,table_length,d_can_list,can_list_size);
|
|
cudaEventRecord(stop,0);
|
|
cudaEventSynchronize(stop);
|
|
cudaEventElapsedTime(&period,start,stop);
|
|
printf("****Kernel Execution time: %.2f ms\n",period);
|
|
gettime_and_reset_clock(t,"kernel execution");
|
|
cudaMemcpy(arr, d_arr, sizeof(unsigned)*head[table_length], cudaMemcpyDeviceToHost);
|
|
gettime_and_reset_clock(t,"cudaMemcpy");
|
|
cudaFree(d_head);
|
|
cudaFree(d_arr);
|
|
if(can_list != NULL)
|
|
cudaFree(d_can_list);
|
|
gettime_and_reset_clock(t,"cudaFree");
|
|
}
|
|
else
|
|
{
|
|
CPUMODE:
|
|
cout << "enter CPU mode" << endl;
|
|
struct timespec start;
|
|
for (unsigned i = 0; i < table_length; i++)
|
|
for(unsigned idx = head[i];idx < head[i+1];idx++)
|
|
{
|
|
unsigned key = arr[idx];
|
|
unsigned l,r,m;
|
|
bool found = true;
|
|
for (unsigned j = 1; j < join_width; j++)
|
|
{
|
|
l = head[j*table_length+i];
|
|
r = head[j*table_length+i+1];
|
|
while (l + 1 < r)
|
|
{
|
|
m = (l + r) / 2;
|
|
if (arr[m] <= key)
|
|
l = m;
|
|
else
|
|
r = m;
|
|
}
|
|
if ((l == r) || (arr[l] != key))
|
|
{
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
if(found && can_list != NULL)
|
|
{
|
|
l = 0;
|
|
r = can_list_size;
|
|
while (l + 1 < r)
|
|
{
|
|
m = (l + r) / 2;
|
|
if (can_list[m] <= key)
|
|
l = m;
|
|
else
|
|
r = m;
|
|
}
|
|
if ((l == r) || (can_list[l] != key))
|
|
{
|
|
found = false;
|
|
}
|
|
}
|
|
if(!found)
|
|
arr[idx] = -1;
|
|
}
|
|
}
|
|
}
|
|
|