239 lines
7.4 KiB
C
239 lines
7.4 KiB
C
|
/*=============================================================================
|
||
|
# Filename: Join.h
|
||
|
# Author: Bookug Lobert
|
||
|
# Mail: 1181955272@qq.com
|
||
|
# Last Modified: 2015-12-13 16:05
|
||
|
# Description: design join strategies and select/cost modules
|
||
|
=============================================================================*/
|
||
|
|
||
|
#ifndef _JOIN_JOIN_H
|
||
|
#define _JOIN_JOIN_H
|
||
|
|
||
|
#include "../Query/IDList.h"
|
||
|
#include "../Query/BasicQuery.h"
|
||
|
#include "../Query/SPARQLquery.h"
|
||
|
#include "../KVstore/KVstore.h"
|
||
|
#include "../Util/Util.h"
|
||
|
|
||
|
//BETTER?:place multi_join and index_join in separated files
|
||
|
|
||
|
typedef vector<int> RecordType;
|
||
|
typedef vector<int>::iterator RecordIterator;
|
||
|
typedef list<RecordType> TableType;
|
||
|
typedef list<RecordType>::iterator TableIterator;
|
||
|
typedef list<RecordType>::reverse_iterator TableReverseIterator;
|
||
|
//typedef list< vector<int> > TableType;
|
||
|
//typedef list< vector<int> >::iterator TableIterator;
|
||
|
//typedef list< vector<int> >::reverse_iterator TableReverseIterator;
|
||
|
typedef vector< vector<int*> > IdLists;
|
||
|
typedef vector< vector<int> > IdListsLen;
|
||
|
|
||
|
typedef struct IndexItem
|
||
|
{
|
||
|
int value;
|
||
|
bool isValid; //needed for final travelling
|
||
|
//NOTICE: the size of vector is expected to be small
|
||
|
//the order in vector must be same as in IndexList vector
|
||
|
vector< list< list<struct IndexItem>::iterator > > travel;
|
||
|
vector< set<int> > check;
|
||
|
//map< int, list < list<struct IndexItem>::iterator > > links; //direct next index list id and linking
|
||
|
//map< int, set <int> > check; //indirect previous index list id and verifying
|
||
|
IndexItem()
|
||
|
{
|
||
|
this->value = -1;
|
||
|
this->isValid = false;
|
||
|
}
|
||
|
IndexItem(int _val)
|
||
|
{
|
||
|
this->value = _val;
|
||
|
this->isValid = true;
|
||
|
}
|
||
|
}IndexItem;
|
||
|
|
||
|
typedef struct IndexList
|
||
|
{
|
||
|
//int next;
|
||
|
//NOTICE:the list should be ordered at the beginning
|
||
|
list<IndexItem> candidates;
|
||
|
list<IndexItem>::iterator border; //used to divide valid and invalid area
|
||
|
bool prepared; //find and set all invalid eles restricted by subtree in travelling
|
||
|
int position; //current neighbor to travel
|
||
|
vector<int> travel_map; //the mapping between links position and IndexList id
|
||
|
vector<int> check_map; //the mapping between check position and IndexList id
|
||
|
IndexList()
|
||
|
{
|
||
|
//this->next = -1;
|
||
|
this->prepared = false;
|
||
|
this->position = 0;
|
||
|
}
|
||
|
bool end()
|
||
|
{
|
||
|
return this->position == (int)this->travel_map.size();
|
||
|
}
|
||
|
int next()
|
||
|
{
|
||
|
return this->travel_map[this->position++];
|
||
|
}
|
||
|
//NOTICE:we can not use binary-search in list, but this search method maybe slow
|
||
|
//BETTER?:adjust the list to binary-tree or other struture?
|
||
|
list<IndexItem>::iterator search(int _val)
|
||
|
{
|
||
|
for(list<IndexItem>::iterator it = this->candidates.begin(); it != this->border; ++it)
|
||
|
{
|
||
|
if(it->value == _val)
|
||
|
return it;
|
||
|
}
|
||
|
return this->border;
|
||
|
}
|
||
|
}IndexList;
|
||
|
|
||
|
typedef struct Satellite
|
||
|
{
|
||
|
int id;
|
||
|
int* idlist;
|
||
|
int idlist_len;
|
||
|
Satellite(int _id, int* _idlist, int _idlist_len)
|
||
|
{
|
||
|
this->id = _id;
|
||
|
this->idlist = _idlist;
|
||
|
this->idlist_len = _idlist_len;
|
||
|
}
|
||
|
}Satellite;
|
||
|
|
||
|
typedef list<IndexItem> ItemList;
|
||
|
typedef list<IndexItem>::iterator ItemListIterator;
|
||
|
typedef list< list<struct IndexItem>::iterator > IteratorList;
|
||
|
|
||
|
//Database new Join and pass something like kvstore
|
||
|
class Join
|
||
|
{
|
||
|
private:
|
||
|
int start_id;
|
||
|
int var_num;
|
||
|
//bool* dealed_triple;
|
||
|
BasicQuery* basic_query;
|
||
|
KVstore* kvstore;
|
||
|
//used by score_node for parameters
|
||
|
static const unsigned PARAM_DEGREE = 1;
|
||
|
static const unsigned PARAM_SIZE = 100000;
|
||
|
static const unsigned PARAM_DENSE = 1;
|
||
|
static const double JUDGE_LIMIT = 0.5;
|
||
|
static const int LIMIT_CANDIDATE_LIST_SIZE = 1000;
|
||
|
//BETTER?:predefine size to avoid copy cost
|
||
|
TableType current_table;
|
||
|
TableIterator new_start; //keep to end() as default
|
||
|
//list<bool> table_row_new;
|
||
|
|
||
|
//keep the mapping for disordered ids in vector<int> table
|
||
|
int* id2pos;
|
||
|
int id_pos; //the num of id put into id2pos currently
|
||
|
int* pos2id;
|
||
|
bool* dealed_triple;
|
||
|
stack<int> mystack;
|
||
|
|
||
|
vector<int*>* result_list;
|
||
|
vector<Satellite> satellites;
|
||
|
int* record;
|
||
|
int record_len;
|
||
|
|
||
|
void init(BasicQuery* _basic_query);
|
||
|
void clear();
|
||
|
void add_id_pos_mapping(int _id);
|
||
|
void reset_id_pos_mapping();
|
||
|
|
||
|
//judge which method should be used according to
|
||
|
//the size of candidates and structure of quering graph
|
||
|
int judge(int _smallest, int _biggest);
|
||
|
|
||
|
//select the start point and search order
|
||
|
void select();
|
||
|
|
||
|
//score the cost to link two tables and the efficience
|
||
|
//of filtering
|
||
|
//int score(List1, List2);
|
||
|
|
||
|
//score the node according to degree and size
|
||
|
double score_node(unsigned _degree, unsigned _size);
|
||
|
|
||
|
void toStartJoin();
|
||
|
|
||
|
bool filter_before_join();
|
||
|
bool constant_edge_filter(int _var_i);
|
||
|
void preid_filter(int _var_i);
|
||
|
bool only_pre_filter_after_join();
|
||
|
void add_literal_candidate();
|
||
|
bool pre_var_handler();
|
||
|
//bool filterBySatellites(int _var, int _ele);
|
||
|
bool filterBySatellites(int _var);
|
||
|
bool allFilterByPres();
|
||
|
void generateAllSatellites();
|
||
|
void cartesian(int pos, int end);
|
||
|
|
||
|
//functions for help
|
||
|
//copy/add to the end of current_table and set true
|
||
|
void add_new_to_results(TableIterator it, int id);
|
||
|
|
||
|
//void set_results_old(list<bool>::iterator it);
|
||
|
int choose_next_node(int id);
|
||
|
|
||
|
bool is_literal_var(int id);
|
||
|
bool is_literal_ele(int _id);
|
||
|
|
||
|
void copyToResult();
|
||
|
|
||
|
//BETTER?:change these params to members in class
|
||
|
void acquire_all_id_lists(IdLists& _id_lists, IdListsLen& _id_lists_len, IDList& _can_list, vector<int>& _edges, int _id, int _can_list_size);
|
||
|
bool if_prepare_idlist(int _can_list_size, bool _is_literal);
|
||
|
bool new_join_with_multi_vars_prepared(IdLists& _id_lists, IdListsLen& _id_lists_len, vector<int>& _edges, IDList& _can_list, int _can_list_size);
|
||
|
bool new_join_with_multi_vars_not_prepared(vector<int>& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal);
|
||
|
|
||
|
bool multi_join();
|
||
|
|
||
|
|
||
|
|
||
|
//================================================================================================
|
||
|
//The index join method saves the memory cost because 2m+2mn < 3mn,
|
||
|
//and time may be reduced if the pre-process is not too costly
|
||
|
//because we can reuse the links other than recompute in temporal table
|
||
|
//New struct is needed for node, i.e. list<bool, int, list<iterator> >,
|
||
|
//because we may have to delete, but how can we know if an iterator
|
||
|
//is valid if the one it points to is removed?(remove if the other is removed; using end())
|
||
|
//1. based on edges: process each time only in valid area(already
|
||
|
//macthed with others, invalid is removed), and finally it must be
|
||
|
//all ok, just copy to result_list. We should select the edge order
|
||
|
//to better the efficiency, but how can we keep only a neighbor links
|
||
|
//set if we want to save memory?(ensure all can be linked later)
|
||
|
//2. based on points: search deeply like multi-index-join, only a
|
||
|
//neighbor links set is kept for a node(not every edge), so memory
|
||
|
//cost is low. Finally, travel around along valid iterator, copy...
|
||
|
|
||
|
IndexList* index_lists;
|
||
|
|
||
|
void buildIndexLists();
|
||
|
bool travel_init(int _lid);
|
||
|
|
||
|
bool index_link(int _nid, int _idx);
|
||
|
bool index_filter(int _nid, int _idx);
|
||
|
bool table_travel(int _id1, int _id2);
|
||
|
bool table_check(int _id1, int _id2);
|
||
|
|
||
|
bool index_travel_one();
|
||
|
bool index_travel_two();
|
||
|
bool index_travel();
|
||
|
bool index_join();
|
||
|
|
||
|
//NOTICE:this is only used to join a BasicQuery
|
||
|
bool join();
|
||
|
|
||
|
public:
|
||
|
Join();
|
||
|
Join(KVstore* _kvstore);
|
||
|
//these functions can be called by Database
|
||
|
bool join_sparql(SPARQLquery& _sparql_query);
|
||
|
bool join_basic(BasicQuery* _basic_query);
|
||
|
~Join();
|
||
|
};
|
||
|
|
||
|
#endif //_JOIN_JOIN_H
|
||
|
|