/*============================================================================= # Filename: Join.h # Author: Bookug Lobert # Mail: 1181955272@qq.com # Last Modified: 2015-12-13 16:05 # Description: design join strategies and select/cost modules =============================================================================*/ #ifndef _JOIN_JOIN_H #define _JOIN_JOIN_H #include "../Query/IDList.h" #include "../Query/BasicQuery.h" #include "../Query/SPARQLquery.h" #include "../KVstore/KVstore.h" #include "../Util/Util.h" //BETTER?:place multi_join and index_join in separated files typedef vector RecordType; typedef vector::iterator RecordIterator; typedef list TableType; typedef list::iterator TableIterator; typedef list::reverse_iterator TableReverseIterator; //typedef list< vector > TableType; //typedef list< vector >::iterator TableIterator; //typedef list< vector >::reverse_iterator TableReverseIterator; typedef vector< vector > IdLists; typedef vector< vector > IdListsLen; typedef struct IndexItem { int value; bool isValid; //needed for final travelling //NOTICE: the size of vector is expected to be small //the order in vector must be same as in IndexList vector vector< list< list::iterator > > travel; vector< set > check; //map< int, list < list::iterator > > links; //direct next index list id and linking //map< int, set > check; //indirect previous index list id and verifying IndexItem() { this->value = -1; this->isValid = false; } IndexItem(int _val) { this->value = _val; this->isValid = true; } }IndexItem; typedef struct IndexList { //int next; //NOTICE:the list should be ordered at the beginning list candidates; list::iterator border; //used to divide valid and invalid area bool prepared; //find and set all invalid eles restricted by subtree in travelling int position; //current neighbor to travel vector travel_map; //the mapping between links position and IndexList id vector check_map; //the mapping between check position and IndexList id IndexList() { //this->next = -1; this->prepared = false; this->position = 0; } bool end() { return this->position == (int)this->travel_map.size(); } int next() { return this->travel_map[this->position++]; } //NOTICE:we can not use binary-search in list, but this search method maybe slow //BETTER?:adjust the list to binary-tree or other struture? list::iterator search(int _val) { for(list::iterator it = this->candidates.begin(); it != this->border; ++it) { if(it->value == _val) return it; } return this->border; } }IndexList; typedef struct Satellite { int id; int* idlist; int idlist_len; Satellite(int _id, int* _idlist, int _idlist_len) { this->id = _id; this->idlist = _idlist; this->idlist_len = _idlist_len; } }Satellite; typedef list ItemList; typedef list::iterator ItemListIterator; typedef list< list::iterator > IteratorList; //Database new Join and pass something like kvstore class Join { private: int start_id; int var_num; //bool* dealed_triple; BasicQuery* basic_query; KVstore* kvstore; //used by score_node for parameters static const unsigned PARAM_DEGREE = 1; static const unsigned PARAM_SIZE = 100000; static const unsigned PARAM_DENSE = 1; static const double JUDGE_LIMIT = 0.5; static const int LIMIT_CANDIDATE_LIST_SIZE = 1000; //BETTER?:predefine size to avoid copy cost TableType current_table; TableIterator new_start; //keep to end() as default //list table_row_new; //keep the mapping for disordered ids in vector table int* id2pos; int id_pos; //the num of id put into id2pos currently int* pos2id; bool* dealed_triple; stack mystack; vector* result_list; vector satellites; int* record; int record_len; void init(BasicQuery* _basic_query); void clear(); void add_id_pos_mapping(int _id); void reset_id_pos_mapping(); //judge which method should be used according to //the size of candidates and structure of quering graph int judge(int _smallest, int _biggest); //select the start point and search order void select(); //score the cost to link two tables and the efficience //of filtering //int score(List1, List2); //score the node according to degree and size double score_node(unsigned _degree, unsigned _size); void toStartJoin(); bool filter_before_join(); bool constant_edge_filter(int _var_i); void preid_filter(int _var_i); bool only_pre_filter_after_join(); void add_literal_candidate(); bool pre_var_handler(); //bool filterBySatellites(int _var, int _ele); bool filterBySatellites(int _var); bool allFilterByPres(); void generateAllSatellites(); void cartesian(int pos, int end); //functions for help //copy/add to the end of current_table and set true void add_new_to_results(TableIterator it, int id); //void set_results_old(list::iterator it); int choose_next_node(int id); bool is_literal_var(int id); bool is_literal_ele(int _id); void copyToResult(); //BETTER?:change these params to members in class void acquire_all_id_lists(IdLists& _id_lists, IdListsLen& _id_lists_len, IDList& _can_list, vector& _edges, int _id, int _can_list_size); bool if_prepare_idlist(int _can_list_size, bool _is_literal); bool new_join_with_multi_vars_prepared(IdLists& _id_lists, IdListsLen& _id_lists_len, vector& _edges, IDList& _can_list, int _can_list_size); bool new_join_with_multi_vars_not_prepared(vector& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal); bool multi_join(); //================================================================================================ //The index join method saves the memory cost because 2m+2mn < 3mn, //and time may be reduced if the pre-process is not too costly //because we can reuse the links other than recompute in temporal table //New struct is needed for node, i.e. list >, //because we may have to delete, but how can we know if an iterator //is valid if the one it points to is removed?(remove if the other is removed; using end()) //1. based on edges: process each time only in valid area(already //macthed with others, invalid is removed), and finally it must be //all ok, just copy to result_list. We should select the edge order //to better the efficiency, but how can we keep only a neighbor links //set if we want to save memory?(ensure all can be linked later) //2. based on points: search deeply like multi-index-join, only a //neighbor links set is kept for a node(not every edge), so memory //cost is low. Finally, travel around along valid iterator, copy... IndexList* index_lists; void buildIndexLists(); bool travel_init(int _lid); bool index_link(int _nid, int _idx); bool index_filter(int _nid, int _idx); bool table_travel(int _id1, int _id2); bool table_check(int _id1, int _id2); bool index_travel_one(); bool index_travel_two(); bool index_travel(); bool index_join(); //NOTICE:this is only used to join a BasicQuery bool join(); public: Join(); Join(KVstore* _kvstore); //these functions can be called by Database bool join_sparql(SPARQLquery& _sparql_query); bool join_basic(BasicQuery* _basic_query); ~Join(); }; #endif //_JOIN_JOIN_H