gStore/test/IndexJoin.h

/*=============================================================================
# Filename: Join.h
# Author: Bookug Lobert 
# Mail: 1181955272@qq.com
# Last Modified: 2015-12-13 16:05
# Description: design join strategies and select/cost modules
=============================================================================*/

#ifndef _JOIN_JOIN_H
#define _JOIN_JOIN_H 

#include "../Query/IDList.h"
#include "../Query/BasicQuery.h"
#include "../Query/SPARQLquery.h"
#include "../KVstore/KVstore.h"
#include "../Util/Util.h"

//BETTER?:place multi_join and index_join in separated files

typedef vector<int> RecordType;
typedef vector<int>::iterator RecordIterator;
typedef list<RecordType> TableType;
typedef list<RecordType>::iterator TableIterator;
typedef list<RecordType>::reverse_iterator TableReverseIterator;
//typedef list< vector<int> > TableType;
//typedef list< vector<int> >::iterator TableIterator;
//typedef list< vector<int> >::reverse_iterator TableReverseIterator;
typedef vector< vector<int*> > IdLists;
typedef vector< vector<int> > IdListsLen;

typedef struct IndexItem
{
	int value;
	bool isValid;   //needed for final travelling
	//NOTICE: the size of vector is expected to be small
	//the order in vector must be same as in IndexList vector
	vector< list< list<struct IndexItem>::iterator > > travel;
	vector< set<int> > check;
	//map< int, list < list<struct IndexItem>::iterator > > links;  //direct next index list id and linking
	//map< int, set <int> > check;     //indirect previous index list id and verifying
	IndexItem()
	{
		this->value = -1;
		this->isValid = false;
	}
	IndexItem(int _val)
	{
		this->value = _val;
		this->isValid = true;
	}
}IndexItem;

typedef struct IndexList
{
	//int next;
	//NOTICE:the list should be ordered at the beginning
	list<IndexItem> candidates;
	list<IndexItem>::iterator border;   //used to divide valid and invalid area
	bool prepared;                //find and set all invalid eles restricted by subtree in travelling  
	int position;                 //current neighbor to travel
	vector<int> travel_map;      //the mapping between links position and IndexList id
	vector<int> check_map;      //the mapping between check position and IndexList id
	IndexList()
	{
		//this->next = -1;
		this->prepared = false;
		this->position = 0;
	}
	bool end()
	{
		return this->position == (int)this->travel_map.size();
	}
	int next()
	{
		return this->travel_map[this->position++];
	}
	//NOTICE:we can not use binary-search in list, but this search method maybe slow
	//BETTER?:adjust the list to binary-tree or other struture?
	list<IndexItem>::iterator search(int _val)
	{
		for(list<IndexItem>::iterator it = this->candidates.begin(); it != this->border; ++it)
		{
			if(it->value == _val)
				return it;
		}
		return this->border;
	}
}IndexList;

typedef struct Satellite
{
	int id;
	int* idlist;
	int idlist_len;
	Satellite(int _id, int* _idlist, int _idlist_len)
	{
		this->id = _id;
		this->idlist = _idlist;
		this->idlist_len = _idlist_len;
	}
}Satellite;

typedef list<IndexItem> ItemList;
typedef list<IndexItem>::iterator ItemListIterator;
typedef list< list<struct IndexItem>::iterator >  IteratorList;

//Database new Join and pass something like kvstore
class Join
{
private:
	int start_id;
	int var_num;
	//bool* dealed_triple;
	BasicQuery* basic_query;
	KVstore* kvstore;
	//used by score_node for parameters
	static const unsigned PARAM_DEGREE = 1;
	static const unsigned PARAM_SIZE = 100000;
	static const unsigned PARAM_DENSE = 1;
	static const double JUDGE_LIMIT = 0.5;
	static const int LIMIT_CANDIDATE_LIST_SIZE = 1000;
	//BETTER?:predefine size to avoid copy cost
	TableType current_table;
	TableIterator new_start;   //keep to end() as default
	//list<bool> table_row_new;
	
	//keep the mapping for disordered ids in vector<int> table
	int* id2pos; 
	int id_pos;   //the num of id put into id2pos currently
	int* pos2id; 
	bool* dealed_triple;
	stack<int> mystack;

	vector<int*>* result_list;
	vector<Satellite> satellites;
	int* record;
	int record_len;

	void init(BasicQuery* _basic_query);
	void clear();
	void add_id_pos_mapping(int _id);
	void reset_id_pos_mapping();

	//judge which method should be used according to 
	//the size of candidates and structure of quering graph
	int judge(int _smallest, int _biggest);

	//select the start point and search order
	void select();

	//score the cost to link two tables and the efficience
	//of filtering
	//int score(List1, List2);
	
	//score the node according to degree and size
	double score_node(unsigned _degree, unsigned _size);

	void toStartJoin();

	bool filter_before_join();
	bool constant_edge_filter(int _var_i);
	void preid_filter(int _var_i);
	bool only_pre_filter_after_join();
	void add_literal_candidate();
	bool pre_var_handler();
	//bool filterBySatellites(int _var, int _ele);
	bool filterBySatellites(int _var);
	bool allFilterByPres();
	void generateAllSatellites();
	void cartesian(int pos, int end);

	//functions for help
	//copy/add to the end of current_table and set true
	void add_new_to_results(TableIterator it, int id);

	//void set_results_old(list<bool>::iterator it);
	int choose_next_node(int id);

	bool is_literal_var(int id);
	bool is_literal_ele(int _id);
	
	void copyToResult();

	//BETTER?:change these params to members in class
	void acquire_all_id_lists(IdLists& _id_lists, IdListsLen& _id_lists_len, IDList& _can_list, vector<int>& _edges, int _id, int _can_list_size);
	bool if_prepare_idlist(int _can_list_size, bool _is_literal);
	bool new_join_with_multi_vars_prepared(IdLists& _id_lists, IdListsLen& _id_lists_len, vector<int>& _edges, IDList& _can_list, int _can_list_size);
	bool new_join_with_multi_vars_not_prepared(vector<int>& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal);

	bool multi_join();


//================================================================================================
	//The index join method saves the memory cost because 2m+2mn < 3mn,
	//and time may be reduced if the pre-process is not too costly
	//because we can reuse the links other than recompute in temporal table
	//New struct is needed for node, i.e. list<bool, int, list<iterator> >, 
	//because we may have to delete, but how can we know if an iterator 
	//is valid if the one it points to is removed?(remove if the other is removed; using end())
	//1. based on edges: process each time only in valid area(already 
	//macthed with others, invalid is removed), and finally it must be 
	//all ok, just copy to result_list. We should select the edge order
	//to better the efficiency, but how can we keep only a neighbor links
	//set if we want to save memory?(ensure all can be linked later)
	//2. based on points: search deeply like multi-index-join, only a 
	//neighbor links set is kept for a node(not every edge), so memory 
	//cost is low. Finally, travel around along valid iterator, copy...
	
	IndexList* index_lists; 
	
	void buildIndexLists();
	bool travel_init(int _lid);

	bool index_link(int _nid, int _idx);
	bool index_filter(int _nid, int _idx);
	bool table_travel(int _id1, int _id2);
	bool table_check(int _id1, int _id2);

	bool index_travel_one();
	bool index_travel_two();
	bool index_travel();
	bool index_join();

	//NOTICE:this is only used to join a BasicQuery
	bool join();

public:
	Join();
	Join(KVstore* _kvstore);
	//these functions can be called by Database
	bool join_sparql(SPARQLquery& _sparql_query);
	bool join_basic(BasicQuery* _basic_query);
	~Join();
};

#endif //_JOIN_JOIN_H
version 0.4.0 2017-01-16 14:12:57 +08:00			`/*=============================================================================`
			`# Filename: Join.h`
			`# Author: Bookug Lobert`
			`# Mail: 1181955272@qq.com`
			`# Last Modified: 2015-12-13 16:05`
			`# Description: design join strategies and select/cost modules`
			`=============================================================================*/`

			`#ifndef _JOIN_JOIN_H`
			`#define _JOIN_JOIN_H`

			`#include "../Query/IDList.h"`
			`#include "../Query/BasicQuery.h"`
			`#include "../Query/SPARQLquery.h"`
			`#include "../KVstore/KVstore.h"`
			`#include "../Util/Util.h"`

			`//BETTER?:place multi_join and index_join in separated files`

			`typedef vector<int> RecordType;`
			`typedef vector<int>::iterator RecordIterator;`
			`typedef list<RecordType> TableType;`
			`typedef list<RecordType>::iterator TableIterator;`
			`typedef list<RecordType>::reverse_iterator TableReverseIterator;`
			`//typedef list< vector<int> > TableType;`
			`//typedef list< vector<int> >::iterator TableIterator;`
			`//typedef list< vector<int> >::reverse_iterator TableReverseIterator;`
			`typedef vector< vector<int*> > IdLists;`
			`typedef vector< vector<int> > IdListsLen;`

			`typedef struct IndexItem`
			`{`
			`int value;`
			`bool isValid; //needed for final travelling`
			`//NOTICE: the size of vector is expected to be small`
			`//the order in vector must be same as in IndexList vector`
			`vector< list< list<struct IndexItem>::iterator > > travel;`
			`vector< set<int> > check;`
			`//map< int, list < list<struct IndexItem>::iterator > > links; //direct next index list id and linking`
			`//map< int, set <int> > check; //indirect previous index list id and verifying`
			`IndexItem()`
			`{`
			`this->value = -1;`
			`this->isValid = false;`
			`}`
			`IndexItem(int _val)`
			`{`
			`this->value = _val;`
			`this->isValid = true;`
			`}`
			`}IndexItem;`

			`typedef struct IndexList`
			`{`
			`//int next;`
			`//NOTICE:the list should be ordered at the beginning`
			`list<IndexItem> candidates;`
			`list<IndexItem>::iterator border; //used to divide valid and invalid area`
			`bool prepared; //find and set all invalid eles restricted by subtree in travelling`
			`int position; //current neighbor to travel`
			`vector<int> travel_map; //the mapping between links position and IndexList id`
			`vector<int> check_map; //the mapping between check position and IndexList id`
			`IndexList()`
			`{`
			`//this->next = -1;`
			`this->prepared = false;`
			`this->position = 0;`
			`}`
			`bool end()`
			`{`
			`return this->position == (int)this->travel_map.size();`
			`}`
			`int next()`
			`{`
			`return this->travel_map[this->position++];`
			`}`
			`//NOTICE:we can not use binary-search in list, but this search method maybe slow`
			`//BETTER?:adjust the list to binary-tree or other struture?`
			`list<IndexItem>::iterator search(int _val)`
			`{`
			`for(list<IndexItem>::iterator it = this->candidates.begin(); it != this->border; ++it)`
			`{`
			`if(it->value == _val)`
			`return it;`
			`}`
			`return this->border;`
			`}`
			`}IndexList;`

			`typedef struct Satellite`
			`{`
			`int id;`
			`int* idlist;`
			`int idlist_len;`
			`Satellite(int _id, int* _idlist, int _idlist_len)`
			`{`
			`this->id = _id;`
			`this->idlist = _idlist;`
			`this->idlist_len = _idlist_len;`
			`}`
			`}Satellite;`

			`typedef list<IndexItem> ItemList;`
			`typedef list<IndexItem>::iterator ItemListIterator;`
			`typedef list< list<struct IndexItem>::iterator > IteratorList;`

			`//Database new Join and pass something like kvstore`
			`class Join`
			`{`
			`private:`
			`int start_id;`
			`int var_num;`
			`//bool* dealed_triple;`
			`BasicQuery* basic_query;`
			`KVstore* kvstore;`
			`//used by score_node for parameters`
			`static const unsigned PARAM_DEGREE = 1;`
			`static const unsigned PARAM_SIZE = 100000;`
			`static const unsigned PARAM_DENSE = 1;`
			`static const double JUDGE_LIMIT = 0.5;`
			`static const int LIMIT_CANDIDATE_LIST_SIZE = 1000;`
			`//BETTER?:predefine size to avoid copy cost`
			`TableType current_table;`
			`TableIterator new_start; //keep to end() as default`
			`//list<bool> table_row_new;`

			`//keep the mapping for disordered ids in vector<int> table`
			`int* id2pos;`
			`int id_pos; //the num of id put into id2pos currently`
			`int* pos2id;`
			`bool* dealed_triple;`
			`stack<int> mystack;`

			`vector<int> result_list;`
			`vector<Satellite> satellites;`
			`int* record;`
			`int record_len;`

			`void init(BasicQuery* _basic_query);`
			`void clear();`
			`void add_id_pos_mapping(int _id);`
			`void reset_id_pos_mapping();`

			`//judge which method should be used according to`
			`//the size of candidates and structure of quering graph`
			`int judge(int _smallest, int _biggest);`

			`//select the start point and search order`
			`void select();`

			`//score the cost to link two tables and the efficience`
			`//of filtering`
			`//int score(List1, List2);`

			`//score the node according to degree and size`
			`double score_node(unsigned _degree, unsigned _size);`

			`void toStartJoin();`

			`bool filter_before_join();`
			`bool constant_edge_filter(int _var_i);`
			`void preid_filter(int _var_i);`
			`bool only_pre_filter_after_join();`
			`void add_literal_candidate();`
			`bool pre_var_handler();`
			`//bool filterBySatellites(int _var, int _ele);`
			`bool filterBySatellites(int _var);`
			`bool allFilterByPres();`
			`void generateAllSatellites();`
			`void cartesian(int pos, int end);`

			`//functions for help`
			`//copy/add to the end of current_table and set true`
			`void add_new_to_results(TableIterator it, int id);`

			`//void set_results_old(list<bool>::iterator it);`
			`int choose_next_node(int id);`

			`bool is_literal_var(int id);`
			`bool is_literal_ele(int _id);`

			`void copyToResult();`

			`//BETTER?:change these params to members in class`
			`void acquire_all_id_lists(IdLists& _id_lists, IdListsLen& _id_lists_len, IDList& _can_list, vector<int>& _edges, int _id, int _can_list_size);`
			`bool if_prepare_idlist(int _can_list_size, bool _is_literal);`
			`bool new_join_with_multi_vars_prepared(IdLists& _id_lists, IdListsLen& _id_lists_len, vector<int>& _edges, IDList& _can_list, int _can_list_size);`
			`bool new_join_with_multi_vars_not_prepared(vector<int>& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal);`

			`bool multi_join();`



			`//================================================================================================`
			`//The index join method saves the memory cost because 2m+2mn < 3mn,`
			`//and time may be reduced if the pre-process is not too costly`
			`//because we can reuse the links other than recompute in temporal table`
			`//New struct is needed for node, i.e. list<bool, int, list<iterator> >,`
			`//because we may have to delete, but how can we know if an iterator`
			`//is valid if the one it points to is removed?(remove if the other is removed; using end())`
			`//1. based on edges: process each time only in valid area(already`
			`//macthed with others, invalid is removed), and finally it must be`
			`//all ok, just copy to result_list. We should select the edge order`
			`//to better the efficiency, but how can we keep only a neighbor links`
			`//set if we want to save memory?(ensure all can be linked later)`
			`//2. based on points: search deeply like multi-index-join, only a`
			`//neighbor links set is kept for a node(not every edge), so memory`
			`//cost is low. Finally, travel around along valid iterator, copy...`

			`IndexList* index_lists;`

			`void buildIndexLists();`
			`bool travel_init(int _lid);`

			`bool index_link(int _nid, int _idx);`
			`bool index_filter(int _nid, int _idx);`
			`bool table_travel(int _id1, int _id2);`
			`bool table_check(int _id1, int _id2);`

			`bool index_travel_one();`
			`bool index_travel_two();`
			`bool index_travel();`
			`bool index_join();`

			`//NOTICE:this is only used to join a BasicQuery`
			`bool join();`

			`public:`
			`Join();`
			`Join(KVstore* _kvstore);`
			`//these functions can be called by Database`
			`bool join_sparql(SPARQLquery& _sparql_query);`
			`bool join_basic(BasicQuery* _basic_query);`
			`~Join();`
			`};`

			`#endif //_JOIN_JOIN_H`