/*============================================================================= # Filename: Join.cpp # Author: Bookug Lobert # Mail: 1181955272@qq.com # Last Modified: 2015-12-13 16:44 # Description: implement functions in Join.h =============================================================================*/ #include "Join.h" using namespace std; Join::Join() { this->kvstore = NULL; this->result_list = NULL; } Join::Join(KVstore* _kvstore) { this->kvstore = _kvstore; this->result_list = NULL; } Join::~Join() { //noting to do necessarily } void Join::init(BasicQuery* _basic_query) { //BETTER:only common are placed here! this->basic_query = _basic_query; this->var_num = this->basic_query->getVarNum(); int mapping_len = this->basic_query->getPreVarNum() + this->var_num; this->id2pos = (int*)malloc(sizeof(int) * mapping_len); memset(id2pos, -1, sizeof(int) * mapping_len); //this->id2pos = (int*)malloc(sizeof(int) * this->var_num); //memset(id2pos, -1, sizeof(int) * this->var_num); this->pos2id = (int*)malloc(sizeof(int) * mapping_len); memset(pos2id, -1, sizeof(int) * mapping_len); //this->pos2id = (int*)malloc(sizeof(int) * this->var_num); //memset(pos2id, -1, sizeof(int) * this->var_num); this->id_pos = 0; this->start_id = -1; int triple_num = this->basic_query->getTripleNum(); this->dealed_triple = (bool*)calloc(triple_num, sizeof(bool)); this->index_lists = NULL; this->result_list = _basic_query->getResultListPointer(); } void Join::clear() { //BETTER:only common are released here! free(this->id2pos); free(this->pos2id); //NOTICE:maybe many BasicQuery this->current_table.clear(); while (this->mystack.empty() == false) this->mystack.pop(); free(this->dealed_triple); //NULL if using multi-join method delete[] this->index_lists; this->result_list = NULL; this->satellites.clear(); } double Join::score_node(unsigned _degree, unsigned _size) { //PARAM_DEGREE * _degree - PARAM_SIZE * _size //BETTER?:use other cost model return Join::PARAM_DEGREE * (double)_degree + Join::PARAM_SIZE / (double)_size; } int Join::judge(int _smallest, int _biggest) { return 0; //DEBUG:remove when index_join is ok //BETTER?:use appropiate method according to size and structure int edge_num = this->basic_query->getTripleNum(); double dense = (double)edge_num / this->var_num; //BETTER:how to guess the size of can_lists double size = (_smallest + _biggest) / 2.0; double ans = Join::PARAM_DENSE * dense - size / Join::PARAM_SIZE; if (ans > Join::JUDGE_LIMIT) return 0; //multi_join method else return 1; //index_join method } //select the start point, and maybe search order for index_join //just for multi_join here, maybe diffrent for index_join later void Join::select() { //NOTICE: only consider vars in select here double max = 0; int maxi = -1; //int border = this->basic_query->getVarNum(); for(int i = 0; i < this->var_num; ++i) { //satellites which are not retrieved if(!this->basic_query->isReady(i)) { continue; } double tmp = this->score_node(this->basic_query->getVarDegree(i), this->basic_query->getCandidateSize(i)); if (tmp > max) { max = tmp; maxi = i; } } if (maxi == -1) { cout << "error to select the first one to join" << endl; } else { this->start_id = maxi; } #ifdef DEBUG_JOIN //printf("the start id is: %d\n", this->start_id); cerr << "the start id is: " << this->start_id << endl; #endif } //join on the vector of CandidateList, available after //retrieved from the VSTREE and store the resut in _result_set bool Join::join_sparql(SPARQLquery& _sparql_query) { int basic_query_num = _sparql_query.getBasicQueryNum(); //join each basic query for (int i = 0; i < basic_query_num; i++) { //fprintf(stderr, "Basic query %d\n", i); cerr << "Basic query " << i << endl; bool ret = this->join_basic(&(_sparql_query.getBasicQuery(i))); if (!ret) cerr << "end directly for this basic query: " << i << endl; } return true; } bool Join::join_basic(BasicQuery* _basic_query) { this->init(_basic_query); long begin = Util::get_cur_time(); bool ret1 = this->filter_before_join(); long after_constant_filter = Util::get_cur_time(); //fprintf(stderr, "after filter_before_join: used %ld ms\n", after_filter - begin); cerr << "after filter_before_join: used " << (after_constant_filter - begin) << " ms" << endl; if (!ret1) { this->clear(); return false; } this->add_literal_candidate(); long after_add_literal = Util::get_cur_time(); cerr << "after add_literal_candidate: used " << (after_add_literal - after_constant_filter) << " ms" << endl; bool ret2 = this->allFilterByPres(); //bool ret2 = true; long after_pre_filter = Util::get_cur_time(); cerr << "after allFilterByPres: used " << (after_pre_filter - after_add_literal) << " ms" << endl; if (!ret2) { this->clear(); return false; } bool ret3 = this->join(); long after_joinbasic = Util::get_cur_time(); cerr << "after join_basic: used " << (after_joinbasic - after_pre_filter) << " ms" << endl; if (!ret3) { this->clear(); return false; } //TODO+DEBUG:only_pre_filter needed to deal with undealed isolated edge //only when allFilterByPres not used!!! //BETTER:we need to choose a way to decide a set of isolated edges to be dealed before join(one by one in join or just before?) //the other should be dealed later(filter efficience is low!!!) //NOTICE:we do pre_var_handler first, and generate all satellites when coping to result list // //this->generateAllSatellites(); //long after_generate_satellite = Util::get_cur_time(); //cerr<<"after generate satellite: used "<<(after_generate_satellite - after_joinbasic)<<" ms"<pre_var_handler(); //TODO+BETTER:maybe also reduce to empty, return false long after_pre_var = Util::get_cur_time(); cerr << "after pre var: used " << (after_pre_var - after_joinbasic) << " ms" << endl; this->copyToResult(); long after_copy = Util::get_cur_time(); cerr << "after copy to result list: used " << (after_copy - after_pre_var) << " ms" << endl; cerr << "Final result size: " << this->basic_query->getResultList().size() << endl; this->clear(); return true; } void Join::generateAllSatellites() { //BETTER: directly generate to result list, avoiding copying cost? int core_var_num = this->basic_query->getRetrievedVarNum(); for (int i = 0; i < core_var_num; ++i) { int id = this->pos2id[i]; int degree = this->basic_query->getVarDegree(id); for (int j = 0; j < degree; ++j) { int id2 = this->basic_query->getEdgeNeighborID(id, j); if (this->basic_query->isSatelliteInJoin(id2) == false) continue; char edge_type = this->basic_query->getEdgeType(id, j); int preid = this->basic_query->getEdgePreID(id, j); for (TableIterator it = this->current_table.begin(); it != this->new_start; ++it) { //TODO:generate and add just like join } this->new_start = this->current_table.end(); } } } bool Join::pre_var_handler() { //int core_var_num = this->basic_query->getRetrievedVarNum(); unsigned pre_var_num = this->basic_query->getPreVarNum(); #ifdef DEBUG_JOIN cerr << "pre var num: " << pre_var_num << endl; #endif //QUERY+BETTER:filter by pre vars one by one or each record together? for (unsigned i = 0; i < pre_var_num; ++i) { #ifdef DEBUG_JOIN cerr << "current pre var id: " << i << endl; #endif const PreVar& pre_var = this->basic_query->getPreVarByID(i); #ifdef DEBUG_JOIN cerr << "current table size: " << this->current_table.size() << endl; #endif //WARN:do not conflict with original var id //1 core var, id can be 1, then pos can be 1 + 0 = 1 for pre var!!! conflict! //int pos = core_var_num + i; int pos = this->var_num + i; this->add_id_pos_mapping(pos); //cout<<"id 1 pos "<id2pos[1]<current_table.begin(); it != this->new_start;) { IDList valid_ans; //bool ok = true; unsigned triple_num = pre_var.triples.size(); #ifdef DEBUG_JOIN //cerr<<"triple num for this var: "<basic_query->getTriple(pre_var.triples[j]); string sub_name = triple.subject; string obj_name = triple.object; #ifdef DEBUG_JOIN //cerr << sub_name << endl << triple.predicate << endl << obj_name << endl; #endif int sub_id = -1, obj_id = -1, var1 = -1, var2 = -1; if (sub_name[0] != '?') { sub_id = this->kvstore->getIDByEntity(sub_name); } else { if (!(this->basic_query->isOneDegreeNotJoinVar(sub_name))) var1 = this->basic_query->getIDByVarName(sub_name); //satellite in join not retrieved if (var1 != -1 && this->basic_query->isSatelliteInJoin(var1)) var1 = -1; } if (obj_name[0] != '?') { obj_id = this->kvstore->getIDByEntity(obj_name); if (obj_id == -1) obj_id = this->kvstore->getIDByLiteral(obj_name); } else { if (!(this->basic_query->isOneDegreeNotJoinVar(obj_name))) var2 = this->basic_query->getIDByVarName(obj_name); //satellite in join not retrieved if (var2 != -1 && this->basic_query->isSatelliteInJoin(var2)) var2 = -1; } //cout<<"var1: "< 1 //(?s2 ?p ?o2 is not ok, not connected query graph) //WARN+QUERY: if only this triple, no answer for ?p //we need to output all predicates in data graph, so store one file containing //entity/literal/predicate num when building, and reoutput all when changing //(binary file i snot visible, so use character file) // //we shall deal with this case in the Strategy module in time } else if (var1 == -1 && var2 != -1) { //TODO+NOTICE: we must add literals here, also enum all predicates and using p2o //but literals should only be added once for each predicate //QUERY:maybe many in edges all with unbound predicates //we think this case is very rare, so not consider now // //how about bound predicates? ?s1 p1 ?o ?s2 p2 ?o (?o retrieved, p2o to add literals) //all unbound predicates: ?s1 ?p1 ?o ?s2 ?p2 ?o //if exist constant neighbor, just use s2o to add literals(already discussed) //NOTICE+WARN+TODO: in these cases, all subject degree is 1, but we can not start from ?o because //it is a literal var!!! //(so join can not be processed, however, do not need to join here) //must add literal for ?o before or treat it as special case, considered in Strategy!! //cout<<"pos: "<id2pos[var2]<<" ele: "<<(*it)[0]<kvstore->getpreIDlistByobjID((*it)[this->id2pos[var2]], id_list, id_list_len, true); } else if (var1 != -1 && var2 == -1) { this->kvstore->getpreIDlistBysubID((*it)[this->id2pos[var1]], id_list, id_list_len, true); } else if (var1 != -1 && var2 != -1) { //if(this->is_literal_var(var2)) //{ //int* oid_list = NULL; //int oid_list_len = 0; //this->kvstore->getobjIDlistBysubID((*it)[this->id2pos[var1]], oid_list, oid_list_len); //this->kvstore->getpreIDlistBysubID((*it)[this->id2pos[var1]], id_list, id_list_len); //} //cerr<<"sub str: "<kvstore->getEntityByID((*it)[this->id2pos[var1]])<kvstore->getEntityByID((*it)[this->id2pos[var2]])<kvstore->getpreIDlistBysubIDobjID((*it)[this->id2pos[var1]], (*it)[this->id2pos[var2]], id_list, id_list_len); int sid = (*it)[this->id2pos[var1]], oid = (*it)[this->id2pos[var2]]; this->kvstore->getpreIDlistBysubIDobjID(sid, oid, id_list, id_list_len, true); //NOTICE:no need to add literals here because they are added when join using s2o } } //two constants in query else if (sub_id != -1 && obj_id != -1) { //just use so2p in query graph to find predicates //this->kvstore->getpreIDlistBysubIDobjID(sub_id, obj_id, id_list, id_list_len); int sid = sub_id, oid = obj_id; this->kvstore->getpreIDlistBysubIDobjID(sid, oid, id_list, id_list_len, true); } //sub is var while obj is constant else if (sub_id == -1 && obj_id != -1) { if (var1 == -1) { this->kvstore->getpreIDlistByobjID(obj_id, id_list, id_list_len, true); } else { this->kvstore->getpreIDlistBysubIDobjID((*it)[this->id2pos[var1]], obj_id, id_list, id_list_len, true); int sid = (*it)[this->id2pos[var1]], oid = obj_id; this->kvstore->getpreIDlistBysubIDobjID(sid, oid, id_list, id_list_len, true); } } //sub is constant while obj is var else if (sub_id != -1 && obj_id == -1) { if (var2 == -1) { this->kvstore->getpreIDlistBysubID(sub_id, id_list, id_list_len, true); } else { //NOTICE:no need to add literals here because they are added in add_literal_candidate using s2o //this->kvstore->getpreIDlistBysubIDobjID(sub_id, (*it)[this->id2pos[var2]], id_list, id_list_len); int sid = sub_id, oid = (*it)[this->id2pos[var2]]; this->kvstore->getpreIDlistBysubIDobjID(sid, oid, id_list, id_list_len, true); } } //cout<<"the idlist len "<kvstore->getPredicateByID(valid_ans[k])< 0) { it->push_back(valid_ans[0]); int begin = 1; if (!if_new_start && size > 1) { this->add_new_to_results(it, valid_ans[1]); if_new_start = true; this->new_start = this->current_table.end(); this->new_start--; begin = 2; } for (int j = begin; j < size; ++j) { this->add_new_to_results(it, valid_ans[j]); } it++; } else { it = this->current_table.erase(it); } } this->new_start = this->current_table.end(); } cout << "table size after pre_var " << this->current_table.size() << endl; return true; } void Join::copyToResult() { //copy to result list, adjust the vars order this->result_list->clear(); int select_var_num = this->basic_query->getSelectVarNum(); int core_var_num = this->basic_query->getRetrievedVarNum(); int pre_var_num = this->basic_query->getPreVarNum(); //TODO:set right selected_pre_var_num here int selected_pre_var_num = pre_var_num; if (this->id_pos != core_var_num + selected_pre_var_num) { cerr << "terrible error in copyToResult!" << endl; return; } #ifdef DEBUG_JOIN cerr << "core var num: " << core_var_num << " select var num: " << select_var_num << endl; #endif this->record_len = select_var_num + pre_var_num; this->record = new int[this->record_len]; for (TableIterator it = this->current_table.begin(); it != this->current_table.end(); ++it) { int i = 0; for (; i < core_var_num; ++i) { //This is because sleect var id is always smaller if (this->pos2id[i] < select_var_num) this->record[this->pos2id[i]] = (*it)[i]; } #ifdef DEBUG_JOIN //cerr<<"current id_pos: "<id_pos<id_pos) { //TODO:only add selected ones //int pre_var_id = this->pos2id[i] - core_var_num; int pre_var_id = this->pos2id[i] - this->var_num; this->record[select_var_num + pre_var_id] = (*it)[i]; ++i; } //generate satellites when constructing records //NOTICE: satellites in join must be selected //core vertex maybe not in select // //vector satellites; for (i = 0; i < core_var_num; ++i) { int id = this->pos2id[i]; int ele = (*it)[i]; int degree = this->basic_query->getVarDegree(id); for (int j = 0; j < degree; ++j) { int id2 = this->basic_query->getEdgeNeighborID(id, j); if (this->basic_query->isSatelliteInJoin(id2) == false) continue; #ifdef DEBUG_JOIN //cerr << "to generate "<basic_query->getEdgeID(id, j); Triple triple = this->basic_query->getTriple(triple_id); int preid = this->basic_query->getEdgePreID(id, j); if (preid == -2) //?p { string predicate = triple.predicate; int pre_var_id = this->basic_query->getPreVarID(predicate); preid = (*it)[core_var_num + pre_var_id]; } else if (preid == -1) { //ERROR } char edge_type = this->basic_query->getEdgeType(id, j); if (edge_type == Util::EDGE_OUT) { this->kvstore->getobjIDlistBysubIDpreID(ele, preid, idlist, idlist_len, true); } else { this->kvstore->getsubIDlistByobjIDpreID(ele, preid, idlist, idlist_len, true); } this->satellites.push_back(Satellite(id2, idlist, idlist_len)); #ifdef DEBUG_JOIN //cerr<<"push a new satellite in"<cartesian(0, size); #ifdef DEBUG_JOIN //cerr<<"after cartesian"<satellites[k].idlist; //this->satellites[k].idlist = NULL; } //WARN:use this to avoid influence on the next loop this->satellites.clear(); #ifdef DEBUG_JOIN //cerr<<"after clear the satellites"<record; #ifdef DEBUG_JOIN //cerr<<"after delete the record"<record = NULL; this->record_len = 0; } void Join::cartesian(int pos, int end) { if (pos == end) { int* new_record = new int[this->record_len]; memcpy(new_record, this->record, sizeof(int) * this->record_len); this->result_list->push_back(new_record); return; } int size = this->satellites[pos].idlist_len; int id = this->satellites[pos].id; int* list = this->satellites[pos].idlist; for (int i = 0; i < size; ++i) { this->record[id] = list[i]; this->cartesian(pos + 1, end); } } void Join::toStartJoin() { for (int i = 0; i < this->var_num; ++i) { if (this->basic_query->isReady(i)) { return; } } cout << "toStartJoin(): need to prepare a ready node"<var_num; ++i) { if (!this->basic_query->isSatelliteInJoin(i)) { double tmp = this->score_node(this->basic_query->getVarDegree(i), this->basic_query->getCandidateSize(i)); if (tmp > max) { max = tmp; maxi = i; } } } //TODO+DEBUG:add literals error? //NOTICE:not add literal, so no constant neighbor, this must be free literal variable int var_id = maxi; int var_degree = this->basic_query->getVarDegree(var_id); //cout<<"var id: "<basic_query->getEdgeNeighborID(var_id, j); int predicate_id = this->basic_query->getEdgePreID(var_id, j); int triple_id = this->basic_query->getEdgeID(var_id, j); Triple triple = this->basic_query->getTriple(triple_id); string neighbor_name = triple.subject; IDList this_edge_literal_list; int* object_list = NULL; int object_list_len = 0; if (predicate_id >= 0) (this->kvstore)->getobjIDlistBypreID(predicate_id, object_list, object_list_len, true); //cout<<"predicate id: "<kvstore->getPredicateByID(predicate_id)<basic_query->getCandidateList(var_id); //int origin_candidate_list_len = origin_candidate_list.size(); origin_candidate_list.unionList(literal_candidate_list, true); //int after_add_literal_candidate_list_len = origin_candidate_list.size(); this->basic_query->setReady(var_id); cout<<"add literals num: "<toStartJoin(); //the smallest candidate list size of the not-satellite vars int id = this->basic_query->getVarID_FirstProcessWhenJoin(); int smallest = this->basic_query->getCandidateSize(id); if(!this->is_literal_var(id) && smallest == 0) return false; //empty result int biggest = this->basic_query->getVarID_MaxCandidateList(); int method = this->judge(smallest, biggest); bool ret = true; switch (method) { case 0: //printf("use multi-join here!\n"); cerr << "use multi-join here!" << endl; ret = this->multi_join(); break; case 1: //printf("use index-join here!\n"); cerr << "use index-join here!" << endl; ret = this->index_join(); break; default: //printf("ERROR: no method found!\n"); cerr << "ERROR: no method found!" << endl; break; } return ret; } int Join::choose_next_node(int id) { //choose a child to search deeply int degree = this->basic_query->getVarDegree(id); int maxi = -1; double max = 0; for (int i = 0; i < degree; ++i) { int var_id2 = this->basic_query->getEdgeNeighborID(id, i); if (var_id2 == -1) //not in join, including constant { continue; } //satellites which are not retrieved if (this->basic_query->if_need_retrieve(var_id2) == false) { continue; } // each triple/edge need to be processed only once. int edge_id = this->basic_query->getEdgeID(id, i); if (this->dealed_triple[edge_id]) { continue; } //NTC:not using updated degrees, other not the whole loop double tmp = this->score_node(this->basic_query->getVarDegree(var_id2), this->basic_query->getCandidateSize(var_id2)); if (max < tmp) { max = tmp; maxi = i; } } return maxi; } bool Join::is_literal_var(int _id) { //if(!this->basic_query->isFreeLiteralVariable(_id) || this->basic_query->isAddedLiteralCandidate(_id)) //if(!this->basic_query->isFreeLiteralVariable(_id)) //{ //return false; //} //BETTER?:this is not needed because we ensure that //all dealed nodes's literals are added! //this->basic_query->setAddedLiteralCandidate(_id); //if(this->basic_query->isAddedLiteralCandidate(_id)) if (this->basic_query->isReady(_id)) return false; else return true; //NOTICE:satellites are not considered in join, so only free literal variable checked here //(some free literal var maybe also added) } //=================================================================================================== //Below are functions to do multi-join method //=================================================================================================== void Join::add_new_to_results(TableIterator it, int id) { //NTC:already have one more in *it if need to push back RecordType tmp(*it); *(tmp.rbegin()) = id; this->current_table.push_back(tmp); } void Join::acquire_all_id_lists(IdLists& _id_lists, IdListsLen& _id_lists_len, IDList& _can_list, vector& _edges, int _id, int can_list_size) { int* tmp_id_list; int tmp_id_list_len; for (int i = 0; i < this->id_pos; ++i) { // keep empty if not valid/used _id_lists.push_back(vector()); _id_lists_len.push_back(vector()); int edge_index = _edges[i]; if (edge_index != -1) { int pre_id = this->basic_query->getEdgePreID(_id, edge_index); //int edge_id = this->basic_query->getEdgeID(_id, edge_index); int edge_type = this->basic_query->getEdgeType(_id, edge_index); if (pre_id >= 0) // valid { for (int j = 0; j < can_list_size; ++j) { if (edge_type == Util::EDGE_IN) { this->kvstore->getsubIDlistByobjIDpreID(_can_list[i], \ pre_id, tmp_id_list, tmp_id_list_len, true); } else //EDGE_OUT { this->kvstore->getobjIDlistBysubIDpreID(_can_list[i], \ pre_id, tmp_id_list, tmp_id_list_len, true); } _id_lists.rbegin()->push_back(tmp_id_list); _id_lists_len.rbegin()->push_back(tmp_id_list_len); } } } } } //DEBUG:add debug info and check when the var is not free bool Join::new_join_with_multi_vars_prepared(IdLists& _id_lists, IdListsLen& _id_lists_len, vector& _edges, IDList& _can_list, int _can_list_size) { if (_can_list_size == 0) { return false; //empty result } bool found = false; //no record matched bool if_new_start = false; //the first to add to end in while //list< list > temp_table; for (TableIterator it0 = this->current_table.begin(); it0 != this->new_start;) { bool matched = false; //this record matched bool added = false; //if one ele added already for (int i = 0; i < _can_list_size; ++i) { int cnt = 0; bool linked = true; for (RecordIterator it1 = it0->begin(); it1 != it0->end(); ++it1, ++cnt) { int edge_index = _edges[cnt]; if (edge_index == -1) { continue; } int ele = *it1; if (_id_lists_len[cnt][i] == 0) { linked = false; break; } if (Util::bsearch_int_uporder(ele, _id_lists[cnt][i], _id_lists_len[cnt][i]) == -1) { linked = false; break; } } if (linked) { if (added) { this->add_new_to_results(it0, _can_list[i]); if (!if_new_start) { if_new_start = true; this->new_start = this->current_table.end(); this->new_start--; } } else { added = true; it0->push_back(_can_list[i]); } matched = true; } } if (matched) { found = true; it0++; //it3++; } else { it0 = this->current_table.erase(it0); //it3 = this->table_row_new.erase(it3); } } return found; } bool Join::new_join_with_multi_vars_not_prepared(vector& _edges, IDList& _can_list, int _can_list_size, int _id, bool _is_literal) { if (_can_list_size == 0 && !_is_literal) { return false; //empty result } bool found = false; bool if_new_start = false; //the first to add to end in while for (TableIterator it0 = this->current_table.begin(); it0 != this->new_start;) { #ifdef DEBUG_JOIN if (this->new_start != this->current_table.end()) { //printf("now the new_start is:"); cerr << "now the new_start is:"; for (RecordIterator it1 = this->new_start->begin(); it1 != this->new_start->end(); ++it1) { //printf(" %d", *it1); cerr << " " << *it1; } //printf("\n"); cerr << endl; } else //printf("new_start still in end?!\n"); cerr << "new_start still in end?!" << endl; //printf("now the record is:"); cerr << "now the record is:"; for (RecordIterator it1 = it0->begin(); it1 != it0->end(); ++it1) { //printf(" %d", *it1); cerr << " " << *it1; } //printf("\n"); cerr << endl; #endif int cnt = 0; //update the valid id num according to restrictions by multi vars //also ordered while id_list and can_list are ordered //IDList valid_ans_list; IDList* valid_ans_list = NULL; //list valid_ans_list; bool matched = true; //NOTICE:we can generate cans from either direction, but this way is convenient and better for (RecordIterator it1 = it0->begin(); it1 != it0->end(); ++it1, ++cnt) { #ifdef DEBUG_JOIN //printf("cnt is: %d\n", cnt); cerr << "cnt is: " << cnt << endl; #endif int edge_index = _edges[cnt]; if (edge_index == -1) { continue; } #ifdef DEBUG_JOIN cerr << "edge exists!" << endl; #endif int ele = *it1; int edge_type = this->basic_query->getEdgeType(_id, edge_index); int pre_id = this->basic_query->getEdgePreID(_id, edge_index); if (pre_id == -2) //predicate var { #ifdef DEBUG_JOIN cerr << "this is a predicate var!" << endl; #endif //if(valid_ans_list == NULL) //{ //valid_ans_list = IDList::intersect(_can_list, NULL, 0); //} //else //{ //} //continue; } int* id_list; int id_list_len; if (edge_type == Util::EDGE_IN) { #ifdef DEBUG_JOIN //printf("this is an edge to our id to join!\n"); cerr << "this is an edge to our id to join!" << endl; #endif if (pre_id == -2) this->kvstore->getobjIDlistBysubID(ele, id_list, id_list_len, true); else if (pre_id >= 0) this->kvstore->getobjIDlistBysubIDpreID(ele, \ pre_id, id_list, id_list_len, true); } else { #ifdef DEBUG_JOIN //printf("this is an edge from our id to join!\n"); cerr << "this is an edge from our id to join!" << endl; #endif if (pre_id == -2) this->kvstore->getsubIDlistByobjID(ele, id_list, id_list_len, true); else this->kvstore->getsubIDlistByobjIDpreID(ele, pre_id, id_list, id_list_len, true); } if (id_list_len == 0) { //id_list == NULL in this case, no need to free matched = false; #ifdef DEBUG_JOIN //printf("this id_list is empty!\n"); cerr << "this id_list is empty!" << endl; #endif break; } //NOTICE:using so2p to filter is not good //The cost to join two ordered lists is the basic operation //of the whole join process!(O(klogn) < O(k+n) gennerally, for k < n) //Notice that n is the candidate list size just retrieved from vstree //TODO+BETTER:compute all sets by multiple restrictions, and choose //the minimal to start intersect operations //only can occur the first time, means cnt == 0 //if(valid_ans_list.size() == 0) if (valid_ans_list == NULL) { //WARN:this is too costly due to coping elements! //valid_ans_list.unionList(_can_list); if (_is_literal) { int entity_len = 0; while (true) { if (entity_len == id_list_len || Util::is_literal_ele(id_list[entity_len])) break; entity_len++; } //valid_ans_list.intersectList(id_list, entity_len); valid_ans_list = IDList::intersect(_can_list, id_list, entity_len); valid_ans_list->unionList(id_list + entity_len, id_list_len - entity_len, true); //this->basic_query->setAddedLiteralCandidate(_id); } else { valid_ans_list = IDList::intersect(_can_list, id_list, id_list_len); } //for(int i = 0; i < id_list_len; ++i) //{ //if we found this element(entity/literal) in //var1's candidate list, or this is a literal //element and var2 is a free literal variable, //we should add this one to result. //bool flag = false; //NOTICE:this var is free, but it can also contain //entities. Candidates after retrieved from vstree will //contain all possible entities, but no literals. //if(Util::is_literal_ele(id_list[i])) //{ //if(_is_literal) //{ //flag = true; //#ifdef DEBUG_JOIN //printf("to add literal for free variable!\n"); //#endif //} //} //else //{ //flag = _can_list.bsearch_uporder(id_list[i]) >= 0; //} //if(!flag) continue; //printf("add the ele to list!\n"); //valid_ans_list.addID(id_list[i]); //} } else { valid_ans_list->intersectList(id_list, id_list_len); //for(list::iterator it2 = valid_ans_list.begin(); it2 != valid_ans_list.end();) //{ //int tmp = *it2; //if(Util::bsearch_int_uporder(tmp, id_list, id_list_len) == -1) //{ //it2 = valid_ans_list.erase(it2); //} //else //{ //it2++; //} //} } delete[] id_list; if (valid_ans_list->size() == 0) { matched = false; break; } } if (matched) { #ifdef DEBUG_JOIN //printf("this record is matched!!\n"); cerr << "this record is matched!!" << endl; #endif found = true; //bool added = false; //add new var results to table from valid_ans_list //for(list::iterator it2 = valid_ans_list.begin(); it2 != valid_ans_list.end(); ++it2) int size = valid_ans_list->size(); it0->push_back((*valid_ans_list)[0]); int begin = 1; if (!if_new_start && size > 1) { this->add_new_to_results(it0, (*valid_ans_list)[1]); if_new_start = true; //this->new_start = this->current_table.rbegin().base(); this->new_start = this->current_table.end(); this->new_start--; //-1 is not allowed begin = 2; } for (int i = begin; i < size; ++i) { //WARN+NOTICE:this strategy may cause that duplicates are not together! this->add_new_to_results(it0, (*valid_ans_list)[i]); } it0++; } else { it0 = this->current_table.erase(it0); #ifdef DEBUG_JOIN //printf("this record is not matched!\n"); cerr << "this record is not matched!" << endl; #endif } delete valid_ans_list; valid_ans_list = NULL; } return found; } bool Join::if_prepare_idlist(int _can_list_size, bool _is_literal) { if (!_is_literal && _can_list_size < Join::LIMIT_CANDIDATE_LIST_SIZE) return true; else return false; } void Join::add_id_pos_mapping(int _id) { this->pos2id[this->id_pos] = _id; this->id2pos[_id] = this->id_pos; this->id_pos++; } void Join::reset_id_pos_mapping() { memset(this->id2pos, -1, sizeof(int) * this->var_num); memset(this->pos2id, -1, sizeof(int) * this->var_num); this->id_pos = 0; } //BETTER+QUERY:why this more costly in some query containing literal vars? //should not filter for literal var and just generate when join? //QUERY:is the allFilterBySatellites sometimes costly if candidate list is too large? //in this case we can join first and filter by edge later //TODO:check the time of each part in bsbm_100000, self5.sql, self6.sql bool Join::multi_join() { this->select(); //keep an increasing vector for temp results, not in id order //vals num generally < 10, so just enum them and check if conncted //finally, copy in order to result_list in BasicQuery TableIterator it0; list::iterator it1; vector::iterator it2; //list::iterator it3; //BETTER:filter all vertices first by allFilterByPres first and then select the minium? //QUERY+TODO:literal var not suitable for joining first! //The best strategy is to ensure that for each record, all satellite edges exist //then after join all core vertices, generate candidates for each satellite //and these are just the final accurate answer //It's out of question better than generating candidates for satellites now // //NOTICE:this should be done just once, so use it before pushing candidates //pruning the original candidates first(satellites only concerned with itself) //this->filterBySatellites(this->start_id); IDList& start_table = this->basic_query->getCandidateList(this->start_id); int start_size = this->basic_query->getCandidateSize(this->start_id); #ifdef DEBUG_JOIN cerr << "the start size " << start_size << endl; #endif for (int i = 0; i < start_size; ++i) { int ele = start_table.getID(i); RecordType record(1, ele); this->current_table.push_back(record); //this->table_row_new.push_back(false); } this->add_id_pos_mapping(this->start_id); //cout<<"the mapping is id "<start_id<<" and pos "<id2pos[this->start_id]<new_start = this->current_table.end(); //BETTER?:we can use nodes in stack to consider links instead of //nodes in current_table, but this needs the stack to be visited //below top, requiring us to implement on our own(array/vector) //DEBUG: var_num > 100, maybe using vector, increasing dynamicly //int mystack[100]; //int top = -1; //mystack[++top] = this->start_id; // //if using nodes in current_table to consider links, no [] //can be used(except changing to vector, but wasteful) //and then visit eles below top in stack is not ok, //so choose STL stack this->mystack.push(this->start_id); #ifdef DEBUG_JOIN //fprintf(stderr, "now to start the stack loop\n"); cerr << "now to start the stack loop" << endl; #endif while (!this->mystack.empty()) { int id = this->mystack.top(); #ifdef DEBUG_JOIN //fprintf(stderr, "the current id: %d\n", id); cerr << "the current id: " << id << endl; #endif //int id = mystack[top]; int maxi = this->choose_next_node(id); if (maxi == -1) //all edges of this node are dealed { #ifdef DEBUG_JOIN //fprintf(stderr, "the node is totally dealed: %d\n", id); cerr << "the node is totally dealed: " << id << endl; #endif //top--; this->mystack.pop(); continue; } int id2 = this->basic_query->getEdgeNeighborID(id, maxi); #ifdef DEBUG_JOIN //fprintf(stderr, "the next node id to join: %d\n", id2); cerr << "the next node id to join: " << id2 << endl; #endif //this->filterBySatellites(id2); #ifdef DEBUG_JOIN cerr << "the start size " << this->basic_query->getCandidateSize(id2) << endl; #endif //pre_id == -1 means we cannot find such predicate in rdf file, so the result set of this sparql should be empty. //note that we cannot support to query sparqls with predicate variables ?p. //TODO: if all missed?! //preid < 0 ! //if(id_list[cnt].empty()) //{ // ifEmpty = true; // break; //} vector edges; //the edge index for table column in id2 // the outer is node-loop, inner is canlist-loop vector< vector > id_lists; vector< vector > id_lists_len; //int* tmp_id_list; //int tmp_id_list_len; IDList& can_list = this->basic_query->getCandidateList(id2); int can_list_size = can_list.size(); for (int i = 0; i < this->id_pos; ++i) { int edge_index = this->basic_query->getEdgeIndex(id2, this->pos2id[i]); edges.push_back(edge_index); } //NOTICE: there are several ways to join two tables //h is the cost to search kvstore, m is the returned list size //n is the normal can_list_size, k is the vars num to //consider now, r is the record num //0. expand and intersect with another table: not ok! //1. given two node to find if exist right pre: //O(1) space, O(rhknlogn) time, //2. bsearch in can_list: O(mk+n) space, O(rmkhlogn) time //3. bsearch in id_list: O(nkm) space, O(rnklogm+knh) // //most queries will contain many constants(entity/literal) //var's can_list with one constant neighbor will be small, //otherwise will be big compared with id_list //the can_list of var representing literals is not valid, //must use kvstore->get...() to join bool is_literal = this->is_literal_var(id2); if (is_literal) { #ifdef DEBUG_PRECISE //fprintf(stderr, "this var may contain literals: %d\n", id2); cerr << "this var may contain literals: " << id2 << endl; #endif this->basic_query->setReady(id2); } else { #ifdef DEBUG_PRECISE //fprintf(stderr, "this var not contain literals: %d\n", id2); cerr << "this var not contain literals: " << id2 << endl; #endif } bool flag = false; bool if_prepare = this->if_prepare_idlist(can_list_size, is_literal); //#ifdef DEBUG_JOIN if_prepare = false; //#endif //needed if place can_list in the outer loop to join if (if_prepare) { #ifdef DEBUG_PRECISE //fprintf(stderr, "this edge uses prepared-join way\n"); cerr << "this edge uses prepared-join way" << endl; #endif this->acquire_all_id_lists(id_lists, id_lists_len, can_list, edges, id2, can_list_size); flag = this->new_join_with_multi_vars_prepared(id_lists, id_lists_len, edges, can_list, can_list_size); //need to release id_lists if using acquire_all_id_lists() firstly for (vector< vector >::iterator p1 = id_lists.begin(); p1 != id_lists.end(); ++p1) { for (vector::iterator p2 = p1->begin(); p2 != p1->end(); ++p2) { delete[] * p2; } } } else { #ifdef DEBUG_PRECISE //fprintf(stderr, "this edge uses not-prepared-join way\n"); cerr << "this edge uses not-prepared-join way" << endl; #endif flag = this->new_join_with_multi_vars_not_prepared(edges, can_list, can_list_size, id2, is_literal); } //if current_table is empty, ends directly if (!flag) { #ifdef DEBUG_JOIN //fprintf(stderr, "the result is already empty!!\n"); cerr << "the result is already empty!!" << endl; #endif //break; return false; //to avoid later invalid copy } for (int i = 0; i < this->id_pos; ++i) { int edge_index = edges[i]; if (edge_index != -1) { int edge_id = this->basic_query->getEdgeID(id2, edge_index); dealed_triple[edge_id] = true; } } this->new_start = this->current_table.end(); this->add_id_pos_mapping(id2); this->mystack.push(id2); } #ifdef DEBUG_JOIN //fprintf(stderr, "now end the stack loop\n"); cerr << "now end the stack loop" << endl; #endif //BETTER?:though the whole current_table is ordered here, the //selected columns are not definitely ordered, needing to be //sorted at the end. We can join based on the selected var's //candidate to ensure the order, but this may be complicated. //If we want to ensure the order here, new table is a must! //and the duplicates cannot be checked unless the last step! //The result list will not be too large generally, and the sort //is not in any loop.(but if the size is too large?) return true; } //=================================================================================================== //Below are functions to do index-join method //=================================================================================================== void Join::buildIndexLists() { this->index_lists = new IndexList[this->var_num]; for (int i = 0; i < this->var_num; ++i) { IDList& can_list = this->basic_query->getCandidateList(i); int can_list_size = can_list.size(); for (int j = 0; j < can_list_size; ++j) { this->index_lists[i].candidates.push_back(IndexItem(can_list[j])); } this->index_lists[i].candidates.push_back(IndexItem()); this->index_lists[i].border = this->index_lists[i].candidates.end(); this->index_lists[i].border--; } } //NOTICE: list of _id1 is all ok, but maybe add literals for list of _id2 bool Join::index_link(int _nid, int _idx) { int _id1 = _nid, _id2 = this->basic_query->getEdgeNeighborID(_nid, _idx); bool is_literal = this->is_literal_var(_id2); list& can1 = this->index_lists[_id1].candidates; list& can2 = this->index_lists[_id2].candidates; this->index_lists[_id1].travel_map.push_back(_id2); //all set to false first, later change to valid if ok for (ItemListIterator it = can1.begin(); it != this->index_lists[_id1].border; ++it) { it->isValid = false; } for (ItemListIterator it = can2.begin(); it != this->index_lists[_id2].border; ++it) { it->isValid = false; } for (ItemListIterator it = can1.begin(); it != this->index_lists[_id1].border; ++it) { int edge_type = this->basic_query->getEdgeType(_id1, _idx); int pre_id = this->basic_query->getEdgePreID(_id1, _idx); int* id_list; int id_list_len; if (edge_type == Util::EDGE_IN) { #ifdef DEBUG_JOIN //fprintf(stderr, "this is an edge to our id to join!\n"); cerr << "this is an edge to our id to join!" << endl; #endif this->kvstore->getobjIDlistBysubIDpreID(it->value, pre_id, id_list, id_list_len, true); } else { #ifdef DEBUG_JOIN //fprintf(stderr, "this is an edge from our id to join!\n"); cerr << "this is an edge from our id to join!" << endl; #endif this->kvstore->getsubIDlistByobjIDpreID(it->value, pre_id, id_list, id_list_len, true); } if (id_list_len == 0) { //id_list is NULL in this case #ifdef DEBUG_JOIN //fprintf(stderr, "this id_list is empty!\n"); cerr << "this id_list is empty!" << endl; #endif continue; } it->travel.push_back(IteratorList()); for (int i = 0; i < id_list_len; ++i) { //if we found this element(entity/literal) in var1's candidate list, or this is a literal //element and var2 is a free literal variable, we should add this one to result. bool flag = false; ItemListIterator ret; if (Util::is_literal_ele(id_list[i])) { //NOTICE:literals cannot exist in the result from VStree, so no need to search //if added already, then the expression in if() returns false if (is_literal) { //QUERY:maybe same one between different records, and should be dealed to be ordered! flag = true; //BETTER?:the adding way is due to the not-sort and not-binary search method can2.push_back(IndexItem(id_list[i])); ret = --can2.end(); #ifdef DEBUG_JOIN //fprintf(stderr, "to add literal for free variable!\n"); cerr << "to add literal for free variable!" << endl; #endif } } else { //BETTER:currently we can search in the candidate list, but the iterator? ret = this->index_lists[_id2].search(id_list[i]); if (ret != this->index_lists[_id2].border) flag = true; } if (!flag) continue; //printf("add the ele to list!\n"); it->isValid = true; it->travel[it->travel.size() - 1].push_back(ret); ret->isValid = true; } delete[] id_list; } //deal with invalid eles in can1 and can2 for (ItemListIterator it = can1.begin(); it != this->index_lists[_id1].border;) { if (it->isValid) it++; else { //BETTER:no need to add to end if the start list can1.push_back(*it); it = can1.erase(it); } } for (ItemListIterator it = can2.begin(); it != this->index_lists[_id2].border;) { if (it->isValid) it++; else { //NOTICE:here we can removve directly it = can2.erase(it); } } //NOTICE:we deal with possible literals in the final to avoid meaningless search before //The list returned by get... is sorted, and all literal id > vertex id,, so if needing //sort, nonsense to compare the latter part with the former part //However, there maybe different ordered literals lists to be added, so the order won't be //naturally kept if (this->index_lists[_id2].border != --can2.end()) { //adjust the border in can2, also due to the structure and unsort,, not-binary search method can2.erase(this->index_lists[_id2].border); this->index_lists[_id2].border = --can2.end(); } return true; } //_nid is the newer bool Join::index_filter(int _nid, int _idx) { //TODO:the two lists are allok, only remove no add int _id1 = _nid, _id2 = this->basic_query->getEdgeNeighborID(_nid, _idx); bool is_literal = this->is_literal_var(_id2); list& can1 = this->index_lists[_id1].candidates; list& can2 = this->index_lists[_id2].candidates; this->index_lists[_id1].travel_map.push_back(_id2); //QUERY:how about the search? nonsense to search directly in the candidate list! //or search reversely in the list returned by get...? //Notice that the newer one maybe not ordered! // //TODO:the literals should join the filter process? //consider to reset the border of list after filter!!! // //reverse to filter, searching in objlist, which is generated by the smaller one, and this is ordered return true; } bool Join::table_travel(int _id1, int _id2) { //NOTICE: all is ok if in valid area, just travel and link the two return true; } bool Join::table_check(int _id1, int _id2) { //NOTICE: need to verify the linking, but exist-question is many and frustrating return true; } bool Join::travel_init(int _lid) { if (this->index_lists[_lid].prepared) return true; int size = this->index_lists[_lid].travel_map.size(); if (size == 0) { this->index_lists[_lid].prepared = true; return true; } list& can = this->index_lists[_lid].candidates; for (list::iterator it = can.begin(); it != this->index_lists[_lid].border;) { //deal with invalid eles according to neighbor list for (int i = 0; i < size; ++i) { int tid = this->index_lists[_lid].travel_map[i]; if (this->travel_init(tid) == false) { return false; } list< list::iterator >& next = it->travel[i]; for (list< list::iterator >::iterator it2 = next.begin(); it2 != next.end();) { if ((*it2)->isValid == false) { it2 = next.erase(it2); } else { it2++; } } if (next.empty()) { it->isValid = false; break; } } if (it->isValid) { it++; } else { IndexItem tmp = *it; it = can.erase(it); can.push_back(tmp); } } if (can.begin() == this->index_lists[_lid].border) return false; //BETTER:remove all invalid eles in the next lists now this->index_lists[_lid].prepared = true; return true; } //WARN:we nned to use IndexItem iterator/object instead of int in current table now, //NOTICE: this travel strategy is based on the relation between two index lists //otherwise things not work.. So, this strategy is not used for now. bool Join::index_travel_two() { if (this->travel_init(this->start_id) == false) return false; //reuse mystack because by now the stack is already empty this->reset_id_pos_mapping(); this->mystack.push(this->start_id); this->add_id_pos_mapping(this->start_id); //init the current table with the start index list list& can = this->index_lists[this->start_id].candidates; for (list::iterator it = can.begin(); it != this->index_lists[this->start_id].border; ++it) { RecordType record(1, it->value); this->current_table.push_back(record); } //fprintf(stderr, "now to travel and store in current table\n"); cerr << "now to travel and store in current table" << endl; while (!this->mystack.empty()) { int id = this->mystack.top(); //fprintf(stderr, "the current id: %d\n", id); cerr << "the current id: " << id << endl; if (this->index_lists[id].end()) //all linking of this index list are travelled { //fprintf(stderr, "the list is totally dealed: %d\n", id); cerr << "the list is totally dealed: " << id << endl; this->mystack.pop(); continue; } int id2 = this->index_lists[id].next(); //fprintf(stderr, "the next list id to travel: %d\n", id2); cerr << "the next list id to travel: " << id2 << endl; bool flag = true; //NOTICE: we assume that the former node is all ok, scanning it to border flag = this->table_travel(id, id2); //if already empty(fail to link the two lists), ends directly if (!flag) { //fprintf(stderr, "the result is already empty!!\n"); cerr << "the result is already empty!!" << endl; return false; //to avoid later invalid copy } int size = this->index_lists[id2].check_map.size(); for (int i = 0; i < size; ++i) { flag = this->table_check(id2, this->index_lists[id2].check_map[i]); if (!flag) { //fprintf(stderr, "the result is already empty!!\n"); cerr << "the result is already empty!!" << endl; return false; //to avoid later invalid copy } } this->add_id_pos_mapping(id2); this->mystack.push(id2); } return true; } //NOTICE: this strtegy is based on one-line travesal, so prepare one iterator for each index list bool Join::index_travel_one() { //TODO return true; } bool Join::index_travel() { return this->index_travel_one(); } bool Join::index_join() { this->buildIndexLists(); //OPTION: remove contents in candidate_list now(which are originally removed in BasicQuery::clear()) this->select(); this->mystack.push(this->start_id); this->add_id_pos_mapping(this->start_id); //fprintf(stderr, "now to start the stack loop\n"); cerr << "now to start the stack loop" << endl; while (!this->mystack.empty()) { int id = this->mystack.top(); //fprintf(stderr, "the current id: %d\n", id); cerr << "the current id: " << id << endl; int maxi = this->choose_next_node(id); if (maxi == -1) //all edges of this node are dealed { //fprintf(stderr, "the node is totally dealed: %d\n", id); cerr << "theh node is totally dealed: " << id << endl; this->mystack.pop(); continue; } int id2 = this->basic_query->getEdgeNeighborID(id, maxi); //fprintf(stderr, "the next node id to join: %d\n", id2); cerr << "the next node id to join: " << id2 << endl; IDList& can_list = this->basic_query->getCandidateList(id2); //int can_list_size = can_list.size(); bool is_literal = this->is_literal_var(id2); if (is_literal) //fprintf(stderr, "this var may contain literals: %d\n", id2); cerr << "this var may contain literals: " << id2 << endl; else //fprintf(stderr, "this var not contain literals: %d\n", id2); cerr << "this var not contain literals: " << id2 << endl; bool flag = true; //NOTICE: we assume that the former node is all ok, scanning it to border //flag = this->index_link(id, id2); flag = this->index_link(id, maxi); //if already empty(fail to link the two lists), ends directly if (!flag) { //fprintf(stderr, "the result is already empty!!\n"); cerr << "the result is already empty!!" << endl; return false; //to avoid later invalid copy } int edge_index, edge_id = this->basic_query->getEdgeID(id, maxi); this->dealed_triple[edge_id] = true; for (int i = 0; i < this->id_pos; ++i) { if (this->pos2id[i] == id) continue; edge_index = this->basic_query->getEdgeIndex(id2, this->pos2id[i]); if (edge_index == -1) continue; //flag = this->index_filter(id2, this->pos2id[i]); flag = this->index_filter(id2, edge_index); //if already empty(fail to link the two lists), ends directly if (!flag) { //fprintf(stderr, "the result is already empty!!\n"); cerr << "the result is already empty!!" << endl; return false; //to avoid later invalid copy } edge_id = this->basic_query->getEdgeID(id2, edge_index); this->dealed_triple[edge_id] = true; } this->add_id_pos_mapping(id2); this->mystack.push(id2); } //fprintf(stderr, "now end the stack loop\n"); cerr << "now end the stack loop" << endl; // To travel and store in current_table, then do the last filter if (this->index_travel() == false) return false; //printf("now to filter through only_pre_filter_after_join\n"); cerr << "now to filter through only_pre_filter_after_join" << endl; this->only_pre_filter_after_join(); //copy to result list, adjust the vars order vector& result_list = this->basic_query->getResultList(); result_list.clear(); int select_var_num = this->basic_query->getSelectVarNum(); for (TableIterator it0 = this->current_table.begin(); it0 != this->current_table.end(); ++it0) { int* record = (int*)malloc(sizeof(int) * select_var_num); for (int i = 0; i < this->id_pos; ++i) { if (this->pos2id[i] < select_var_num) record[this->pos2id[i]] = (*it0)[i]; } result_list.push_back(record); } return true; } //=================================================================================================== //Below are functions before or after Join //=================================================================================================== //sort the candidate lists and deal with all constant neigbors bool Join::filter_before_join() { //fprintf(stderr, "*****IIIIIIN filter_before_join\n"); cerr << "*****IN filter_before_join" << endl; for (int i = 0; i < this->var_num; i++) { bool flag = this->basic_query->isLiteralVariable(i); //fprintf(stderr, "\tVar%d %s\n", i, this->basic_query->getVarName(i).c_str()); cerr << "\tVar" << i << " " << this->basic_query->getVarName(i) << endl; IDList &can_list = this->basic_query->getCandidateList(i); //fprintf(stderr, "\t\tsize of canlist before filter: %d\n", can_list.size()); cerr << "\t\tsize of canlist before filter: " << can_list.size() << endl; //NOTICE:must sort before using binary search. can_list.sort(); long begin = Util::get_cur_time(); bool ret = this->constant_edge_filter(i); long after_constant_edge_filter = Util::get_cur_time(); //fprintf(stderr, "\t\tconstant_edge_filter: used %ld ms\n", after_constant_edge_filter - begin); cerr << "\t\tconstant_edge_filter: used " << (after_constant_edge_filter - begin) << " ms" << endl; // this->preid_filter(this->basic_query, i); // long after_preid_filter = Util::get_cur_time(); //cout << "\t\tafter_preid_filter: used " << (after_preid_filter-after_literal_edge_filter) << " ms" << endl; //fprintf(stderr, "\t\t[%d] after filter, candidate size = %d\n\n\n", i, can_list.size()); cerr << "\t\t[" << i << "] after filter, candidate size= " << can_list.size() << endl << endl << endl; //debug // { // stringstream _ss; // for(int i = 0; i < can_list.size(); i ++) // { // string _can = this->kvstore->getEntityByID(can_list[i]); // _ss << "[" << _can << ", " << can_list[i] << "]\t"; // } // _ss << endl; // Util::logging(_ss.str()); // cout << can_list.to_str() << endl; // } if (!flag && !ret) //already empty { return false; } } //fprintf(stderr, "OOOOOOUT filter_before_join\n"); cerr << "OUT filter_before_join" << endl; return true; } //decrease the candidates of _var_i using its constant neighbors bool Join::constant_edge_filter(int _var_i) { //Util::logging("IN literal_edge_filter"); //debug int var_degree = this->basic_query->getVarDegree(_var_i); IDList &_list = this->basic_query->getCandidateList(_var_i); for (int j = 0; j < var_degree; j++) { int neighbor_id = this->basic_query->getEdgeNeighborID(_var_i, j); //fprintf(stderr, "\t\t\tneighbor_id=%d\n", neighbor_id); cerr << "\t\t\tneighbor_id=" << neighbor_id << endl; if (neighbor_id != -1) //variables in join not considered here { continue; } char edge_type = this->basic_query->getEdgeType(_var_i, j); int triple_id = this->basic_query->getEdgeID(_var_i, j); Triple triple = this->basic_query->getTriple(triple_id); string neighbor_name; if (edge_type == Util::EDGE_OUT) { neighbor_name = triple.object; } else { neighbor_name = triple.subject; } //NOTICE: this is another case, vars not in join, we only need constants bool only_preid_filter = (this->basic_query->isOneDegreeNotJoinVar(neighbor_name)); if (only_preid_filter) { continue; } int pre_id = this->basic_query->getEdgePreID(_var_i, j); int lit_id = (this->kvstore)->getIDByEntity(neighbor_name); if (lit_id == -1) { lit_id = (this->kvstore)->getIDByLiteral(neighbor_name); } // cout << "\t\tedge[" << j << "] "<< lit_string << " has id " << lit_id << ""; // cout << " preid:" << pre_id << " type:" << edge_type // << endl; // { // stringstream _ss; // _ss << "\t\tedge[" << j << "] "<< lit_string << " has id " << lit_id << ""; // _ss << " preid:" << pre_id << " type:" << edge_type // << endl; // Util::logging(_ss.str()); // } int id_list_len = 0; int* id_list = NULL; if (pre_id >= 0) { if (edge_type == Util::EDGE_OUT) { (this->kvstore)->getsubIDlistByobjIDpreID(lit_id, pre_id, id_list, id_list_len, true); } else { (this->kvstore)->getobjIDlistBysubIDpreID(lit_id, pre_id, id_list, id_list_len, true); } } else if (pre_id == -2) { if (edge_type == Util::EDGE_OUT) { (this->kvstore)->getsubIDlistByobjID(lit_id, id_list, id_list_len, true); } else { (this->kvstore)->getobjIDlistBysubID(lit_id, id_list, id_list_len, true); } } else // pre_id == -1 means we cannot find such predicate in rdf file, so the result set of this sparql should be empty. // note that we cannot support to query sparqls with predicate variables ?p. { id_list_len = 0; // if (edge_type == Util::EDGE_OUT) // { // (this->kvstore)->getsubIDlistByobjID(lit_id, id_list, id_list_len); // } // else // { // (this->kvstore)->getobjIDlistBysubID(lit_id, id_list, id_list_len); // } } //debug // { // stringstream _ss; // _ss << "id_list: "; // for (int i=0;ivar_num; i++) { //debug //{ // stringstream _ss; // _ss << "var[" << i << "]\t"; // if (this->basic_query->isLiteralVariable(i)) // { // _ss << "may have literal result."; // } // else // { // _ss << "do not have literal result."; // } // _ss << endl; // //Util::logging(_ss.str()); //} //if(!this->basic_query->isLiteralVariable(i)) //{ //// if this variable is not literal variable, we can assume that its literal candidates have been added. //this->basic_query->setAddedLiteralCandidate(i); //continue; //} if (this->basic_query->isReady(i)) { continue; } if (this->basic_query->isSatelliteInJoin(i)) { continue; } // for these literal variable without any linking entities(we call free literal variable), // we will add their literal candidates when join-step. if (this->basic_query->isFreeLiteralVariable(i)) { continue; } int var_id = i; int var_degree = this->basic_query->getVarDegree(var_id); IDList literal_candidate_list; bool flag = false; // intersect each edge's literal candidate. for (int j = 0; j < var_degree; j++) { int neighbor_id = this->basic_query->getEdgeNeighborID(var_id, j); int predicate_id = this->basic_query->getEdgePreID(var_id, j); int triple_id = this->basic_query->getEdgeID(var_id, j); Triple triple = this->basic_query->getTriple(triple_id); string neighbor_name = triple.subject; IDList this_edge_literal_list; // if the neighbor of this edge is an entity, we can add all literals which has an exact predicate edge linking to this entity. if (neighbor_id == -1) { int subject_id = (this->kvstore)->getIDByEntity(neighbor_name); int* object_list = NULL; int object_list_len = 0; if (predicate_id >= 0) (this->kvstore)->getobjIDlistBysubIDpreID(subject_id, predicate_id, object_list, object_list_len, true); else if (predicate_id == -2) { this->kvstore->getobjIDlistBysubID(subject_id, object_list, object_list_len, true); } //NOTICE:only literals should be unioned this_edge_literal_list.unionList(object_list, object_list_len, true); delete[]object_list; } // if the neighbor of this edge is variable, then the neighbor variable can not have any literal results, // we should add literals when join these two variables, see the Database::join function for details. // deprecated... // if the neighbor of this edge is variable, we should add all this neighbor variable's candidate entities' neighbor literal, // which has one corresponding predicate edge linking to this variable. else { continue; /* IDList& neighbor_candidate_list = this->basic_query->getCandidateList(neighbor_id); int neighbor_candidate_list_size = neighbor_candidate_list.size(); for (int k = 0;k < neighbor_candidate_list_size; k ++) { int subject_id = neighbor_candidate_list.getID(k); int* object_list = NULL; int object_list_len = 0; (this->kvstore)->getobjIDlistBysubIDpreID(subject_id, predicate_id, object_list, object_list_len); this_edge_literal_list.unionList(object_list, object_list_len); delete []object_list; } */ } if (!flag) { flag = true; literal_candidate_list.unionList(this_edge_literal_list); } else { literal_candidate_list.intersectList(this_edge_literal_list); } } // add the literal_candidate_list to the original candidate list. IDList& origin_candidate_list = this->basic_query->getCandidateList(var_id); //int origin_candidate_list_len = origin_candidate_list.size(); origin_candidate_list.unionList(literal_candidate_list, true); //int after_add_literal_candidate_list_len = origin_candidate_list.size(); // this variable's literal candidates have been added. //this->basic_query->setAddedLiteralCandidate(var_id); this->basic_query->setReady(var_id); //{ //stringstream _ss; //_ss << "var[" << var_id << "] candidate list after add literal:\t" //<< origin_candidate_list_len << "-->" << after_add_literal_candidate_list_len << endl; /* for (int i = 0; i < after_add_literal_candidate_list_len; i ++) { int candidate_id = origin_candidate_list.getID(i); string candidate_name; if (i < origin_candidate_list_len) { candidate_name = (this->kvstore)->getEntityByID(origin_candidate_list.getID(i)); } else { candidate_name = (this->kvstore)->getLiteralByID(origin_candidate_list.getID(i)); } _ss << candidate_name << "(" << candidate_id << ")\t"; } */ //Util::logging(_ss.str()); //} } //Util::logging("OUT add_literal_candidate"); } //NOTICE:I think we should use this instead of only_pre_filter_after_join //this function not only consider satellite predicates, but also one degree not selected var and other vars in join //(constants ar enot necessary considered here) //this check is fast because predicate num is small, but the performance can be very good //(instead of filter when joining, we do a precheck first!) bool Join::allFilterByPres() { //NOTICE:this check is a must to ensure that we can get all right answers //for core vertices after join, then we can generate satellites directly for (int i = 0; i < this->var_num; ++i) { if (this->basic_query->isSatelliteInJoin(i)) continue; if (this->filterBySatellites(i) == false) return false; } return true; } //NOTICE:we should only consider satellites, because constant neighbor edges are already dealed //and edge in join can be dealed quicked if not satisfy by sp2o or op2s bool //false when no result for this basicquery Join::filterBySatellites(int _var) { //TODO:not consider already dealed edge //TODO:not always filter by pres first, maybe not efficient //if cans size is small, then can precise to avoid burst-increment when deep-join //if size is very large, the cost is high and not many can be filtered! //(keep state for each one-degree node, if considered) IDList& cans = this->basic_query->getCandidateList(_var); int size = this->basic_query->getCandidateSize(_var); //result if already empty for non-literal variable if (size == 0 && !is_literal_var(_var)) return false; int var_degree = this->basic_query->getVarDegree(_var); vector in_edge_pre_id; vector out_edge_pre_id; for (int i = 0; i < var_degree; i++) { char edge_type = this->basic_query->getEdgeType(_var, i); int triple_id = this->basic_query->getEdgeID(_var, i); Triple triple = this->basic_query->getTriple(triple_id); string neighbor; if (edge_type == Util::EDGE_OUT) { neighbor = triple.object; } else { neighbor = triple.subject; } //not consider edge with constant neighbors here if (neighbor[0] != '?') { //cerr << "not to filter: " << neighbor_name << endl; continue; } //else //cerr << "need to filter: " << neighbor_name << endl; int pre_id = this->basic_query->getEdgePreID(_var, i); //WARN+BETTER:invalid(should be discarded in Query) or ?p(should not be considered here) if (pre_id < 0) { continue; } if (edge_type == Util::EDGE_OUT) { out_edge_pre_id.push_back(pre_id); } else { in_edge_pre_id.push_back(pre_id); } } //BETTER:maybe several duplicates of predicates //use set instead? if (in_edge_pre_id.empty() && out_edge_pre_id.empty()) { return true; } //QUERY:maybe we can divide edges into two separate groups according to the size of p2s //NOTICE+BETTER: the cost should be due to the cans size, p2s size and s2p size //generally, size of p2s is larger than s2p, but smaller than size of cans //The best way is to extract the features of dataset and keep //but we may use a simple strategy here: use p2s if cans size is too large, i.e. > size of p2s //(assuming 5000 here) //WARN:different edge may corresponding different size of subjects, like is too large //QUERY: erase is too costly, use an invalid[] array, maybe bitset due to large candidates size //only consider valid ones when join loop, but how about intersect and union? // //we build a new idlist with all valid ones, and update to the original idlist //(consider in current_table is not good, too many duplicates) //IDList* valid_list = NULL; //int *list = NULL; //int len = 0; ////TODO+BETTER:o2p and s2p may duplicate many times //if (!in_edge_pre_id.empty()) //{ // int size2 = in_edge_pre_id.size(); // for (int i = 0; i < size2; ++i) // { // int preid = in_edge_pre_id[i]; // this->kvstore->getobjIDlistBypreID(preid, list, len); // if (i == 0) // { // if (size > len) // { // valid_list = IDList::intersect(cans, list, len); // } // else // { // valid_list = new IDList; // int* list2 = NULL; // int len2 = 0; // for (int j = 0; j < size; ++j) // { // this->kvstore->getpreIDlistByobjID(cans[j], list2, len2); // if (Util::bsearch_int_uporder(preid, list2, len2) != -1) // { // valid_list->addID(cans[j]); // } // delete[] list2; // } // } // } // else // { // if (valid_list->size() > len) // { // valid_list->intersectList(list, len); // } // else // { // int* list2 = NULL; // int len2 = 0; // IDList* new_list = new IDList; // int size3 = valid_list->size(); // for (int j = 0; j < size3; ++j) // { // this->kvstore->getpreIDlistByobjID(valid_list->getID(j), list2, len2); // if (Util::bsearch_int_uporder(preid, list2, len2) != -1) // { // new_list->addID(valid_list->getID(j)); // } // delete[] list2; // } // delete valid_list; // valid_list = new_list; // } // } // delete[] list; // } //} //if (!is_literal_var(_var) && valid_list != NULL && valid_list->empty()) //{ // //cerr << "quit when empty in edge"<kvstore->getsubIDlistBypreID(preid, list, len); // //cerr<<"p2s len "< len) // { // valid_list = IDList::intersect(cans, list, len); // } // else // { // valid_list = new IDList; // int* list2 = NULL; // int len2 = 0; // for (int j = 0; j < size; ++j) // { // this->kvstore->getpreIDlistBysubID(cans[j], list2, len2); // if (Util::bsearch_int_uporder(preid, list2, len2) != -1) // { // valid_list->addID(cans[j]); // } // delete[] list2; // } // } // } // else // { // if (valid_list->size() > len) // { // valid_list->intersectList(list, len); // } // else // { // int* list2 = NULL; // int len2 = 0; // IDList* new_list = new IDList; // int size3 = valid_list->size(); // for (int j = 0; j < size3; ++j) // { // this->kvstore->getpreIDlistBysubID(valid_list->getID(j), list2, len2); // if (Util::bsearch_int_uporder(preid, list2, len2) != -1) // { // new_list->addID(valid_list->getID(j)); // } // delete[] list2; // } // delete valid_list; // valid_list = new_list; // } // } // delete[] list; // } //} //if (!is_literal_var(_var) && valid_list->empty()) //{ // //cerr << "quit when empty out edge"< valid_idlist; for(int i = 0; i < size; ++i) { int ele = cans[i]; int* list = NULL; int list_len = 0; bool exist_preid = true; if(exist_preid && !in_edge_pre_id.empty()) { //(this->kvstore)->getpreIDsubIDlistByobjID(entity_id, pair_list, pair_len); (this->kvstore)->getpreIDlistByobjID(ele, list, list_len, true); for(vector::iterator itr_pre = in_edge_pre_id.begin(); itr_pre != in_edge_pre_id.end(); itr_pre++) { int pre_id = (*itr_pre); //the return value is pos, -1 if not found if(Util::bsearch_int_uporder(pre_id, list, list_len) == -1) exist_preid = false; if(!exist_preid) { break; } } delete[] list; } //NOTICE:we do not use intersect here because the case is a little different //first the pre num is not so much in a query //second once a pre in query is not found, break directly if(exist_preid && !out_edge_pre_id.empty()) { //(this->kvstore)->getpreIDobjIDlistBysubID(entity_id, pair_list, pair_len); (this->kvstore)->getpreIDlistBysubID(ele, list, list_len, true); for(vector::iterator itr_pre = out_edge_pre_id.begin(); itr_pre != out_edge_pre_id.end(); itr_pre++) { int pre_id = (*itr_pre); if(Util::bsearch_int_uporder(pre_id, list, list_len) == -1) exist_preid = false; if(!exist_preid) { break; } } delete[] list; } //result sequence is illegal when there exists any missing filter predicate id. if(exist_preid) { valid_idlist.push_back(ele); } } //this is a core vertex, so if not literal var, exit when empty if(!is_literal_var(_var) && valid_idlist.empty()) { return false; } cans.copy(valid_idlist); cerr << "var " << _var << "size after pre_filter " << cans.size() << endl; return true; } //if neighbor is an var, but not in select //then, if its degree is 1, it has none contribution to filter //only its sole edge property(predicate) makes sense //we should make sure that current candidateVar has an edge matching the predicate bool Join::only_pre_filter_after_join() { for (int var_id = 0; var_id < this->var_num; var_id++) { int var_degree = this->basic_query->getVarDegree(var_id); //get all the only predicate filter edges for this variable. vector in_edge_pre_id; vector out_edge_pre_id; for (int i = 0; i < var_degree; i++) { //WARN:one degree not in select var's id is also -1 !! //constant neighbors already be dealed in literal_edge_filter //if(this->basic_query->getEdgeNeighborID(var_id, i) == -1) //continue; char edge_type = this->basic_query->getEdgeType(var_id, i); int triple_id = this->basic_query->getEdgeID(var_id, i); Triple triple = this->basic_query->getTriple(triple_id); string neighbor_name; if (edge_type == Util::EDGE_OUT) { neighbor_name = triple.object; } else { neighbor_name = triple.subject; } bool only_preid_filter = (this->basic_query->isOneDegreeNotJoinVar(neighbor_name)); if (!only_preid_filter) { //cerr << "not to filter: " << neighbor_name << endl; continue; } //else //cerr << "need to filter: " << neighbor_name << endl; int pre_id = this->basic_query->getEdgePreID(var_id, i); if (pre_id < 0) { continue; } if (edge_type == Util::EDGE_OUT) { out_edge_pre_id.push_back(pre_id); } else { in_edge_pre_id.push_back(pre_id); } } if (in_edge_pre_id.empty() && out_edge_pre_id.empty()) { continue; } for (TableIterator it = this->current_table.begin(); it != this->current_table.end();) { int entity_id = (*it)[this->id2pos[var_id]]; int* pair_list = NULL; int pair_len = 0; bool exist_preid = true; //NOTICE: four ways to judge if the predicates exist //getpreIDsubIDlistByobjID getpreIDobjIDlistBysubID //getsubIDlistBypreIDobjID getobjIDlistBysubIDpreID //I think the best one is: getpreIDlistBysubID getpreIDlistByobjID //how about getsubIDlistBypreID getobjIDlistBypreID // //the predicates in query can not be too large, so just loop //you can also use an intersect one if the two ordered list are both large if (exist_preid && !in_edge_pre_id.empty()) { //(this->kvstore)->getpreIDsubIDlistByobjID(entity_id, pair_list, pair_len); (this->kvstore)->getpreIDlistByobjID(entity_id, pair_list, pair_len, true); for (vector::iterator itr_pre = in_edge_pre_id.begin(); itr_pre != in_edge_pre_id.end(); itr_pre++) { int pre_id = (*itr_pre); //exist_preid = Util::bsearch_preid_uporder(pre_id, pair_list, pair_len); if (Util::bsearch_int_uporder(pre_id, pair_list, pair_len) == -1) exist_preid = false; if (!exist_preid) { break; } } delete[] pair_list; } if (exist_preid && !out_edge_pre_id.empty()) { //(this->kvstore)->getpreIDobjIDlistBysubID(entity_id, pair_list, pair_len); (this->kvstore)->getpreIDlistBysubID(entity_id, pair_list, pair_len, true); for (vector::iterator itr_pre = out_edge_pre_id.begin(); itr_pre != out_edge_pre_id.end(); itr_pre++) { int pre_id = (*itr_pre); //exist_preid = Util::bsearch_preid_uporder(pre_id, pair_list, pair_len); if (Util::bsearch_int_uporder(pre_id, pair_list, pair_len) == -1) exist_preid = false; if (!exist_preid) { break; } } delete[] pair_list; } //result sequence is illegal when there exists any missing filter predicate id. if (!exist_preid) { it = this->current_table.erase(it); } else { it++; } } if (this->current_table.empty()) { return false; } } return true; }