gStore/Parser/DBparser.cpp

317 lines
8.4 KiB
C++

/*
* DBparser.cpp
*
* Created on: 2014-6-20
* Author: liyouhuan
*/
#include "DBparser.h"
DBparser::DBparser()
{
_prefix_map.clear();
}
/* input sparql query string and parse query into SPARQLquery
* the returned string is set for log when error happen
* */
string DBparser::sparqlParser(const string& _sparql, SPARQLquery& _sparql_query)
{
pANTLR3_INPUT_STREAM input;
pSparqlLexer lex;
pANTLR3_COMMON_TOKEN_STREAM tokens;
pSparqlParser parser;
input = antlr3StringStreamNew((ANTLR3_UINT8 *)(_sparql.c_str()),ANTLR3_ENC_UTF8,_sparql.length(),(ANTLR3_UINT8 *)"QueryString");
//input = antlr3FileStreamNew((pANTLR3_UINT8)filePath,ANTLR3_ENC_8BIT);
lex = SparqlLexerNew(input);
tokens = antlr3CommonTokenStreamSourceNew(ANTLR3_SIZE_HINT,TOKENSOURCE(lex));
parser = SparqlParserNew(tokens);
SparqlParser_workload_return r = parser->workload(parser);
pANTLR3_BASE_TREE root = r.tree;
pANTLR3_BASE_TREE treeNode;
printNode(root);
parseNode(root,_sparql_query,0);
parser->free(parser);
tokens->free(tokens);
lex->free(lex);
input->close(input);
return "";
}
/* file pointer _fp points to rdfFile
* that was opened previously in Database::encodeRDF
* rdfParser() will be called many times until all triples in the rdfFile is parsed
* and after each call, a group of triples will be parsed into the vector;
* the returned string is set for log when error happen;
* a single line in file responds to a triple and end up with a '.'
* tuple in a line separated by '\t'
*/
string DBparser::rdfParser(ifstream& _fin, Triple* _triple_array, int& _triple_num)
{
memset(line_buf, 0, buf_len);
_triple_num = 0;
int _line_len = 0;
while(_triple_num < DBparser::TRIPLE_NUM_PER_GROUP
&& (! _fin.eof()))
{
_fin.getline(line_buf, buf_len-1);
_line_len = strlen(line_buf);
/* maybe this is an empty line at the ending of file */
if(_line_len < 4)
{
continue;
}
bool _end_with_dot = true;
int _i_dot = 0;
for(int i = _line_len-1; i >= 0; i --)
{
if(line_buf[i] == '.')
{
_i_dot = i;
break;
}
if(line_buf[i] == '\t')
{
_end_with_dot = false;
break;
}
}
/* check the '.' */
if(! _end_with_dot)
{
cerr << "'.' is expected at line:" << line_buf << endl;
cerr << " line_length = " << _line_len << endl;
continue;
}
line_buf[_i_dot+1] = '\0';
_line_len = strlen(line_buf);
/* find the first TAB */
int _first_tab = -1;
for(int i = 0; i < _line_len; i ++)
{
if(line_buf[i] == '\t')
{
_first_tab = i;
break;
}
}
if(_first_tab == -1)
{
cerr << "First TAB is expected at line:" << line_buf << endl;
continue;
}
/* find the second TAB */
int _second_tab = -1;
for(int i = _first_tab+1; i < _line_len; i ++)
{
if(line_buf[i] == '\t')
{
_second_tab = i;
break;
}
}
if(_second_tab == -1)
{
cerr << "Second TAB is expected at line:" << line_buf << endl;
}
/* get sub, pre, obj and add new triple */
{
string _line = string(line_buf);
int _sub_size = _first_tab - 0;
_triple_array[_triple_num].subject = _line.substr(0, _sub_size);
int _pre_size = _second_tab - (_first_tab+1);
_triple_array[_triple_num].predicate = _line.substr(_first_tab+1, _pre_size);
/* (_line_len-1) make sure that '.' is not included */
int _obj_size = (_line_len-1) - (_second_tab+1);
_triple_array[_triple_num].object = _line.substr(_second_tab+1, _obj_size);
}
_triple_num ++;
}
return "";
}
/*
* used in readline of FILE, avoiding new memory each time
*/
char* DBparser::line_buf = new char[100*1000];
int DBparser::buf_len = 100*1000;
int DBparser::parseString(pANTLR3_BASE_TREE node,std::string& str,int depth){
const char* s =(const char*) node->getText(node)->chars;
//std::cout<<"parseString: "<<s<<std::endl;
if (depth==0){
str = s;
}
else{
parseString((pANTLR3_BASE_TREE) node->getChild(node,0),str,depth-1);
}
return 0;
}
int DBparser::parsePrefix(pANTLR3_BASE_TREE node,std::pair<std::string,std::string>& prefixPair){
const char* s =(const char*) node->getText(node)->chars;
std::string key;
std::string value;
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
//prefix key string 136
if (childNode->getType(childNode)==136){
parseString(childNode,key);
}
//prefix value URL 89
if (childNode->getType(childNode)==89){
parseString(childNode,value);
}
}
prefixPair = make_pair(key,value);
return 0;
}
int DBparser::replacePrefix(string& str){
if (str[0]!='<'){
int sep=str.find(":");
std::string prefix=str.substr(0,sep+1);
std::cout<<"prefix: "<<prefix<<std::endl;
if (_prefix_map.find(prefix)!=_prefix_map.end()){
str=_prefix_map[prefix].substr(0,_prefix_map[prefix].length()-1)+str.substr(sep+1,str.length()-sep-1)+">";
std::cout<<"str: "<<str<<std::endl;
}
else{
std::cout<<"prefix not found..."<<std::endl;
}
}
}
int DBparser::parseTriple(pANTLR3_BASE_TREE node,Triple& triple){
const char* s =(const char*) node->getText(node)->chars;
std::string subject="";
std::string predicate="";
std::string object="";
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
//subject 177
if (childNode->getType(childNode)==177){
parseString(childNode,subject,1);
replacePrefix(subject);
}
//predicate 142
if (childNode->getType(childNode)==142){
parseString(childNode,predicate,4);
replacePrefix(predicate);
}
//object 119
if (childNode->getType(childNode)==119){
parseString(childNode,object,1);
replacePrefix(object);
}
}
triple=Triple(subject,predicate,object);
std::cout<<"Triple: \n\ts|"<<subject<<"|\n\tp|"<<predicate<<"|\n\to|"<<object<<"|"<<std::endl;
return 0;
}
int DBparser::parseBasicQuery(pANTLR3_BASE_TREE node,BasicQuery& basicQuery){
const char* s =(const char*) node->getText(node)->chars;
Triple triple;
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
//basicQuery 185
std::cout<<"Child type: "<<childNode->getType(childNode)<<endl;
if (childNode->getType(childNode)==185){
parseTriple(childNode,triple);
basicQuery.addTriple(triple);
}
if (childNode->getType(childNode)==195){
//Union part here!!
//parseUnion(childNode,U);
//basicQuery.addTriple(triple);
}
}
return 0;
}
int DBparser::parseVar(pANTLR3_BASE_TREE node,SPARQLquery& query){
const char* s =(const char*) node->getText(node)->chars;
std::string var="";
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
//var 200
if (childNode->getType(childNode)==200){
parseString(childNode,var,0);
query.addQueryVar(var);
}
}
return 0;
}
int DBparser::parseNode(pANTLR3_BASE_TREE node, SPARQLquery& query,int depth){
const char* s =(const char*) node->getText(node)->chars;
ANTLR3_UINT32 treeType = node->getType(node);
for (int i=0;i<depth;i++){
printf(" ");
}
printf("%d: %s\n",treeType,s);
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
int childNodeType = childNode->getType(childNode);
switch (childNodeType){
//prefix
case 199:{
parseVar(childNode,query);
break;
}
//var
case 143:{
std::pair<std::string,std::string> prefixPair;
parsePrefix(childNode,prefixPair);
_prefix_map.insert(prefixPair);
break;
}
//BasicQuery
case 77:{
BasicQuery* basicQuery=new BasicQuery();
parseBasicQuery(childNode,*basicQuery);
query.addBasicQuery(basicQuery);
break;
}
default:
parseNode(childNode,query,depth+1);
}
}
return 0;
}
int DBparser::printNode(pANTLR3_BASE_TREE node, int depth){
const char* s =(const char*) node->getText(node)->chars;
ANTLR3_UINT32 treeType = node->getType(node);
for (int i=0;i<depth;i++){
printf(" ");
}
printf("%d: %s\n",treeType,s);
for (unsigned int j=0;j<node->getChildCount(node);j++){
pANTLR3_BASE_TREE childNode=(pANTLR3_BASE_TREE) node->getChild(node,j);
int childNodeType = childNode->getType(childNode);
printNode(childNode,depth+1);
}
}