#include "TurtleParser.h" //--------------------------------------------------------------------------- // RDF-3X // (c) 2008 Thomas Neumann. Web site: http://www.mpi-inf.mpg.de/~neumann/rdf3x // // This work is licensed under the Creative Commons // Attribution-Noncommercial-Share Alike 3.0 Unported License. To view a copy // of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/ // or send a letter to Creative Commons, 171 Second Street, Suite 300, // San Francisco, California, 94105, USA. //--------------------------------------------------------------------------- using namespace std; //--------------------------------------------------------------------------- TurtleParser::Exception::Exception(const std::string& message) : message(message) // Constructor { } //--------------------------------------------------------------------------- TurtleParser::Exception::Exception(const char* message) : message(message) // Constructor { } //--------------------------------------------------------------------------- TurtleParser::Exception::~Exception() // Destructor { } //--------------------------------------------------------------------------- TurtleParser::Lexer::Lexer(istream& in) : in(in),putBack(Token_Eof),line(1),readBufferStart(0),readBufferEnd(0) // Constructor { } //--------------------------------------------------------------------------- TurtleParser::Lexer::~Lexer() // Destructor { } //--------------------------------------------------------------------------- bool TurtleParser::Lexer::doRead(char& c) // Read new characters { while (in) { readBufferStart=readBuffer; in.read(readBuffer,readBufferSize); if (!in.gcount()) return false; readBufferEnd=readBufferStart+in.gcount(); if (readBufferStart'9')) break; while ((c>='0')&&(c<='9')) { token+=c; if (!read(c)) return Token_Integer; } if (issep(c)) { unread(); return Token_Integer; } } // Dot? if (c=='.') { token+=c; if (!read(c)) break; // Second number block while ((c>='0')&&(c<='9')) { token+=c; if (!read(c)) return Token_Decimal; } if (issep(c)) { unread(); return Token_Decimal; } } // Exponent if ((c!='e')&&(c!='E')) break; token+=c; if (!read(c)) break; if ((c=='-')||(c=='+')) { token+=c; if (!read(c)) break; } if ((c<'0')||(c>'9')) break; while ((c>='0')&&(c<='9')) { token+=c; if (!read(c)) return Token_Double; } if (issep(c)) { unread(); return Token_Double; } break; } stringstream msg; msg << "lexer error in line " << line << ": invalid number " << token << c; throw Exception(msg.str()); } //--------------------------------------------------------------------------- unsigned TurtleParser::Lexer::lexHexCode(unsigned len) // Parse a hex code { unsigned result=0; for (unsigned index=0;;index++) { // Done? if (index==len) return result; // Read the next char char c; if (!read(c)) break; // Interpret it if ((c>='0')&&(c<='9')) result=(result<<4)|(c-'0'); else if ((c>='A')&&(c<='F')) result=(result<<4)|(c-'A'+10); else if ((c>='a')&&(c<='f')) result=(result<<4)|(c-'a'+10); else break; } stringstream msg; msg << "lexer error in line " << line << ": invalid unicode escape"; throw Exception(msg.str()); } //--------------------------------------------------------------------------- static string encodeUtf8(unsigned code) // Encode a unicode character as utf8 { string result; if (code&&(code<0x80)) { result+=static_cast(code); } else if (code<0x800) { result+=static_cast(0xc0 | (0x1f & (code >> 6))); result+=static_cast(0x80 | (0x3f & code)); } else { result+=static_cast(0xe0 | (0x0f & (code >> 12))); result+=static_cast(0x80 | (0x3f & (code >> 6))); result+=static_cast(0x80 | (0x3f & code)); } return result; } //--------------------------------------------------------------------------- void TurtleParser::Lexer::lexEscape(std::string& token) // Lex an escape sequence, \ already consumed { while (true) { char c; if (!read(c)) break; // Standard escapes? if (c=='t') { token+='\t'; return; } if (c=='n') { token+='\n'; return; } if (c=='r') { token+='\r'; return; } if (c=='\"') { token+='\"'; return; } if (c=='>') { token+='>'; return; } if (c=='\\') { token+='\\'; return; } // Unicode sequences? if (c=='u') { unsigned code=lexHexCode(4); token+=encodeUtf8(code); return; } if (c=='U') { unsigned code=lexHexCode(8); token+=encodeUtf8(code); return; } // Invalid escape break; } stringstream msg; msg << "lexer error in line " << line << ": invalid escape sequence"; throw Exception(msg.str()); } //--------------------------------------------------------------------------- TurtleParser::Lexer::Token TurtleParser::Lexer::lexLongString(std::string& token) // Lex a long string, first """ already consumed { char c; while (read(c)) { if (c=='\"') { if (!read(c)) break; if (c!='\"') { token+='\"'; continue; } if (!read(c)) break; if (c!='\"') { token+="\"\""; continue; } return Token_String; } if (c=='\\') { lexEscape(token); } else { token+=c; if (c=='\n') line++; } } stringstream msg; msg << "lexer error in line " << line << ": invalid string"; throw Exception(msg.str()); } //--------------------------------------------------------------------------- TurtleParser::Lexer::Token TurtleParser::Lexer::lexString(std::string& token,char c) // Lex a string { token.resize(0); // Check the next character if (!read(c)) { stringstream msg; msg << "lexer error in line " << line << ": invalid string"; throw Exception(msg.str()); } // Another quote? if (c=='\"') { if (!read(c)) return Token_String; if (c=='\"') return lexLongString(token); unread(); return Token_String; } // Process normally while (true) { if (c=='\"') return Token_String; if (c=='\\') { lexEscape(token); } else { token+=c; if (c == '\n') { unread(); stringstream msg; msg << "lexer error in line " << line << ": invalid string"; throw Exception(msg.str()); } } if (!read(c)) { stringstream msg; msg << "lexer error in line " << line << ": invalid string"; throw Exception(msg.str()); } } } //--------------------------------------------------------------------------- TurtleParser::Lexer::Token TurtleParser::Lexer::lexURI(std::string& token,char c) // Lex a URI { token.resize(0); // Check the next character if (!read(c)) { stringstream msg; msg << "lexer error in line " << line << ": invalid URI"; throw Exception(msg.str()); } // Process normally while (true) { if (c=='>') return Token_URI; if (c=='\\') { lexEscape(token); } else { token+=c; if (c == '\n') { unread(); stringstream msg; msg << "lexer error in line " << line << ": invalid URI"; throw Exception(msg.str()); } } if (!read(c)) { stringstream msg; msg << "lexer error in line " << line << ": invalid URI"; throw Exception(msg.str()); } } } //--------------------------------------------------------------------------- TurtleParser::Lexer::Token TurtleParser::Lexer::next(std::string& token) // Get the next token { // Do we already have one? if (putBack!=Token_Eof) { Token result=putBack; token=putBackValue; putBack=Token_Eof; return result; } // Read more char c; while (read(c)) { switch (c) { case ' ': case '\t': case '\r': continue; case '\n': line++; continue; case '#': while (read(c)) if ((c=='\n')||(c=='\r')) break; if (c=='\n') ++line; continue; case '.': if (!read(c)) return Token_Dot; unread(); if ((c>='0')&&(c<='9')) return lexNumber(token,'.'); return Token_Dot; case ':': return Token_Colon; case ';': return Token_Semicolon; case ',': return Token_Comma; case '[': return Token_LBracket; case ']': return Token_RBracket; case '(': return Token_LParen; case ')': return Token_RParen; case '@': return Token_At; case '+': case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(token,c); case '^': if ((!read(c))||(c!='^')) { stringstream msg; msg << "lexer error in line " << line << ": '^' expected"; throw Exception(msg.str()); } return Token_Type; case '\"': return lexString(token,c); case '<': return lexURI(token,c); default: if (((c>='A')&&(c<='Z'))||((c>='a')&&(c<='z'))||(c=='_')) { // XXX unicode! token=c; while (read(c)) { if (issep(c)) { unread(); break; } token+=c; } if (token=="a") return Token_A; if (token=="true") return Token_True; if (token=="false") return Token_False; return Token_Name; } else { stringstream msg; msg << "lexer error in line " << line << ": unexpected character " << c; throw Exception(msg.str()); } } } return Token_Eof; } //--------------------------------------------------------------------------- void TurtleParser::Lexer::readUntilSep(std::string& value) { value.resize(0); char c; while (read(c)) { if (issep(c)) { unread(); break; } value += c; } } //--------------------------------------------------------------------------- TurtleParser::TurtleParser(istream& in) : lexer(in),triplesReader(0),nextBlank(0) // Constructor { } //--------------------------------------------------------------------------- TurtleParser::~TurtleParser() // Destructor { } //--------------------------------------------------------------------------- void TurtleParser::parseError(const string& message) // Report an error { stringstream msg; msg << "parse error in line " << lexer.getLine() << ": " << message; throw Exception(msg.str()); } //--------------------------------------------------------------------------- void TurtleParser::newBlankNode(std::string& node) // Construct a new blank node { stringstream buffer; buffer << "_:_" << (nextBlank++); node=buffer.str(); } //--------------------------------------------------------------------------- void TurtleParser::constructAbsoluteURI(std::string& uri) // Convert a relative URI into an absolute one { // No base? if (base.empty()) return; // Already absolute? XXX fix the check! if (uri.find("://")<10) return; // Put the base in front uri=base+uri; } //--------------------------------------------------------------------------- void TurtleParser::parseDirective() // Parse a directive { std::string value; if (lexer.next(value)!=Lexer::Token_Name) parseError("directive name expected after '@'"); if (value=="base") { if (lexer.next(base)!=Lexer::Token_URI) parseError("URI expected after @base"); } else if (value=="prefix") { std::string prefixName; Lexer::Token token=lexer.next(prefixName); // A prefix name? if (token==Lexer::Token_Name) { token=lexer.next(); } else prefixName.resize(0); // Colon if (token!=Lexer::Token_Colon) parseError("':' expected after @prefix"); // URI std::string uri; if (lexer.next(uri)!=Lexer::Token_URI) parseError("URI expected after @prefix"); prefixes[prefixName]=uri; } else { parseError("unknown directive @"+value); } // Final dot if (lexer.next()!=Lexer::Token_Dot) parseError("'.' expected after directive"); } //--------------------------------------------------------------------------- inline bool TurtleParser::isName(Lexer::Token token) // Is a (generalized) name token? { return (token==Lexer::Token_Name)||(token==Lexer::Token_A)||(token==Lexer::Token_True)||(token==Lexer::Token_False); } //--------------------------------------------------------------------------- void TurtleParser::parseQualifiedName(const string& prefix,string& name) // Parse a qualified name { if (lexer.next()!=Lexer::Token_Colon) parseError("':' expected in qualified name"); if (!prefixes.count(prefix)) parseError("unknown prefix '"+prefix+"'"); string expandedPrefix=prefixes[prefix]; lexer.readUntilSep(name); name=expandedPrefix+name; /* Lexer::Token token=lexer.next(name); if (isName(token)) { name=expandedPrefix+name; } else { lexer.unget(token,name); name=expandedPrefix; } */ } //--------------------------------------------------------------------------- void TurtleParser::parseBlank(std::string& entry) // Parse a blank entry { Lexer::Token token=lexer.next(entry); switch (token) { case Lexer::Token_Name: if ((entry!="_")||(lexer.next()!=Lexer::Token_Colon)||(!isName(lexer.next(entry)))) parseError("blank nodes must start with '_:'"); entry="_:"+entry; return; case Lexer::Token_LBracket: { newBlankNode(entry); token=lexer.next(); if (token!=Lexer::Token_RBracket) { lexer.ungetIgnored(token); std::string predicate,object,objectSubType; Type::Type_ID objectType; parsePredicateObjectList(entry,predicate,object,objectType,objectSubType); triples.push_back(Triple(entry,predicate,object,objectType,objectSubType)); if (lexer.next()!=Lexer::Token_RBracket) parseError("']' expected"); } return; } case Lexer::Token_LParen: { // Collection vector entries,entrySubTypes; vector entryTypes; while ((token=lexer.next())!=Lexer::Token_RParen) { lexer.ungetIgnored(token); entries.push_back(string()); entryTypes.push_back(Type::Type_URI); entrySubTypes.push_back(string()); parseObject(entries.back(),entryTypes.back(),entrySubTypes.back()); } // Empty collection? if (entries.empty()) { entry="http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; return; } // Build blank nodes vector nodes; nodes.resize(entries.size()); for (unsigned index=0;index=triples.size()) { triples.clear(); triplesReader=0; } return true; } // No, check if the input is done Lexer::Token token; while (true) { token=lexer.next(subject); if (token==Lexer::Token_Eof) return false; // A directive? if (token==Lexer::Token_At) { parseDirective(); continue; } else break; } // No, parse a triple parseTriple(token,subject,predicate,object,objectType,objectSubType); return true; } //---------------------------------------------------------------------------