762 lines
25 KiB
C++
762 lines
25 KiB
C++
#include "TurtleParser.h"
|
|
#include <sstream>
|
|
//---------------------------------------------------------------------------
|
|
// RDF-3X
|
|
// (c) 2008 Thomas Neumann. Web site: http://www.mpi-inf.mpg.de/~neumann/rdf3x
|
|
//
|
|
// This work is licensed under the Creative Commons
|
|
// Attribution-Noncommercial-Share Alike 3.0 Unported License. To view a copy
|
|
// of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/
|
|
// or send a letter to Creative Commons, 171 Second Street, Suite 300,
|
|
// San Francisco, California, 94105, USA.
|
|
//---------------------------------------------------------------------------
|
|
using namespace std;
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Exception::Exception(const std::string& message)
|
|
: message(message)
|
|
// Constructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Exception::Exception(const char* message)
|
|
: message(message)
|
|
// Constructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Exception::~Exception()
|
|
// Destructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Lexer(istream& in)
|
|
: in(in),putBack(Token_Eof),line(1),readBufferStart(0),readBufferEnd(0)
|
|
// Constructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::~Lexer()
|
|
// Destructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
bool TurtleParser::Lexer::doRead(char& c)
|
|
// Read new characters
|
|
{
|
|
while (in) {
|
|
readBufferStart=readBuffer;
|
|
in.read(readBuffer,readBufferSize);
|
|
if (!in.gcount()) return false;
|
|
readBufferEnd=readBufferStart+in.gcount();
|
|
|
|
if (readBufferStart<readBufferEnd) {
|
|
c=*(readBufferStart++);
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
static bool issep(char c) { return (c==' ')||(c=='\t')||(c=='\n')||(c=='\r')||(c=='[')||(c==']')||(c=='(')||(c==')')||(c==',')||(c==';')||(c==':')||(c=='.'); }
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Token TurtleParser::Lexer::lexNumber(std::string& token,char c)
|
|
// Lex a number
|
|
{
|
|
token.resize(0);
|
|
|
|
while (true) {
|
|
// Sign?
|
|
if ((c=='+')||(c=='-')) {
|
|
token+=c;
|
|
if (!read(c)) break;
|
|
}
|
|
|
|
// First number block
|
|
if (c!='.') {
|
|
if ((c<'0')||(c>'9')) break;
|
|
while ((c>='0')&&(c<='9')) {
|
|
token+=c;
|
|
if (!read(c)) return Token_Integer;
|
|
}
|
|
if (issep(c)) {
|
|
unread();
|
|
return Token_Integer;
|
|
}
|
|
}
|
|
|
|
// Dot?
|
|
if (c=='.') {
|
|
token+=c;
|
|
if (!read(c)) break;
|
|
// Second number block
|
|
while ((c>='0')&&(c<='9')) {
|
|
token+=c;
|
|
if (!read(c)) return Token_Decimal;
|
|
}
|
|
if (issep(c)) {
|
|
unread();
|
|
return Token_Decimal;
|
|
}
|
|
}
|
|
|
|
// Exponent
|
|
if ((c!='e')&&(c!='E')) break;
|
|
token+=c;
|
|
if (!read(c)) break;
|
|
if ((c=='-')||(c=='+')) {
|
|
token+=c;
|
|
if (!read(c)) break;
|
|
}
|
|
if ((c<'0')||(c>'9')) break;
|
|
while ((c>='0')&&(c<='9')) {
|
|
token+=c;
|
|
if (!read(c)) return Token_Double;
|
|
}
|
|
if (issep(c)) {
|
|
unread();
|
|
return Token_Double;
|
|
}
|
|
break;
|
|
}
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid number " << token << c;
|
|
throw Exception(msg.str());
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
unsigned TurtleParser::Lexer::lexHexCode(unsigned len)
|
|
// Parse a hex code
|
|
{
|
|
unsigned result=0;
|
|
for (unsigned index=0;;index++) {
|
|
// Done?
|
|
if (index==len) return result;
|
|
|
|
// Read the next char
|
|
char c;
|
|
if (!read(c)) break;
|
|
|
|
// Interpret it
|
|
if ((c>='0')&&(c<='9')) result=(result<<4)|(c-'0'); else
|
|
if ((c>='A')&&(c<='F')) result=(result<<4)|(c-'A'+10); else
|
|
if ((c>='a')&&(c<='f')) result=(result<<4)|(c-'a'+10); else
|
|
break;
|
|
}
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid unicode escape";
|
|
throw Exception(msg.str());
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
static string encodeUtf8(unsigned code)
|
|
// Encode a unicode character as utf8
|
|
{
|
|
string result;
|
|
if (code&&(code<0x80)) {
|
|
result+=static_cast<char>(code);
|
|
} else if (code<0x800) {
|
|
result+=static_cast<char>(0xc0 | (0x1f & (code >> 6)));
|
|
result+=static_cast<char>(0x80 | (0x3f & code));
|
|
} else {
|
|
result+=static_cast<char>(0xe0 | (0x0f & (code >> 12)));
|
|
result+=static_cast<char>(0x80 | (0x3f & (code >> 6)));
|
|
result+=static_cast<char>(0x80 | (0x3f & code));
|
|
}
|
|
return result;
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::Lexer::lexEscape(std::string& token)
|
|
// Lex an escape sequence, \ already consumed
|
|
{
|
|
while (true) {
|
|
char c;
|
|
if (!read(c)) break;
|
|
// Standard escapes?
|
|
if (c=='t') { token+='\t'; return; }
|
|
if (c=='n') { token+='\n'; return; }
|
|
if (c=='r') { token+='\r'; return; }
|
|
if (c=='\"') { token+='\"'; return; }
|
|
if (c=='>') { token+='>'; return; }
|
|
if (c=='\\') { token+='\\'; return; }
|
|
|
|
// Unicode sequences?
|
|
if (c=='u') {
|
|
unsigned code=lexHexCode(4);
|
|
token+=encodeUtf8(code);
|
|
return;
|
|
}
|
|
if (c=='U') {
|
|
unsigned code=lexHexCode(8);
|
|
token+=encodeUtf8(code);
|
|
return;
|
|
}
|
|
|
|
// Invalid escape
|
|
break;
|
|
}
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid escape sequence";
|
|
throw Exception(msg.str());
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Token TurtleParser::Lexer::lexLongString(std::string& token)
|
|
// Lex a long string, first """ already consumed
|
|
{
|
|
char c;
|
|
while (read(c)) {
|
|
if (c=='\"') {
|
|
if (!read(c)) break;
|
|
if (c!='\"') { token+='\"'; continue; }
|
|
if (!read(c)) break;
|
|
if (c!='\"') { token+="\"\""; continue; }
|
|
return Token_String;
|
|
}
|
|
if (c=='\\') {
|
|
lexEscape(token);
|
|
} else {
|
|
token+=c;
|
|
if (c=='\n') line++;
|
|
}
|
|
}
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid string";
|
|
throw Exception(msg.str());
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Token TurtleParser::Lexer::lexString(std::string& token,char c)
|
|
// Lex a string
|
|
{
|
|
token.resize(0);
|
|
|
|
// Check the next character
|
|
if (!read(c)) {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid string";
|
|
throw Exception(msg.str());
|
|
}
|
|
|
|
// Another quote?
|
|
if (c=='\"') {
|
|
if (!read(c))
|
|
return Token_String;
|
|
if (c=='\"')
|
|
return lexLongString(token);
|
|
unread();
|
|
return Token_String;
|
|
}
|
|
|
|
// Process normally
|
|
while (true) {
|
|
if (c=='\"') return Token_String;
|
|
if (c=='\\') {
|
|
lexEscape(token);
|
|
} else {
|
|
token+=c;
|
|
if (c == '\n')
|
|
{
|
|
unread();
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid string";
|
|
throw Exception(msg.str());
|
|
}
|
|
}
|
|
if (!read(c)) {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid string";
|
|
throw Exception(msg.str());
|
|
}
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Token TurtleParser::Lexer::lexURI(std::string& token,char c)
|
|
// Lex a URI
|
|
{
|
|
token.resize(0);
|
|
|
|
// Check the next character
|
|
if (!read(c)) {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid URI";
|
|
throw Exception(msg.str());
|
|
}
|
|
|
|
// Process normally
|
|
while (true) {
|
|
if (c=='>') return Token_URI;
|
|
if (c=='\\') {
|
|
lexEscape(token);
|
|
} else {
|
|
token+=c;
|
|
if (c == '\n')
|
|
{
|
|
unread();
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid URI";
|
|
throw Exception(msg.str());
|
|
}
|
|
}
|
|
if (!read(c)) {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": invalid URI";
|
|
throw Exception(msg.str());
|
|
}
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::Lexer::Token TurtleParser::Lexer::next(std::string& token)
|
|
// Get the next token
|
|
{
|
|
// Do we already have one?
|
|
if (putBack!=Token_Eof) {
|
|
Token result=putBack;
|
|
token=putBackValue;
|
|
putBack=Token_Eof;
|
|
return result;
|
|
}
|
|
|
|
// Read more
|
|
char c;
|
|
while (read(c)) {
|
|
switch (c) {
|
|
case ' ': case '\t': case '\r': continue;
|
|
case '\n': line++; continue;
|
|
case '#': while (read(c)) if ((c=='\n')||(c=='\r')) break; if (c=='\n') ++line; continue;
|
|
case '.': if (!read(c)) return Token_Dot; unread(); if ((c>='0')&&(c<='9')) return lexNumber(token,'.'); return Token_Dot;
|
|
case ':': return Token_Colon;
|
|
case ';': return Token_Semicolon;
|
|
case ',': return Token_Comma;
|
|
case '[': return Token_LBracket;
|
|
case ']': return Token_RBracket;
|
|
case '(': return Token_LParen;
|
|
case ')': return Token_RParen;
|
|
case '@': return Token_At;
|
|
case '+': case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
return lexNumber(token,c);
|
|
case '^':
|
|
if ((!read(c))||(c!='^')) {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": '^' expected";
|
|
throw Exception(msg.str());
|
|
}
|
|
return Token_Type;
|
|
case '\"': return lexString(token,c);
|
|
case '<': return lexURI(token,c);
|
|
default:
|
|
if (((c>='A')&&(c<='Z'))||((c>='a')&&(c<='z'))||(c=='_')) { // XXX unicode!
|
|
token=c;
|
|
while (read(c)) {
|
|
if (issep(c)) { unread(); break; }
|
|
token+=c;
|
|
}
|
|
if (token=="a") return Token_A;
|
|
if (token=="true") return Token_True;
|
|
if (token=="false") return Token_False;
|
|
return Token_Name;
|
|
} else {
|
|
stringstream msg;
|
|
msg << "lexer error in line " << line << ": unexpected character " << c;
|
|
throw Exception(msg.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
return Token_Eof;
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::TurtleParser(istream& in)
|
|
: lexer(in),triplesReader(0),nextBlank(0)
|
|
// Constructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
TurtleParser::~TurtleParser()
|
|
// Destructor
|
|
{
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseError(const string& message)
|
|
// Report an error
|
|
{
|
|
stringstream msg;
|
|
msg << "parse error in line " << lexer.getLine() << ": " << message;
|
|
throw Exception(msg.str());
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::newBlankNode(std::string& node)
|
|
// Construct a new blank node
|
|
{
|
|
stringstream buffer;
|
|
buffer << "_:_" << (nextBlank++);
|
|
node=buffer.str();
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::constructAbsoluteURI(std::string& uri)
|
|
// Convert a relative URI into an absolute one
|
|
{
|
|
// No base?
|
|
if (base.empty())
|
|
return;
|
|
|
|
// Already absolute? XXX fix the check!
|
|
if (uri.find("://")<10)
|
|
return;
|
|
|
|
// Put the base in front
|
|
uri=base+uri;
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseDirective()
|
|
// Parse a directive
|
|
{
|
|
std::string value;
|
|
if (lexer.next(value)!=Lexer::Token_Name)
|
|
parseError("directive name expected after '@'");
|
|
|
|
if (value=="base") {
|
|
if (lexer.next(base)!=Lexer::Token_URI)
|
|
parseError("URI expected after @base");
|
|
} else if (value=="prefix") {
|
|
std::string prefixName;
|
|
Lexer::Token token=lexer.next(prefixName);
|
|
// A prefix name?
|
|
if (token==Lexer::Token_Name) {
|
|
token=lexer.next();
|
|
} else prefixName.resize(0);
|
|
// Colon
|
|
if (token!=Lexer::Token_Colon)
|
|
parseError("':' expected after @prefix");
|
|
// URI
|
|
std::string uri;
|
|
if (lexer.next(uri)!=Lexer::Token_URI)
|
|
parseError("URI expected after @prefix");
|
|
prefixes[prefixName]=uri;
|
|
} else {
|
|
parseError("unknown directive @"+value);
|
|
}
|
|
|
|
// Final dot
|
|
if (lexer.next()!=Lexer::Token_Dot)
|
|
parseError("'.' expected after directive");
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
inline bool TurtleParser::isName(Lexer::Token token)
|
|
// Is a (generalized) name token?
|
|
{
|
|
return (token==Lexer::Token_Name)||(token==Lexer::Token_A)||(token==Lexer::Token_True)||(token==Lexer::Token_False);
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseQualifiedName(const string& prefix,string& name)
|
|
// Parse a qualified name
|
|
{
|
|
if (lexer.next()!=Lexer::Token_Colon)
|
|
parseError("':' expected in qualified name");
|
|
if (!prefixes.count(prefix))
|
|
parseError("unknown prefix '"+prefix+"'");
|
|
string expandedPrefix=prefixes[prefix];
|
|
|
|
Lexer::Token token=lexer.next(name);
|
|
if (isName(token)) {
|
|
name=expandedPrefix+name;
|
|
} else {
|
|
lexer.unget(token,name);
|
|
name=expandedPrefix;
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseBlank(std::string& entry)
|
|
// Parse a blank entry
|
|
{
|
|
Lexer::Token token=lexer.next(entry);
|
|
switch (token) {
|
|
case Lexer::Token_Name:
|
|
if ((entry!="_")||(lexer.next()!=Lexer::Token_Colon)||(!isName(lexer.next(entry))))
|
|
parseError("blank nodes must start with '_:'");
|
|
entry="_:"+entry;
|
|
return;
|
|
case Lexer::Token_LBracket:
|
|
{
|
|
newBlankNode(entry);
|
|
token=lexer.next();
|
|
if (token!=Lexer::Token_RBracket) {
|
|
lexer.ungetIgnored(token);
|
|
std::string predicate,object,objectSubType;
|
|
Type::Type_ID objectType;
|
|
parsePredicateObjectList(entry,predicate,object,objectType,objectSubType);
|
|
triples.push_back(Triple(entry,predicate,object,objectType,objectSubType));
|
|
if (lexer.next()!=Lexer::Token_RBracket)
|
|
parseError("']' expected");
|
|
}
|
|
return;
|
|
}
|
|
case Lexer::Token_LParen:
|
|
{
|
|
// Collection
|
|
vector<string> entries,entrySubTypes;
|
|
vector<Type::Type_ID> entryTypes;
|
|
while ((token=lexer.next())!=Lexer::Token_RParen) {
|
|
lexer.ungetIgnored(token);
|
|
entries.push_back(string());
|
|
entryTypes.push_back(Type::Type_URI);
|
|
entrySubTypes.push_back(string());
|
|
parseObject(entries.back(),entryTypes.back(),entrySubTypes.back());
|
|
}
|
|
|
|
// Empty collection?
|
|
if (entries.empty()) {
|
|
entry="http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
|
|
return;
|
|
}
|
|
|
|
// Build blank nodes
|
|
vector<string> nodes;
|
|
nodes.resize(entries.size());
|
|
for (unsigned index=0;index<entries.size();index++)
|
|
newBlankNode(nodes[index]);
|
|
nodes.push_back("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil");
|
|
|
|
// Derive triples
|
|
for (unsigned index=0;index<entries.size();index++) {
|
|
triples.push_back(Triple(nodes[index],"http://www.w3.org/1999/02/22-rdf-syntax-ns#first",entries[index],entryTypes[index],entrySubTypes[index]));
|
|
triples.push_back(Triple(nodes[index],"http://www.w3.org/1999/02/22-rdf-syntax-ns#rest",nodes[index+1],Type::Type_URI,""));
|
|
}
|
|
entry=nodes.front();
|
|
}
|
|
|
|
default: parseError("invalid blank entry");
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseSubject(Lexer::Token token,std::string& subject)
|
|
// Parse a subject
|
|
{
|
|
switch (token) {
|
|
case Lexer::Token_URI:
|
|
// URI
|
|
constructAbsoluteURI(subject);
|
|
return;
|
|
case Lexer::Token_A: subject="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; return;
|
|
case Lexer::Token_Colon:
|
|
// Qualified name with empty prefix?
|
|
lexer.unget(token,subject);
|
|
parseQualifiedName("",subject);
|
|
return;
|
|
case Lexer::Token_Name:
|
|
// Qualified name
|
|
// Blank node?
|
|
if (subject=="_") {
|
|
lexer.unget(token,subject);
|
|
parseBlank(subject);
|
|
return;
|
|
}
|
|
// No
|
|
parseQualifiedName(subject,subject);
|
|
return;
|
|
case Lexer::Token_LBracket: case Lexer::Token_LParen:
|
|
// Opening bracket/parenthesis
|
|
lexer.unget(token,subject);
|
|
parseBlank(subject);
|
|
default: parseError("invalid subject");
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseObject(std::string& object,Type::Type_ID& objectType,std::string& objectSubType)
|
|
// Parse an object
|
|
{
|
|
Lexer::Token token=lexer.next(object);
|
|
objectSubType="";
|
|
switch (token) {
|
|
case Lexer::Token_URI:
|
|
// URI
|
|
constructAbsoluteURI(object);
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
case Lexer::Token_Colon:
|
|
// Qualified name with empty prefix?
|
|
lexer.unget(token,object);
|
|
parseQualifiedName("",object);
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
case Lexer::Token_Name:
|
|
// Qualified name
|
|
// Blank node?
|
|
if (object=="_") {
|
|
lexer.unget(token,object);
|
|
parseBlank(object);
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
}
|
|
// No
|
|
parseQualifiedName(object,object);
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
case Lexer::Token_LBracket: case Lexer::Token_LParen:
|
|
// Opening bracket/parenthesis
|
|
lexer.unget(token,object);
|
|
parseBlank(object);
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
case Lexer::Token_Integer:
|
|
// Literal
|
|
objectType=Type::Type_Integer;
|
|
return;
|
|
case Lexer::Token_Decimal:
|
|
// Literal
|
|
objectType=Type::Type_Decimal;
|
|
return;
|
|
case Lexer::Token_Double:
|
|
// Literal
|
|
objectType=Type::Type_Double;
|
|
return;
|
|
case Lexer::Token_A:
|
|
// Literal
|
|
object="http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
|
|
objectType=Type::Type_URI;
|
|
return;
|
|
case Lexer::Token_True:
|
|
// Literal
|
|
objectType=Type::Type_Boolean;
|
|
return;
|
|
case Lexer::Token_False:
|
|
// Literal
|
|
objectType=Type::Type_Boolean;
|
|
return;
|
|
case Lexer::Token_String:
|
|
// String literal
|
|
{
|
|
token=lexer.next();
|
|
objectType=Type::Type_Literal;
|
|
if (token==Lexer::Token_At) {
|
|
if (lexer.next(objectSubType)!=Lexer::Token_Name)
|
|
parseError("language tag expected");
|
|
objectType=Type::Type_CustomLanguage;
|
|
} else if (token==Lexer::Token_Type) {
|
|
string type;
|
|
token=lexer.next(type);
|
|
if (token==Lexer::Token_URI) {
|
|
// Already parsed
|
|
} else if (token==Lexer::Token_Colon) {
|
|
parseQualifiedName("",type);
|
|
} else if (token==Lexer::Token_Name) {
|
|
parseQualifiedName(type,type);
|
|
}
|
|
if (type=="http://www.w3.org/2001/XMLSchema#string") {
|
|
objectType=Type::Type_String;
|
|
} else if (type=="http://www.w3.org/2001/XMLSchema#integer") {
|
|
objectType=Type::Type_Integer;
|
|
} else if (type=="http://www.w3.org/2001/XMLSchema#decimal") {
|
|
objectType=Type::Type_Decimal;
|
|
} else if (type=="http://www.w3.org/2001/XMLSchema#double") {
|
|
objectType=Type::Type_Double;
|
|
} else if (type=="http://www.w3.org/2001/XMLSchema#boolean") {
|
|
objectType=Type::Type_Boolean;
|
|
} else {
|
|
objectType=Type::Type_CustomType;
|
|
objectSubType=type;
|
|
}
|
|
} else {
|
|
lexer.ungetIgnored(token);
|
|
}
|
|
return;
|
|
}
|
|
default: parseError("invalid object");
|
|
}
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parsePredicateObjectList(const string& subject,string& predicate,string& object,Type::Type_ID& objectType,string& objectSubType)
|
|
// Parse a predicate object list
|
|
{
|
|
// Parse the first predicate
|
|
Lexer::Token token;
|
|
switch (token=lexer.next(predicate)) {
|
|
case Lexer::Token_URI: constructAbsoluteURI(predicate); break;
|
|
case Lexer::Token_A: predicate="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; break;
|
|
case Lexer::Token_Colon: lexer.unget(token,predicate); parseQualifiedName("",predicate); break;
|
|
case Lexer::Token_Name: if (predicate=="_") parseError("blank nodes not allowed as predicate"); parseQualifiedName(predicate,predicate); break;
|
|
default: parseError("invalid predicate");
|
|
}
|
|
|
|
// Parse the object
|
|
parseObject(object,objectType,objectSubType);
|
|
|
|
// Additional objects?
|
|
token=lexer.next();
|
|
while (token==Lexer::Token_Comma) {
|
|
string additionalObject,additionalObjectSubType;
|
|
Type::Type_ID additionalObjectType;
|
|
parseObject(additionalObject,additionalObjectType,additionalObjectSubType);
|
|
triples.push_back(Triple(subject,predicate,additionalObject,additionalObjectType,additionalObjectSubType));
|
|
token=lexer.next();
|
|
}
|
|
|
|
// Additional predicates?
|
|
while (token==Lexer::Token_Semicolon) {
|
|
// Parse the predicate
|
|
string additionalPredicate;
|
|
switch (token=lexer.next(additionalPredicate)) {
|
|
case Lexer::Token_URI: constructAbsoluteURI(additionalPredicate); break;
|
|
case Lexer::Token_A: additionalPredicate="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; break;
|
|
case Lexer::Token_Colon: lexer.unget(token,additionalPredicate); parseQualifiedName("",additionalPredicate); break;
|
|
case Lexer::Token_Name: if (additionalPredicate=="_") parseError("blank nodes not allowed as predicate"); parseQualifiedName(additionalPredicate,additionalPredicate); break;
|
|
default: lexer.unget(token,additionalPredicate); return;
|
|
}
|
|
|
|
// Parse the object
|
|
string additionalObject,additionalObjectSubType;
|
|
Type::Type_ID additionalObjectType;
|
|
parseObject(additionalObject,additionalObjectType,additionalObjectSubType);
|
|
triples.push_back(Triple(subject,additionalPredicate,additionalObject,additionalObjectType,additionalObjectSubType));
|
|
|
|
// Additional objects?
|
|
token=lexer.next();
|
|
while (token==Lexer::Token_Comma) {
|
|
parseObject(additionalObject,additionalObjectType,additionalObjectSubType);
|
|
triples.push_back(Triple(subject,additionalPredicate,additionalObject,additionalObjectType,additionalObjectSubType));
|
|
token=lexer.next();
|
|
}
|
|
}
|
|
lexer.ungetIgnored(token);
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
void TurtleParser::parseTriple(Lexer::Token token,std::string& subject,std::string& predicate,std::string& object,Type::Type_ID& objectType,std::string& objectSubType)
|
|
// Parse a triple
|
|
{
|
|
parseSubject(token,subject);
|
|
parsePredicateObjectList(subject,predicate,object,objectType,objectSubType);
|
|
if (lexer.next()!=Lexer::Token_Dot)
|
|
parseError("'.' expected after triple");
|
|
}
|
|
//---------------------------------------------------------------------------
|
|
bool TurtleParser::parse(std::string& subject,std::string& predicate,std::string& object,Type::Type_ID& objectType,std::string& objectSubType)
|
|
// Read the next triple
|
|
{
|
|
// Some triples left?
|
|
if (triplesReader<triples.size()) {
|
|
subject=triples[triplesReader].subject;
|
|
predicate=triples[triplesReader].predicate;
|
|
object=triples[triplesReader].object;
|
|
objectType=triples[triplesReader].objectType;
|
|
objectSubType=triples[triplesReader].objectSubType;
|
|
if ((++triplesReader)>=triples.size()) {
|
|
triples.clear();
|
|
triplesReader=0;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// No, check if the input is done
|
|
Lexer::Token token;
|
|
while (true) {
|
|
token=lexer.next(subject);
|
|
if (token==Lexer::Token_Eof) return false;
|
|
|
|
// A directive?
|
|
if (token==Lexer::Token_At) {
|
|
parseDirective();
|
|
continue;
|
|
} else break;
|
|
}
|
|
|
|
// No, parse a triple
|
|
parseTriple(token,subject,predicate,object,objectType,objectSubType);
|
|
return true;
|
|
}
|
|
//---------------------------------------------------------------------------
|