gStore/Parser/TurtleParser.h

170 lines
5.7 KiB
C++

#ifndef H_tools_rdf3xload_TurtleParser
#define H_tools_rdf3xload_TurtleParser
//---------------------------------------------------------------------------
// RDF-3X
// (c) 2008 Thomas Neumann. Web site: http://www.mpi-inf.mpg.de/~neumann/rdf3x
//
// This work is licensed under the Creative Commons
// Attribution-Noncommercial-Share Alike 3.0 Unported License. To view a copy
// of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/
// or send a letter to Creative Commons, 171 Second Street, Suite 300,
// San Francisco, California, 94105, USA.
//---------------------------------------------------------------------------
#include "Type.h"
#include <istream>
#include <string>
#include <map>
#include <vector>
//---------------------------------------------------------------------------
/// Parse a turtle file
class TurtleParser
{
public:
/// A parse error
class Exception {
public:
/// The message
std::string message;
/// Constructor
Exception(const std::string& message);
/// Constructor
Exception(const char* message);
/// Destructor
~Exception();
};
private:
/// A turtle lexer
class Lexer {
public:
/// Possible tokens
enum Token { Token_Eof, Token_Dot, Token_Colon, Token_Comma, Token_Semicolon, Token_LBracket, Token_RBracket, Token_LParen, Token_RParen, Token_At, Token_Type, Token_Integer, Token_Decimal, Token_Double, Token_Name, Token_A, Token_True, Token_False, Token_String, Token_URI };
private:
/// The input
std::istream& in;
/// The putback
Token putBack;
/// The putback string
std::string putBackValue;
/// Buffer for parsing when ignoring the value
std::string ignored;
/// The current line
unsigned line;
/// Size of the read buffer
static const unsigned readBufferSize = 1024;
/// Read buffer
char readBuffer[readBufferSize];
/// Read buffer pointers
char* readBufferStart,*readBufferEnd;
/// Read new characters
bool doRead(char& c);
/// Read a character
bool read(char& c) { if (readBufferStart<readBufferEnd) { c=*(readBufferStart++); return true; } else return doRead(c); }
/// Unread the last character
void unread() { readBufferStart--; }
/// Lex a hex code
unsigned lexHexCode(unsigned len);
/// Lex an escape sequence
void lexEscape(std::string& token);
/// Lex a long string
Token lexLongString(std::string& token);
/// Lex a string
Token lexString(std::string& token,char c);
/// Lex a URI
Token lexURI(std::string& token,char c);
/// Lex a number
Token lexNumber(std::string& token,char c);
public:
/// Constructor
Lexer(std::istream& in);
/// Destructor
~Lexer();
/// The next token (including value)
Token next(std::string& value);
/// The next token (ignoring the value)
Token next() { return next(ignored); }
/// Put a token and a string back
void unget(Token t,const std::string& s) { putBack=t; if (t>=Token_Integer) putBackValue=s; }
/// Put a token back
void ungetIgnored(Token t) { putBack=t; if (t>=Token_Integer) putBackValue=ignored; }
/// Get the line
unsigned getLine() const { return line; }
void discardLine()
{
char c;
while (read(c) && c != '\n');
}
};
/// A triple
struct Triple {
/// The entries
std::string subject,predicate,object,objectSubType;
/// Type for the object
Type::Type_ID objectType;
/// Constructor
Triple(const std::string& subject,const std::string& predicate,const std::string& object,Type::Type_ID objectType,const std::string& objectSubType) : subject(subject),predicate(predicate),object(object),objectSubType(objectSubType),objectType(objectType) {}
};
/// The lexer
Lexer lexer;
/// The uri base
std::string base;
/// All known prefixes
std::map<std::string,std::string> prefixes;
/// The currently available triples
std::vector<Triple> triples;
/// Reader in the triples
unsigned triplesReader;
/// The next blank node id
unsigned nextBlank;
/// Is a (generalized) name token?
static inline bool isName(Lexer::Token token);
// Convert a relative URI into an absolute one
void constructAbsoluteURI(std::string& uri);
/// Construct a new blank node
void newBlankNode(std::string& node);
/// Report an error
void parseError(const std::string& message);
/// Parse a qualified name
void parseQualifiedName(const std::string& prefix,std::string& name);
/// Parse a blank entry
void parseBlank(std::string& entry);
/// Parse a subject
void parseSubject(Lexer::Token token,std::string& subject);
/// Parse an object
void parseObject(std::string& object,Type::Type_ID& objectType,std::string& objectSubType);
/// Parse a predicate object list
void parsePredicateObjectList(const std::string& subject,std::string& predicate,std::string& object,Type::Type_ID& objectType,std::string& objectSubType);
/// Parse a directive
void parseDirective();
/// Parse a new triple
void parseTriple(Lexer::Token token,std::string& subject,std::string& predicate,std::string& object,Type::Type_ID& objectType,std::string& objectSubType);
public:
/// Constructor
TurtleParser(std::istream& in);
/// Destructor
~TurtleParser();
/// Read the next triple
bool parse(std::string& subject,std::string& predicate,std::string& object,Type::Type_ID& objectType,std::string& objectSubType);
void discardLine()
{
lexer.discardLine();
}
};
//---------------------------------------------------------------------------
#endif