/* * [The "BSD license"] * Copyright (c) 2013 Terence Parr * Copyright (c) 2013 Sam Harwell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using System; using System.Collections.Generic; using System.Text; using Antlr4.Runtime; using Antlr4.Runtime.Atn; using Antlr4.Runtime.Misc; using Antlr4.Runtime.Sharpen; namespace Antlr4.Runtime { /// A lexer is recognizer that draws input symbols from a character stream. /// /// A lexer is recognizer that draws input symbols from a character stream. /// lexer grammars result in a subclass of this object. A Lexer object /// uses simplified match() and error recovery mechanisms in the interest /// of speed. /// public abstract class Lexer : Recognizer, ITokenSource { public const int DefaultMode = 0; public const int DefaultTokenChannel = TokenConstants.DefaultChannel; public const int Hidden = TokenConstants.HiddenChannel; public const int MinCharValue = '\u0000'; public const int MaxCharValue = '\uFFFE'; public ICharStream _input; protected internal Tuple _tokenFactorySourcePair; /// How to create token objects protected internal ITokenFactory _factory = CommonTokenFactory.Default; /// The goal of all lexer rules/methods is to create a token object. /// /// The goal of all lexer rules/methods is to create a token object. /// This is an instance variable as multiple rules may collaborate to /// create a single token. nextToken will return this object after /// matching lexer rule(s). If you subclass to allow multiple token /// emissions, then set this to the last token to be matched or /// something nonnull so that the auto token emit mechanism will not /// emit another token. /// public IToken _token; /// /// What character index in the stream did the current token start at? /// Needed, for example, to get the text for current token. /// /// /// What character index in the stream did the current token start at? /// Needed, for example, to get the text for current token. Set at /// the start of nextToken. /// public int _tokenStartCharIndex = -1; /// The line on which the first character of the token resides public int _tokenStartLine; /// The character position of first character within the line public int _tokenStartCharPositionInLine; /// Once we see EOF on char stream, next token will be EOF. /// /// Once we see EOF on char stream, next token will be EOF. /// If you have DONE : EOF ; then you see DONE EOF. /// public bool _hitEOF; /// The channel number for the current token public int _channel; /// The token type for the current token public int _type; public readonly List _modeStack = new List(); public int _mode = Antlr4.Runtime.Lexer.DefaultMode; /// /// You can set the text for the current token to override what is in /// the input char buffer. /// /// /// You can set the text for the current token to override what is in /// the input char buffer. Use setText() or can set this instance var. /// public string _text; public Lexer(ICharStream input) { this._input = input; this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, input); } public virtual void Reset() { // wack Lexer state variables if (_input != null) { _input.Seek(0); } // rewind the input _token = null; _type = TokenConstants.InvalidType; _channel = TokenConstants.DefaultChannel; _tokenStartCharIndex = -1; _tokenStartCharPositionInLine = -1; _tokenStartLine = -1; _text = null; _hitEOF = false; _mode = Antlr4.Runtime.Lexer.DefaultMode; _modeStack.Clear(); Interpreter.Reset(); } /// /// Return a token from this source; i.e., match a token on the char /// stream. /// /// /// Return a token from this source; i.e., match a token on the char /// stream. /// public virtual IToken NextToken() { if (_input == null) { throw new InvalidOperationException("nextToken requires a non-null input stream."); } // Mark start location in char stream so unbuffered streams are // guaranteed at least have text of current token int tokenStartMarker = _input.Mark(); try { while (true) { if (_hitEOF) { EmitEOF(); return _token; } _token = null; _channel = TokenConstants.DefaultChannel; _tokenStartCharIndex = _input.Index; _tokenStartCharPositionInLine = Interpreter.Column; _tokenStartLine = Interpreter.Line; _text = null; do { _type = TokenConstants.InvalidType; // System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+ // " in mode "+mode+ // " at index "+input.index()); int ttype; try { ttype = Interpreter.Match(_input, _mode); } catch (LexerNoViableAltException e) { NotifyListeners(e); // report error Recover(e); ttype = TokenTypes.Skip; } if (_input.La(1) == IntStreamConstants.Eof) { _hitEOF = true; } if (_type == TokenConstants.InvalidType) { _type = ttype; } if (_type == TokenTypes.Skip) { goto outer_continue; } } while (_type == TokenTypes.More); if (_token == null) { Emit(); } return _token; outer_continue: ; } } finally { // make sure we release marker after match or // unbuffered char stream will keep buffering _input.Release(tokenStartMarker); } } /// /// Instruct the lexer to skip creating a token for current lexer rule /// and look for another token. /// /// /// Instruct the lexer to skip creating a token for current lexer rule /// and look for another token. nextToken() knows to keep looking when /// a lexer rule finishes with token set to SKIP_TOKEN. Recall that /// if token==null at end of any token rule, it creates one for you /// and emits it. /// public virtual void Skip() { _type = TokenTypes.Skip; } public virtual void More() { _type = TokenTypes.More; } public virtual void Mode(int m) { _mode = m; } public virtual void PushMode(int m) { _modeStack.Add(_mode); Mode(m); } public virtual int PopMode() { if (_modeStack.Count == 0) { throw new InvalidOperationException(); } int mode = _modeStack[_modeStack.Count - 1]; _modeStack.RemoveAt(_modeStack.Count - 1); Mode(mode); return _mode; } public virtual ITokenFactory TokenFactory { get { return _factory; } set { ITokenFactory factory = value; this._factory = factory; } } /// Set the char stream and reset the lexer public virtual void SetInputStream(ICharStream input) { this._input = null; this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input); Reset(); this._input = input; this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input); } public virtual string SourceName { get { return _input.SourceName; } } public override IIntStream InputStream { get { return _input; } } ICharStream ITokenSource.InputStream { get { return (ICharStream)InputStream; } } /// /// By default does not support multiple emits per nextToken invocation /// for efficiency reasons. /// /// /// By default does not support multiple emits per nextToken invocation /// for efficiency reasons. Subclass and override this method, nextToken, /// and getToken (to push tokens into a list and pull from that list /// rather than a single variable as this implementation does). /// public virtual void Emit(IToken token) { //System.err.println("emit "+token); this._token = token; } /// /// The standard method called to automatically emit a token at the /// outermost lexical rule. /// /// /// The standard method called to automatically emit a token at the /// outermost lexical rule. The token object should point into the /// char buffer start..stop. If there is a text override in 'text', /// use that to set the token's text. Override this method to emit /// custom Token objects or provide a new factory. /// public virtual IToken Emit() { IToken t = _factory.Create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, CharIndex - 1, _tokenStartLine, _tokenStartCharPositionInLine); Emit(t); return t; } public virtual IToken EmitEOF() { int cpos = Column; // The character position for EOF is one beyond the position of // the previous token's last character if (_token != null) { int n = _token.StopIndex - _token.StartIndex + 1; cpos = _token.Column + n; } IToken eof = _factory.Create(_tokenFactorySourcePair, TokenConstants.Eof, null, TokenConstants.DefaultChannel, _input.Index, _input.Index - 1, Line, cpos); Emit(eof); return eof; } public virtual int Line { get { return Interpreter.Line; } set { int line = value; Interpreter.Line = line; } } public virtual int Column { get { return Interpreter.Column; } set { int charPositionInLine = value; Interpreter.Column = charPositionInLine; } } /// What is the index of the current character of lookahead? public virtual int CharIndex { get { return _input.Index; } } /// /// Return the text matched so far for the current token or any text /// override. /// /// /// Return the text matched so far for the current token or any text /// override. /// /// /// Set the complete text of this token; it wipes any previous changes to the /// text. /// /// /// Set the complete text of this token; it wipes any previous changes to the /// text. /// public virtual string Text { get { if (_text != null) { return _text; } return Interpreter.GetText(_input); } set { string text = value; this._text = text; } } /// Override if emitting multiple tokens. /// Override if emitting multiple tokens. public virtual IToken Token { get { return _token; } set { IToken _token = value; this._token = _token; } } public virtual int Type { get { return _type; } set { int ttype = value; _type = ttype; } } public virtual int Channel { get { return _channel; } set { int channel = value; _channel = channel; } } public virtual string[] ModeNames { get { return null; } } /// /// Used to print out token names like ID during debugging and /// error reporting. /// /// /// Used to print out token names like ID during debugging and /// error reporting. The generated parsers implement a method /// that overrides this to point to their String[] tokenNames. /// public override string[] TokenNames { get { return null; } } /// Return a list of all Token objects in input char stream. /// /// Return a list of all Token objects in input char stream. /// Forces load of all tokens. Does not include EOF token. /// public virtual IList GetAllTokens() { IList tokens = new List(); IToken t = NextToken(); while (t.Type != TokenConstants.Eof) { tokens.Add(t); t = NextToken(); } return tokens; } public virtual void Recover(LexerNoViableAltException e) { if (_input.La(1) != IntStreamConstants.Eof) { // skip a char and try again Interpreter.Consume(_input); } } public virtual void NotifyListeners(LexerNoViableAltException e) { string text = _input.GetText(Interval.Of(_tokenStartCharIndex, _input.Index)); string msg = "token recognition error at: '" + GetErrorDisplay(text) + "'"; IAntlrErrorListener listener = ErrorListenerDispatch; listener.SyntaxError(this, 0, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); } public virtual string GetErrorDisplay(string s) { StringBuilder buf = new StringBuilder(); foreach (char c in s.ToCharArray()) { buf.Append(GetErrorDisplay(c)); } return buf.ToString(); } public virtual string GetErrorDisplay(int c) { string s = ((char)c).ToString(); switch (c) { case TokenConstants.Eof: { s = ""; break; } case '\n': { s = "\\n"; break; } case '\t': { s = "\\t"; break; } case '\r': { s = "\\r"; break; } } return s; } public virtual string GetCharErrorDisplay(int c) { string s = GetErrorDisplay(c); return "'" + s + "'"; } /// /// Lexers can normally match any char in it's vocabulary after matching /// a token, so do the easy thing and just kill a character and hope /// it all works out. /// /// /// Lexers can normally match any char in it's vocabulary after matching /// a token, so do the easy thing and just kill a character and hope /// it all works out. You can instead use the rule invocation stack /// to do sophisticated error recovery if you are in a fragment rule. /// public virtual void Recover(RecognitionException re) { //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); //re.printStackTrace(); // TODO: Do we lose character or line position information? _input.Consume(); } } }