antlr/Antlr4.Runtime/Lexer.cs

/*
 * [The "BSD license"]
 *  Copyright (c) 2013 Terence Parr
 *  Copyright (c) 2013 Sam Harwell
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
using System;
using System.Collections.Generic;
using System.Text;
using Antlr4.Runtime;
using Antlr4.Runtime.Atn;
using Antlr4.Runtime.Misc;
using Sharpen;

namespace Antlr4.Runtime
{
    /// <summary>A lexer is recognizer that draws input symbols from a character stream.</summary>
    /// <remarks>
    /// A lexer is recognizer that draws input symbols from a character stream.
    /// lexer grammars result in a subclass of this object. A Lexer object
    /// uses simplified match() and error recovery mechanisms in the interest
    /// of speed.
    /// </remarks>
    public abstract class Lexer : Recognizer<int, LexerATNSimulator>, ITokenSource
    {
        public const int DefaultMode = 0;

        public const int DefaultTokenChannel = TokenConstants.DefaultChannel;

        public const int Hidden = TokenConstants.HiddenChannel;

        public const int MinCharValue = '\u0000';

        public const int MaxCharValue = '\uFFFE';

        public ICharStream _input;

        protected internal Tuple<ITokenSource, ICharStream> _tokenFactorySourcePair;

        /// <summary>How to create token objects</summary>
        protected internal ITokenFactory _factory = CommonTokenFactory.Default;

        /// <summary>The goal of all lexer rules/methods is to create a token object.</summary>
        /// <remarks>
        /// The goal of all lexer rules/methods is to create a token object.
        /// This is an instance variable as multiple rules may collaborate to
        /// create a single token.  nextToken will return this object after
        /// matching lexer rule(s).  If you subclass to allow multiple token
        /// emissions, then set this to the last token to be matched or
        /// something nonnull so that the auto token emit mechanism will not
        /// emit another token.
        /// </remarks>
        public IToken _token;

        /// <summary>
        /// What character index in the stream did the current token start at?
        /// Needed, for example, to get the text for current token.
        /// </summary>
        /// <remarks>
        /// What character index in the stream did the current token start at?
        /// Needed, for example, to get the text for current token.  Set at
        /// the start of nextToken.
        /// </remarks>
        public int _tokenStartCharIndex = -1;

        /// <summary>The line on which the first character of the token resides</summary>
        public int _tokenStartLine;

        /// <summary>The character position of first character within the line</summary>
        public int _tokenStartCharPositionInLine;

        /// <summary>Once we see EOF on char stream, next token will be EOF.</summary>
        /// <remarks>
        /// Once we see EOF on char stream, next token will be EOF.
        /// If you have DONE : EOF ; then you see DONE EOF.
        /// </remarks>
        public bool _hitEOF;

        /// <summary>The channel number for the current token</summary>
        public int _channel;

        /// <summary>The token type for the current token</summary>
        public int _type;

        public readonly List<int> _modeStack = new List<int>();

        public int _mode = Antlr4.Runtime.Lexer.DefaultMode;

        /// <summary>
        /// You can set the text for the current token to override what is in
        /// the input char buffer.
        /// </summary>
        /// <remarks>
        /// You can set the text for the current token to override what is in
        /// the input char buffer.  Use setText() or can set this instance var.
        /// </remarks>
        public string _text;

        public Lexer(ICharStream input)
        {
            this._input = input;
            this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, input);
        }

        public virtual void Reset()
        {
            // wack Lexer state variables
            if (_input != null)
            {
                _input.Seek(0);
            }
            // rewind the input
            _token = null;
            _type = TokenConstants.InvalidType;
            _channel = TokenConstants.DefaultChannel;
            _tokenStartCharIndex = -1;
            _tokenStartCharPositionInLine = -1;
            _tokenStartLine = -1;
            _text = null;
            _hitEOF = false;
            _mode = Antlr4.Runtime.Lexer.DefaultMode;
            _modeStack.Clear();
            Interpreter.Reset();
        }

        /// <summary>
        /// Return a token from this source; i.e., match a token on the char
        /// stream.
        /// </summary>
        /// <remarks>
        /// Return a token from this source; i.e., match a token on the char
        /// stream.
        /// </remarks>
        public virtual IToken NextToken()
        {
            if (_input == null)
            {
                throw new InvalidOperationException("nextToken requires a non-null input stream."
                    );
            }
            // Mark start location in char stream so unbuffered streams are
            // guaranteed at least have text of current token
            int tokenStartMarker = _input.Mark();
            try
            {
                while (true)
                {
                    if (_hitEOF)
                    {
                        EmitEOF();
                        return _token;
                    }
                    _token = null;
                    _channel = TokenConstants.DefaultChannel;
                    _tokenStartCharIndex = _input.Index;
                    _tokenStartCharPositionInLine = Interpreter.GetCharPositionInLine();
                    _tokenStartLine = Interpreter.GetLine();
                    _text = null;
                    do
                    {
                        _type = TokenConstants.InvalidType;
                        //				System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+
                        //								   " in mode "+mode+
                        //								   " at index "+input.index());
                        int ttype;
                        try
                        {
                            ttype = Interpreter.Match(_input, _mode);
                        }
                        catch (LexerNoViableAltException e)
                        {
                            NotifyListeners(e);
                            // report error
                            Recover(e);
                            ttype = TokenTypes.Skip;
                        }
                        if (_input.La(1) == IntStreamConstants.Eof)
                        {
                            _hitEOF = true;
                        }
                        if (_type == TokenConstants.InvalidType)
                        {
                            _type = ttype;
                        }
                        if (_type == TokenTypes.Skip)
                        {
                            goto outer_continue;
                        }
                    }
                    while (_type == TokenTypes.More);
                    if (_token == null)
                    {
                        Emit();
                    }
                    return _token;
outer_continue: ;
                }
outer_break: ;
            }
            finally
            {
                // make sure we release marker after match or
                // unbuffered char stream will keep buffering
                _input.Release(tokenStartMarker);
            }
        }

        /// <summary>
        /// Instruct the lexer to skip creating a token for current lexer rule
        /// and look for another token.
        /// </summary>
        /// <remarks>
        /// Instruct the lexer to skip creating a token for current lexer rule
        /// and look for another token.  nextToken() knows to keep looking when
        /// a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
        /// if token==null at end of any token rule, it creates one for you
        /// and emits it.
        /// </remarks>
        public virtual void Skip()
        {
            _type = TokenTypes.Skip;
        }

        public virtual void More()
        {
            _type = TokenTypes.More;
        }

        public virtual void Mode(int m)
        {
            _mode = m;
        }

        public virtual void PushMode(int m)
        {
            _modeStack.Add(_mode);
            Mode(m);
        }

        public virtual int PopMode()
        {
            if (_modeStack.Count == 0)
            {
                throw new InvalidOperationException();
            }
            Mode(_modeStack.Pop());
            return _mode;
        }

        public virtual ITokenFactory TokenFactory
        {
            get
            {
                return _factory;
            }
            set
            {
                ITokenFactory factory = value;
                this._factory = factory;
            }
        }

        /// <summary>Set the char stream and reset the lexer</summary>
        public virtual void SetInputStream(ICharStream input)
        {
            this._input = null;
            this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input);
            Reset();
            this._input = input;
            this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input);
        }

        public virtual string SourceName
        {
            get
            {
                return _input.SourceName;
            }
        }

        public override IIntStream InputStream
        {
            get
            {
                return _input;
            }
        }

        ICharStream ITokenSource.InputStream
        {
            get
            {
                return (ICharStream)InputStream;
            }
        }

        /// <summary>
        /// By default does not support multiple emits per nextToken invocation
        /// for efficiency reasons.
        /// </summary>
        /// <remarks>
        /// By default does not support multiple emits per nextToken invocation
        /// for efficiency reasons.  Subclass and override this method, nextToken,
        /// and getToken (to push tokens into a list and pull from that list
        /// rather than a single variable as this implementation does).
        /// </remarks>
        public virtual void Emit(IToken token)
        {
            //System.err.println("emit "+token);
            this._token = token;
        }

        /// <summary>
        /// The standard method called to automatically emit a token at the
        /// outermost lexical rule.
        /// </summary>
        /// <remarks>
        /// The standard method called to automatically emit a token at the
        /// outermost lexical rule.  The token object should point into the
        /// char buffer start..stop.  If there is a text override in 'text',
        /// use that to set the token's text.  Override this method to emit
        /// custom Token objects or provide a new factory.
        /// </remarks>
        public virtual IToken Emit()
        {
            IToken t = _factory.Create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex
                , GetCharIndex() - 1, _tokenStartLine, _tokenStartCharPositionInLine);
            Emit(t);
            return t;
        }

        public virtual IToken EmitEOF()
        {
            int cpos = Column;
            // The character position for EOF is one beyond the position of
            // the previous token's last character
            if (_token != null)
            {
                int n = _token.StopIndex - _token.StartIndex + 1;
                cpos = _token.Column + n;
            }
            IToken eof = _factory.Create(_tokenFactorySourcePair, TokenConstants.Eof, null, TokenConstants
                .DefaultChannel, _input.Index, _input.Index - 1, Line, cpos);
            Emit(eof);
            return eof;
        }

        public virtual int Line
        {
            get
            {
                return Interpreter.GetLine();
            }
        }

        public virtual int Column
        {
            get
            {
                return Interpreter.GetCharPositionInLine();
            }
        }

        public virtual void SetLine(int line)
        {
            Interpreter.SetLine(line);
        }

        public virtual void SetCharPositionInLine(int charPositionInLine)
        {
            Interpreter.SetCharPositionInLine(charPositionInLine);
        }

        /// <summary>What is the index of the current character of lookahead?</summary>
        public virtual int GetCharIndex()
        {
            return _input.Index;
        }

        /// <summary>
        /// Return the text matched so far for the current token or any
        /// text override.
        /// </summary>
        /// <remarks>
        /// Return the text matched so far for the current token or any
        /// text override.
        /// </remarks>
        public virtual string GetText()
        {
            if (_text != null)
            {
                return _text;
            }
            return Interpreter.GetText(_input);
        }

        /// <summary>
        /// Set the complete text of this token; it wipes any previous
        /// changes to the text.
        /// </summary>
        /// <remarks>
        /// Set the complete text of this token; it wipes any previous
        /// changes to the text.
        /// </remarks>
        public virtual void SetText(string text)
        {
            this._text = text;
        }

        /// <summary>Override if emitting multiple tokens.</summary>
        /// <remarks>Override if emitting multiple tokens.</remarks>
        public virtual IToken GetToken()
        {
            return _token;
        }

        public virtual void SetToken(IToken _token)
        {
            this._token = _token;
        }

        public virtual void SetType(int ttype)
        {
            _type = ttype;
        }

        public virtual int GetType()
        {
            return _type;
        }

        public virtual void SetChannel(int channel)
        {
            _channel = channel;
        }

        public virtual int GetChannel()
        {
            return _channel;
        }

        public virtual string[] GetModeNames()
        {
            return null;
        }

        /// <summary>
        /// Used to print out token names like ID during debugging and
        /// error reporting.
        /// </summary>
        /// <remarks>
        /// Used to print out token names like ID during debugging and
        /// error reporting.  The generated parsers implement a method
        /// that overrides this to point to their String[] tokenNames.
        /// </remarks>
        public override string[] TokenNames
        {
            get
            {
                return null;
            }
        }

        /// <summary>Return a list of all Token objects in input char stream.</summary>
        /// <remarks>
        /// Return a list of all Token objects in input char stream.
        /// Forces load of all tokens. Does not include EOF token.
        /// </remarks>
        public virtual IList<IToken> GetAllTokens()
        {
            IList<IToken> tokens = new List<IToken>();
            IToken t = NextToken();
            while (t.Type != TokenConstants.Eof)
            {
                tokens.Add(t);
                t = NextToken();
            }
            return tokens;
        }

        public virtual void Recover(LexerNoViableAltException e)
        {
            if (_input.La(1) != IntStreamConstants.Eof)
            {
                // skip a char and try again
                Interpreter.Consume(_input);
            }
        }

        public virtual void NotifyListeners(LexerNoViableAltException e)
        {
            string text = _input.GetText(Interval.Of(_tokenStartCharIndex, _input.Index));
            string msg = "token recognition error at: '" + GetErrorDisplay(text) + "'";
            IAntlrErrorListener<int> listener = GetErrorListenerDispatch();
            listener.SyntaxError(this, 0, _tokenStartLine, _tokenStartCharPositionInLine,
                msg, e);
        }

        public virtual string GetErrorDisplay(string s)
        {
            StringBuilder buf = new StringBuilder();
            foreach (char c in s.ToCharArray())
            {
                buf.Append(GetErrorDisplay(c));
            }
            return buf.ToString();
        }

        public virtual string GetErrorDisplay(int c)
        {
            string s = ((char)c).ToString();
            switch (c)
            {
                case TokenConstants.Eof:
                {
                    s = "<EOF>";
                    break;
                }

                case '\n':
                {
                    s = "\\n";
                    break;
                }

                case '\t':
                {
                    s = "\\t";
                    break;
                }

                case '\r':
                {
                    s = "\\r";
                    break;
                }
            }
            return s;
        }

        public virtual string GetCharErrorDisplay(int c)
        {
            string s = GetErrorDisplay(c);
            return "'" + s + "'";
        }

        /// <summary>
        /// Lexers can normally match any char in it's vocabulary after matching
        /// a token, so do the easy thing and just kill a character and hope
        /// it all works out.
        /// </summary>
        /// <remarks>
        /// Lexers can normally match any char in it's vocabulary after matching
        /// a token, so do the easy thing and just kill a character and hope
        /// it all works out.  You can instead use the rule invocation stack
        /// to do sophisticated error recovery if you are in a fragment rule.
        /// </remarks>
        public virtual void Recover(RecognitionException re)
        {
            //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
            //re.printStackTrace();
            // TODO: Do we lose character or line position information?
            _input.Consume();
        }
    }
}