antlr/runtime/Swift/Sources/Antlr4/Lexer.swift

///
/// Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
/// Use of this file is governed by the BSD 3-clause license that
/// can be found in the LICENSE.txt file in the project root.
///


///
/// A lexer is recognizer that draws input symbols from a character stream.
/// lexer grammars result in a subclass of this object. A Lexer object
/// uses simplified match() and error recovery mechanisms in the interest
/// of speed.
///

import Foundation

open class Lexer: Recognizer<LexerATNSimulator>, TokenSource {
    public static let EOF = -1
    public static let DEFAULT_MODE = 0
    public static let MORE = -2
    public static let SKIP = -3

    public static let DEFAULT_TOKEN_CHANNEL = CommonToken.DEFAULT_CHANNEL
    public static let HIDDEN = CommonToken.HIDDEN_CHANNEL
    public static let MIN_CHAR_VALUE = Character.MIN_VALUE;
    public static let MAX_CHAR_VALUE = Character.MAX_VALUE;

    public var _input: CharStream?
    internal var _tokenFactorySourcePair: TokenSourceAndStream

    ///
    /// How to create token objects
    ///
    internal var _factory = CommonTokenFactory.DEFAULT

    ///
    /// The goal of all lexer rules/methods is to create a token object.
    /// This is an instance variable as multiple rules may collaborate to
    /// create a single token.  nextToken will return this object after
    /// matching lexer rule(s).  If you subclass to allow multiple token
    /// emissions, then set this to the last token to be matched or
    /// something nonnull so that the auto token emit mechanism will not
    /// emit another token.
    ///
    public var _token: Token?

    ///
    /// What character index in the stream did the current token start at?
    /// Needed, for example, to get the text for current token.  Set at
    /// the start of nextToken.
    ///
    public var _tokenStartCharIndex = -1

    ///
    /// The line on which the first character of the token resides
    ///
    public var _tokenStartLine = 0

    ///
    /// The character position of first character within the line
    ///
    public var _tokenStartCharPositionInLine = 0

    ///
    /// Once we see EOF on char stream, next token will be EOF.
    /// If you have DONE : EOF ; then you see DONE EOF.
    ///
    public var _hitEOF = false

    ///
    /// The channel number for the current token
    ///
    public var _channel = 0

    ///
    /// The token type for the current token
    ///
    public var _type = 0

    public final var _modeStack = Stack<Int>()
    public var _mode = Lexer.DEFAULT_MODE

    ///
    /// You can set the text for the current token to override what is in
    /// the input char buffer.  Use setText() or can set this instance var.
    ///
    public var _text: String?

    public override init() {
        self._tokenFactorySourcePair = TokenSourceAndStream()
        super.init()
        self._tokenFactorySourcePair.tokenSource = self
    }

    public required init(_ input: CharStream) {
        self._input = input
        self._tokenFactorySourcePair = TokenSourceAndStream()
        super.init()
        self._tokenFactorySourcePair.tokenSource = self
        self._tokenFactorySourcePair.stream = input
    }

    open func reset() throws {
        // wack Lexer state variables
        if let _input = _input {
            try  _input.seek(0) // rewind the input
        }
        _token = nil
        _type = CommonToken.INVALID_TYPE
        _channel = CommonToken.DEFAULT_CHANNEL
        _tokenStartCharIndex = -1
        _tokenStartCharPositionInLine = -1
        _tokenStartLine = -1
        _text = nil

        _hitEOF = false
        _mode = Lexer.DEFAULT_MODE
        _modeStack.clear()

        getInterpreter().reset()
    }

    ///
    /// Return a token from this source; i.e., match a token on the char
    /// stream.
    ///

    open func nextToken() throws -> Token {
        guard let _input = _input else {
            throw ANTLRError.illegalState(msg: "nextToken requires a non-null input stream.")
        }

        // Mark start location in char stream so unbuffered streams are
        // guaranteed at least have text of current token
        var tokenStartMarker = _input.mark()
        defer {
            // make sure we release marker after match or
            // unbuffered char stream will keep buffering
            try! _input.release(tokenStartMarker)
        }
        do {
            outer:
            while true {
                if _hitEOF {
                    emitEOF()
                    return _token!
                }

                _token = nil
                _channel = CommonToken.DEFAULT_CHANNEL
                _tokenStartCharIndex = _input.index()
                _tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine()
                _tokenStartLine = getInterpreter().getLine()
                _text = nil
                repeat {
                    _type = CommonToken.INVALID_TYPE
                    var ttype: Int
                    do {
                        ttype = try getInterpreter().match(_input, _mode)
                    }
                    catch  ANTLRException.recognition(let e) {
                        notifyListeners(e as! LexerNoViableAltException, recognizer: self)
                        try recover(e as! LexerNoViableAltException)
                        ttype = Lexer.SKIP
                    }
                    if try _input.LA(1) == BufferedTokenStream.EOF {
                        _hitEOF = true
                    }
                    if _type == CommonToken.INVALID_TYPE {
                        _type = ttype
                    }
                    if _type == Lexer.SKIP {
                        continue outer
                    }
                } while _type == Lexer.MORE

                if _token == nil {
                    emit()
                }
                return _token!
            }
        }

    }

    ///
    /// Instruct the lexer to skip creating a token for current lexer rule
    /// and look for another token.  nextToken() knows to keep looking when
    /// a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
    /// if token==null at end of any token rule, it creates one for you
    /// and emits it.
    ///
    open func skip() {
        _type = Lexer.SKIP
    }

    open func more() {
        _type = Lexer.MORE
    }

    open func mode(_ m: Int) {
        _mode = m
    }

    open func pushMode(_ m: Int) {
        if LexerATNSimulator.debug {
            print("pushMode \(m)")
        }
        _modeStack.push(_mode)
        mode(m)
    }
    @discardableResult
    open func popMode() throws -> Int {
        if _modeStack.isEmpty {
            throw ANTLRError.unsupportedOperation(msg: " EmptyStackException")
        }

        if LexerATNSimulator.debug {
            print("popMode back to \(String(describing: _modeStack.peek()))")
        }
        mode(_modeStack.pop())
        return _mode
    }


    open override func setTokenFactory(_ factory: TokenFactory) {
        self._factory = factory
    }


    open override func getTokenFactory() -> TokenFactory {
        return _factory
    }

    ///
    /// Set the char stream and reset the lexer
    ///

    open override func setInputStream(_ input: IntStream) throws {
        self._input = nil
        self._tokenFactorySourcePair = makeTokenSourceAndStream()
        try reset()
        self._input = input as? CharStream
        self._tokenFactorySourcePair = makeTokenSourceAndStream()
    }


    open func getSourceName() -> String {
        return _input!.getSourceName()
    }


    open func getInputStream() -> CharStream? {
        return _input
    }

    ///
    /// By default does not support multiple emits per nextToken invocation
    /// for efficiency reasons.  Subclass and override this method, nextToken,
    /// and getToken (to push tokens into a list and pull from that list
    /// rather than a single variable as this implementation does).
    ///
    open func emit(_ token: Token) {
        //System.err.println("emit "+token);
        self._token = token
    }

    ///
    /// The standard method called to automatically emit a token at the
    /// outermost lexical rule.  The token object should point into the
    /// char buffer start..stop.  If there is a text override in 'text',
    /// use that to set the token's text.  Override this method to emit
    /// custom Token objects or provide a new factory.
    ///
    @discardableResult
    open func emit() -> Token {
        let t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, getCharIndex() - 1, _tokenStartLine, _tokenStartCharPositionInLine)
        emit(t)
        return t
    }

    @discardableResult
    open func emitEOF() -> Token {
        let cpos = getCharPositionInLine()
        let line = getLine()
        let idx = _input!.index()
        let eof = _factory.create(
            _tokenFactorySourcePair,
            CommonToken.EOF,
            nil,
            CommonToken.DEFAULT_CHANNEL,
            idx,
            idx - 1,
            line,
            cpos)
        emit(eof)
        return eof
    }


    open func getLine() -> Int {
        return getInterpreter().getLine()
    }


    open func getCharPositionInLine() -> Int {
        return getInterpreter().getCharPositionInLine()
    }

    open func setLine(_ line: Int) {
        getInterpreter().setLine(line)
    }

    open func setCharPositionInLine(_ charPositionInLine: Int) {
        getInterpreter().setCharPositionInLine(charPositionInLine)
    }

    ///
    /// What is the index of the current character of lookahead?
    ///
    open func getCharIndex() -> Int {
        return _input!.index()
    }

    ///
    /// Return the text matched so far for the current token or any
    /// text override.
    ///
    open func getText() -> String {
        if _text != nil {
            return _text!
        }
        return getInterpreter().getText(_input!)
    }

    ///
    /// Set the complete text of this token; it wipes any previous
    /// changes to the text.
    ///
    open func setText(_ text: String) {
        self._text = text
    }

    ///
    /// Override if emitting multiple tokens.
    ///
    open func getToken() -> Token {
        return _token!
    }

    open func setToken(_ _token: Token) {
        self._token = _token
    }

    open func setType(_ ttype: Int) {
        _type = ttype
    }

    open func getType() -> Int {
        return _type
    }

    open func setChannel(_ channel: Int) {
        _channel = channel
    }

    open func getChannel() -> Int {
        return _channel
    }

    open func getChannelNames() -> [String]? {
        return nil
    }

    open func getModeNames() -> [String]? {
        return nil
    }

    ///
    /// Return a list of all Token objects in input char stream.
    /// Forces load of all tokens. Does not include EOF token.
    ///
    open func getAllTokens() throws -> [Token] {
        var tokens = [Token]()
        var t = try nextToken()
        while t.getType() != CommonToken.EOF {
            tokens.append(t)
            t = try nextToken()
        }
        return tokens
    }

    open func recover(_ e: LexerNoViableAltException) throws {
        if try _input!.LA(1) != BufferedTokenStream.EOF {
            // skip a char and try again
            try getInterpreter().consume(_input!)
        }
    }

    open func notifyListeners<T>(_ e: LexerNoViableAltException, recognizer: Recognizer<T>) {

        let text: String
        do {
            text = try _input!.getText(Interval.of(_tokenStartCharIndex, _input!.index()))
        }
        catch {
            text = "<unknown>"
        }
        let msg = "token recognition error at: '\(getErrorDisplay(text))'"

        let listener = getErrorListenerDispatch()
        listener.syntaxError(recognizer, nil, _tokenStartLine, _tokenStartCharPositionInLine, msg, e)
    }

    open func getErrorDisplay(_ s: String) -> String {
        var buf = ""
        for c in s {
            buf += getErrorDisplay(c)
        }
        return buf
    }

    open func getErrorDisplay(_ c: Character) -> String {
        if c.integerValue == CommonToken.EOF {
            return "<EOF>"
        }
        switch c {
        case "\n":
            return "\\n"
        case "\t":
            return "\\t"
        case "\r":
            return "\\r"
        default:
            return String(c)
        }
    }

    open func getCharErrorDisplay(_ c: Character) -> String {
        let s: String = getErrorDisplay(c)
        return "'\(s)'"
    }

    ///
    /// Lexers can normally match any char in it's vocabulary after matching
    /// a token, so do the easy thing and just kill a character and hope
    /// it all works out.  You can instead use the rule invocation stack
    /// to do sophisticated error recovery if you are in a fragment rule.
    ///
    open func recover(_ re: AnyObject) throws {
        // TODO: Do we lose character or line position information?
        try _input!.consume()
    }

    internal func makeTokenSourceAndStream() -> TokenSourceAndStream {
        return TokenSourceAndStream(self, _input)
    }
}