antlr/runtime/Swift/Sources/Antlr4/Lexer.swift

459 lines
13 KiB
Swift

///
/// Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
/// Use of this file is governed by the BSD 3-clause license that
/// can be found in the LICENSE.txt file in the project root.
///
///
/// A lexer is recognizer that draws input symbols from a character stream.
/// lexer grammars result in a subclass of this object. A Lexer object
/// uses simplified match() and error recovery mechanisms in the interest
/// of speed.
///
import Foundation
open class Lexer: Recognizer<LexerATNSimulator>, TokenSource {
public static let EOF = -1
public static let DEFAULT_MODE = 0
public static let MORE = -2
public static let SKIP = -3
public static let DEFAULT_TOKEN_CHANNEL = CommonToken.DEFAULT_CHANNEL
public static let HIDDEN = CommonToken.HIDDEN_CHANNEL
public static let MIN_CHAR_VALUE = Character.MIN_VALUE;
public static let MAX_CHAR_VALUE = Character.MAX_VALUE;
public var _input: CharStream?
internal var _tokenFactorySourcePair: TokenSourceAndStream
///
/// How to create token objects
///
internal var _factory = CommonTokenFactory.DEFAULT
///
/// The goal of all lexer rules/methods is to create a token object.
/// This is an instance variable as multiple rules may collaborate to
/// create a single token. nextToken will return this object after
/// matching lexer rule(s). If you subclass to allow multiple token
/// emissions, then set this to the last token to be matched or
/// something nonnull so that the auto token emit mechanism will not
/// emit another token.
///
public var _token: Token?
///
/// What character index in the stream did the current token start at?
/// Needed, for example, to get the text for current token. Set at
/// the start of nextToken.
///
public var _tokenStartCharIndex = -1
///
/// The line on which the first character of the token resides
///
public var _tokenStartLine = 0
///
/// The character position of first character within the line
///
public var _tokenStartCharPositionInLine = 0
///
/// Once we see EOF on char stream, next token will be EOF.
/// If you have DONE : EOF ; then you see DONE EOF.
///
public var _hitEOF = false
///
/// The channel number for the current token
///
public var _channel = 0
///
/// The token type for the current token
///
public var _type = 0
public final var _modeStack = Stack<Int>()
public var _mode = Lexer.DEFAULT_MODE
///
/// You can set the text for the current token to override what is in
/// the input char buffer. Use setText() or can set this instance var.
///
public var _text: String?
public override init() {
self._tokenFactorySourcePair = TokenSourceAndStream()
super.init()
self._tokenFactorySourcePair.tokenSource = self
}
public required init(_ input: CharStream) {
self._input = input
self._tokenFactorySourcePair = TokenSourceAndStream()
super.init()
self._tokenFactorySourcePair.tokenSource = self
self._tokenFactorySourcePair.stream = input
}
open func reset() throws {
// wack Lexer state variables
if let _input = _input {
try _input.seek(0) // rewind the input
}
_token = nil
_type = CommonToken.INVALID_TYPE
_channel = CommonToken.DEFAULT_CHANNEL
_tokenStartCharIndex = -1
_tokenStartCharPositionInLine = -1
_tokenStartLine = -1
_text = nil
_hitEOF = false
_mode = Lexer.DEFAULT_MODE
_modeStack.clear()
getInterpreter().reset()
}
///
/// Return a token from this source; i.e., match a token on the char
/// stream.
///
open func nextToken() throws -> Token {
guard let _input = _input else {
throw ANTLRError.illegalState(msg: "nextToken requires a non-null input stream.")
}
// Mark start location in char stream so unbuffered streams are
// guaranteed at least have text of current token
var tokenStartMarker = _input.mark()
defer {
// make sure we release marker after match or
// unbuffered char stream will keep buffering
try! _input.release(tokenStartMarker)
}
do {
outer:
while true {
if _hitEOF {
emitEOF()
return _token!
}
_token = nil
_channel = CommonToken.DEFAULT_CHANNEL
_tokenStartCharIndex = _input.index()
_tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine()
_tokenStartLine = getInterpreter().getLine()
_text = nil
repeat {
_type = CommonToken.INVALID_TYPE
var ttype: Int
do {
ttype = try getInterpreter().match(_input, _mode)
}
catch ANTLRException.recognition(let e) {
notifyListeners(e as! LexerNoViableAltException, recognizer: self)
try recover(e as! LexerNoViableAltException)
ttype = Lexer.SKIP
}
if try _input.LA(1) == BufferedTokenStream.EOF {
_hitEOF = true
}
if _type == CommonToken.INVALID_TYPE {
_type = ttype
}
if _type == Lexer.SKIP {
continue outer
}
} while _type == Lexer.MORE
if _token == nil {
emit()
}
return _token!
}
}
}
///
/// Instruct the lexer to skip creating a token for current lexer rule
/// and look for another token. nextToken() knows to keep looking when
/// a lexer rule finishes with token set to SKIP_TOKEN. Recall that
/// if token==null at end of any token rule, it creates one for you
/// and emits it.
///
open func skip() {
_type = Lexer.SKIP
}
open func more() {
_type = Lexer.MORE
}
open func mode(_ m: Int) {
_mode = m
}
open func pushMode(_ m: Int) {
if LexerATNSimulator.debug {
print("pushMode \(m)")
}
_modeStack.push(_mode)
mode(m)
}
@discardableResult
open func popMode() throws -> Int {
if _modeStack.isEmpty {
throw ANTLRError.unsupportedOperation(msg: " EmptyStackException")
}
if LexerATNSimulator.debug {
print("popMode back to \(String(describing: _modeStack.peek()))")
}
mode(_modeStack.pop())
return _mode
}
open override func setTokenFactory(_ factory: TokenFactory) {
self._factory = factory
}
open override func getTokenFactory() -> TokenFactory {
return _factory
}
///
/// Set the char stream and reset the lexer
///
open override func setInputStream(_ input: IntStream) throws {
self._input = nil
self._tokenFactorySourcePair = makeTokenSourceAndStream()
try reset()
self._input = input as? CharStream
self._tokenFactorySourcePair = makeTokenSourceAndStream()
}
open func getSourceName() -> String {
return _input!.getSourceName()
}
open func getInputStream() -> CharStream? {
return _input
}
///
/// By default does not support multiple emits per nextToken invocation
/// for efficiency reasons. Subclass and override this method, nextToken,
/// and getToken (to push tokens into a list and pull from that list
/// rather than a single variable as this implementation does).
///
open func emit(_ token: Token) {
//System.err.println("emit "+token);
self._token = token
}
///
/// The standard method called to automatically emit a token at the
/// outermost lexical rule. The token object should point into the
/// char buffer start..stop. If there is a text override in 'text',
/// use that to set the token's text. Override this method to emit
/// custom Token objects or provide a new factory.
///
@discardableResult
open func emit() -> Token {
let t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, getCharIndex() - 1, _tokenStartLine, _tokenStartCharPositionInLine)
emit(t)
return t
}
@discardableResult
open func emitEOF() -> Token {
let cpos = getCharPositionInLine()
let line = getLine()
let idx = _input!.index()
let eof = _factory.create(
_tokenFactorySourcePair,
CommonToken.EOF,
nil,
CommonToken.DEFAULT_CHANNEL,
idx,
idx - 1,
line,
cpos)
emit(eof)
return eof
}
open func getLine() -> Int {
return getInterpreter().getLine()
}
open func getCharPositionInLine() -> Int {
return getInterpreter().getCharPositionInLine()
}
open func setLine(_ line: Int) {
getInterpreter().setLine(line)
}
open func setCharPositionInLine(_ charPositionInLine: Int) {
getInterpreter().setCharPositionInLine(charPositionInLine)
}
///
/// What is the index of the current character of lookahead?
///
open func getCharIndex() -> Int {
return _input!.index()
}
///
/// Return the text matched so far for the current token or any
/// text override.
///
open func getText() -> String {
if _text != nil {
return _text!
}
return getInterpreter().getText(_input!)
}
///
/// Set the complete text of this token; it wipes any previous
/// changes to the text.
///
open func setText(_ text: String) {
self._text = text
}
///
/// Override if emitting multiple tokens.
///
open func getToken() -> Token {
return _token!
}
open func setToken(_ _token: Token) {
self._token = _token
}
open func setType(_ ttype: Int) {
_type = ttype
}
open func getType() -> Int {
return _type
}
open func setChannel(_ channel: Int) {
_channel = channel
}
open func getChannel() -> Int {
return _channel
}
open func getChannelNames() -> [String]? {
return nil
}
open func getModeNames() -> [String]? {
return nil
}
///
/// Return a list of all Token objects in input char stream.
/// Forces load of all tokens. Does not include EOF token.
///
open func getAllTokens() throws -> [Token] {
var tokens = [Token]()
var t = try nextToken()
while t.getType() != CommonToken.EOF {
tokens.append(t)
t = try nextToken()
}
return tokens
}
open func recover(_ e: LexerNoViableAltException) throws {
if try _input!.LA(1) != BufferedTokenStream.EOF {
// skip a char and try again
try getInterpreter().consume(_input!)
}
}
open func notifyListeners<T>(_ e: LexerNoViableAltException, recognizer: Recognizer<T>) {
let text: String
do {
text = try _input!.getText(Interval.of(_tokenStartCharIndex, _input!.index()))
}
catch {
text = "<unknown>"
}
let msg = "token recognition error at: '\(getErrorDisplay(text))'"
let listener = getErrorListenerDispatch()
listener.syntaxError(recognizer, nil, _tokenStartLine, _tokenStartCharPositionInLine, msg, e)
}
open func getErrorDisplay(_ s: String) -> String {
var buf = ""
for c in s {
buf += getErrorDisplay(c)
}
return buf
}
open func getErrorDisplay(_ c: Character) -> String {
if c.integerValue == CommonToken.EOF {
return "<EOF>"
}
switch c {
case "\n":
return "\\n"
case "\t":
return "\\t"
case "\r":
return "\\r"
default:
return String(c)
}
}
open func getCharErrorDisplay(_ c: Character) -> String {
let s: String = getErrorDisplay(c)
return "'\(s)'"
}
///
/// Lexers can normally match any char in it's vocabulary after matching
/// a token, so do the easy thing and just kill a character and hope
/// it all works out. You can instead use the rule invocation stack
/// to do sophisticated error recovery if you are in a fragment rule.
///
open func recover(_ re: AnyObject) throws {
// TODO: Do we lose character or line position information?
try _input!.consume()
}
internal func makeTokenSourceAndStream() -> TokenSourceAndStream {
return TokenSourceAndStream(self, _input)
}
}