Merge pull request #2075 from ewanmellor/swift-unbufferedcharstream

Implement UnbufferedCharStream in the Swift runtime.
2017-10-27 08:27:45 -07:00 · 2017-10-27 08:27:45 -07:00 · 2bfaec72ee
parent 5f4b9b49da 64048caf12
commit 2bfaec72ee
7 changed files with 402 additions and 6 deletions
--- a/runtime/Swift/Sources/Antlr4/CharStream.swift
+++ b/runtime/Swift/Sources/Antlr4/CharStream.swift
@ -25,5 +25,5 @@ public protocol CharStream: IntStream {
    /// - throws: _ANTLRError.unsupportedOperation_ if the stream does not support
    /// getting the text of the specified interval
    /// 
-    func getText(_ interval: Interval) -> String
+    func getText(_ interval: Interval) throws -> String
 }
--- a/runtime/Swift/Sources/Antlr4/CommonToken.swift
+++ b/runtime/Swift/Sources/Antlr4/CommonToken.swift
@ -164,7 +164,12 @@ public class CommonToken: WritableToken {
        if let input = getInputStream() {
            let n = input.size()
            if start < n && stop < n {
-                return input.getText(Interval.of(start, stop))
+                do {
+                    return try input.getText(Interval.of(start, stop))
+                }
+                catch {
+                    return nil
+                }
            } else {
                return "<EOF>"
            }
--- a/runtime/Swift/Sources/Antlr4/CommonTokenFactory.swift
+++ b/runtime/Swift/Sources/Antlr4/CommonTokenFactory.swift
@ -75,7 +75,7 @@ public class CommonTokenFactory: TokenFactory {
            t.setText(text)
        }
        else if let cStream = source.1, copyText {
-            t.setText(cStream.getText(Interval.of(start, stop)))
+            t.setText(try! cStream.getText(Interval.of(start, stop)))
        }

        return t
--- a/runtime/Swift/Sources/Antlr4/Lexer.swift
+++ b/runtime/Swift/Sources/Antlr4/Lexer.swift
@ -395,7 +395,13 @@ open class Lexer: Recognizer<LexerATNSimulator>, TokenSource {

    open func notifyListeners<T>(_ e: LexerNoViableAltException, recognizer: Recognizer<T>) {

-        let text = _input!.getText(Interval.of(_tokenStartCharIndex, _input!.index()))
+        let text: String
+        do {
+            text = try _input!.getText(Interval.of(_tokenStartCharIndex, _input!.index()))
+        }
+        catch {
+            text = "<unknown>"
+        }
        let msg = "token recognition error at: '\(getErrorDisplay(text))'"

        let listener = getErrorListenerDispatch()
--- a/runtime/Swift/Sources/Antlr4/LexerNoViableAltException.swift
+++ b/runtime/Swift/Sources/Antlr4/LexerNoViableAltException.swift
@ -39,7 +39,7 @@ public class LexerNoViableAltException: RecognitionException, CustomStringConver
        var symbol = ""
        if let charStream = getInputStream() as? CharStream, startIndex >= 0 && startIndex < charStream.size() {
            let interval = Interval.of(startIndex, startIndex)
-            symbol = charStream.getText(interval)
+            symbol = try! charStream.getText(interval)
            symbol = Utils.escapeWhitespace(symbol, false)
        }

--- a/runtime/Swift/Sources/Antlr4/UnbufferedCharStream.swift
+++ b/runtime/Swift/Sources/Antlr4/UnbufferedCharStream.swift
@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+import Foundation
+
+
+/** Do not buffer up the entire char stream. It does keep a small buffer
+ *  for efficiency and also buffers while a mark exists (set by the
+ *  lookahead prediction in parser). "Unbuffered" here refers to fact
+ *  that it doesn't buffer all data, not that's it's on demand loading of char.
+ *
+ *  Before 4.7, this class used the default environment encoding to convert
+ *  bytes to UTF-16, and held the UTF-16 bytes in the buffer as chars.
+ *
+ *  As of 4.7, the class uses UTF-8 by default, and the buffer holds Unicode
+ *  code points in the buffer as ints.
+ */
+open class UnbufferedCharStream: CharStream {
+    /**
+     * A moving window buffer of the data being scanned. While there's a marker,
+     * we keep adding to buffer. Otherwise, {@link #consume consume()} resets so
+     * we start filling at index 0 again.
+     */
+    internal var data: [Int]
+
+    /**
+     * The number of characters currently in {@link #data data}.
+     *
+     * <p>This is not the buffer capacity, that's {@code data.length}.</p>
+     */
+    internal var n = 0
+
+    /**
+     * 0..n-1 index into {@link #data data} of next character.
+     *
+     * <p>The {@code LA(1)} character is {@code data[p]}. If {@code p == n}, we are
+     * out of buffered characters.</p>
+     */
+    internal var p = 0
+
+    /**
+     * Count up with {@link #mark mark()} and down with
+     * {@link #release release()}. When we {@code release()} the last mark,
+     * {@code numMarkers} reaches 0 and we reset the buffer. Copy
+     * {@code data[p]..data[n-1]} to {@code data[0]..data[(n-1)-p]}.
+     */
+    internal var numMarkers = 0
+
+    /**
+     * This is the {@code LA(-1)} character for the current position.
+     */
+    internal var lastChar = -1
+
+    /**
+     * When {@code numMarkers > 0}, this is the {@code LA(-1)} character for the
+     * first character in {@link #data data}. Otherwise, this is unspecified.
+     */
+    internal var lastCharBufferStart = 0
+
+    /**
+     * Absolute character index. It's the index of the character about to be
+     * read via {@code LA(1)}. Goes from 0 to the number of characters in the
+     * entire stream, although the stream size is unknown before the end is
+     * reached.
+     */
+    internal var currentCharIndex = 0
+
+    internal let input: InputStream
+    private var unicodeIterator: UnicodeScalarStreamIterator
+
+
+    /** The name or source of this char stream. */
+    public var name: String = ""
+
+    public init(_ input: InputStream, _ bufferSize: Int = 256) {
+        self.input = input
+        self.data = [Int](repeating: 0, count: bufferSize)
+        let si = UInt8StreamIterator(input)
+        self.unicodeIterator = UnicodeScalarStreamIterator(si)
+    }
+
+    public func consume() throws {
+        if try LA(1) == CommonToken.EOF {
+            throw ANTLRError.illegalState(msg: "cannot consume EOF")
+        }
+
+        // buf always has at least data[p==0] in this method due to ctor
+        lastChar = data[p]   // track last char for LA(-1)
+
+        if p == n - 1 && numMarkers == 0 {
+            n = 0
+            p = -1 // p++ will leave this at 0
+            lastCharBufferStart = lastChar
+        }
+
+        p += 1
+        currentCharIndex += 1
+        sync(1)
+    }
+
+    /**
+     * Make sure we have 'need' elements from current position {@link #p p}.
+     * Last valid {@code p} index is {@code data.length-1}. {@code p+need-1} is
+     * the char index 'need' elements ahead. If we need 1 element,
+     * {@code (p+1-1)==p} must be less than {@code data.length}.
+     */
+    internal func sync(_ want: Int) {
+        let need = (p + want - 1) - n + 1 // how many more elements we need?
+        if need > 0 {
+            fill(need)
+        }
+    }
+
+    /**
+     * Add {@code n} characters to the buffer. Returns the number of characters
+     * actually added to the buffer. If the return value is less than {@code n},
+     * then EOF was reached before {@code n} characters could be added.
+     */
+    @discardableResult internal func fill(_ toAdd: Int) -> Int {
+        for i in 0 ..< toAdd {
+            if n > 0 && data[n - 1] == CommonToken.EOF {
+                return i
+            }
+
+            guard let c = nextChar() else {
+                return i
+            }
+            add(c)
+        }
+
+        return n
+    }
+
+    /**
+     * Override to provide different source of characters than
+     * {@link #input input}.
+     */
+    internal func nextChar() -> Int? {
+        if let next = unicodeIterator.next() {
+            return Int(next.value)
+        }
+        else if unicodeIterator.hasErrorOccurred {
+            return nil
+        }
+        else {
+            return nil
+        }
+    }
+
+    internal func add(_ c: Int) {
+        if n >= data.count {
+            data += [Int](repeating: 0, count: data.count)
+        }
+        data[n] = c
+        n += 1
+    }
+
+    public func LA(_ i: Int) throws -> Int {
+        let result = try LA_(i)
+        print("LA(\(i)) -> \(result)")
+        return result
+    }
+
+    private func LA_(_ i: Int) throws -> Int {
+
+        if i == -1 {
+            return lastChar // special case
+        }
+        sync(i)
+        let index = p + i - 1
+        if index < 0 {
+            throw ANTLRError.indexOutOfBounds(msg: "")
+        }
+        if index >= n {
+            return CommonToken.EOF
+        }
+        return data[index]
+    }
+
+    /**
+     * Return a marker that we can release later.
+     *
+     * <p>The specific marker value used for this class allows for some level of
+     * protection against misuse where {@code seek()} is called on a mark or
+     * {@code release()} is called in the wrong order.</p>
+     */
+    public func mark() -> Int {
+        if numMarkers == 0 {
+            lastCharBufferStart = lastChar
+        }
+
+        let mark = -numMarkers - 1
+        numMarkers += 1
+        return mark
+    }
+
+    /** Decrement number of markers, resetting buffer if we hit 0.
+     * @param marker
+     */
+    public func release(_ marker: Int) throws {
+        let expectedMark = -numMarkers
+        if marker != expectedMark {
+            preconditionFailure("release() called with an invalid marker.")
+        }
+
+        numMarkers -= 1
+        if numMarkers == 0 && p > 0 {
+            // release buffer when we can, but don't do unnecessary work
+
+            // Copy data[p]..data[n-1] to data[0]..data[(n-1)-p], reset ptrs
+            // p is last valid char; move nothing if p==n as we have no valid char
+            let dataCapacity = data.capacity
+            data = Array(data[p ..< n])
+            data += [Int](repeating: 0, count: dataCapacity - (n - p))
+            precondition(data.capacity == dataCapacity)
+            n = n - p
+            p = 0
+            lastCharBufferStart = lastChar
+        }
+    }
+
+    public func index() -> Int {
+        return currentCharIndex
+    }
+
+    /** Seek to absolute character index, which might not be in the current
+     *  sliding window.  Move {@code p} to {@code index-bufferStartIndex}.
+     */
+    public func seek(_ index_: Int) throws {
+        var index = index_
+
+        if index == currentCharIndex {
+            return
+        }
+
+        if index > currentCharIndex {
+            sync(index - currentCharIndex)
+            index = min(index, getBufferStartIndex() + n - 1)
+        }
+
+        // index == to bufferStartIndex should set p to 0
+        let i = index - getBufferStartIndex()
+        if i < 0 {
+            throw ANTLRError.illegalArgument(msg: "cannot seek to negative index \(index)")
+        }
+        else if i >= n {
+            let si = getBufferStartIndex()
+            let ei = si + n
+            let msg = "seek to index outside buffer: \(index) not in \(si)..\(ei)"
+            throw ANTLRError.unsupportedOperation(msg: msg)
+        }
+
+        p = i
+        currentCharIndex = index
+        if p == 0 {
+            lastChar = lastCharBufferStart
+        }
+        else {
+            lastChar = data[p - 1]
+        }
+    }
+
+    public func size() -> Int {
+        preconditionFailure("Unbuffered stream cannot know its size")
+    }
+
+    public func getSourceName() -> String {
+        return name
+    }
+
+    public func getText(_ interval: Interval) throws -> String {
+        if interval.a < 0 || interval.b < interval.a - 1 {
+            throw ANTLRError.illegalArgument(msg: "invalid interval")
+        }
+
+        let bufferStartIndex = getBufferStartIndex()
+        if n > 0 &&
+            data[n - 1] == CommonToken.EOF &&
+            interval.a + interval.length() > bufferStartIndex + n {
+            throw ANTLRError.illegalArgument(msg: "the interval extends past the end of the stream")
+        }
+
+        if interval.a < bufferStartIndex || interval.b >= bufferStartIndex + n {
+            let msg = "interval \(interval) outside buffer: \(bufferStartIndex)...\(bufferStartIndex + n - 1)"
+            throw ANTLRError.unsupportedOperation(msg: msg)
+        }
+
+        if interval.b < interval.a {
+            // The EOF token.
+            return ""
+        }
+
+        // convert from absolute to local index
+        let i = interval.a - bufferStartIndex
+        let j = interval.b - bufferStartIndex
+
+        // Convert from Int codepoints to a String.
+        let codepoints = data[i ... j].map { Character(Unicode.Scalar($0)!) }
+        return String(codepoints)
+    }
+
+    internal func getBufferStartIndex() -> Int {
+        return currentCharIndex - p
+    }
+}
+
+
+fileprivate struct UInt8StreamIterator: IteratorProtocol {
+    private static let bufferSize = 1024
+
+    private let stream: InputStream
+    private var buffer = [UInt8](repeating: 0, count: UInt8StreamIterator.bufferSize)
+    private var buffGen: IndexingIterator<ArraySlice<UInt8>>
+
+    var hasErrorOccurred = false
+
+
+    init(_ stream: InputStream) {
+        self.stream = stream
+        self.buffGen = buffer[0..<0].makeIterator()
+    }
+
+    mutating func next() -> UInt8? {
+        if let result = buffGen.next() {
+            return result
+        }
+
+        if hasErrorOccurred {
+            return nil
+        }
+
+        switch stream.streamStatus {
+        case .notOpen, .writing, .closed:
+            preconditionFailure()
+        case .atEnd:
+            return nil
+        case .error:
+            hasErrorOccurred = true
+            return nil
+        case .opening, .open, .reading:
+            break
+        }
+
+        let count = stream.read(&buffer, maxLength: buffer.capacity)
+        if count <= 0 {
+            hasErrorOccurred = true
+            return nil
+        }
+
+        buffGen = buffer.prefix(count).makeIterator()
+        return buffGen.next()
+    }
+}
+
+
+fileprivate struct UnicodeScalarStreamIterator: IteratorProtocol {
+    private var streamIterator: UInt8StreamIterator
+    private var codec = Unicode.UTF8()
+
+    var hasErrorOccurred = false
+
+    init(_ streamIterator: UInt8StreamIterator) {
+        self.streamIterator = streamIterator
+    }
+
+    mutating func next() -> Unicode.Scalar? {
+        if streamIterator.hasErrorOccurred {
+            hasErrorOccurred = true
+            return nil
+        }
+
+        switch codec.decode(&streamIterator) {
+        case .scalarValue(let scalar):
+            return scalar
+        case .emptyInput:
+            return nil
+        case .error:
+            hasErrorOccurred = true
+            return nil
+        }
+    }
+}
--- a/runtime/Swift/Sources/Antlr4/atn/LexerATNSimulator.swift
+++ b/runtime/Swift/Sources/Antlr4/atn/LexerATNSimulator.swift
@ -736,7 +736,7 @@ open class LexerATNSimulator: ATNSimulator {

    public func getText(_ input: CharStream) -> String {
        // index is first lookahead char, don't include.
-        return input.getText(Interval.of(startIndex, input.index() - 1))
+        return try! input.getText(Interval.of(startIndex, input.index() - 1))
    }

    public func getLine() -> Int {