From 6e2bbcdb42bdee6d3b08b06c17f5ab9e49dd0056 Mon Sep 17 00:00:00 2001
From: parrt <parrt@antlr.org>
Date: Sat, 29 Oct 2011 11:08:40 -0800
Subject: [PATCH] got EOF in lexer

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9223]
---
 .../v4/runtime/atn/LexerATNSimulator.java     |  27 +++--
 .../antlr/v4/automata/LexerATNFactory.java    |  16 ++-
 tool/src/org/antlr/v4/parse/ANTLRParser.g     |  11 +-
 .../antlr/v4/test/TestATNDeserialization.java |  28 +++++
 .../v4/test/TestATNLexerInterpreter.java      |  89 +++++++++++---
 .../antlr/v4/test/TestATNSerialization.java   | 111 ++++++++++++++++++
 6 files changed, 242 insertions(+), 40 deletions(-)
diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java
index 2f23bf8dd..abc85141a 100644
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java
@@ -210,7 +210,7 @@ public class LexerATNSimulator extends ATNSimulator {
 				// we got nowhere on t, don't throw out this knowledge; it'd
 				// cause a failover from DFA later.  Don't track EOF edges
 				// from stop states, though.
-				if ( t!=Token.EOF ) addDFAEdge(from, t, ERROR);
+				if ( t!=CharStream.EOF ) addDFAEdge(from, t, ERROR);
 				break;
 			}
 
@@ -223,12 +223,17 @@ public class LexerATNSimulator extends ATNSimulator {
 										   input.index()+", reach="+reach+
 										   ", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex);
 					}
-					if ( input.index() > prevAcceptIndex ) {
+					int index = input.index();
+					if ( index > prevAcceptIndex ) {
 						// will favor prev accept at same index so "int" is keyword not ID
 						prevAccept = c;
-						prevAcceptIndex = input.index();
+						if ( t == CharStream.EOF ) {
+							// later we seek to prevAcceptIndex+1, undo that effect for EOF
+							index--;
+						}
+						prevAcceptIndex = index;
 						if ( debug ) {
-							System.out.println("mark "+c+" @ index="+input.index());
+							System.out.println("mark "+c+" @ index="+index);
 						}
 					}
 
@@ -243,10 +248,11 @@ public class LexerATNSimulator extends ATNSimulator {
 			}
 
 			consume(input, t);
-			addDFAEdge(closure, t, reach);
+			if ( t!=CharStream.EOF ) addDFAEdge(closure, t, reach);
 			t = input.LA(1);
 
 			// swap to avoid reallocating space
+			// TODO: faster to reallocate?
 			OrderedHashSet<ATNConfig> tmp = reach;
 			reach = closure;
 			closure = tmp;
@@ -256,7 +262,7 @@ public class LexerATNSimulator extends ATNSimulator {
 		if ( prevAccept==null ) {
 			if ( t==Token.EOF ) {
 				System.out.println("EOF in token at input index "+input.index());
-				return Token.EOF;
+				//return Token.EOF;
 			}
 //					System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index());
 			throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty
@@ -296,11 +302,10 @@ public class LexerATNSimulator extends ATNSimulator {
 		else if ( trans instanceof SetTransition ) {
 			SetTransition st = (SetTransition)trans;
 			boolean not = trans instanceof NotSetTransition;
-			if ( !not && st.set.contains(t) || not && !st.set.contains(t) ) {
-//				if ( st.set.toString().equals("0") ) {
-//					System.out.println("eh?");
-//				}
-				if ( debug ) System.out.println("match set "+st.set.toString(true));
+			if ( (!not && st.set.contains(t)) ||
+				 (not && !st.set.contains(t) && t!=Token.EOF) ) // ~set doesn't not match EOF
+			{
+				if ( debug ) System.out.println("match "+(not?"~":"")+"set "+st.set.toString(true));
 				return st.target;
 			}
 		}
diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
index b7abf837a..780aa990c 100644
--- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@@ -32,13 +32,11 @@ package org.antlr.v4.automata;
 import org.antlr.runtime.Token;
 import org.antlr.v4.misc.CharSupport;
 import org.antlr.v4.parse.ANTLRParser;
+import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.atn.*;
 import org.antlr.v4.runtime.misc.IntervalSet;
-import org.antlr.v4.tool.LexerGrammar;
-import org.antlr.v4.tool.Rule;
-import org.antlr.v4.tool.ast.ActionAST;
-import org.antlr.v4.tool.ast.GrammarAST;
-import org.antlr.v4.tool.ast.TerminalAST;
+import org.antlr.v4.tool.*;
+import org.antlr.v4.tool.ast.*;
 
 import java.util.List;
 
@@ -122,6 +120,7 @@ public class LexerATNFactory extends ParserATNFactory {
 			}
 		}
 		if ( invert ) {
+			// TODO: what? should be chars not token types
 			IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType());
 			left.addTransition(new NotSetTransition(set, notSet, right));
 		}
@@ -157,6 +156,13 @@ public class LexerATNFactory extends ParserATNFactory {
 
 	@Override
 	public Handle tokenRef(TerminalAST node) {
+		// Ref to EOF in lexer yields char transition on -1
+		if ( node.getText().equals("EOF") ) {
+			ATNState left = newState(node);
+			ATNState right = newState(node);
+			left.addTransition(new AtomTransition(CharStream.EOF, right));
+			return new Handle(left, right);
+		}
 		return _ruleRef(node);
 	}
 }
diff --git a/tool/src/org/antlr/v4/parse/ANTLRParser.g b/tool/src/org/antlr/v4/parse/ANTLRParser.g
index cc76cbd63..d1515dbb3 100644
--- a/tool/src/org/antlr/v4/parse/ANTLRParser.g
+++ b/tool/src/org/antlr/v4/parse/ANTLRParser.g
@@ -763,16 +763,7 @@ blockSet
 	boolean ebnf = false;
 }
     :	LPAREN setElement (OR setElement)* RPAREN
-/*		{
-		t = input.LT(1);
-		ebnf = t!=null && (t.getType()==QUESTION || t.getType()==STAR || t.getType()==PLUS);
-	    }
-	    */
-		-> ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT setElement)+ )
-/*
-		-> {ebnf}?	^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT ^(SET[$LPAREN,"SET"] setElement+ )))
-		-> 			^(SET[$LPAREN,"SET"] setElement+ )
-*/
+		-> ^(SET<SetAST>[$LPAREN,"SET"] setElement+ )
     ;
 
 setElement
diff --git a/tool/test/org/antlr/v4/test/TestATNDeserialization.java b/tool/test/org/antlr/v4/test/TestATNDeserialization.java
index 04979db95..2e840c737 100644
--- a/tool/test/org/antlr/v4/test/TestATNDeserialization.java
+++ b/tool/test/org/antlr/v4/test/TestATNDeserialization.java
@@ -14,6 +14,20 @@ public class TestATNDeserialization extends BaseTest {
 		checkDeserializationIsStable(g);
 	}
 
+	@Test public void testEOF() throws Exception {
+		Grammar g = new Grammar(
+			"parser grammar T;\n"+
+			"a : EOF ;");
+		checkDeserializationIsStable(g);
+	}
+
+	@Test public void testEOFInSet() throws Exception {
+		Grammar g = new Grammar(
+			"parser grammar T;\n"+
+			"a : (EOF|A) ;");
+		checkDeserializationIsStable(g);
+	}
+
 	@Test public void testNot() throws Exception {
 		Grammar g = new Grammar(
 			"parser grammar T;\n"+
@@ -67,6 +81,20 @@ public class TestATNDeserialization extends BaseTest {
 		checkDeserializationIsStable(lg);
 	}
 
+	@Test public void testLexerEOF() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'a' EOF ;\n");
+		checkDeserializationIsStable(lg);
+	}
+
+	@Test public void testLexerEOFInSet() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'a' (EOF|'\n') ;\n");
+		checkDeserializationIsStable(lg);
+	}
+
 	@Test public void testLexerRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
diff --git a/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java b/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java
index a0723f5a0..91d36e7e5 100644
--- a/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java
+++ b/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java
@@ -4,7 +4,7 @@ import org.antlr.v4.misc.Utils;
 import org.antlr.v4.runtime.*;
 import org.antlr.v4.runtime.atn.*;
 import org.antlr.v4.tool.*;
-import org.junit.*;
+import org.junit.Test;
 
 import java.util.List;
 
@@ -14,7 +14,7 @@ import java.util.List;
  * several rules and even within a rule. However, that conflicts
  * with the notion of non-greedy, which by definition tries to match
  * the fewest possible. During ATN construction, non-greedy loops
- * have their entry and exit branches reversed so that the ATM
+ * have their entry and exit branches reversed so that the ATN
  * simulator will see the exit branch 1st, giving it a priority. The
  * 1st path to the stop state kills any other paths for that rule
  * that begin with the wildcard. In general, this does everything we
@@ -51,17 +51,17 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "xyz", "A, EOF");
 	}
 
-	@Test public void testWildOnEnd() throws Exception {
+	@Test public void testWildOnEndFirstAlt() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
-			"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop
+			"A : 'xy' .\n" + // should pursue '.' since xyz hits stop first, before 2nd alt
 			"  | 'xy'\n" +
 			"  ;\n");
 		checkLexerMatches(lg, "xy", "A, EOF");
 		checkLexerMatches(lg, "xyz", "A, EOF");
 	}
 
-	@Test public void testWildOnEndLast() throws Exception {
+	@Test public void testWildOnEndLastAlt() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"A : 'xy'\n" +
@@ -83,6 +83,15 @@ public class TestATNLexerInterpreter extends BaseTest {
 		assertEquals("NoViableAltException('q')", e.toString());
 	}
 
+	@Test public void testWildcardNonQuirkWhenSplitBetweenTwoRules() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'xy' ;\n" +
+			"B : 'xy' . 'z' ;\n");
+		checkLexerMatches(lg, "xy", "A, EOF");
+		checkLexerMatches(lg, "xyz", "B, EOF");
+	}
+
 	@Test public void testLexerLoops() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
@@ -135,7 +144,7 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting);
 	}
 
-	@Ignore public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
+	@Test public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"CMT : '//' .* '\\n' ;\n");
@@ -143,25 +152,23 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "//x\n//y\n", expecting);
 	}
 
-	// should not work. no priority within a single rule. the subrule won't work. need modes
-	@Ignore
-	public void testLexerEscapeInString() throws Exception {
+	@Test public void testLexerEscapeInString() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
-			"STR : '\"' ('\\\\' '\"' | .)* '\"' ;\n"); // STR : '"' ('\\' '"' | .)* '"'
-		checkLexerMatches(lg, "\"a\\\"b\"", "STR, EOF");
-		checkLexerMatches(lg, "\"a\"", "STR, EOF");
+			"STR : '[' ('~' ']' | .)* ']' ;\n");
+		checkLexerMatches(lg, "[a~]b]", "STR, EOF");
+		checkLexerMatches(lg, "[a]", "STR, EOF");
 	}
 
 	@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
-			"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n");
+			"CMT : '//' .+ '\\n' ;\n");
 		String expecting = "CMT, CMT, EOF";
 		checkLexerMatches(lg, "//x\n//y\n", expecting);
 	}
 
-	// does not fail since ('*/')? cant match and have rule succeed
+	// does not fail since ('*/')? can't match and have rule succeed
 	@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
@@ -179,6 +186,60 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "<a><x>", expecting);
 	}
 
+	@Test public void testEOFAtEndOfLineComment() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"CMT : '//' ~('\n')* ;\n");
+		String expecting = "CMT, EOF";
+		checkLexerMatches(lg, "//x", expecting);
+	}
+
+	@Test public void testEOFAtEndOfLineComment2() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"CMT : '//' ~('\n'|'\r')* ;\n");
+		String expecting = "CMT, EOF";
+		checkLexerMatches(lg, "//x", expecting);
+	}
+
+	/** only positive sets like (EOF|'\n') can match EOF and not in wildcard or ~foo sets
+	 *  EOF matches but does not advance cursor.
+	 */
+	@Test public void testEOFInSetAtEndOfLineComment() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"CMT : '//' .* (EOF|'\n') ;\n");
+		String expecting = "CMT, EOF";
+		checkLexerMatches(lg, "//", expecting);
+	}
+
+	@Test public void testEOFSuffixInSecondRule() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'a' ;\n"+ // shorter than 'a' EOF, despite EOF being 0 width
+			"B : 'a' EOF ;\n");
+		String expecting = "B, EOF";
+		checkLexerMatches(lg, "a", expecting);
+	}
+
+	@Test public void testEOFSuffixInFirstRule() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'a' EOF ;\n"+
+			"B : 'a';\n");
+		String expecting = "A, EOF";
+		checkLexerMatches(lg, "a", expecting);
+	}
+
+	@Test public void testEOFByItself() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"DONE : EOF ;\n"+
+			"A : 'a';\n");
+		String expecting = "A, DONE, EOF";
+		checkLexerMatches(lg, "a", expecting);
+	}
+
 	protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
 		ATN atn = createATN(lg);
 		CharStream input = new ANTLRStringStream(inputString);
diff --git a/tool/test/org/antlr/v4/test/TestATNSerialization.java b/tool/test/org/antlr/v4/test/TestATNSerialization.java
index 07721bf21..8a7b5d142 100644
--- a/tool/test/org/antlr/v4/test/TestATNSerialization.java
+++ b/tool/test/org/antlr/v4/test/TestATNSerialization.java
@@ -31,6 +31,53 @@ public class TestATNSerialization extends BaseTest {
 		assertEquals(expecting, result);
 	}
 
+	@Test public void testEOF() throws Exception {
+		Grammar g = new Grammar(
+			"parser grammar T;\n"+
+			"a : A EOF ;");
+		String expecting =
+			"max type 3\n" +
+			"0:RULE_START 0\n" +
+			"1:RULE_STOP 0\n" +
+			"2:BASIC 0\n" +
+			"3:BASIC 0\n" +
+			"4:BASIC 0\n" +
+			"5:BASIC 0\n" +
+			"6:BASIC 0\n" +
+			"rule 0:0 0,0\n" +
+			"0->2 EPSILON 0,0,0\n" +
+			"1->6 ATOM -1,0,0\n" +
+			"2->3 ATOM 3,0,0\n" +
+			"3->4 EPSILON 0,0,0\n" +
+			"4->5 ATOM -1,0,0\n" +
+			"5->1 EPSILON 0,0,0\n";
+		ATN atn = createATN(g);
+		String result = ATNSerializer.getDecoded(g, atn);
+		assertEquals(expecting, result);
+	}
+
+	@Test public void testEOFInSet() throws Exception {
+		Grammar g = new Grammar(
+			"parser grammar T;\n"+
+			"a : (A|EOF) ;");
+		String expecting =
+			"max type 3\n" +
+			"0:RULE_START 0\n" +
+			"1:RULE_STOP 0\n" +
+			"2:BASIC 0\n" +
+			"3:BASIC 0\n" +
+			"4:BASIC 0\n" +
+			"rule 0:0 0,0\n" +
+			"0:EOF..EOF, A..A\n" +
+			"0->2 EPSILON 0,0,0\n" +
+			"1->4 ATOM -1,0,0\n" +
+			"2->3 SET 0,0,0\n" +
+			"3->1 EPSILON 0,0,0\n";
+		ATN atn = createATN(g);
+		String result = ATNSerializer.getDecoded(g, atn);
+		assertEquals(expecting, result);
+	}
+
 	@Test public void testNot() throws Exception {
 		Grammar g = new Grammar(
 			"parser grammar T;\n"+
@@ -283,6 +330,70 @@ public class TestATNSerialization extends BaseTest {
 		assertEquals(expecting, result);
 	}
 
+	@Test public void testLexerEOF() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"INT : 'a' EOF ;\n");
+		String expecting =
+			"max type 3\n" +
+			"0:TOKEN_START -1\n" +
+			"1:RULE_START 0\n" +
+			"2:RULE_STOP 0\n" +
+			"3:BASIC 0\n" +
+			"4:BASIC 0\n" +
+			"5:BASIC 0\n" +
+			"6:BASIC 0\n" +
+			"rule 0:1 3,-1\n" +
+			"mode 0:0\n" +
+			"0->1 EPSILON 0,0,0\n" +
+			"1->3 EPSILON 0,0,0\n" +
+			"3->4 ATOM 97,0,0\n" +
+			"4->5 EPSILON 0,0,0\n" +
+			"5->6 ATOM -1,0,0\n" +
+			"6->2 EPSILON 0,0,0\n" +
+			"0:0 1\n";
+		ATN atn = createATN(lg);
+		String result = ATNSerializer.getDecoded(lg, atn);
+		assertEquals(expecting, result);
+	}
+
+	@Test public void testLexerEOFInSet() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"INT : 'a' (EOF|'\n') ;\n");
+		String expecting =
+			"max type 3\n" +
+			"0:TOKEN_START -1\n" +
+			"1:RULE_START 0\n" +
+			"2:RULE_STOP 0\n" +
+			"3:BASIC 0\n" +
+			"4:BASIC 0\n" +
+			"5:BASIC 0\n" +
+			"6:BASIC 0\n" +
+			"7:BASIC 0\n" +
+			"8:BASIC 0\n" +
+			"9:BLOCK_START 0\n" +
+			"10:BLOCK_END 0\n" +
+			"rule 0:1 3,-1\n" +
+			"mode 0:0\n" +
+			"0->1 EPSILON 0,0,0\n" +
+			"1->3 EPSILON 0,0,0\n" +
+			"3->4 ATOM 97,0,0\n" +
+			"4->9 EPSILON 0,0,0\n" +
+			"5->6 ATOM -1,0,0\n" +
+			"6->10 EPSILON 0,0,0\n" +
+			"7->8 ATOM 10,0,0\n" +
+			"8->10 EPSILON 0,0,0\n" +
+			"9->5 EPSILON 0,0,0\n" +
+			"9->7 EPSILON 0,0,0\n" +
+			"10->2 EPSILON 0,0,0\n" +
+			"0:0 1\n" +
+			"1:9 1\n";
+		ATN atn = createATN(lg);
+		String result = ATNSerializer.getDecoded(lg, atn);
+		assertEquals(expecting, result);
+	}
+
 	@Test public void testLexerLoops() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+