From 6e2bbcdb42bdee6d3b08b06c17f5ab9e49dd0056 Mon Sep 17 00:00:00 2001 From: parrt Date: Sat, 29 Oct 2011 11:08:40 -0800 Subject: [PATCH] got EOF in lexer [git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9223] --- .../v4/runtime/atn/LexerATNSimulator.java | 27 +++-- .../antlr/v4/automata/LexerATNFactory.java | 16 ++- tool/src/org/antlr/v4/parse/ANTLRParser.g | 11 +- .../antlr/v4/test/TestATNDeserialization.java | 28 +++++ .../v4/test/TestATNLexerInterpreter.java | 89 +++++++++++--- .../antlr/v4/test/TestATNSerialization.java | 111 ++++++++++++++++++ 6 files changed, 242 insertions(+), 40 deletions(-) diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java index 2f23bf8dd..abc85141a 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java @@ -210,7 +210,7 @@ public class LexerATNSimulator extends ATNSimulator { // we got nowhere on t, don't throw out this knowledge; it'd // cause a failover from DFA later. Don't track EOF edges // from stop states, though. - if ( t!=Token.EOF ) addDFAEdge(from, t, ERROR); + if ( t!=CharStream.EOF ) addDFAEdge(from, t, ERROR); break; } @@ -223,12 +223,17 @@ public class LexerATNSimulator extends ATNSimulator { input.index()+", reach="+reach+ ", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex); } - if ( input.index() > prevAcceptIndex ) { + int index = input.index(); + if ( index > prevAcceptIndex ) { // will favor prev accept at same index so "int" is keyword not ID prevAccept = c; - prevAcceptIndex = input.index(); + if ( t == CharStream.EOF ) { + // later we seek to prevAcceptIndex+1, undo that effect for EOF + index--; + } + prevAcceptIndex = index; if ( debug ) { - System.out.println("mark "+c+" @ index="+input.index()); + System.out.println("mark "+c+" @ index="+index); } } @@ -243,10 +248,11 @@ public class LexerATNSimulator extends ATNSimulator { } consume(input, t); - addDFAEdge(closure, t, reach); + if ( t!=CharStream.EOF ) addDFAEdge(closure, t, reach); t = input.LA(1); // swap to avoid reallocating space + // TODO: faster to reallocate? OrderedHashSet tmp = reach; reach = closure; closure = tmp; @@ -256,7 +262,7 @@ public class LexerATNSimulator extends ATNSimulator { if ( prevAccept==null ) { if ( t==Token.EOF ) { System.out.println("EOF in token at input index "+input.index()); - return Token.EOF; + //return Token.EOF; } // System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index()); throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty @@ -296,11 +302,10 @@ public class LexerATNSimulator extends ATNSimulator { else if ( trans instanceof SetTransition ) { SetTransition st = (SetTransition)trans; boolean not = trans instanceof NotSetTransition; - if ( !not && st.set.contains(t) || not && !st.set.contains(t) ) { -// if ( st.set.toString().equals("0") ) { -// System.out.println("eh?"); -// } - if ( debug ) System.out.println("match set "+st.set.toString(true)); + if ( (!not && st.set.contains(t)) || + (not && !st.set.contains(t) && t!=Token.EOF) ) // ~set doesn't not match EOF + { + if ( debug ) System.out.println("match "+(not?"~":"")+"set "+st.set.toString(true)); return st.target; } } diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java index b7abf837a..780aa990c 100644 --- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -32,13 +32,11 @@ package org.antlr.v4.automata; import org.antlr.runtime.Token; import org.antlr.v4.misc.CharSupport; import org.antlr.v4.parse.ANTLRParser; +import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.atn.*; import org.antlr.v4.runtime.misc.IntervalSet; -import org.antlr.v4.tool.LexerGrammar; -import org.antlr.v4.tool.Rule; -import org.antlr.v4.tool.ast.ActionAST; -import org.antlr.v4.tool.ast.GrammarAST; -import org.antlr.v4.tool.ast.TerminalAST; +import org.antlr.v4.tool.*; +import org.antlr.v4.tool.ast.*; import java.util.List; @@ -122,6 +120,7 @@ public class LexerATNFactory extends ParserATNFactory { } } if ( invert ) { + // TODO: what? should be chars not token types IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType()); left.addTransition(new NotSetTransition(set, notSet, right)); } @@ -157,6 +156,13 @@ public class LexerATNFactory extends ParserATNFactory { @Override public Handle tokenRef(TerminalAST node) { + // Ref to EOF in lexer yields char transition on -1 + if ( node.getText().equals("EOF") ) { + ATNState left = newState(node); + ATNState right = newState(node); + left.addTransition(new AtomTransition(CharStream.EOF, right)); + return new Handle(left, right); + } return _ruleRef(node); } } diff --git a/tool/src/org/antlr/v4/parse/ANTLRParser.g b/tool/src/org/antlr/v4/parse/ANTLRParser.g index cc76cbd63..d1515dbb3 100644 --- a/tool/src/org/antlr/v4/parse/ANTLRParser.g +++ b/tool/src/org/antlr/v4/parse/ANTLRParser.g @@ -763,16 +763,7 @@ blockSet boolean ebnf = false; } : LPAREN setElement (OR setElement)* RPAREN -/* { - t = input.LT(1); - ebnf = t!=null && (t.getType()==QUESTION || t.getType()==STAR || t.getType()==PLUS); - } - */ - -> ^(BLOCK[$LPAREN,"BLOCK"] ^(ALT setElement)+ ) -/* - -> {ebnf}? ^(BLOCK[$LPAREN,"BLOCK"] ^(ALT ^(SET[$LPAREN,"SET"] setElement+ ))) - -> ^(SET[$LPAREN,"SET"] setElement+ ) -*/ + -> ^(SET[$LPAREN,"SET"] setElement+ ) ; setElement diff --git a/tool/test/org/antlr/v4/test/TestATNDeserialization.java b/tool/test/org/antlr/v4/test/TestATNDeserialization.java index 04979db95..2e840c737 100644 --- a/tool/test/org/antlr/v4/test/TestATNDeserialization.java +++ b/tool/test/org/antlr/v4/test/TestATNDeserialization.java @@ -14,6 +14,20 @@ public class TestATNDeserialization extends BaseTest { checkDeserializationIsStable(g); } + @Test public void testEOF() throws Exception { + Grammar g = new Grammar( + "parser grammar T;\n"+ + "a : EOF ;"); + checkDeserializationIsStable(g); + } + + @Test public void testEOFInSet() throws Exception { + Grammar g = new Grammar( + "parser grammar T;\n"+ + "a : (EOF|A) ;"); + checkDeserializationIsStable(g); + } + @Test public void testNot() throws Exception { Grammar g = new Grammar( "parser grammar T;\n"+ @@ -67,6 +81,20 @@ public class TestATNDeserialization extends BaseTest { checkDeserializationIsStable(lg); } + @Test public void testLexerEOF() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "A : 'a' EOF ;\n"); + checkDeserializationIsStable(lg); + } + + @Test public void testLexerEOFInSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "A : 'a' (EOF|'\n') ;\n"); + checkDeserializationIsStable(lg); + } + @Test public void testLexerRange() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ diff --git a/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java b/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java index a0723f5a0..91d36e7e5 100644 --- a/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java +++ b/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java @@ -4,7 +4,7 @@ import org.antlr.v4.misc.Utils; import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.atn.*; import org.antlr.v4.tool.*; -import org.junit.*; +import org.junit.Test; import java.util.List; @@ -14,7 +14,7 @@ import java.util.List; * several rules and even within a rule. However, that conflicts * with the notion of non-greedy, which by definition tries to match * the fewest possible. During ATN construction, non-greedy loops - * have their entry and exit branches reversed so that the ATM + * have their entry and exit branches reversed so that the ATN * simulator will see the exit branch 1st, giving it a priority. The * 1st path to the stop state kills any other paths for that rule * that begin with the wildcard. In general, this does everything we @@ -51,17 +51,17 @@ public class TestATNLexerInterpreter extends BaseTest { checkLexerMatches(lg, "xyz", "A, EOF"); } - @Test public void testWildOnEnd() throws Exception { + @Test public void testWildOnEndFirstAlt() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ - "A : 'xy' .\n" + // should not pursue '.' since xy already hit stop + "A : 'xy' .\n" + // should pursue '.' since xyz hits stop first, before 2nd alt " | 'xy'\n" + " ;\n"); checkLexerMatches(lg, "xy", "A, EOF"); checkLexerMatches(lg, "xyz", "A, EOF"); } - @Test public void testWildOnEndLast() throws Exception { + @Test public void testWildOnEndLastAlt() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ "A : 'xy'\n" + @@ -83,6 +83,15 @@ public class TestATNLexerInterpreter extends BaseTest { assertEquals("NoViableAltException('q')", e.toString()); } + @Test public void testWildcardNonQuirkWhenSplitBetweenTwoRules() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "A : 'xy' ;\n" + + "B : 'xy' . 'z' ;\n"); + checkLexerMatches(lg, "xy", "A, EOF"); + checkLexerMatches(lg, "xyz", "B, EOF"); + } + @Test public void testLexerLoops() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ @@ -135,7 +144,7 @@ public class TestATNLexerInterpreter extends BaseTest { checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting); } - @Ignore public void testLexerWildcardNonGreedyLoopByDefault() throws Exception { + @Test public void testLexerWildcardNonGreedyLoopByDefault() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ "CMT : '//' .* '\\n' ;\n"); @@ -143,25 +152,23 @@ public class TestATNLexerInterpreter extends BaseTest { checkLexerMatches(lg, "//x\n//y\n", expecting); } - // should not work. no priority within a single rule. the subrule won't work. need modes - @Ignore - public void testLexerEscapeInString() throws Exception { + @Test public void testLexerEscapeInString() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ - "STR : '\"' ('\\\\' '\"' | .)* '\"' ;\n"); // STR : '"' ('\\' '"' | .)* '"' - checkLexerMatches(lg, "\"a\\\"b\"", "STR, EOF"); - checkLexerMatches(lg, "\"a\"", "STR, EOF"); + "STR : '[' ('~' ']' | .)* ']' ;\n"); + checkLexerMatches(lg, "[a~]b]", "STR, EOF"); + checkLexerMatches(lg, "[a]", "STR, EOF"); } @Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ - "CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n"); + "CMT : '//' .+ '\\n' ;\n"); String expecting = "CMT, CMT, EOF"; checkLexerMatches(lg, "//x\n//y\n", expecting); } - // does not fail since ('*/')? cant match and have rule succeed + // does not fail since ('*/')? can't match and have rule succeed @Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ @@ -179,6 +186,60 @@ public class TestATNLexerInterpreter extends BaseTest { checkLexerMatches(lg, "", expecting); } + @Test public void testEOFAtEndOfLineComment() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "CMT : '//' ~('\n')* ;\n"); + String expecting = "CMT, EOF"; + checkLexerMatches(lg, "//x", expecting); + } + + @Test public void testEOFAtEndOfLineComment2() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "CMT : '//' ~('\n'|'\r')* ;\n"); + String expecting = "CMT, EOF"; + checkLexerMatches(lg, "//x", expecting); + } + + /** only positive sets like (EOF|'\n') can match EOF and not in wildcard or ~foo sets + * EOF matches but does not advance cursor. + */ + @Test public void testEOFInSetAtEndOfLineComment() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "CMT : '//' .* (EOF|'\n') ;\n"); + String expecting = "CMT, EOF"; + checkLexerMatches(lg, "//", expecting); + } + + @Test public void testEOFSuffixInSecondRule() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "A : 'a' ;\n"+ // shorter than 'a' EOF, despite EOF being 0 width + "B : 'a' EOF ;\n"); + String expecting = "B, EOF"; + checkLexerMatches(lg, "a", expecting); + } + + @Test public void testEOFSuffixInFirstRule() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "A : 'a' EOF ;\n"+ + "B : 'a';\n"); + String expecting = "A, EOF"; + checkLexerMatches(lg, "a", expecting); + } + + @Test public void testEOFByItself() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "DONE : EOF ;\n"+ + "A : 'a';\n"); + String expecting = "A, DONE, EOF"; + checkLexerMatches(lg, "a", expecting); + } + protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { ATN atn = createATN(lg); CharStream input = new ANTLRStringStream(inputString); diff --git a/tool/test/org/antlr/v4/test/TestATNSerialization.java b/tool/test/org/antlr/v4/test/TestATNSerialization.java index 07721bf21..8a7b5d142 100644 --- a/tool/test/org/antlr/v4/test/TestATNSerialization.java +++ b/tool/test/org/antlr/v4/test/TestATNSerialization.java @@ -31,6 +31,53 @@ public class TestATNSerialization extends BaseTest { assertEquals(expecting, result); } + @Test public void testEOF() throws Exception { + Grammar g = new Grammar( + "parser grammar T;\n"+ + "a : A EOF ;"); + String expecting = + "max type 3\n" + + "0:RULE_START 0\n" + + "1:RULE_STOP 0\n" + + "2:BASIC 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "5:BASIC 0\n" + + "6:BASIC 0\n" + + "rule 0:0 0,0\n" + + "0->2 EPSILON 0,0,0\n" + + "1->6 ATOM -1,0,0\n" + + "2->3 ATOM 3,0,0\n" + + "3->4 EPSILON 0,0,0\n" + + "4->5 ATOM -1,0,0\n" + + "5->1 EPSILON 0,0,0\n"; + ATN atn = createATN(g); + String result = ATNSerializer.getDecoded(g, atn); + assertEquals(expecting, result); + } + + @Test public void testEOFInSet() throws Exception { + Grammar g = new Grammar( + "parser grammar T;\n"+ + "a : (A|EOF) ;"); + String expecting = + "max type 3\n" + + "0:RULE_START 0\n" + + "1:RULE_STOP 0\n" + + "2:BASIC 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:0 0,0\n" + + "0:EOF..EOF, A..A\n" + + "0->2 EPSILON 0,0,0\n" + + "1->4 ATOM -1,0,0\n" + + "2->3 SET 0,0,0\n" + + "3->1 EPSILON 0,0,0\n"; + ATN atn = createATN(g); + String result = ATNSerializer.getDecoded(g, atn); + assertEquals(expecting, result); + } + @Test public void testNot() throws Exception { Grammar g = new Grammar( "parser grammar T;\n"+ @@ -283,6 +330,70 @@ public class TestATNSerialization extends BaseTest { assertEquals(expecting, result); } + @Test public void testLexerEOF() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "INT : 'a' EOF ;\n"); + String expecting = + "max type 3\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "5:BASIC 0\n" + + "6:BASIC 0\n" + + "rule 0:1 3,-1\n" + + "mode 0:0\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 ATOM 97,0,0\n" + + "4->5 EPSILON 0,0,0\n" + + "5->6 ATOM -1,0,0\n" + + "6->2 EPSILON 0,0,0\n" + + "0:0 1\n"; + ATN atn = createATN(lg); + String result = ATNSerializer.getDecoded(lg, atn); + assertEquals(expecting, result); + } + + @Test public void testLexerEOFInSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "INT : 'a' (EOF|'\n') ;\n"); + String expecting = + "max type 3\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "5:BASIC 0\n" + + "6:BASIC 0\n" + + "7:BASIC 0\n" + + "8:BASIC 0\n" + + "9:BLOCK_START 0\n" + + "10:BLOCK_END 0\n" + + "rule 0:1 3,-1\n" + + "mode 0:0\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 ATOM 97,0,0\n" + + "4->9 EPSILON 0,0,0\n" + + "5->6 ATOM -1,0,0\n" + + "6->10 EPSILON 0,0,0\n" + + "7->8 ATOM 10,0,0\n" + + "8->10 EPSILON 0,0,0\n" + + "9->5 EPSILON 0,0,0\n" + + "9->7 EPSILON 0,0,0\n" + + "10->2 EPSILON 0,0,0\n" + + "0:0 1\n" + + "1:9 1\n"; + ATN atn = createATN(lg); + String result = ATNSerializer.getDecoded(lg, atn); + assertEquals(expecting, result); + } + @Test public void testLexerLoops() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+