got EOF in lexer

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9223]
This commit is contained in:
parrt 2011-10-29 11:08:40 -08:00
parent 6576abf65f
commit 6e2bbcdb42
6 changed files with 242 additions and 40 deletions

View File

@ -210,7 +210,7 @@ public class LexerATNSimulator extends ATNSimulator {
// we got nowhere on t, don't throw out this knowledge; it'd
// cause a failover from DFA later. Don't track EOF edges
// from stop states, though.
if ( t!=Token.EOF ) addDFAEdge(from, t, ERROR);
if ( t!=CharStream.EOF ) addDFAEdge(from, t, ERROR);
break;
}
@ -223,12 +223,17 @@ public class LexerATNSimulator extends ATNSimulator {
input.index()+", reach="+reach+
", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex);
}
if ( input.index() > prevAcceptIndex ) {
int index = input.index();
if ( index > prevAcceptIndex ) {
// will favor prev accept at same index so "int" is keyword not ID
prevAccept = c;
prevAcceptIndex = input.index();
if ( t == CharStream.EOF ) {
// later we seek to prevAcceptIndex+1, undo that effect for EOF
index--;
}
prevAcceptIndex = index;
if ( debug ) {
System.out.println("mark "+c+" @ index="+input.index());
System.out.println("mark "+c+" @ index="+index);
}
}
@ -243,10 +248,11 @@ public class LexerATNSimulator extends ATNSimulator {
}
consume(input, t);
addDFAEdge(closure, t, reach);
if ( t!=CharStream.EOF ) addDFAEdge(closure, t, reach);
t = input.LA(1);
// swap to avoid reallocating space
// TODO: faster to reallocate?
OrderedHashSet<ATNConfig> tmp = reach;
reach = closure;
closure = tmp;
@ -256,7 +262,7 @@ public class LexerATNSimulator extends ATNSimulator {
if ( prevAccept==null ) {
if ( t==Token.EOF ) {
System.out.println("EOF in token at input index "+input.index());
return Token.EOF;
//return Token.EOF;
}
// System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index());
throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty
@ -296,11 +302,10 @@ public class LexerATNSimulator extends ATNSimulator {
else if ( trans instanceof SetTransition ) {
SetTransition st = (SetTransition)trans;
boolean not = trans instanceof NotSetTransition;
if ( !not && st.set.contains(t) || not && !st.set.contains(t) ) {
// if ( st.set.toString().equals("0") ) {
// System.out.println("eh?");
// }
if ( debug ) System.out.println("match set "+st.set.toString(true));
if ( (!not && st.set.contains(t)) ||
(not && !st.set.contains(t) && t!=Token.EOF) ) // ~set doesn't not match EOF
{
if ( debug ) System.out.println("match "+(not?"~":"")+"set "+st.set.toString(true));
return st.target;
}
}

View File

@ -32,13 +32,11 @@ package org.antlr.v4.automata;
import org.antlr.runtime.Token;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.atn.*;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.tool.LexerGrammar;
import org.antlr.v4.tool.Rule;
import org.antlr.v4.tool.ast.ActionAST;
import org.antlr.v4.tool.ast.GrammarAST;
import org.antlr.v4.tool.ast.TerminalAST;
import org.antlr.v4.tool.*;
import org.antlr.v4.tool.ast.*;
import java.util.List;
@ -122,6 +120,7 @@ public class LexerATNFactory extends ParserATNFactory {
}
}
if ( invert ) {
// TODO: what? should be chars not token types
IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType());
left.addTransition(new NotSetTransition(set, notSet, right));
}
@ -157,6 +156,13 @@ public class LexerATNFactory extends ParserATNFactory {
@Override
public Handle tokenRef(TerminalAST node) {
// Ref to EOF in lexer yields char transition on -1
if ( node.getText().equals("EOF") ) {
ATNState left = newState(node);
ATNState right = newState(node);
left.addTransition(new AtomTransition(CharStream.EOF, right));
return new Handle(left, right);
}
return _ruleRef(node);
}
}

View File

@ -763,16 +763,7 @@ blockSet
boolean ebnf = false;
}
: LPAREN setElement (OR setElement)* RPAREN
/* {
t = input.LT(1);
ebnf = t!=null && (t.getType()==QUESTION || t.getType()==STAR || t.getType()==PLUS);
}
*/
-> ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT setElement)+ )
/*
-> {ebnf}? ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT ^(SET[$LPAREN,"SET"] setElement+ )))
-> ^(SET[$LPAREN,"SET"] setElement+ )
*/
-> ^(SET<SetAST>[$LPAREN,"SET"] setElement+ )
;
setElement

View File

@ -14,6 +14,20 @@ public class TestATNDeserialization extends BaseTest {
checkDeserializationIsStable(g);
}
@Test public void testEOF() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : EOF ;");
checkDeserializationIsStable(g);
}
@Test public void testEOFInSet() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : (EOF|A) ;");
checkDeserializationIsStable(g);
}
@Test public void testNot() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
@ -67,6 +81,20 @@ public class TestATNDeserialization extends BaseTest {
checkDeserializationIsStable(lg);
}
@Test public void testLexerEOF() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' EOF ;\n");
checkDeserializationIsStable(lg);
}
@Test public void testLexerEOFInSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' (EOF|'\n') ;\n");
checkDeserializationIsStable(lg);
}
@Test public void testLexerRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+

View File

@ -4,7 +4,7 @@ import org.antlr.v4.misc.Utils;
import org.antlr.v4.runtime.*;
import org.antlr.v4.runtime.atn.*;
import org.antlr.v4.tool.*;
import org.junit.*;
import org.junit.Test;
import java.util.List;
@ -14,7 +14,7 @@ import java.util.List;
* several rules and even within a rule. However, that conflicts
* with the notion of non-greedy, which by definition tries to match
* the fewest possible. During ATN construction, non-greedy loops
* have their entry and exit branches reversed so that the ATM
* have their entry and exit branches reversed so that the ATN
* simulator will see the exit branch 1st, giving it a priority. The
* 1st path to the stop state kills any other paths for that rule
* that begin with the wildcard. In general, this does everything we
@ -51,17 +51,17 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "xyz", "A, EOF");
}
@Test public void testWildOnEnd() throws Exception {
@Test public void testWildOnEndFirstAlt() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop
"A : 'xy' .\n" + // should pursue '.' since xyz hits stop first, before 2nd alt
" | 'xy'\n" +
" ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "A, EOF");
}
@Test public void testWildOnEndLast() throws Exception {
@Test public void testWildOnEndLastAlt() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy'\n" +
@ -83,6 +83,15 @@ public class TestATNLexerInterpreter extends BaseTest {
assertEquals("NoViableAltException('q')", e.toString());
}
@Test public void testWildcardNonQuirkWhenSplitBetweenTwoRules() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy' ;\n" +
"B : 'xy' . 'z' ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "B, EOF");
}
@Test public void testLexerLoops() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
@ -135,7 +144,7 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting);
}
@Ignore public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
@Test public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' .* '\\n' ;\n");
@ -143,25 +152,23 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "//x\n//y\n", expecting);
}
// should not work. no priority within a single rule. the subrule won't work. need modes
@Ignore
public void testLexerEscapeInString() throws Exception {
@Test public void testLexerEscapeInString() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"STR : '\"' ('\\\\' '\"' | .)* '\"' ;\n"); // STR : '"' ('\\' '"' | .)* '"'
checkLexerMatches(lg, "\"a\\\"b\"", "STR, EOF");
checkLexerMatches(lg, "\"a\"", "STR, EOF");
"STR : '[' ('~' ']' | .)* ']' ;\n");
checkLexerMatches(lg, "[a~]b]", "STR, EOF");
checkLexerMatches(lg, "[a]", "STR, EOF");
}
@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n");
"CMT : '//' .+ '\\n' ;\n");
String expecting = "CMT, CMT, EOF";
checkLexerMatches(lg, "//x\n//y\n", expecting);
}
// does not fail since ('*/')? cant match and have rule succeed
// does not fail since ('*/')? can't match and have rule succeed
@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
@ -179,6 +186,60 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "<a><x>", expecting);
}
@Test public void testEOFAtEndOfLineComment() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' ~('\n')* ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//x", expecting);
}
@Test public void testEOFAtEndOfLineComment2() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' ~('\n'|'\r')* ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//x", expecting);
}
/** only positive sets like (EOF|'\n') can match EOF and not in wildcard or ~foo sets
* EOF matches but does not advance cursor.
*/
@Test public void testEOFInSetAtEndOfLineComment() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' .* (EOF|'\n') ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//", expecting);
}
@Test public void testEOFSuffixInSecondRule() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' ;\n"+ // shorter than 'a' EOF, despite EOF being 0 width
"B : 'a' EOF ;\n");
String expecting = "B, EOF";
checkLexerMatches(lg, "a", expecting);
}
@Test public void testEOFSuffixInFirstRule() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' EOF ;\n"+
"B : 'a';\n");
String expecting = "A, EOF";
checkLexerMatches(lg, "a", expecting);
}
@Test public void testEOFByItself() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"DONE : EOF ;\n"+
"A : 'a';\n");
String expecting = "A, DONE, EOF";
checkLexerMatches(lg, "a", expecting);
}
protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg);
CharStream input = new ANTLRStringStream(inputString);

View File

@ -31,6 +31,53 @@ public class TestATNSerialization extends BaseTest {
assertEquals(expecting, result);
}
@Test public void testEOF() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : A EOF ;");
String expecting =
"max type 3\n" +
"0:RULE_START 0\n" +
"1:RULE_STOP 0\n" +
"2:BASIC 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"rule 0:0 0,0\n" +
"0->2 EPSILON 0,0,0\n" +
"1->6 ATOM -1,0,0\n" +
"2->3 ATOM 3,0,0\n" +
"3->4 EPSILON 0,0,0\n" +
"4->5 ATOM -1,0,0\n" +
"5->1 EPSILON 0,0,0\n";
ATN atn = createATN(g);
String result = ATNSerializer.getDecoded(g, atn);
assertEquals(expecting, result);
}
@Test public void testEOFInSet() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : (A|EOF) ;");
String expecting =
"max type 3\n" +
"0:RULE_START 0\n" +
"1:RULE_STOP 0\n" +
"2:BASIC 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:0 0,0\n" +
"0:EOF..EOF, A..A\n" +
"0->2 EPSILON 0,0,0\n" +
"1->4 ATOM -1,0,0\n" +
"2->3 SET 0,0,0\n" +
"3->1 EPSILON 0,0,0\n";
ATN atn = createATN(g);
String result = ATNSerializer.getDecoded(g, atn);
assertEquals(expecting, result);
}
@Test public void testNot() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
@ -283,6 +330,70 @@ public class TestATNSerialization extends BaseTest {
assertEquals(expecting, result);
}
@Test public void testLexerEOF() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : 'a' EOF ;\n");
String expecting =
"max type 3\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"rule 0:1 3,-1\n" +
"mode 0:0\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 ATOM 97,0,0\n" +
"4->5 EPSILON 0,0,0\n" +
"5->6 ATOM -1,0,0\n" +
"6->2 EPSILON 0,0,0\n" +
"0:0 1\n";
ATN atn = createATN(lg);
String result = ATNSerializer.getDecoded(lg, atn);
assertEquals(expecting, result);
}
@Test public void testLexerEOFInSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : 'a' (EOF|'\n') ;\n");
String expecting =
"max type 3\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"7:BASIC 0\n" +
"8:BASIC 0\n" +
"9:BLOCK_START 0\n" +
"10:BLOCK_END 0\n" +
"rule 0:1 3,-1\n" +
"mode 0:0\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 ATOM 97,0,0\n" +
"4->9 EPSILON 0,0,0\n" +
"5->6 ATOM -1,0,0\n" +
"6->10 EPSILON 0,0,0\n" +
"7->8 ATOM 10,0,0\n" +
"8->10 EPSILON 0,0,0\n" +
"9->5 EPSILON 0,0,0\n" +
"9->7 EPSILON 0,0,0\n" +
"10->2 EPSILON 0,0,0\n" +
"0:0 1\n" +
"1:9 1\n";
ATN atn = createATN(lg);
String result = ATNSerializer.getDecoded(lg, atn);
assertEquals(expecting, result);
}
@Test public void testLexerLoops() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+