got EOF in lexer

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9223]
This commit is contained in:
parrt 2011-10-29 11:08:40 -08:00
parent 6576abf65f
commit 6e2bbcdb42
6 changed files with 242 additions and 40 deletions

View File

@ -210,7 +210,7 @@ public class LexerATNSimulator extends ATNSimulator {
// we got nowhere on t, don't throw out this knowledge; it'd // we got nowhere on t, don't throw out this knowledge; it'd
// cause a failover from DFA later. Don't track EOF edges // cause a failover from DFA later. Don't track EOF edges
// from stop states, though. // from stop states, though.
if ( t!=Token.EOF ) addDFAEdge(from, t, ERROR); if ( t!=CharStream.EOF ) addDFAEdge(from, t, ERROR);
break; break;
} }
@ -223,12 +223,17 @@ public class LexerATNSimulator extends ATNSimulator {
input.index()+", reach="+reach+ input.index()+", reach="+reach+
", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex); ", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex);
} }
if ( input.index() > prevAcceptIndex ) { int index = input.index();
if ( index > prevAcceptIndex ) {
// will favor prev accept at same index so "int" is keyword not ID // will favor prev accept at same index so "int" is keyword not ID
prevAccept = c; prevAccept = c;
prevAcceptIndex = input.index(); if ( t == CharStream.EOF ) {
// later we seek to prevAcceptIndex+1, undo that effect for EOF
index--;
}
prevAcceptIndex = index;
if ( debug ) { if ( debug ) {
System.out.println("mark "+c+" @ index="+input.index()); System.out.println("mark "+c+" @ index="+index);
} }
} }
@ -243,10 +248,11 @@ public class LexerATNSimulator extends ATNSimulator {
} }
consume(input, t); consume(input, t);
addDFAEdge(closure, t, reach); if ( t!=CharStream.EOF ) addDFAEdge(closure, t, reach);
t = input.LA(1); t = input.LA(1);
// swap to avoid reallocating space // swap to avoid reallocating space
// TODO: faster to reallocate?
OrderedHashSet<ATNConfig> tmp = reach; OrderedHashSet<ATNConfig> tmp = reach;
reach = closure; reach = closure;
closure = tmp; closure = tmp;
@ -256,7 +262,7 @@ public class LexerATNSimulator extends ATNSimulator {
if ( prevAccept==null ) { if ( prevAccept==null ) {
if ( t==Token.EOF ) { if ( t==Token.EOF ) {
System.out.println("EOF in token at input index "+input.index()); System.out.println("EOF in token at input index "+input.index());
return Token.EOF; //return Token.EOF;
} }
// System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index()); // System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index());
throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty
@ -296,11 +302,10 @@ public class LexerATNSimulator extends ATNSimulator {
else if ( trans instanceof SetTransition ) { else if ( trans instanceof SetTransition ) {
SetTransition st = (SetTransition)trans; SetTransition st = (SetTransition)trans;
boolean not = trans instanceof NotSetTransition; boolean not = trans instanceof NotSetTransition;
if ( !not && st.set.contains(t) || not && !st.set.contains(t) ) { if ( (!not && st.set.contains(t)) ||
// if ( st.set.toString().equals("0") ) { (not && !st.set.contains(t) && t!=Token.EOF) ) // ~set doesn't not match EOF
// System.out.println("eh?"); {
// } if ( debug ) System.out.println("match "+(not?"~":"")+"set "+st.set.toString(true));
if ( debug ) System.out.println("match set "+st.set.toString(true));
return st.target; return st.target;
} }
} }

View File

@ -32,13 +32,11 @@ package org.antlr.v4.automata;
import org.antlr.runtime.Token; import org.antlr.runtime.Token;
import org.antlr.v4.misc.CharSupport; import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.atn.*; import org.antlr.v4.runtime.atn.*;
import org.antlr.v4.runtime.misc.IntervalSet; import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.tool.LexerGrammar; import org.antlr.v4.tool.*;
import org.antlr.v4.tool.Rule; import org.antlr.v4.tool.ast.*;
import org.antlr.v4.tool.ast.ActionAST;
import org.antlr.v4.tool.ast.GrammarAST;
import org.antlr.v4.tool.ast.TerminalAST;
import java.util.List; import java.util.List;
@ -122,6 +120,7 @@ public class LexerATNFactory extends ParserATNFactory {
} }
} }
if ( invert ) { if ( invert ) {
// TODO: what? should be chars not token types
IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType()); IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType());
left.addTransition(new NotSetTransition(set, notSet, right)); left.addTransition(new NotSetTransition(set, notSet, right));
} }
@ -157,6 +156,13 @@ public class LexerATNFactory extends ParserATNFactory {
@Override @Override
public Handle tokenRef(TerminalAST node) { public Handle tokenRef(TerminalAST node) {
// Ref to EOF in lexer yields char transition on -1
if ( node.getText().equals("EOF") ) {
ATNState left = newState(node);
ATNState right = newState(node);
left.addTransition(new AtomTransition(CharStream.EOF, right));
return new Handle(left, right);
}
return _ruleRef(node); return _ruleRef(node);
} }
} }

View File

@ -763,16 +763,7 @@ blockSet
boolean ebnf = false; boolean ebnf = false;
} }
: LPAREN setElement (OR setElement)* RPAREN : LPAREN setElement (OR setElement)* RPAREN
/* { -> ^(SET<SetAST>[$LPAREN,"SET"] setElement+ )
t = input.LT(1);
ebnf = t!=null && (t.getType()==QUESTION || t.getType()==STAR || t.getType()==PLUS);
}
*/
-> ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT setElement)+ )
/*
-> {ebnf}? ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT ^(SET[$LPAREN,"SET"] setElement+ )))
-> ^(SET[$LPAREN,"SET"] setElement+ )
*/
; ;
setElement setElement

View File

@ -14,6 +14,20 @@ public class TestATNDeserialization extends BaseTest {
checkDeserializationIsStable(g); checkDeserializationIsStable(g);
} }
@Test public void testEOF() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : EOF ;");
checkDeserializationIsStable(g);
}
@Test public void testEOFInSet() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : (EOF|A) ;");
checkDeserializationIsStable(g);
}
@Test public void testNot() throws Exception { @Test public void testNot() throws Exception {
Grammar g = new Grammar( Grammar g = new Grammar(
"parser grammar T;\n"+ "parser grammar T;\n"+
@ -67,6 +81,20 @@ public class TestATNDeserialization extends BaseTest {
checkDeserializationIsStable(lg); checkDeserializationIsStable(lg);
} }
@Test public void testLexerEOF() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' EOF ;\n");
checkDeserializationIsStable(lg);
}
@Test public void testLexerEOFInSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' (EOF|'\n') ;\n");
checkDeserializationIsStable(lg);
}
@Test public void testLexerRange() throws Exception { @Test public void testLexerRange() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+

View File

@ -4,7 +4,7 @@ import org.antlr.v4.misc.Utils;
import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.*;
import org.antlr.v4.runtime.atn.*; import org.antlr.v4.runtime.atn.*;
import org.antlr.v4.tool.*; import org.antlr.v4.tool.*;
import org.junit.*; import org.junit.Test;
import java.util.List; import java.util.List;
@ -14,7 +14,7 @@ import java.util.List;
* several rules and even within a rule. However, that conflicts * several rules and even within a rule. However, that conflicts
* with the notion of non-greedy, which by definition tries to match * with the notion of non-greedy, which by definition tries to match
* the fewest possible. During ATN construction, non-greedy loops * the fewest possible. During ATN construction, non-greedy loops
* have their entry and exit branches reversed so that the ATM * have their entry and exit branches reversed so that the ATN
* simulator will see the exit branch 1st, giving it a priority. The * simulator will see the exit branch 1st, giving it a priority. The
* 1st path to the stop state kills any other paths for that rule * 1st path to the stop state kills any other paths for that rule
* that begin with the wildcard. In general, this does everything we * that begin with the wildcard. In general, this does everything we
@ -51,17 +51,17 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "xyz", "A, EOF"); checkLexerMatches(lg, "xyz", "A, EOF");
} }
@Test public void testWildOnEnd() throws Exception { @Test public void testWildOnEndFirstAlt() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop "A : 'xy' .\n" + // should pursue '.' since xyz hits stop first, before 2nd alt
" | 'xy'\n" + " | 'xy'\n" +
" ;\n"); " ;\n");
checkLexerMatches(lg, "xy", "A, EOF"); checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "A, EOF"); checkLexerMatches(lg, "xyz", "A, EOF");
} }
@Test public void testWildOnEndLast() throws Exception { @Test public void testWildOnEndLastAlt() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
"A : 'xy'\n" + "A : 'xy'\n" +
@ -83,6 +83,15 @@ public class TestATNLexerInterpreter extends BaseTest {
assertEquals("NoViableAltException('q')", e.toString()); assertEquals("NoViableAltException('q')", e.toString());
} }
@Test public void testWildcardNonQuirkWhenSplitBetweenTwoRules() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy' ;\n" +
"B : 'xy' . 'z' ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "B, EOF");
}
@Test public void testLexerLoops() throws Exception { @Test public void testLexerLoops() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
@ -135,7 +144,7 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting); checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting);
} }
@Ignore public void testLexerWildcardNonGreedyLoopByDefault() throws Exception { @Test public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
"CMT : '//' .* '\\n' ;\n"); "CMT : '//' .* '\\n' ;\n");
@ -143,25 +152,23 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "//x\n//y\n", expecting); checkLexerMatches(lg, "//x\n//y\n", expecting);
} }
// should not work. no priority within a single rule. the subrule won't work. need modes @Test public void testLexerEscapeInString() throws Exception {
@Ignore
public void testLexerEscapeInString() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
"STR : '\"' ('\\\\' '\"' | .)* '\"' ;\n"); // STR : '"' ('\\' '"' | .)* '"' "STR : '[' ('~' ']' | .)* ']' ;\n");
checkLexerMatches(lg, "\"a\\\"b\"", "STR, EOF"); checkLexerMatches(lg, "[a~]b]", "STR, EOF");
checkLexerMatches(lg, "\"a\"", "STR, EOF"); checkLexerMatches(lg, "[a]", "STR, EOF");
} }
@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception { @Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n"); "CMT : '//' .+ '\\n' ;\n");
String expecting = "CMT, CMT, EOF"; String expecting = "CMT, CMT, EOF";
checkLexerMatches(lg, "//x\n//y\n", expecting); checkLexerMatches(lg, "//x\n//y\n", expecting);
} }
// does not fail since ('*/')? cant match and have rule succeed // does not fail since ('*/')? can't match and have rule succeed
@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception { @Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
@ -179,6 +186,60 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "<a><x>", expecting); checkLexerMatches(lg, "<a><x>", expecting);
} }
@Test public void testEOFAtEndOfLineComment() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' ~('\n')* ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//x", expecting);
}
@Test public void testEOFAtEndOfLineComment2() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' ~('\n'|'\r')* ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//x", expecting);
}
/** only positive sets like (EOF|'\n') can match EOF and not in wildcard or ~foo sets
* EOF matches but does not advance cursor.
*/
@Test public void testEOFInSetAtEndOfLineComment() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' .* (EOF|'\n') ;\n");
String expecting = "CMT, EOF";
checkLexerMatches(lg, "//", expecting);
}
@Test public void testEOFSuffixInSecondRule() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' ;\n"+ // shorter than 'a' EOF, despite EOF being 0 width
"B : 'a' EOF ;\n");
String expecting = "B, EOF";
checkLexerMatches(lg, "a", expecting);
}
@Test public void testEOFSuffixInFirstRule() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'a' EOF ;\n"+
"B : 'a';\n");
String expecting = "A, EOF";
checkLexerMatches(lg, "a", expecting);
}
@Test public void testEOFByItself() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"DONE : EOF ;\n"+
"A : 'a';\n");
String expecting = "A, DONE, EOF";
checkLexerMatches(lg, "a", expecting);
}
protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg); ATN atn = createATN(lg);
CharStream input = new ANTLRStringStream(inputString); CharStream input = new ANTLRStringStream(inputString);

View File

@ -31,6 +31,53 @@ public class TestATNSerialization extends BaseTest {
assertEquals(expecting, result); assertEquals(expecting, result);
} }
@Test public void testEOF() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : A EOF ;");
String expecting =
"max type 3\n" +
"0:RULE_START 0\n" +
"1:RULE_STOP 0\n" +
"2:BASIC 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"rule 0:0 0,0\n" +
"0->2 EPSILON 0,0,0\n" +
"1->6 ATOM -1,0,0\n" +
"2->3 ATOM 3,0,0\n" +
"3->4 EPSILON 0,0,0\n" +
"4->5 ATOM -1,0,0\n" +
"5->1 EPSILON 0,0,0\n";
ATN atn = createATN(g);
String result = ATNSerializer.getDecoded(g, atn);
assertEquals(expecting, result);
}
@Test public void testEOFInSet() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : (A|EOF) ;");
String expecting =
"max type 3\n" +
"0:RULE_START 0\n" +
"1:RULE_STOP 0\n" +
"2:BASIC 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:0 0,0\n" +
"0:EOF..EOF, A..A\n" +
"0->2 EPSILON 0,0,0\n" +
"1->4 ATOM -1,0,0\n" +
"2->3 SET 0,0,0\n" +
"3->1 EPSILON 0,0,0\n";
ATN atn = createATN(g);
String result = ATNSerializer.getDecoded(g, atn);
assertEquals(expecting, result);
}
@Test public void testNot() throws Exception { @Test public void testNot() throws Exception {
Grammar g = new Grammar( Grammar g = new Grammar(
"parser grammar T;\n"+ "parser grammar T;\n"+
@ -283,6 +330,70 @@ public class TestATNSerialization extends BaseTest {
assertEquals(expecting, result); assertEquals(expecting, result);
} }
@Test public void testLexerEOF() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : 'a' EOF ;\n");
String expecting =
"max type 3\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"rule 0:1 3,-1\n" +
"mode 0:0\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 ATOM 97,0,0\n" +
"4->5 EPSILON 0,0,0\n" +
"5->6 ATOM -1,0,0\n" +
"6->2 EPSILON 0,0,0\n" +
"0:0 1\n";
ATN atn = createATN(lg);
String result = ATNSerializer.getDecoded(lg, atn);
assertEquals(expecting, result);
}
@Test public void testLexerEOFInSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : 'a' (EOF|'\n') ;\n");
String expecting =
"max type 3\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"7:BASIC 0\n" +
"8:BASIC 0\n" +
"9:BLOCK_START 0\n" +
"10:BLOCK_END 0\n" +
"rule 0:1 3,-1\n" +
"mode 0:0\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 ATOM 97,0,0\n" +
"4->9 EPSILON 0,0,0\n" +
"5->6 ATOM -1,0,0\n" +
"6->10 EPSILON 0,0,0\n" +
"7->8 ATOM 10,0,0\n" +
"8->10 EPSILON 0,0,0\n" +
"9->5 EPSILON 0,0,0\n" +
"9->7 EPSILON 0,0,0\n" +
"10->2 EPSILON 0,0,0\n" +
"0:0 1\n" +
"1:9 1\n";
ATN atn = createATN(lg);
String result = ATNSerializer.getDecoded(lg, atn);
assertEquals(expecting, result);
}
@Test public void testLexerLoops() throws Exception { @Test public void testLexerLoops() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+