forked from jasder/antlr
got EOF in lexer
[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9223]
This commit is contained in:
parent
6576abf65f
commit
6e2bbcdb42
|
@ -210,7 +210,7 @@ public class LexerATNSimulator extends ATNSimulator {
|
|||
// we got nowhere on t, don't throw out this knowledge; it'd
|
||||
// cause a failover from DFA later. Don't track EOF edges
|
||||
// from stop states, though.
|
||||
if ( t!=Token.EOF ) addDFAEdge(from, t, ERROR);
|
||||
if ( t!=CharStream.EOF ) addDFAEdge(from, t, ERROR);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -223,12 +223,17 @@ public class LexerATNSimulator extends ATNSimulator {
|
|||
input.index()+", reach="+reach+
|
||||
", prevAccept="+prevAccept+", prevIndex="+prevAcceptIndex);
|
||||
}
|
||||
if ( input.index() > prevAcceptIndex ) {
|
||||
int index = input.index();
|
||||
if ( index > prevAcceptIndex ) {
|
||||
// will favor prev accept at same index so "int" is keyword not ID
|
||||
prevAccept = c;
|
||||
prevAcceptIndex = input.index();
|
||||
if ( t == CharStream.EOF ) {
|
||||
// later we seek to prevAcceptIndex+1, undo that effect for EOF
|
||||
index--;
|
||||
}
|
||||
prevAcceptIndex = index;
|
||||
if ( debug ) {
|
||||
System.out.println("mark "+c+" @ index="+input.index());
|
||||
System.out.println("mark "+c+" @ index="+index);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -243,10 +248,11 @@ public class LexerATNSimulator extends ATNSimulator {
|
|||
}
|
||||
|
||||
consume(input, t);
|
||||
addDFAEdge(closure, t, reach);
|
||||
if ( t!=CharStream.EOF ) addDFAEdge(closure, t, reach);
|
||||
t = input.LA(1);
|
||||
|
||||
// swap to avoid reallocating space
|
||||
// TODO: faster to reallocate?
|
||||
OrderedHashSet<ATNConfig> tmp = reach;
|
||||
reach = closure;
|
||||
closure = tmp;
|
||||
|
@ -256,7 +262,7 @@ public class LexerATNSimulator extends ATNSimulator {
|
|||
if ( prevAccept==null ) {
|
||||
if ( t==Token.EOF ) {
|
||||
System.out.println("EOF in token at input index "+input.index());
|
||||
return Token.EOF;
|
||||
//return Token.EOF;
|
||||
}
|
||||
// System.out.println("no viable token at input "+getTokenName(input.LA(1))+", index "+input.index());
|
||||
throw new LexerNoViableAltException(recog, input, closure); // TODO: closure is empty
|
||||
|
@ -296,11 +302,10 @@ public class LexerATNSimulator extends ATNSimulator {
|
|||
else if ( trans instanceof SetTransition ) {
|
||||
SetTransition st = (SetTransition)trans;
|
||||
boolean not = trans instanceof NotSetTransition;
|
||||
if ( !not && st.set.contains(t) || not && !st.set.contains(t) ) {
|
||||
// if ( st.set.toString().equals("0") ) {
|
||||
// System.out.println("eh?");
|
||||
// }
|
||||
if ( debug ) System.out.println("match set "+st.set.toString(true));
|
||||
if ( (!not && st.set.contains(t)) ||
|
||||
(not && !st.set.contains(t) && t!=Token.EOF) ) // ~set doesn't not match EOF
|
||||
{
|
||||
if ( debug ) System.out.println("match "+(not?"~":"")+"set "+st.set.toString(true));
|
||||
return st.target;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,13 +32,11 @@ package org.antlr.v4.automata;
|
|||
import org.antlr.runtime.Token;
|
||||
import org.antlr.v4.misc.CharSupport;
|
||||
import org.antlr.v4.parse.ANTLRParser;
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.atn.*;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
import org.antlr.v4.tool.LexerGrammar;
|
||||
import org.antlr.v4.tool.Rule;
|
||||
import org.antlr.v4.tool.ast.ActionAST;
|
||||
import org.antlr.v4.tool.ast.GrammarAST;
|
||||
import org.antlr.v4.tool.ast.TerminalAST;
|
||||
import org.antlr.v4.tool.*;
|
||||
import org.antlr.v4.tool.ast.*;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -122,6 +120,7 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
}
|
||||
}
|
||||
if ( invert ) {
|
||||
// TODO: what? should be chars not token types
|
||||
IntervalSet notSet = (IntervalSet)set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType());
|
||||
left.addTransition(new NotSetTransition(set, notSet, right));
|
||||
}
|
||||
|
@ -157,6 +156,13 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
|
||||
@Override
|
||||
public Handle tokenRef(TerminalAST node) {
|
||||
// Ref to EOF in lexer yields char transition on -1
|
||||
if ( node.getText().equals("EOF") ) {
|
||||
ATNState left = newState(node);
|
||||
ATNState right = newState(node);
|
||||
left.addTransition(new AtomTransition(CharStream.EOF, right));
|
||||
return new Handle(left, right);
|
||||
}
|
||||
return _ruleRef(node);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -763,16 +763,7 @@ blockSet
|
|||
boolean ebnf = false;
|
||||
}
|
||||
: LPAREN setElement (OR setElement)* RPAREN
|
||||
/* {
|
||||
t = input.LT(1);
|
||||
ebnf = t!=null && (t.getType()==QUESTION || t.getType()==STAR || t.getType()==PLUS);
|
||||
}
|
||||
*/
|
||||
-> ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT setElement)+ )
|
||||
/*
|
||||
-> {ebnf}? ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] ^(ALT ^(SET[$LPAREN,"SET"] setElement+ )))
|
||||
-> ^(SET[$LPAREN,"SET"] setElement+ )
|
||||
*/
|
||||
-> ^(SET<SetAST>[$LPAREN,"SET"] setElement+ )
|
||||
;
|
||||
|
||||
setElement
|
||||
|
|
|
@ -14,6 +14,20 @@ public class TestATNDeserialization extends BaseTest {
|
|||
checkDeserializationIsStable(g);
|
||||
}
|
||||
|
||||
@Test public void testEOF() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
"a : EOF ;");
|
||||
checkDeserializationIsStable(g);
|
||||
}
|
||||
|
||||
@Test public void testEOFInSet() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
"a : (EOF|A) ;");
|
||||
checkDeserializationIsStable(g);
|
||||
}
|
||||
|
||||
@Test public void testNot() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
|
@ -67,6 +81,20 @@ public class TestATNDeserialization extends BaseTest {
|
|||
checkDeserializationIsStable(lg);
|
||||
}
|
||||
|
||||
@Test public void testLexerEOF() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'a' EOF ;\n");
|
||||
checkDeserializationIsStable(lg);
|
||||
}
|
||||
|
||||
@Test public void testLexerEOFInSet() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'a' (EOF|'\n') ;\n");
|
||||
checkDeserializationIsStable(lg);
|
||||
}
|
||||
|
||||
@Test public void testLexerRange() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
|
|
@ -4,7 +4,7 @@ import org.antlr.v4.misc.Utils;
|
|||
import org.antlr.v4.runtime.*;
|
||||
import org.antlr.v4.runtime.atn.*;
|
||||
import org.antlr.v4.tool.*;
|
||||
import org.junit.*;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -14,7 +14,7 @@ import java.util.List;
|
|||
* several rules and even within a rule. However, that conflicts
|
||||
* with the notion of non-greedy, which by definition tries to match
|
||||
* the fewest possible. During ATN construction, non-greedy loops
|
||||
* have their entry and exit branches reversed so that the ATM
|
||||
* have their entry and exit branches reversed so that the ATN
|
||||
* simulator will see the exit branch 1st, giving it a priority. The
|
||||
* 1st path to the stop state kills any other paths for that rule
|
||||
* that begin with the wildcard. In general, this does everything we
|
||||
|
@ -51,17 +51,17 @@ public class TestATNLexerInterpreter extends BaseTest {
|
|||
checkLexerMatches(lg, "xyz", "A, EOF");
|
||||
}
|
||||
|
||||
@Test public void testWildOnEnd() throws Exception {
|
||||
@Test public void testWildOnEndFirstAlt() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop
|
||||
"A : 'xy' .\n" + // should pursue '.' since xyz hits stop first, before 2nd alt
|
||||
" | 'xy'\n" +
|
||||
" ;\n");
|
||||
checkLexerMatches(lg, "xy", "A, EOF");
|
||||
checkLexerMatches(lg, "xyz", "A, EOF");
|
||||
}
|
||||
|
||||
@Test public void testWildOnEndLast() throws Exception {
|
||||
@Test public void testWildOnEndLastAlt() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'xy'\n" +
|
||||
|
@ -83,6 +83,15 @@ public class TestATNLexerInterpreter extends BaseTest {
|
|||
assertEquals("NoViableAltException('q')", e.toString());
|
||||
}
|
||||
|
||||
@Test public void testWildcardNonQuirkWhenSplitBetweenTwoRules() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'xy' ;\n" +
|
||||
"B : 'xy' . 'z' ;\n");
|
||||
checkLexerMatches(lg, "xy", "A, EOF");
|
||||
checkLexerMatches(lg, "xyz", "B, EOF");
|
||||
}
|
||||
|
||||
@Test public void testLexerLoops() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
@ -135,7 +144,7 @@ public class TestATNLexerInterpreter extends BaseTest {
|
|||
checkLexerMatches(lg, "/* ick */\n/* /*nested*/ */", expecting);
|
||||
}
|
||||
|
||||
@Ignore public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
|
||||
@Test public void testLexerWildcardNonGreedyLoopByDefault() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"CMT : '//' .* '\\n' ;\n");
|
||||
|
@ -143,25 +152,23 @@ public class TestATNLexerInterpreter extends BaseTest {
|
|||
checkLexerMatches(lg, "//x\n//y\n", expecting);
|
||||
}
|
||||
|
||||
// should not work. no priority within a single rule. the subrule won't work. need modes
|
||||
@Ignore
|
||||
public void testLexerEscapeInString() throws Exception {
|
||||
@Test public void testLexerEscapeInString() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"STR : '\"' ('\\\\' '\"' | .)* '\"' ;\n"); // STR : '"' ('\\' '"' | .)* '"'
|
||||
checkLexerMatches(lg, "\"a\\\"b\"", "STR, EOF");
|
||||
checkLexerMatches(lg, "\"a\"", "STR, EOF");
|
||||
"STR : '[' ('~' ']' | .)* ']' ;\n");
|
||||
checkLexerMatches(lg, "[a~]b]", "STR, EOF");
|
||||
checkLexerMatches(lg, "[a]", "STR, EOF");
|
||||
}
|
||||
|
||||
@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n");
|
||||
"CMT : '//' .+ '\\n' ;\n");
|
||||
String expecting = "CMT, CMT, EOF";
|
||||
checkLexerMatches(lg, "//x\n//y\n", expecting);
|
||||
}
|
||||
|
||||
// does not fail since ('*/')? cant match and have rule succeed
|
||||
// does not fail since ('*/')? can't match and have rule succeed
|
||||
@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
@ -179,6 +186,60 @@ public class TestATNLexerInterpreter extends BaseTest {
|
|||
checkLexerMatches(lg, "<a><x>", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEOFAtEndOfLineComment() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"CMT : '//' ~('\n')* ;\n");
|
||||
String expecting = "CMT, EOF";
|
||||
checkLexerMatches(lg, "//x", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEOFAtEndOfLineComment2() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"CMT : '//' ~('\n'|'\r')* ;\n");
|
||||
String expecting = "CMT, EOF";
|
||||
checkLexerMatches(lg, "//x", expecting);
|
||||
}
|
||||
|
||||
/** only positive sets like (EOF|'\n') can match EOF and not in wildcard or ~foo sets
|
||||
* EOF matches but does not advance cursor.
|
||||
*/
|
||||
@Test public void testEOFInSetAtEndOfLineComment() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"CMT : '//' .* (EOF|'\n') ;\n");
|
||||
String expecting = "CMT, EOF";
|
||||
checkLexerMatches(lg, "//", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEOFSuffixInSecondRule() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'a' ;\n"+ // shorter than 'a' EOF, despite EOF being 0 width
|
||||
"B : 'a' EOF ;\n");
|
||||
String expecting = "B, EOF";
|
||||
checkLexerMatches(lg, "a", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEOFSuffixInFirstRule() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : 'a' EOF ;\n"+
|
||||
"B : 'a';\n");
|
||||
String expecting = "A, EOF";
|
||||
checkLexerMatches(lg, "a", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEOFByItself() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"DONE : EOF ;\n"+
|
||||
"A : 'a';\n");
|
||||
String expecting = "A, DONE, EOF";
|
||||
checkLexerMatches(lg, "a", expecting);
|
||||
}
|
||||
|
||||
protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
|
||||
ATN atn = createATN(lg);
|
||||
CharStream input = new ANTLRStringStream(inputString);
|
||||
|
|
|
@ -31,6 +31,53 @@ public class TestATNSerialization extends BaseTest {
|
|||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testEOF() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
"a : A EOF ;");
|
||||
String expecting =
|
||||
"max type 3\n" +
|
||||
"0:RULE_START 0\n" +
|
||||
"1:RULE_STOP 0\n" +
|
||||
"2:BASIC 0\n" +
|
||||
"3:BASIC 0\n" +
|
||||
"4:BASIC 0\n" +
|
||||
"5:BASIC 0\n" +
|
||||
"6:BASIC 0\n" +
|
||||
"rule 0:0 0,0\n" +
|
||||
"0->2 EPSILON 0,0,0\n" +
|
||||
"1->6 ATOM -1,0,0\n" +
|
||||
"2->3 ATOM 3,0,0\n" +
|
||||
"3->4 EPSILON 0,0,0\n" +
|
||||
"4->5 ATOM -1,0,0\n" +
|
||||
"5->1 EPSILON 0,0,0\n";
|
||||
ATN atn = createATN(g);
|
||||
String result = ATNSerializer.getDecoded(g, atn);
|
||||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testEOFInSet() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
"a : (A|EOF) ;");
|
||||
String expecting =
|
||||
"max type 3\n" +
|
||||
"0:RULE_START 0\n" +
|
||||
"1:RULE_STOP 0\n" +
|
||||
"2:BASIC 0\n" +
|
||||
"3:BASIC 0\n" +
|
||||
"4:BASIC 0\n" +
|
||||
"rule 0:0 0,0\n" +
|
||||
"0:EOF..EOF, A..A\n" +
|
||||
"0->2 EPSILON 0,0,0\n" +
|
||||
"1->4 ATOM -1,0,0\n" +
|
||||
"2->3 SET 0,0,0\n" +
|
||||
"3->1 EPSILON 0,0,0\n";
|
||||
ATN atn = createATN(g);
|
||||
String result = ATNSerializer.getDecoded(g, atn);
|
||||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testNot() throws Exception {
|
||||
Grammar g = new Grammar(
|
||||
"parser grammar T;\n"+
|
||||
|
@ -283,6 +330,70 @@ public class TestATNSerialization extends BaseTest {
|
|||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testLexerEOF() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"INT : 'a' EOF ;\n");
|
||||
String expecting =
|
||||
"max type 3\n" +
|
||||
"0:TOKEN_START -1\n" +
|
||||
"1:RULE_START 0\n" +
|
||||
"2:RULE_STOP 0\n" +
|
||||
"3:BASIC 0\n" +
|
||||
"4:BASIC 0\n" +
|
||||
"5:BASIC 0\n" +
|
||||
"6:BASIC 0\n" +
|
||||
"rule 0:1 3,-1\n" +
|
||||
"mode 0:0\n" +
|
||||
"0->1 EPSILON 0,0,0\n" +
|
||||
"1->3 EPSILON 0,0,0\n" +
|
||||
"3->4 ATOM 97,0,0\n" +
|
||||
"4->5 EPSILON 0,0,0\n" +
|
||||
"5->6 ATOM -1,0,0\n" +
|
||||
"6->2 EPSILON 0,0,0\n" +
|
||||
"0:0 1\n";
|
||||
ATN atn = createATN(lg);
|
||||
String result = ATNSerializer.getDecoded(lg, atn);
|
||||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testLexerEOFInSet() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"INT : 'a' (EOF|'\n') ;\n");
|
||||
String expecting =
|
||||
"max type 3\n" +
|
||||
"0:TOKEN_START -1\n" +
|
||||
"1:RULE_START 0\n" +
|
||||
"2:RULE_STOP 0\n" +
|
||||
"3:BASIC 0\n" +
|
||||
"4:BASIC 0\n" +
|
||||
"5:BASIC 0\n" +
|
||||
"6:BASIC 0\n" +
|
||||
"7:BASIC 0\n" +
|
||||
"8:BASIC 0\n" +
|
||||
"9:BLOCK_START 0\n" +
|
||||
"10:BLOCK_END 0\n" +
|
||||
"rule 0:1 3,-1\n" +
|
||||
"mode 0:0\n" +
|
||||
"0->1 EPSILON 0,0,0\n" +
|
||||
"1->3 EPSILON 0,0,0\n" +
|
||||
"3->4 ATOM 97,0,0\n" +
|
||||
"4->9 EPSILON 0,0,0\n" +
|
||||
"5->6 ATOM -1,0,0\n" +
|
||||
"6->10 EPSILON 0,0,0\n" +
|
||||
"7->8 ATOM 10,0,0\n" +
|
||||
"8->10 EPSILON 0,0,0\n" +
|
||||
"9->5 EPSILON 0,0,0\n" +
|
||||
"9->7 EPSILON 0,0,0\n" +
|
||||
"10->2 EPSILON 0,0,0\n" +
|
||||
"0:0 1\n" +
|
||||
"1:9 1\n";
|
||||
ATN atn = createATN(lg);
|
||||
String result = ATNSerializer.getDecoded(lg, atn);
|
||||
assertEquals(expecting, result);
|
||||
}
|
||||
|
||||
@Test public void testLexerLoops() throws Exception {
|
||||
LexerGrammar lg = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
|
Loading…
Reference in New Issue