got lexer nongreedy loops going

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9018]
This commit is contained in:
parrt 2011-08-07 11:19:34 -08:00
parent a9782118ac
commit f44c49a8b8
7 changed files with 207 additions and 47 deletions

View File

@ -50,6 +50,6 @@ public class LexerNoViableAltException extends LexerRecognitionExeption {
}
public String toString() {
return "NoViableAltException('"+(char)c+"'";
return "NoViableAltException('"+(char)c+"')";
}
}

View File

@ -184,6 +184,21 @@ public class LexerATNSimulator extends ATNSimulator {
prevAccept = c;
prevAcceptIndex = input.index();
}
// if we reach lexer accept state, toss out any configs in rest
// of configs work list associated with this rule (config.alt);
// that rule is done. this is how we cut off nongreedy .+ loops.
deleteConfigsForAlt(closure, ci, c.alt);
// int j=ci+1;
// while ( j<closure.size() ) {
// ATNConfig c2 = closure.get(j);
// if ( c2.alt == c.alt ) {
// System.out.println("kill "+c2);
// closure.remove(j);
// }
// else j++;
// }
// move to next char, looking for longer match
// (we continue processing if there are states in reach)
}
@ -275,19 +290,19 @@ public class LexerATNSimulator extends ATNSimulator {
return null;
}
/* TODO: use if we need nongreedy
public void deleteConfigsForAlt(OrderedHashSet<ATNConfig> closure, int ci, int alt) {
int j=ci+1;
while ( j<closure.size() ) {
ATNConfig c = closure.get(j);
if ( c.alt == alt ) {
boolean isWildcard = c.state.getClass() == ATNState.class &&
c.state.transition(0).getClass() == WildcardTransition.class;
if ( c.alt == alt && isWildcard ) {
System.out.println("kill "+c);
closure.remove(j);
}
else j++;
}
}
*/
protected OrderedHashSet<ATNConfig> computeStartState(IntStream input,
ATNState p)
@ -337,31 +352,36 @@ public class LexerATNSimulator extends ATNSimulator {
ATNState p = config.state;
for (int i=0; i<p.getNumberOfTransitions(); i++) {
Transition t = p.transition(i);
ATNConfig c = null;
if ( t.getClass() == RuleTransition.class ) {
RuleContext newContext =
new RuleContext(config.context, p.stateNumber, t.target.stateNumber);
c = new ATNConfig(config, t.target, newContext);
}
else if ( t.getClass() == PredicateTransition.class ) {
PredicateTransition pt = (PredicateTransition)t;
if ( recog.sempred(null, pt.ruleIndex, pt.predIndex) ) {
c = new ATNConfig(config, t.target);
c.traversedPredicate = true;
}
}
// ignore actions; just exec one per rule upon accept
else if ( t.getClass() == ActionTransition.class ) {
c = new ATNConfig(config, t.target);
}
// TODO: forced actions?
else if ( t.isEpsilon() ) {
c = new ATNConfig(config, t.target);
}
ATNConfig c = getEpsilonTarget(config, t);
if ( c!=null ) closure(c, configs);
}
}
public ATNConfig getEpsilonTarget(ATNConfig config, Transition t) {
ATNState p = config.state;
ATNConfig c = null;
if ( t.getClass() == RuleTransition.class ) {
RuleContext newContext =
new RuleContext(config.context, p.stateNumber, t.target.stateNumber);
c = new ATNConfig(config, t.target, newContext);
}
else if ( t.getClass() == PredicateTransition.class ) {
PredicateTransition pt = (PredicateTransition)t;
if ( recog.sempred(null, pt.ruleIndex, pt.predIndex) ) {
c = new ATNConfig(config, t.target);
c.traversedPredicate = true;
}
}
// ignore actions; just exec one per rule upon accept
else if ( t.getClass() == ActionTransition.class ) {
c = new ATNConfig(config, t.target);
}
else if ( t.isEpsilon() ) {
c = new ATNConfig(config, t.target);
}
return c;
}
protected void addDFAEdge(OrderedHashSet<ATNConfig> p,
int t,
OrderedHashSet<ATNConfig> q)

56
tool/playground/Errors.g Normal file
View File

@ -0,0 +1,56 @@
grammar Errors;
stat: 'return' INT
| ID '=' expr ';'
| ID '(' expr (',' expr)* ')' ';'
/ ID .* '(' expr (',' expr)* ')' ';'
/ ID '=' .* ';' // bad assignment
/ .* ';' // bad stat
/ .* // match anything else? when to stop?
/ // match anything else?
;
catch[Exception e] { }
finally { }
// error to match might be diff than how to resynch? maybe just
// include resynch pattern on end of error alt.
/*
Traps any recog exception in anything called from rule or matched in that rule.
a : expr ';'
/ '--' ID ';' // catches any problem in expr or matching ';'
;
If no err alt matches, defaults to normal error mechanism at rule level.
report. resync.
*/
atom: '(' expr ')'
| INT
/ '(' expr // missing RP; how to resync?
/ '(' ')'
;
// do error alts affect FOLLOW sync sets? nope.
// foo -> bar says how to make resulting tree for bad alts
expr: atom ('*' atom)* ;
atom: INT ;
ID : 'a'..'z'+ ;
WS : (' '|'\n')* ;
/*
Stop .* when it sees any viable following token, even if it uses FOLLOW. So,
err alt
/ .*
would match until it sees something in FOLLOW (but not context-sensitive follow).
actually maybe it would be sensitive; just use real outer context when matching
error alts. who cares about speed.
*/

View File

@ -368,8 +368,17 @@ public class ParserATNFactory implements ATNFactory {
plusAST.atnState = blkStart;
blkStart.loopBackState = loop;
epsilon(blkEnd, loop); // blk can see loop back
epsilon(loop, blkStart); // loop back to start
epsilon(loop, end); // or exit
BlockAST blkAST = (BlockAST)plusAST.getChild(0);
if ( !g.isLexer() || isGreedy(blkAST) ) {
epsilon(loop, blkStart); // loop back to start
epsilon(loop, end); // or exit
}
else { // only lexers flip entry/exit branches for nongreedy
// if not greedy, priority to exit branch; make it first
epsilon(loop, end); // exit
epsilon(loop, blkStart); // loop back to start
}
return new Handle(blkStart, end);
}
@ -395,8 +404,16 @@ public class ParserATNFactory implements ATNFactory {
ATNState end = newState(ATNState.class, starAST);
StarLoopbackState loop = (StarLoopbackState)newState(StarLoopbackState.class, starAST);
epsilon(entry, blkStart); // loop enter edge (alt 1)
epsilon(entry, end); // bypass loop edge (alt 2)
BlockAST blkAST = (BlockAST)starAST.getChild(0);
if ( !g.isLexer() || isGreedy(blkAST) ) {
epsilon(entry, blkStart); // loop enter edge (alt 1)
epsilon(entry, end); // bypass loop edge (alt 2)
}
else { // only lexers flip entry/exit branches for nongreedy
// if not greedy, priority to exit branch; make it first
epsilon(entry, end); // bypass loop edge (alt 1)
epsilon(entry, blkStart); // loop enter edge (alt 2)
}
epsilon(blkEnd, loop); // block end hits loop back
epsilon(loop, entry); // loop back to entry/exit decision
@ -494,13 +511,12 @@ public class ParserATNFactory implements ATNFactory {
public ATNState newState() { return newState(null); }
public boolean isGreedy(BlockAST blkAST) {
return true;
// boolean greedy = true;
// String greedyOption = blkAST.getOption("greedy");
// if ( blockHasWildcardAlt(blkAST) || greedyOption!=null&&greedyOption.equals("false") ) {
// greedy = false;
// }
// return greedy;
boolean greedy = true;
String greedyOption = blkAST.getOption("greedy");
if ( blockHasWildcardAlt(blkAST) || greedyOption!=null&&greedyOption.equals("false") ) {
greedy = false;
}
return greedy;
}
// (BLOCK (ALT .)) or (BLOCK (ALT 'a') (ALT .))

View File

@ -199,10 +199,14 @@ optionsSpec
;
option
: ^(ASSIGN ID optionValue)
@init {
boolean rule = inContext("RULE ...");
boolean block = inContext("BLOCK ...");
}
: ^(a=ASSIGN ID optionValue)
{
if ( inContext("RULE") ) ruleOption($ID, $optionValue.v);
else if ( inContext("BLOCK") ) blockOption($ID, $optionValue.v);
if ( rule ) ruleOption($ID, $optionValue.v);
else if ( block ) blockOption($ID, $optionValue.v);
else grammarOption($ID, $optionValue.v);
}
;

View File

@ -8,6 +8,19 @@ import org.junit.*;
import java.util.List;
/**
* Lexer rules are little quirky when it comes to wildcards. Problem
* stems from the fact that we want the longest match to win among
* several rules and even within a rule. However, that conflicts
* with the notion of non-greedy, which by definition tries to match
* the fewest possible. During ATN construction, non-greedy loops
* have their entry and exit branches reversed so that the ATM
* simulator will see the exit branch 1st, giving it a priority. The
* 1st path to the stop state kills any other paths for that rule
* that begin with the wildcard. In general, this does everything we
* want, but occasionally there are some quirks as you'll see from
* the tests below.
*/
public class TestATNLexerInterpreter extends BaseTest {
@Test public void testLexerTwoRules() throws Exception {
LexerGrammar lg = new LexerGrammar(
@ -22,12 +35,54 @@ public class TestATNLexerInterpreter extends BaseTest {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy'\n" +
" | 'xyz'\n" +
" | 'xyz'\n" + // make sure nongreedy mech cut off doesn't kill this alt
" ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "A, EOF");
}
@Test public void testShortLongRule2() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xyz'\n" + // make sure nongreedy mech cut off doesn't kill this alt
" | 'xy'\n" +
" ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "A, EOF");
}
@Test public void testWildOnEnd() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop
" | 'xy'\n" +
" ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
checkLexerMatches(lg, "xyz", "A, EOF");
}
@Test public void testWildOnEndLast() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy'\n" +
" | 'xy' .\n" + // should not pursue '.' since xy already hit stop
" ;\n");
checkLexerMatches(lg, "xy", "A, EOF");
LexerRecognitionExeption e = checkLexerMatches(lg, "xyz", "A, EOF");
assertEquals("NoViableAltException('z')", e.toString());
}
@Test public void testWildcardQuirk() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : 'xy'\n" +
" | 'xy' . 'z'\n" + // will not pursue '.' since xy already hit stop (prior alt)
" ;\n");
// checkLexerMatches(lg, "xy", "A, EOF");
LexerRecognitionExeption e = checkLexerMatches(lg, "xyqz", "A, EOF");
assertEquals("NoViableAltException('q')", e.toString());
}
@Test public void testLexerLoops() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
@ -98,15 +153,16 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "\"a\"", "STR, EOF");
}
@Ignore public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '//' .+ '\\n' ;\n");
"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n");
String expecting = "CMT, CMT, EOF";
checkLexerMatches(lg, "//x\n//y\n", expecting);
}
@Ignore public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
// does not fail since ('*/')? cant match and have rule succeed
@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"CMT : '/*' ('*/')? '*/' ;\n");
@ -114,7 +170,7 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "/**/", expecting);
}
@Ignore public void testNonGreedyBetweenRules() throws Exception {
@Test public void testNonGreedyBetweenRules() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"A : '<a>' ;\n" +
@ -123,14 +179,21 @@ public class TestATNLexerInterpreter extends BaseTest {
checkLexerMatches(lg, "<a><x>", expecting);
}
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg);
CharStream input = new ANTLRStringStream(inputString);
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
DOTGenerator dot = new DOTGenerator(lg);
System.out.println(dot.getDOT(startState, true));
List<String> tokenTypes = getTokenTypes(lg, atn, input, false);
List<String> tokenTypes = null;
LexerRecognitionExeption retException = null;
try {
tokenTypes = getTokenTypes(lg, atn, input, false);
}
catch (LexerRecognitionExeption lre) { retException = lre; }
if ( retException!=null ) return retException;
String result = Utils.join(tokenTypes.iterator(), ", ");
System.out.println(tokenTypes);
assertEquals(expecting, result);
@ -139,6 +202,7 @@ public class TestATNLexerInterpreter extends BaseTest {
input.seek(0);
List<String> tokenTypes2 = getTokenTypes(lg, atn, input, true);
assertEquals("interp vs adaptive types differ", tokenTypes, tokenTypes2);
return null;
}
}

View File

@ -218,7 +218,7 @@ public class TestATNSerialization extends BaseTest {
"1->8 ATOM -1,0,0\n" +
"2->6 EPSILON 0,0,0\n" +
"3->5 EPSILON 0,0,0\n" +
"4->5 RULE 2,1,-1\n" +
"4->5 RULE 2,1,0\n" +
"5->1 EPSILON 0,0,0\n" +
"6->7 ATOM 3,0,0\n" +
"7->3 EPSILON 0,0,0\n";