got lexer nongreedy loops going

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9018]
2011-08-07 11:19:34 -08:00 · 2011-08-07 11:19:34 -08:00 · f44c49a8b8
parent a9782118ac
commit f44c49a8b8
7 changed files with 207 additions and 47 deletions
--- a/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java
@ -50,6 +50,6 @@ public class LexerNoViableAltException extends LexerRecognitionExeption {
 	}

 	public String toString() {
-		return "NoViableAltException('"+(char)c+"'";
+		return "NoViableAltException('"+(char)c+"')";
 	}
 }
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerATNSimulator.java
@ -184,6 +184,21 @@ public class LexerATNSimulator extends ATNSimulator {
 						prevAccept = c;
 						prevAcceptIndex = input.index();
 					}
+
+					// if we reach lexer accept state, toss out any configs in rest
+					// of configs work list associated with this rule (config.alt);
+					// that rule is done. this is how we cut off nongreedy .+ loops.
+					deleteConfigsForAlt(closure, ci, c.alt);
+//					int j=ci+1;
+//					while ( j<closure.size() ) {
+//						ATNConfig c2 = closure.get(j);
+//						if ( c2.alt == c.alt ) {
+//							System.out.println("kill "+c2);
+//							closure.remove(j);
+//						}
+//						else j++;
+//					}
+
 					// move to next char, looking for longer match
 					// (we continue processing if there are states in reach)
 				}
@ -275,19 +290,19 @@ public class LexerATNSimulator extends ATNSimulator {
 		return null;
 	}

-	/* TODO: use if we need nongreedy
 	public void deleteConfigsForAlt(OrderedHashSet<ATNConfig> closure, int ci, int alt) {
 		int j=ci+1;
 		while ( j<closure.size() ) {
 			ATNConfig c = closure.get(j);
-			if ( c.alt == alt ) {
+			boolean isWildcard = c.state.getClass() == ATNState.class &&
+				c.state.transition(0).getClass() == WildcardTransition.class;
+			if ( c.alt == alt && isWildcard ) {
 				System.out.println("kill "+c);
 				closure.remove(j);
 			}
 			else j++;
 		}
 	}
-	*/

 	protected OrderedHashSet<ATNConfig> computeStartState(IntStream input,
 														  ATNState p)
@ -337,31 +352,36 @@ public class LexerATNSimulator extends ATNSimulator {
 		ATNState p = config.state;
 		for (int i=0; i<p.getNumberOfTransitions(); i++) {
 			Transition t = p.transition(i);
-			ATNConfig c = null;
-			if ( t.getClass() == RuleTransition.class ) {
-				RuleContext newContext =
-					new RuleContext(config.context, p.stateNumber, t.target.stateNumber);
-				c = new ATNConfig(config, t.target, newContext);
-			}
-			else if ( t.getClass() == PredicateTransition.class ) {
-				PredicateTransition pt = (PredicateTransition)t;
-				if ( recog.sempred(null, pt.ruleIndex, pt.predIndex) ) {
-					c = new ATNConfig(config, t.target);
-					c.traversedPredicate = true;
-				}
-			}
-			// ignore actions; just exec one per rule upon accept
-			else if ( t.getClass() == ActionTransition.class ) {
-				c = new ATNConfig(config, t.target);
-			}
-			// TODO: forced actions?
-			else if ( t.isEpsilon() ) {
-				c = new ATNConfig(config, t.target);
-			}
+			ATNConfig c = getEpsilonTarget(config, t);
 			if ( c!=null ) closure(c, configs);
 		}
 	}

+	public ATNConfig getEpsilonTarget(ATNConfig config, Transition t) {
+		ATNState p = config.state;
+		ATNConfig c = null;
+		if ( t.getClass() == RuleTransition.class ) {
+			RuleContext newContext =
+				new RuleContext(config.context, p.stateNumber, t.target.stateNumber);
+			c = new ATNConfig(config, t.target, newContext);
+		}
+		else if ( t.getClass() == PredicateTransition.class ) {
+			PredicateTransition pt = (PredicateTransition)t;
+			if ( recog.sempred(null, pt.ruleIndex, pt.predIndex) ) {
+				c = new ATNConfig(config, t.target);
+				c.traversedPredicate = true;
+			}
+		}
+		// ignore actions; just exec one per rule upon accept
+		else if ( t.getClass() == ActionTransition.class ) {
+			c = new ATNConfig(config, t.target);
+		}
+		else if ( t.isEpsilon() ) {
+			c = new ATNConfig(config, t.target);
+		}
+		return c;
+	}
+
 	protected void addDFAEdge(OrderedHashSet<ATNConfig> p,
 							  int t,
 							  OrderedHashSet<ATNConfig> q)
--- a/tool/playground/Errors.g
+++ b/tool/playground/Errors.g
@ -0,0 +1,56 @@
+grammar Errors;
+
+stat:	'return' INT
+	|	ID '=' expr ';'
+	|	ID '(' expr (',' expr)* ')' ';'
+	/	ID .* '(' expr (',' expr)* ')' ';'
+	/	ID '=' .* ';' // bad assignment
+	/	.* ';'		// bad stat
+	/	.*			// match anything else? when to stop?
+	/				// match anything else?
+	;
+	catch[Exception e] { }
+	finally { }
+
+// error to match might be diff than how to resynch? maybe just
+// include resynch pattern on end of error alt.
+
+/*
+Traps any recog exception in anything called from rule or matched in that rule.
+a : expr ';'
+  / '--' ID ';'	// catches any problem in expr or matching ';'
+  ;
+
+If no err alt matches, defaults to normal error mechanism at rule level.
+report. resync.
+*/
+
+atom:	'(' expr ')'
+	|	INT
+	/	'(' expr		// missing RP; how to resync?
+	/	'(' ')'
+	;
+
+// do error alts affect FOLLOW sync sets? nope.
+
+// foo -> bar says how to make resulting tree for bad alts
+
+expr:	atom ('*' atom)* ;
+
+atom:	INT ;
+
+ID : 'a'..'z'+ ;
+
+WS : (' '|'\n')* ;
+
+/*
+Stop .* when it sees any viable following token, even if it uses FOLLOW. So,
+err alt
+
+	/ .*
+
+would match until it sees something in FOLLOW (but not context-sensitive follow).
+actually maybe it would be sensitive; just use real outer context when matching
+error alts. who cares about speed.
+
+*/
--- a/tool/src/org/antlr/v4/automata/ParserATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/ParserATNFactory.java
@ -368,8 +368,17 @@ public class ParserATNFactory implements ATNFactory {
 		plusAST.atnState = blkStart;
 		blkStart.loopBackState = loop;
 		epsilon(blkEnd, loop);		// blk can see loop back
-		epsilon(loop, blkStart);	// loop back to start
-		epsilon(loop, end);			// or exit
+
+		BlockAST blkAST = (BlockAST)plusAST.getChild(0);
+		if ( !g.isLexer() || isGreedy(blkAST) ) {
+			epsilon(loop, blkStart);	// loop back to start
+			epsilon(loop, end);			// or exit
+		}
+		else { // only lexers flip entry/exit branches for nongreedy
+			// if not greedy, priority to exit branch; make it first
+			epsilon(loop, end);			// exit
+			epsilon(loop, blkStart);	// loop back to start
+		}

 		return new Handle(blkStart, end);
 	}
@ -395,8 +404,16 @@ public class ParserATNFactory implements ATNFactory {
 		ATNState end = newState(ATNState.class, starAST);
 		StarLoopbackState loop = (StarLoopbackState)newState(StarLoopbackState.class, starAST);

-		epsilon(entry, blkStart);	// loop enter edge (alt 1)
-		epsilon(entry, end);		// bypass loop edge (alt 2)
+		BlockAST blkAST = (BlockAST)starAST.getChild(0);
+		if ( !g.isLexer() || isGreedy(blkAST) ) {
+			epsilon(entry, blkStart);	// loop enter edge (alt 1)
+			epsilon(entry, end);		// bypass loop edge (alt 2)
+		}
+		else { // only lexers flip entry/exit branches for nongreedy
+			// if not greedy, priority to exit branch; make it first
+			epsilon(entry, end);		// bypass loop edge (alt 1)
+			epsilon(entry, blkStart);	// loop enter edge (alt 2)
+		}
 		epsilon(blkEnd, loop);		// block end hits loop back
 		epsilon(loop, entry);		// loop back to entry/exit decision

@ -494,13 +511,12 @@ public class ParserATNFactory implements ATNFactory {
 	public ATNState newState() { return newState(null); }

 	public boolean isGreedy(BlockAST blkAST) {
-		return true;
-//		boolean greedy = true;
-//		String greedyOption = blkAST.getOption("greedy");
-//		if ( blockHasWildcardAlt(blkAST) || greedyOption!=null&&greedyOption.equals("false") ) {
-//			greedy = false;
-//		}
-//		return greedy;
+		boolean greedy = true;
+		String greedyOption = blkAST.getOption("greedy");
+		if ( blockHasWildcardAlt(blkAST) || greedyOption!=null&&greedyOption.equals("false") ) {
+			greedy = false;
+		}
+		return greedy;
 	}

 	// (BLOCK (ALT .)) or (BLOCK (ALT 'a') (ALT .))
--- a/tool/src/org/antlr/v4/parse/GrammarTreeVisitor.g
+++ b/tool/src/org/antlr/v4/parse/GrammarTreeVisitor.g
@ -199,10 +199,14 @@ optionsSpec
    ;

 option
-    :   ^(ASSIGN ID optionValue)
+@init {
+boolean rule = inContext("RULE ...");
+boolean block = inContext("BLOCK ...");
+}
+    :   ^(a=ASSIGN ID optionValue)
    	{
-    	if ( inContext("RULE") ) ruleOption($ID, $optionValue.v);
-    	else if ( inContext("BLOCK") ) blockOption($ID, $optionValue.v);
+    	if ( rule ) ruleOption($ID, $optionValue.v);
+    	else if ( block ) blockOption($ID, $optionValue.v);
    	else grammarOption($ID, $optionValue.v);
    	}
    ;
--- a/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java
+++ b/tool/test/org/antlr/v4/test/TestATNLexerInterpreter.java
@ -8,6 +8,19 @@ import org.junit.*;

 import java.util.List;

+/**
+ * Lexer rules are little quirky when it comes to wildcards. Problem
+ * stems from the fact that we want the longest match to win among
+ * several rules and even within a rule. However, that conflicts
+ * with the notion of non-greedy, which by definition tries to match
+ * the fewest possible. During ATN construction, non-greedy loops
+ * have their entry and exit branches reversed so that the ATM
+ * simulator will see the exit branch 1st, giving it a priority. The
+ * 1st path to the stop state kills any other paths for that rule
+ * that begin with the wildcard. In general, this does everything we
+ * want, but occasionally there are some quirks as you'll see from
+ * the tests below.
+ */
 public class TestATNLexerInterpreter extends BaseTest {
 	@Test public void testLexerTwoRules() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
@ -22,12 +35,54 @@ public class TestATNLexerInterpreter extends BaseTest {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"A : 'xy'\n" +
-			"  | 'xyz'\n" +
+			"  | 'xyz'\n" +  // make sure nongreedy mech cut off doesn't kill this alt
 			"  ;\n");
 		checkLexerMatches(lg, "xy", "A, EOF");
 		checkLexerMatches(lg, "xyz", "A, EOF");
 	}

+	@Test public void testShortLongRule2() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'xyz'\n" +  // make sure nongreedy mech cut off doesn't kill this alt
+			"  | 'xy'\n" +
+			"  ;\n");
+		checkLexerMatches(lg, "xy", "A, EOF");
+		checkLexerMatches(lg, "xyz", "A, EOF");
+	}
+
+	@Test public void testWildOnEnd() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'xy' .\n" + // should not pursue '.' since xy already hit stop
+			"  | 'xy'\n" +
+			"  ;\n");
+		checkLexerMatches(lg, "xy", "A, EOF");
+		checkLexerMatches(lg, "xyz", "A, EOF");
+	}
+
+	@Test public void testWildOnEndLast() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'xy'\n" +
+			"  | 'xy' .\n" +  // should not pursue '.' since xy already hit stop
+			"  ;\n");
+		checkLexerMatches(lg, "xy", "A, EOF");
+		LexerRecognitionExeption e = checkLexerMatches(lg, "xyz", "A, EOF");
+		assertEquals("NoViableAltException('z')", e.toString());
+	}
+
+	@Test public void testWildcardQuirk() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : 'xy'\n" +
+			"  | 'xy' . 'z'\n" + // will not pursue '.' since xy already hit stop (prior alt)
+			"  ;\n");
+//		checkLexerMatches(lg, "xy", "A, EOF");
+		LexerRecognitionExeption e = checkLexerMatches(lg, "xyqz", "A, EOF");
+		assertEquals("NoViableAltException('q')", e.toString());
+	}
+
 	@Test public void testLexerLoops() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
@ -98,15 +153,16 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "\"a\"", "STR, EOF");
 	}

-	@Ignore public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
+	@Test public void testLexerWildcardNonGreedyPlusLoopByDefault() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
-			"CMT : '//' .+ '\\n' ;\n");
+			"CMT : '//' (options {greedy=false;}:.)+ '\\n' ;\n");
 		String expecting = "CMT, CMT, EOF";
 		checkLexerMatches(lg, "//x\n//y\n", expecting);
 	}

-	@Ignore public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
+	// does not fail since ('*/')? cant match and have rule succeed
+	@Test public void testLexerGreedyOptionalShouldWorkAsWeExpect() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"CMT : '/*' ('*/')? '*/' ;\n");
@ -114,7 +170,7 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "/**/", expecting);
 	}

-	@Ignore public void testNonGreedyBetweenRules() throws Exception {
+	@Test public void testNonGreedyBetweenRules() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"A : '<a>' ;\n" +
@ -123,14 +179,21 @@ public class TestATNLexerInterpreter extends BaseTest {
 		checkLexerMatches(lg, "<a><x>", expecting);
 	}

-	protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
+	protected LexerRecognitionExeption checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
 		ATN atn = createATN(lg);
 		CharStream input = new ANTLRStringStream(inputString);
 		ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
 		DOTGenerator dot = new DOTGenerator(lg);
 		System.out.println(dot.getDOT(startState, true));

-		List<String> tokenTypes = getTokenTypes(lg, atn, input, false);
+		List<String> tokenTypes = null;
+		LexerRecognitionExeption retException = null;
+		try {
+			tokenTypes = getTokenTypes(lg, atn, input, false);
+		}
+		catch (LexerRecognitionExeption lre) { retException = lre; }
+		if ( retException!=null ) return retException;
+
 		String result = Utils.join(tokenTypes.iterator(), ", ");
 		System.out.println(tokenTypes);
 		assertEquals(expecting, result);
@ -139,6 +202,7 @@ public class TestATNLexerInterpreter extends BaseTest {
 		input.seek(0);
 		List<String> tokenTypes2 = getTokenTypes(lg, atn, input, true);
 		assertEquals("interp vs adaptive types differ", tokenTypes, tokenTypes2);
+		return null;
 	}

 }
--- a/tool/test/org/antlr/v4/test/TestATNSerialization.java
+++ b/tool/test/org/antlr/v4/test/TestATNSerialization.java
@ -218,7 +218,7 @@ public class TestATNSerialization extends BaseTest {
 			"1->8 ATOM -1,0,0\n" +
 			"2->6 EPSILON 0,0,0\n" +
 			"3->5 EPSILON 0,0,0\n" +
-			"4->5 RULE 2,1,-1\n" +
+			"4->5 RULE 2,1,0\n" +
 			"5->1 EPSILON 0,0,0\n" +
 			"6->7 ATOM 3,0,0\n" +
 			"7->3 EPSILON 0,0,0\n";