Fixes #994 poor performance in left-recursive expressions with very large expression input phrases; builds off of @sharwell solution that explicitly checks for key return states in expr rules

2016-11-24 13:07:19 -08:00 · 2016-11-24 13:07:19 -08:00 · c182e3d5bf
parent fca5e458d3
commit c182e3d5bf
4 changed files with 321 additions and 58 deletions
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/PerformanceDescriptors.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/PerformanceDescriptors.java
@ -33,7 +33,6 @@ public class PerformanceDescriptors {
 		 */
 		@CommentHasStringValue
 		public String grammar;
-
 	}

 	public static class ExpressionGrammar_1 extends ExpressionGrammar {
@ -72,4 +71,158 @@ public class PerformanceDescriptors {
 		@CommentHasStringValue
 		public String input;
 	}
+
+	/** Test for https://github.com/antlr/antlr4/issues/1398.
+	 *  Seeing through a large expression takes 5 _minutes_ on
+	 *  my fast box to complete.  After fix, it's instantaneous.
+	 */
+	public static abstract class DropLoopEntryBranchInLRRule extends BaseParserTestDescriptor {
+		public String grammarName = "Expr";
+		public String startRule = "stat";
+
+		/**
+		 grammar Expr;
+
+		 stat : expr ';'
+		      | expr '.'
+		      ;
+
+		 expr
+		 	: ID
+		 	| 'not' expr
+		 	| expr 'and' expr
+		 	| expr 'or' expr
+		    | '(' ID ')' expr
+		    | expr '?' expr ':' expr
+		    | 'between' expr 'and' expr
+		 	;
+
+		 ID: [a-zA-Z_][a-zA-Z_0-9]*;
+		 WS: [ \t\n\r\f]+ -> skip;
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+		@Override
+		public boolean ignore(String targetName) {
+			return !targetName.equals("Java");
+		}
+	}
+
+	public static class DropLoopEntryBranchInLRRule_1 extends DropLoopEntryBranchInLRRule {
+		/**
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7
+		 ;
+		 */
+		@CommentHasStringValue
+		public String input;
+	}
+
+	public static class DropLoopEntryBranchInLRRule_2 extends DropLoopEntryBranchInLRRule {
+		/**
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7 or
+		 X1 and X2 and X3 and X4 and X5 and X6 and X7
+		 .
+		 */ // Different in final token
+		@CommentHasStringValue
+		public String input;
+	}
+
+	public static class DropLoopEntryBranchInLRRule_3 extends DropLoopEntryBranchInLRRule {
+		/**
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
+		 not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7
+		 ;
+		 */
+		@CommentHasStringValue
+		public String input;
+	}
+
+	public static class DropLoopEntryBranchInLRRule_4 extends DropLoopEntryBranchInLRRule {
+		/**
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4 and
+		 between X1 and X2 or between X3 and X4
+		 ;
+		 */
+		@CommentHasStringValue
+		public String input;
+	}
+
+	public static class DropLoopEntryBranchInLRRule_5 extends DropLoopEntryBranchInLRRule {
+		/**
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z or
+		 X ? Y : Z
+		 ;
+		 */
+		@CommentHasStringValue
+		public String input;
+	}
 }
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/ParserATNSimulator.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ParserATNSimulator.java
@ -58,6 +58,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;

+import static org.antlr.v4.runtime.atn.ATNState.BLOCK_END;
+
 /**
 * The embodiment of the adaptive LL(*), ALL(*), parsing strategy.
 *
@ -291,6 +293,8 @@ public class ParserATNSimulator extends ATNSimulator {
 	public static final boolean dfa_debug = false;
 	public static final boolean retry_debug = false;

+	public static final boolean TURN_OFF_LR_LOOP_ENTRY_BRANCH_OPT = Boolean.parseBoolean(System.getProperty("antlr.no_lr_loop_entry_opt"));
+
 	protected final Parser parser;

 	public final DFA[] decisionToDFA;
@ -788,7 +792,7 @@ public class ParserATNSimulator extends ATNSimulator {
 	protected ATNConfigSet computeReachSet(ATNConfigSet closure, int t,
 										   boolean fullCtx)
 	{
-		if ( debug ) 
+		if ( debug )
 			System.out.println("in computeReachSet, starting closure: " + closure);

 		if (mergeCache == null) {
@ -1552,59 +1556,7 @@ public class ParserATNSimulator extends ATNSimulator {
        }

 		for (int i=0; i<p.getNumberOfTransitions(); i++) {
-			// This block implements first-edge elimination of ambiguous LR
-			// alternatives as part of dynamic disambiguation during prediction.
-			// See antlr/antlr4#1398.
-			if (i == 0
-				&& p.getStateType() == ATNState.STAR_LOOP_ENTRY
-				&& ((StarLoopEntryState)p).isPrecedenceDecision
-				&& !config.context.hasEmptyPath()) {
-
-				// When suppress is true, it means the outgoing edge i==0 is
-				// ambiguous with the outgoing edge i==1, and thus the closure
-				// operation can dynamically disambiguate by suppressing this
-				// edge during the closure operation.
-				boolean suppress = true;
-
-				// Require all return states to return back to the same rule
-				// that p is in.
-				int limit = config.context.size();
-				for (int j = 0; j < limit; j++) {
-					ATNState returnState = atn.states.get(config.context.getReturnState(j));
-					suppress = suppress && returnState.ruleIndex == p.ruleIndex;
-				}
-
-				// Further check to make sure this isn't a 0-precedence entry.
-				// See antlr/antlr4#679.
-				if (suppress) {
-					RuleStopState ruleStopState = atn.ruleToStopState[p.ruleIndex];
-					for (int j = 0; j < limit; j++) {
-						for (Transition transition : ruleStopState.transitions) {
-							if (transition.getSerializationType() != Transition.EPSILON) {
-								continue;
-							}
-
-							if (((EpsilonTransition)transition).outermostPrecedenceReturn() != p.ruleIndex) {
-								continue;
-							}
-
-							int returnStateNumber = config.context.getReturnState(j);
-							suppress = returnStateNumber != transition.target.stateNumber;
-							if (!suppress) {
-								break;
-							}
-						}
-
-						if (!suppress) {
-							break;
-						}
-					}
-				}
-
-				if (suppress) {
-					continue;
-				}
-			}
+			if ( i==0 && canDropLoopEntryEdgeInLeftRecursiveRule(config) ) continue;

 			Transition t = p.transition(i);
 			boolean continueCollecting =
@ -1657,6 +1609,163 @@ public class ParserATNSimulator extends ATNSimulator {
 		}
 	}

+	/** Implements first-edge (loop entry) elimination as an optimization
+	 *  during closure operations.  See antlr/antlr4#1398.
+	 *
+	 * The optimization is to avoid adding the loop entry config when
+	 * the exit path can only lead back to the same
+	 * StarLoopEntryState after popping context at the rule end state
+	 * (traversing only epsilon edges, so we're still in closure, in
+	 * this same rule).
+	 *
+	 * We need to detect any state that can reach loop entry on
+	 * epsilon w/o exiting rule. We don't have to look at FOLLOW
+	 * links, just ensure that all stack tops for config refer to key
+	 * states in LR rule.
+	 *
+	 * To verify we are in the right situation we must first check
+	 * closure is at a StarLoopEntryState generated during LR removal.
+	 * Then we check that each stack top of context is a return state
+	 * from one of these cases:
+	 *
+	 *   1. 'not' expr, '(' type ')' expr. The return state points at loop entry state
+	 *   2. expr op expr. The return state is the block end of internal block of (...)*
+	 *   3. 'between' expr 'and' expr. The return state of 2nd expr reference.
+	 *      That state points at block end of internal block of (...)*.
+	 *   4. expr '?' expr ':' expr. The return state points at block end,
+	 *      which points at loop entry state.
+	 *
+	 * If any is true for each stack top, then closure does not add a
+	 * config to the current config set for edge[0], the loop entry branch.
+	 *
+	 *  Conditions fail if any context for the current config is:
+	 *
+	 *   a. empty (we'd fall out of expr to do a global FOLLOW which could
+	 *      even be to some weird spot in expr) or,
+	 *   b. lies outside of expr or,
+	 *   c. lies within expr but at a state not the BlockEndState
+	 *   generated during LR removal
+	 *
+	 * Do we need to evaluate predicates ever in closure for this case?
+	 *
+	 * No. Predicates, including precedence predicates, are only
+	 * evaluated when computing a DFA start state. I.e., only before
+	 * the lookahead (but not parser) consumes a token.
+	 *
+	 * There are no epsilon edges allowed in LR rule alt blocks or in
+	 * the "primary" part (ID here). If closure is in
+	 * StarLoopEntryState any lookahead operation will have consumed a
+	 * token as there are no epsilon-paths that lead to
+	 * StarLoopEntryState. We do not have to evaluate predicates
+	 * therefore if we are in the generated StarLoopEntryState of a LR
+	 * rule. Note that when making a prediction starting at that
+	 * decision point, decision d=2, compute-start-state performs
+	 * closure starting at edges[0], edges[1] emanating from
+	 * StarLoopEntryState. That means it is not performing closure on
+	 * StarLoopEntryState during compute-start-state.
+	 *
+	 * How do we know this always gives same prediction answer?
+	 *
+	 * Without predicates, loop entry and exit paths are ambiguous
+	 * upon remaining input +b (in, say, a+b). Either paths lead to
+	 * valid parses. Closure can lead to consuming + immediately or by
+	 * falling out of this call to expr back into expr and loop back
+	 * again to StarLoopEntryState to match +b. In this special case,
+	 * we choose the more efficient path, which is to take the bypass
+	 * path.
+	 *
+	 * The lookahead language has not changed because closure chooses
+	 * one path over the other. Both paths lead to consuming the same
+	 * remaining input during a lookahead operation. If the next token
+	 * is an operator, lookahead will enter the choice block with
+	 * operators. If it is not, lookahead will exit expr. Same as if
+	 * closure had chosen to enter the choice block immediately.
+	 *
+	 * Closure is examining one config (some loopentrystate, some alt,
+	 * context) which means it is considering exactly one alt. Closure
+	 * always copies the same alt to any derived configs.
+	 *
+	 * How do we know this optimization doesn't mess up precedence in
+	 * our parse trees?
+	 *
+	 * Looking through expr from left edge of stat only has to confirm
+	 * that an input, say, a+b+c; begins with any valid interpretation
+	 * of an expression. The precedence actually doesn't matter when
+	 * making a decision in stat seeing through expr. It is only when
+	 * parsing rule expr that we must use the precedence to get the
+	 * right interpretation and, hence, parse tree.
+	 */
+	protected boolean canDropLoopEntryEdgeInLeftRecursiveRule(ATNConfig config) {
+		if ( TURN_OFF_LR_LOOP_ENTRY_BRANCH_OPT ) return false;
+		ATNState p = config.state;
+		// First check to see if we are in StarLoopEntryState generated during
+		// left-recursion elimination. For efficiency, also check if
+		// the context has an empty stack case. If so, it would mean
+		// global FOLLOW so we can't perform optimization
+		if ( p.getStateType() != ATNState.STAR_LOOP_ENTRY ||
+			 !((StarLoopEntryState)p).isPrecedenceDecision || // Are we the special loop entry/exit state?
+			 config.context.isEmpty() ||                      // If SLL wildcard
+			 config.context.hasEmptyPath())
+		{
+			return false;
+		}
+
+		// Require all return states to return back to the same rule
+		// that p is in.
+		int numCtxs = config.context.size();
+		for (int i = 0; i < numCtxs; i++) { // for each stack context
+			ATNState returnState = atn.states.get(config.context.getReturnState(i));
+			if ( returnState.ruleIndex != p.ruleIndex ) return false;
+		}
+
+		BlockStartState decisionStartState = (BlockStartState)p.transition(0).target;
+		int blockEndStateNum = decisionStartState.endState.stateNumber;
+		BlockEndState blockEndState = (BlockEndState)atn.states.get(blockEndStateNum);
+
+		// Verify that the top of each stack context leads to loop entry/exit
+		// state through epsilon edges and w/o leaving rule.
+		for (int i = 0; i < numCtxs; i++) {                           // for each stack context
+			int returnStateNumber = config.context.getReturnState(i);
+			ATNState returnState = atn.states.get(returnStateNumber);
+			// all states must have single outgoing epsilon edge
+			if ( returnState.getNumberOfTransitions()!=1 ||
+				 !returnState.transition(0).isEpsilon() )
+			{
+				return false;
+			}
+			// Look for prefix op case like 'not expr', (' type ')' expr
+			ATNState returnStateTarget = returnState.transition(0).target;
+			if ( returnState.getStateType()==BLOCK_END && returnStateTarget==p ) {
+				continue;
+			}
+			// Look for 'expr op expr' or case where expr's return state is block end
+			// of (...)* internal block; the block end points to loop back
+			// which points to p but we don't need to check that
+			if ( returnState==blockEndState ) {
+				continue;
+			}
+			// Look for ternary expr ? expr : expr. The return state points at block end,
+			// which points at loop entry state
+			if ( returnStateTarget==blockEndState ) {
+				continue;
+			}
+			// Look for complex prefix 'between expr and expr' case where 2nd expr's
+			// return state points at block end state of (...)* internal block
+			if ( returnStateTarget.getStateType() == BLOCK_END &&
+				 returnStateTarget.getNumberOfTransitions()==1 &&
+				 returnStateTarget.transition(0).isEpsilon() &&
+				 returnStateTarget.transition(0).target == p )
+			{
+				continue;
+			}
+
+			// anything else ain't conforming
+			return false;
+		}
+
+		return true;
+	}
+

 	public String getRuleName(int index) {
 		if ( parser!=null && index>=0 ) return parser.getRuleNames()[index];
@ -2090,4 +2199,4 @@ public class ParserATNSimulator extends ATNSimulator {
 	public Parser getParser() {
 		return parser;
 	}
-}
+}
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/PredictionContext.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/PredictionContext.java
@ -117,12 +117,13 @@ public abstract class PredictionContext {

 	public abstract int getReturnState(int index);

-	/** This means only the {@link #EMPTY} context is in set. */
+	/** This means only the {@link #EMPTY} (wildcard? not sure) context is in set. */
 	public boolean isEmpty() {
 		return this == EMPTY;
 	}

 	public boolean hasEmptyPath() {
+		// since EMPTY_RETURN_STATE can only appear in the last position, we check last one
 		return getReturnState(size() - 1) == EMPTY_RETURN_STATE;
 	}

--- a/tool/src/org/antlr/v4/parse/LeftRecursiveRuleWalker.g
+++ b/tool/src/org/antlr/v4/parse/LeftRecursiveRuleWalker.g
@ -127,7 +127,7 @@ binary

 prefix
 	:	^(	ALT elementOptions?
-			({!((CommonTree)input.LT(1)).getText().equals(ruleName)}? element)+
+			element+
 			recurse epsilonElement*
 		 )
         {setAltAssoc((AltAST)$ALT,currentOuterAltNumber);}