Fixes #994 poor performance in left-recursive expressions with very large expression input phrases; builds off of @sharwell solution that explicitly checks for key return states in expr rules

This commit is contained in:
parrt 2016-11-24 13:07:19 -08:00
parent fca5e458d3
commit c182e3d5bf
4 changed files with 321 additions and 58 deletions

View File

@ -33,7 +33,6 @@ public class PerformanceDescriptors {
*/
@CommentHasStringValue
public String grammar;
}
public static class ExpressionGrammar_1 extends ExpressionGrammar {
@ -72,4 +71,158 @@ public class PerformanceDescriptors {
@CommentHasStringValue
public String input;
}
/** Test for https://github.com/antlr/antlr4/issues/1398.
* Seeing through a large expression takes 5 _minutes_ on
* my fast box to complete. After fix, it's instantaneous.
*/
public static abstract class DropLoopEntryBranchInLRRule extends BaseParserTestDescriptor {
public String grammarName = "Expr";
public String startRule = "stat";
/**
grammar Expr;
stat : expr ';'
| expr '.'
;
expr
: ID
| 'not' expr
| expr 'and' expr
| expr 'or' expr
| '(' ID ')' expr
| expr '?' expr ':' expr
| 'between' expr 'and' expr
;
ID: [a-zA-Z_][a-zA-Z_0-9]*;
WS: [ \t\n\r\f]+ -> skip;
*/
@CommentHasStringValue
public String grammar;
@Override
public boolean ignore(String targetName) {
return !targetName.equals("Java");
}
}
public static class DropLoopEntryBranchInLRRule_1 extends DropLoopEntryBranchInLRRule {
/**
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7
;
*/
@CommentHasStringValue
public String input;
}
public static class DropLoopEntryBranchInLRRule_2 extends DropLoopEntryBranchInLRRule {
/**
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7 or
X1 and X2 and X3 and X4 and X5 and X6 and X7
.
*/ // Different in final token
@CommentHasStringValue
public String input;
}
public static class DropLoopEntryBranchInLRRule_3 extends DropLoopEntryBranchInLRRule {
/**
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7 or
not X1 and not X2 and not X3 and not X4 and not X5 and not X6 and not X7
;
*/
@CommentHasStringValue
public String input;
}
public static class DropLoopEntryBranchInLRRule_4 extends DropLoopEntryBranchInLRRule {
/**
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4 and
between X1 and X2 or between X3 and X4
;
*/
@CommentHasStringValue
public String input;
}
public static class DropLoopEntryBranchInLRRule_5 extends DropLoopEntryBranchInLRRule {
/**
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z or
X ? Y : Z
;
*/
@CommentHasStringValue
public String input;
}
}

View File

@ -58,6 +58,8 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.antlr.v4.runtime.atn.ATNState.BLOCK_END;
/**
* The embodiment of the adaptive LL(*), ALL(*), parsing strategy.
*
@ -291,6 +293,8 @@ public class ParserATNSimulator extends ATNSimulator {
public static final boolean dfa_debug = false;
public static final boolean retry_debug = false;
public static final boolean TURN_OFF_LR_LOOP_ENTRY_BRANCH_OPT = Boolean.parseBoolean(System.getProperty("antlr.no_lr_loop_entry_opt"));
protected final Parser parser;
public final DFA[] decisionToDFA;
@ -788,7 +792,7 @@ public class ParserATNSimulator extends ATNSimulator {
protected ATNConfigSet computeReachSet(ATNConfigSet closure, int t,
boolean fullCtx)
{
if ( debug )
if ( debug )
System.out.println("in computeReachSet, starting closure: " + closure);
if (mergeCache == null) {
@ -1552,59 +1556,7 @@ public class ParserATNSimulator extends ATNSimulator {
}
for (int i=0; i<p.getNumberOfTransitions(); i++) {
// This block implements first-edge elimination of ambiguous LR
// alternatives as part of dynamic disambiguation during prediction.
// See antlr/antlr4#1398.
if (i == 0
&& p.getStateType() == ATNState.STAR_LOOP_ENTRY
&& ((StarLoopEntryState)p).isPrecedenceDecision
&& !config.context.hasEmptyPath()) {
// When suppress is true, it means the outgoing edge i==0 is
// ambiguous with the outgoing edge i==1, and thus the closure
// operation can dynamically disambiguate by suppressing this
// edge during the closure operation.
boolean suppress = true;
// Require all return states to return back to the same rule
// that p is in.
int limit = config.context.size();
for (int j = 0; j < limit; j++) {
ATNState returnState = atn.states.get(config.context.getReturnState(j));
suppress = suppress && returnState.ruleIndex == p.ruleIndex;
}
// Further check to make sure this isn't a 0-precedence entry.
// See antlr/antlr4#679.
if (suppress) {
RuleStopState ruleStopState = atn.ruleToStopState[p.ruleIndex];
for (int j = 0; j < limit; j++) {
for (Transition transition : ruleStopState.transitions) {
if (transition.getSerializationType() != Transition.EPSILON) {
continue;
}
if (((EpsilonTransition)transition).outermostPrecedenceReturn() != p.ruleIndex) {
continue;
}
int returnStateNumber = config.context.getReturnState(j);
suppress = returnStateNumber != transition.target.stateNumber;
if (!suppress) {
break;
}
}
if (!suppress) {
break;
}
}
}
if (suppress) {
continue;
}
}
if ( i==0 && canDropLoopEntryEdgeInLeftRecursiveRule(config) ) continue;
Transition t = p.transition(i);
boolean continueCollecting =
@ -1657,6 +1609,163 @@ public class ParserATNSimulator extends ATNSimulator {
}
}
/** Implements first-edge (loop entry) elimination as an optimization
* during closure operations. See antlr/antlr4#1398.
*
* The optimization is to avoid adding the loop entry config when
* the exit path can only lead back to the same
* StarLoopEntryState after popping context at the rule end state
* (traversing only epsilon edges, so we're still in closure, in
* this same rule).
*
* We need to detect any state that can reach loop entry on
* epsilon w/o exiting rule. We don't have to look at FOLLOW
* links, just ensure that all stack tops for config refer to key
* states in LR rule.
*
* To verify we are in the right situation we must first check
* closure is at a StarLoopEntryState generated during LR removal.
* Then we check that each stack top of context is a return state
* from one of these cases:
*
* 1. 'not' expr, '(' type ')' expr. The return state points at loop entry state
* 2. expr op expr. The return state is the block end of internal block of (...)*
* 3. 'between' expr 'and' expr. The return state of 2nd expr reference.
* That state points at block end of internal block of (...)*.
* 4. expr '?' expr ':' expr. The return state points at block end,
* which points at loop entry state.
*
* If any is true for each stack top, then closure does not add a
* config to the current config set for edge[0], the loop entry branch.
*
* Conditions fail if any context for the current config is:
*
* a. empty (we'd fall out of expr to do a global FOLLOW which could
* even be to some weird spot in expr) or,
* b. lies outside of expr or,
* c. lies within expr but at a state not the BlockEndState
* generated during LR removal
*
* Do we need to evaluate predicates ever in closure for this case?
*
* No. Predicates, including precedence predicates, are only
* evaluated when computing a DFA start state. I.e., only before
* the lookahead (but not parser) consumes a token.
*
* There are no epsilon edges allowed in LR rule alt blocks or in
* the "primary" part (ID here). If closure is in
* StarLoopEntryState any lookahead operation will have consumed a
* token as there are no epsilon-paths that lead to
* StarLoopEntryState. We do not have to evaluate predicates
* therefore if we are in the generated StarLoopEntryState of a LR
* rule. Note that when making a prediction starting at that
* decision point, decision d=2, compute-start-state performs
* closure starting at edges[0], edges[1] emanating from
* StarLoopEntryState. That means it is not performing closure on
* StarLoopEntryState during compute-start-state.
*
* How do we know this always gives same prediction answer?
*
* Without predicates, loop entry and exit paths are ambiguous
* upon remaining input +b (in, say, a+b). Either paths lead to
* valid parses. Closure can lead to consuming + immediately or by
* falling out of this call to expr back into expr and loop back
* again to StarLoopEntryState to match +b. In this special case,
* we choose the more efficient path, which is to take the bypass
* path.
*
* The lookahead language has not changed because closure chooses
* one path over the other. Both paths lead to consuming the same
* remaining input during a lookahead operation. If the next token
* is an operator, lookahead will enter the choice block with
* operators. If it is not, lookahead will exit expr. Same as if
* closure had chosen to enter the choice block immediately.
*
* Closure is examining one config (some loopentrystate, some alt,
* context) which means it is considering exactly one alt. Closure
* always copies the same alt to any derived configs.
*
* How do we know this optimization doesn't mess up precedence in
* our parse trees?
*
* Looking through expr from left edge of stat only has to confirm
* that an input, say, a+b+c; begins with any valid interpretation
* of an expression. The precedence actually doesn't matter when
* making a decision in stat seeing through expr. It is only when
* parsing rule expr that we must use the precedence to get the
* right interpretation and, hence, parse tree.
*/
protected boolean canDropLoopEntryEdgeInLeftRecursiveRule(ATNConfig config) {
if ( TURN_OFF_LR_LOOP_ENTRY_BRANCH_OPT ) return false;
ATNState p = config.state;
// First check to see if we are in StarLoopEntryState generated during
// left-recursion elimination. For efficiency, also check if
// the context has an empty stack case. If so, it would mean
// global FOLLOW so we can't perform optimization
if ( p.getStateType() != ATNState.STAR_LOOP_ENTRY ||
!((StarLoopEntryState)p).isPrecedenceDecision || // Are we the special loop entry/exit state?
config.context.isEmpty() || // If SLL wildcard
config.context.hasEmptyPath())
{
return false;
}
// Require all return states to return back to the same rule
// that p is in.
int numCtxs = config.context.size();
for (int i = 0; i < numCtxs; i++) { // for each stack context
ATNState returnState = atn.states.get(config.context.getReturnState(i));
if ( returnState.ruleIndex != p.ruleIndex ) return false;
}
BlockStartState decisionStartState = (BlockStartState)p.transition(0).target;
int blockEndStateNum = decisionStartState.endState.stateNumber;
BlockEndState blockEndState = (BlockEndState)atn.states.get(blockEndStateNum);
// Verify that the top of each stack context leads to loop entry/exit
// state through epsilon edges and w/o leaving rule.
for (int i = 0; i < numCtxs; i++) { // for each stack context
int returnStateNumber = config.context.getReturnState(i);
ATNState returnState = atn.states.get(returnStateNumber);
// all states must have single outgoing epsilon edge
if ( returnState.getNumberOfTransitions()!=1 ||
!returnState.transition(0).isEpsilon() )
{
return false;
}
// Look for prefix op case like 'not expr', (' type ')' expr
ATNState returnStateTarget = returnState.transition(0).target;
if ( returnState.getStateType()==BLOCK_END && returnStateTarget==p ) {
continue;
}
// Look for 'expr op expr' or case where expr's return state is block end
// of (...)* internal block; the block end points to loop back
// which points to p but we don't need to check that
if ( returnState==blockEndState ) {
continue;
}
// Look for ternary expr ? expr : expr. The return state points at block end,
// which points at loop entry state
if ( returnStateTarget==blockEndState ) {
continue;
}
// Look for complex prefix 'between expr and expr' case where 2nd expr's
// return state points at block end state of (...)* internal block
if ( returnStateTarget.getStateType() == BLOCK_END &&
returnStateTarget.getNumberOfTransitions()==1 &&
returnStateTarget.transition(0).isEpsilon() &&
returnStateTarget.transition(0).target == p )
{
continue;
}
// anything else ain't conforming
return false;
}
return true;
}
public String getRuleName(int index) {
if ( parser!=null && index>=0 ) return parser.getRuleNames()[index];
@ -2090,4 +2199,4 @@ public class ParserATNSimulator extends ATNSimulator {
public Parser getParser() {
return parser;
}
}
}

View File

@ -117,12 +117,13 @@ public abstract class PredictionContext {
public abstract int getReturnState(int index);
/** This means only the {@link #EMPTY} context is in set. */
/** This means only the {@link #EMPTY} (wildcard? not sure) context is in set. */
public boolean isEmpty() {
return this == EMPTY;
}
public boolean hasEmptyPath() {
// since EMPTY_RETURN_STATE can only appear in the last position, we check last one
return getReturnState(size() - 1) == EMPTY_RETURN_STATE;
}

View File

@ -127,7 +127,7 @@ binary
prefix
: ^( ALT elementOptions?
({!((CommonTree)input.LT(1)).getText().equals(ruleName)}? element)+
element+
recurse epsilonElement*
)
{setAltAssoc((AltAST)$ALT,currentOuterAltNumber);}