Switching to current ANTLR revision, part 3.

- Everything from atn + dfa subfolders is updated now.
- IntervalSet changed behavior a bit which needed updated unit tests.
This commit is contained in:
Mike Lischke 2016-04-30 15:08:28 +02:00
parent 99ada0550f
commit d4ebdfa138
8 changed files with 531 additions and 499 deletions

View File

@ -17,7 +17,16 @@
using namespace antlrcpptest;
using namespace org::antlr::v4::runtime;
class A {
public:
static void doit(const A &a) {
size_t i = a.counter;
}
private:
size_t counter;
};
int main(int argc, const char * argv[]) {
ANTLRInputStream input(L"divideŴ and conquer");
TLexer lexer(&input);
CommonTokenStream tokens(&lexer);

View File

@ -326,7 +326,7 @@ using namespace antlrcpp;
XCTAssert(set2.contains(1111));
XCTAssertFalse(set2.contains(10000));
XCTAssertEqual(set2.getSingleElement(), Token::INVALID_TYPE);
XCTAssertEqual(set2.getMinElement(), 10);
XCTAssertEqual(set2.getMinElement(), -1);
XCTAssertEqual(set2.getMaxElement(), 2000);
IntervalSet set3(set2);
@ -368,7 +368,13 @@ using namespace antlrcpp;
catch (IllegalStateException &e) {
}
set4.setReadOnly(false);
try {
set4.setReadOnly(false);
XCTFail(@"Expected exception");
}
catch (IllegalStateException &e) {
}
set4 = IntervalSet::of(12345);
XCTAssertEqual(set4.getSingleElement(), 12345);
XCTAssertEqual(set4.getMinElement(), 12345);

View File

@ -156,87 +156,66 @@ namespace atn {
static Ref<PredictionContext> mergeSingletons(Ref<SingletonPredictionContext> a,
Ref<SingletonPredictionContext> b, bool rootIsWildcard, PredictionContextMergeCache *mergeCache);
/// <summary>
/// Handle case where at least one of {@code a} or {@code b} is
/// <seealso cref="#EMPTY"/>. In the following diagrams, the symbol {@code $} is used
/// to represent <seealso cref="#EMPTY"/>.
///
/// <h2>Local-Context Merges</h2>
///
/// These local-context merge operations are used when {@code rootIsWildcard}
/// is true.
///
/// <p/>
///
/// <seealso cref="#EMPTY"/> is superset of any graph; return <seealso cref="#EMPTY"/>.<br/>
/// <embed src="images/LocalMerge_EmptyRoot.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// <seealso cref="#EMPTY"/> and anything is {@code #EMPTY}, so merged parent is
/// {@code #EMPTY}; return left graph.<br/>
/// <embed src="images/LocalMerge_EmptyParent.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Special case of last merge if local context.<br/>
/// <embed src="images/LocalMerge_DiffRoots.svg" type="image/svg+xml"/>
///
/// <h2>Full-Context Merges</h2>
///
/// These full-context merge operations are used when {@code rootIsWildcard}
/// is false.
///
/// <p/>
///
/// <embed src="images/FullMerge_EmptyRoots.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Must keep all contexts; <seealso cref="#EMPTY"/> in array is a special value (and
/// null parent).<br/>
/// <embed src="images/FullMerge_EmptyRoot.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// <embed src="images/FullMerge_SameRoot.svg" type="image/svg+xml"/>
/// </summary>
/// <param name="a"> the first <seealso cref="SingletonPredictionContext"/> </param>
/// <param name="b"> the second <seealso cref="SingletonPredictionContext"/> </param>
/// <param name="rootIsWildcard"> {@code true} if this is a local-context merge,
/// otherwise false to indicate a full-context merge </param>
/**
* Handle case where at least one of {@code a} or {@code b} is
* {@link #EMPTY}. In the following diagrams, the symbol {@code $} is used
* to represent {@link #EMPTY}.
*
* <h2>Local-Context Merges</h2>
*
* <p>These local-context merge operations are used when {@code rootIsWildcard}
* is true.</p>
*
* <p>{@link #EMPTY} is superset of any graph; return {@link #EMPTY}.<br>
* <embed src="images/LocalMerge_EmptyRoot.svg" type="image/svg+xml"/></p>
*
* <p>{@link #EMPTY} and anything is {@code #EMPTY}, so merged parent is
* {@code #EMPTY}; return left graph.<br>
* <embed src="images/LocalMerge_EmptyParent.svg" type="image/svg+xml"/></p>
*
* <p>Special case of last merge if local context.<br>
* <embed src="images/LocalMerge_DiffRoots.svg" type="image/svg+xml"/></p>
*
* <h2>Full-Context Merges</h2>
*
* <p>These full-context merge operations are used when {@code rootIsWildcard}
* is false.</p>
*
* <p><embed src="images/FullMerge_EmptyRoots.svg" type="image/svg+xml"/></p>
*
* <p>Must keep all contexts; {@link #EMPTY} in array is a special value (and
* null parent).<br>
* <embed src="images/FullMerge_EmptyRoot.svg" type="image/svg+xml"/></p>
*
* <p><embed src="images/FullMerge_SameRoot.svg" type="image/svg+xml"/></p>
*
* @param a the first {@link SingletonPredictionContext}
* @param b the second {@link SingletonPredictionContext}
* @param rootIsWildcard {@code true} if this is a local-context merge,
* otherwise false to indicate a full-context merge
*/
static Ref<PredictionContext> mergeRoot(Ref<SingletonPredictionContext> a,
Ref<SingletonPredictionContext> b, bool rootIsWildcard);
/// <summary>
/// Merge two <seealso cref="ArrayPredictionContext"/> instances.
///
/// <p/>
///
/// Different tops, different parents.<br/>
/// <embed src="images/ArrayMerge_DiffTopDiffPar.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Shared top, same parents.<br/>
/// <embed src="images/ArrayMerge_ShareTopSamePar.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Shared top, different parents.<br/>
/// <embed src="images/ArrayMerge_ShareTopDiffPar.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Shared top, all shared parents.<br/>
/// <embed src="images/ArrayMerge_ShareTopSharePar.svg" type="image/svg+xml"/>
///
/// <p/>
///
/// Equal tops, merge parents and reduce top to
/// <seealso cref="SingletonPredictionContext"/>.<br/>
/// <embed src="images/ArrayMerge_EqualTop.svg" type="image/svg+xml"/>
/// </summary>
/**
* Merge two {@link ArrayPredictionContext} instances.
*
* <p>Different tops, different parents.<br>
* <embed src="images/ArrayMerge_DiffTopDiffPar.svg" type="image/svg+xml"/></p>
*
* <p>Shared top, same parents.<br>
* <embed src="images/ArrayMerge_ShareTopSamePar.svg" type="image/svg+xml"/></p>
*
* <p>Shared top, different parents.<br>
* <embed src="images/ArrayMerge_ShareTopDiffPar.svg" type="image/svg+xml"/></p>
*
* <p>Shared top, all shared parents.<br>
* <embed src="images/ArrayMerge_ShareTopSharePar.svg" type="image/svg+xml"/></p>
*
* <p>Equal tops, merge parents and reduce top to
* {@link SingletonPredictionContext}.<br>
* <embed src="images/ArrayMerge_EqualTop.svg" type="image/svg+xml"/></p>
*/
static Ref<PredictionContext> mergeArrays(Ref<ArrayPredictionContext> a,
Ref<ArrayPredictionContext> b, bool rootIsWildcard, PredictionContextMergeCache *mergeCache);

View File

@ -43,6 +43,10 @@ using namespace antlrcpp;
struct AltAndContextConfigHasher
{
/**
* The hash code is only a function of the {@link ATNState#stateNumber}
* and {@link ATNConfig#context}.
*/
size_t operator () (const ATNConfig &o) const {
size_t hashCode = misc::MurmurHash::initialize(7);
hashCode = misc::MurmurHash::update(hashCode, (size_t)o.state->stateNumber);
@ -172,6 +176,14 @@ antlrcpp::BitSet PredictionModeClass::getAlts(const std::vector<antlrcpp::BitSet
return all;
}
antlrcpp::BitSet PredictionModeClass::getAlts(Ref<ATNConfigSet> configs) {
antlrcpp::BitSet alts;
for (auto config : configs->configs) {
alts.set(config->alt);
}
return alts;
}
std::vector<antlrcpp::BitSet> PredictionModeClass::getConflictingAltSubsets(Ref<ATNConfigSet> configs) {
std::unordered_map<Ref<ATNConfig>, antlrcpp::BitSet, AltAndContextConfigHasher, AltAndContextConfigComparer> configToAlts;
for (auto config : configs->configs) {

View File

@ -39,171 +39,169 @@ namespace v4 {
namespace runtime {
namespace atn {
/**
* This enumeration defines the prediction modes available in ANTLR 4 along with
* utility methods for analyzing configuration sets for conflicts and/or
* ambiguities.
*/
enum class PredictionMode {
/// <summary>
/// Do only local context prediction (SLL style) and using
/// heuristic which almost always works but is much faster
/// than precise answer.
/// </summary>
/**
* The SLL(*) prediction mode. This prediction mode ignores the current
* parser context when making predictions. This is the fastest prediction
* mode, and provides correct results for many grammars. This prediction
* mode is more powerful than the prediction mode provided by ANTLR 3, but
* may result in syntax errors for grammar and input combinations which are
* not SLL.
*
* <p>
* When using this prediction mode, the parser will either return a correct
* parse tree (i.e. the same parse tree that would be returned with the
* {@link #LL} prediction mode), or it will report a syntax error. If a
* syntax error is encountered when using the {@link #SLL} prediction mode,
* it may be due to either an actual syntax error in the input or indicate
* that the particular combination of grammar and input requires the more
* powerful {@link #LL} prediction abilities to complete successfully.</p>
*
* <p>
* This prediction mode does not provide any guarantees for prediction
* behavior for syntactically-incorrect inputs.</p>
*/
SLL,
/// <summary>
/// Full LL(*) that always gets right answer. For speed
/// reasons, we terminate the prediction process when we know for
/// sure which alt to predict. We don't always know what
/// the ambiguity is in this mode.
/// </summary>
/**
* The LL(*) prediction mode. This prediction mode allows the current parser
* context to be used for resolving SLL conflicts that occur during
* prediction. This is the fastest prediction mode that guarantees correct
* parse results for all combinations of grammars with syntactically correct
* inputs.
*
* <p>
* When using this prediction mode, the parser will make correct decisions
* for all syntactically-correct grammar and input combinations. However, in
* cases where the grammar is truly ambiguous this prediction mode might not
* report a precise answer for <em>exactly which</em> alternatives are
* ambiguous.</p>
*
* <p>
* This prediction mode does not provide any guarantees for prediction
* behavior for syntactically-incorrect inputs.</p>
*/
LL,
/// <summary>
/// Tell the full LL prediction algorithm to pursue lookahead until
/// it has uniquely predicted an alternative without conflict or it's
/// certain that it's found an ambiguous input sequence. when this
/// variable is false. When true, the prediction process will
/// continue looking for the exact ambiguous sequence even if
/// it has already figured out which alternative to predict.
/// </summary>
/**
* The LL(*) prediction mode with exact ambiguity detection. In addition to
* the correctness guarantees provided by the {@link #LL} prediction mode,
* this prediction mode instructs the prediction algorithm to determine the
* complete and exact set of ambiguous alternatives for every ambiguous
* decision encountered while parsing.
*
* <p>
* This prediction mode may be used for diagnosing ambiguities during
* grammar development. Due to the performance overhead of calculating sets
* of ambiguous alternatives, this prediction mode should be avoided when
* the exact results are not necessary.</p>
*
* <p>
* This prediction mode does not provide any guarantees for prediction
* behavior for syntactically-incorrect inputs.</p>
*/
LL_EXACT_AMBIG_DETECTION
};
class PredictionModeClass {
public:
/// <summary>
/// Computes the SLL prediction termination condition.
///
/// <p/>
///
/// This method computes the SLL prediction termination condition for both of
/// the following cases.
///
/// <ul>
/// <li>The usual SLL+LL fallback upon SLL conflict</li>
/// <li>Pure SLL without LL fallback</li>
/// </ul>
///
/// <p/>
///
/// <strong>COMBINED SLL+LL PARSING</strong>
///
/// <p/>
///
/// When LL-fallback is enabled upon SLL conflict, correct predictions are
/// ensured regardless of how the termination condition is computed by this
/// method. Due to the substantially higher cost of LL prediction, the
/// prediction should only fall back to LL when the additional lookahead
/// cannot lead to a unique SLL prediction.
///
/// <p/>
///
/// Assuming combined SLL+LL parsing, an SLL configuration set with only
/// conflicting subsets should fall back to full LL, even if the
/// configuration sets don't resolve to the same alternative (e.g.
/// {@code {1,2}} and {@code {3,4}}. If there is at least one non-conflicting
/// configuration, SLL could continue with the hopes that more lookahead will
/// resolve via one of those non-conflicting configurations.
///
/// <p/>
///
/// Here's the prediction termination rule them: SLL (for SLL+LL parsing)
/// stops when it sees only conflicting configuration subsets. In contrast,
/// full LL keeps going when there is uncertainty.
///
/// <p/>
///
/// <strong>HEURISTIC</strong>
///
/// <p/>
///
/// As a heuristic, we stop prediction when we see any conflicting subset
/// unless we see a state that only has one alternative associated with it.
/// The single-alt-state thing lets prediction continue upon rules like
/// (otherwise, it would admit defeat too soon):
///
/// <p/>
///
/// {@code [12|1|[], 6|2|[], 12|2|[]]. s : (ID | ID ID?) ';' ;}
///
/// <p/>
///
/// When the ATN simulation reaches the state before {@code ';'}, it has a
/// DFA state that looks like: {@code [12|1|[], 6|2|[], 12|2|[]]}. Naturally
/// {@code 12|1|[]} and {@code 12|2|[]} conflict, but we cannot stop
/// processing this node because alternative to has another way to continue,
/// via {@code [6|2|[]]}.
///
/// <p/>
///
/// It also let's us continue for this rule:
///
/// <p/>
///
/// {@code [1|1|[], 1|2|[], 8|3|[]] a : A | A | A B ;}
///
/// <p/>
///
/// After matching input A, we reach the stop state for rule A, state 1.
/// State 8 is the state right before B. Clearly alternatives 1 and 2
/// conflict and no amount of further lookahead will separate the two.
/// However, alternative 3 will be able to continue and so we do not stop
/// working on this state. In the previous example, we're concerned with
/// states associated with the conflicting alternatives. Here alt 3 is not
/// associated with the conflicting configs, but since we can continue
/// looking for input reasonably, don't declare the state done.
///
/// <p/>
///
/// <strong>PURE SLL PARSING</strong>
///
/// <p/>
///
/// To handle pure SLL parsing, all we have to do is make sure that we
/// combine stack contexts for configurations that differ only by semantic
/// predicate. From there, we can do the usual SLL termination heuristic.
///
/// <p/>
///
/// <strong>PREDICATES IN SLL+LL PARSING</strong>
///
/// <p/>
///
/// SLL decisions don't evaluate predicates until after they reach DFA stop
/// states because they need to create the DFA cache that works in all
/// semantic situations. In contrast, full LL evaluates predicates collected
/// during start state computation so it can ignore predicates thereafter.
/// This means that SLL termination detection can totally ignore semantic
/// predicates.
///
/// <p/>
///
/// Implementation-wise, <seealso cref="ATNConfigSet"/> combines stack contexts
/// but not
/// semantic predicate contexts so we might see two configurations like the
/// following.
///
/// <p/>
///
/// {@code (s, 1, x, {}), (s, 1, x', {p})}
///
/// <p/>
///
/// Before testing these configurations against others, we have to merge
/// {@code x} and {@code x'} (without modifying the existing configurations).
/// For example, we test {@code (x+x')==x''} when looking for conflicts in
/// the following configurations.
///
/// <p/>
///
/// {@code (s, 1, x, {}), (s, 1, x', {p}), (s, 2, x'', {})}
///
/// <p/>
///
/// If the configuration set has predicates (as indicated by
/// <seealso cref="ATNConfigSet#hasSemanticContext"/>), this algorithm makes a
/// copy of
/// the configurations to strip out all of the predicates so that a standard
/// <seealso cref="ATNConfigSet"/> will merge everything ignoring predicates.
/// </summary>
/**
* Computes the SLL prediction termination condition.
*
* <p>
* This method computes the SLL prediction termination condition for both of
* the following cases.</p>
*
* <ul>
* <li>The usual SLL+LL fallback upon SLL conflict</li>
* <li>Pure SLL without LL fallback</li>
* </ul>
*
* <p><strong>COMBINED SLL+LL PARSING</strong></p>
*
* <p>When LL-fallback is enabled upon SLL conflict, correct predictions are
* ensured regardless of how the termination condition is computed by this
* method. Due to the substantially higher cost of LL prediction, the
* prediction should only fall back to LL when the additional lookahead
* cannot lead to a unique SLL prediction.</p>
*
* <p>Assuming combined SLL+LL parsing, an SLL configuration set with only
* conflicting subsets should fall back to full LL, even if the
* configuration sets don't resolve to the same alternative (e.g.
* {@code {1,2}} and {@code {3,4}}. If there is at least one non-conflicting
* configuration, SLL could continue with the hopes that more lookahead will
* resolve via one of those non-conflicting configurations.</p>
*
* <p>Here's the prediction termination rule them: SLL (for SLL+LL parsing)
* stops when it sees only conflicting configuration subsets. In contrast,
* full LL keeps going when there is uncertainty.</p>
*
* <p><strong>HEURISTIC</strong></p>
*
* <p>As a heuristic, we stop prediction when we see any conflicting subset
* unless we see a state that only has one alternative associated with it.
* The single-alt-state thing lets prediction continue upon rules like
* (otherwise, it would admit defeat too soon):</p>
*
* <p>{@code [12|1|[], 6|2|[], 12|2|[]]. s : (ID | ID ID?) ';' ;}</p>
*
* <p>When the ATN simulation reaches the state before {@code ';'}, it has a
* DFA state that looks like: {@code [12|1|[], 6|2|[], 12|2|[]]}. Naturally
* {@code 12|1|[]} and {@code 12|2|[]} conflict, but we cannot stop
* processing this node because alternative to has another way to continue,
* via {@code [6|2|[]]}.</p>
*
* <p>It also let's us continue for this rule:</p>
*
* <p>{@code [1|1|[], 1|2|[], 8|3|[]] a : A | A | A B ;}</p>
*
* <p>After matching input A, we reach the stop state for rule A, state 1.
* State 8 is the state right before B. Clearly alternatives 1 and 2
* conflict and no amount of further lookahead will separate the two.
* However, alternative 3 will be able to continue and so we do not stop
* working on this state. In the previous example, we're concerned with
* states associated with the conflicting alternatives. Here alt 3 is not
* associated with the conflicting configs, but since we can continue
* looking for input reasonably, don't declare the state done.</p>
*
* <p><strong>PURE SLL PARSING</strong></p>
*
* <p>To handle pure SLL parsing, all we have to do is make sure that we
* combine stack contexts for configurations that differ only by semantic
* predicate. From there, we can do the usual SLL termination heuristic.</p>
*
* <p><strong>PREDICATES IN SLL+LL PARSING</strong></p>
*
* <p>SLL decisions don't evaluate predicates until after they reach DFA stop
* states because they need to create the DFA cache that works in all
* semantic situations. In contrast, full LL evaluates predicates collected
* during start state computation so it can ignore predicates thereafter.
* This means that SLL termination detection can totally ignore semantic
* predicates.</p>
*
* <p>Implementation-wise, {@link ATNConfigSet} combines stack contexts but not
* semantic predicate contexts so we might see two configurations like the
* following.</p>
*
* <p>{@code (s, 1, x, {}), (s, 1, x', {p})}</p>
*
* <p>Before testing these configurations against others, we have to merge
* {@code x} and {@code x'} (without modifying the existing configurations).
* For example, we test {@code (x+x')==x''} when looking for conflicts in
* the following configurations.</p>
*
* <p>{@code (s, 1, x, {}), (s, 1, x', {p}), (s, 2, x'', {})}</p>
*
* <p>If the configuration set has predicates (as indicated by
* {@link ATNConfigSet#hasSemanticContext}), this algorithm makes a copy of
* the configurations to strip out all of the predicates so that a standard
* {@link ATNConfigSet} will merge everything ignoring predicates.</p>
*/
static bool hasSLLConflictTerminatingPrediction(PredictionMode *mode, Ref<ATNConfigSet> configs);
/// <summary>
@ -230,190 +228,147 @@ namespace atn {
/// <seealso cref="RuleStopState"/>, otherwise {@code false} </returns>
static bool allConfigsInRuleStopStates(Ref<ATNConfigSet> configs);
/// <summary>
/// Full LL prediction termination.
///
/// <p/>
///
/// Can we stop looking ahead during ATN simulation or is there some
/// uncertainty as to which alternative we will ultimately pick, after
/// consuming more input? Even if there are partial conflicts, we might know
/// that everything is going to resolve to the same minimum alternative. That
/// means we can stop since no more lookahead will change that fact. On the
/// other hand, there might be multiple conflicts that resolve to different
/// minimums. That means we need more look ahead to decide which of those
/// alternatives we should predict.
///
/// <p/>
///
/// The basic idea is to split the set of configurations {@code C}, into
/// conflicting subsets {@code (s, _, ctx, _)} and singleton subsets with
/// non-conflicting configurations. Two configurations conflict if they have
/// identical <seealso cref="ATNConfig#state"/> and <seealso
/// cref="ATNConfig#context"/> values
/// but different <seealso cref="ATNConfig#alt"/> value, e.g. {@code (s, i, ctx,
/// _)}
/// and {@code (s, j, ctx, _)} for {@code i!=j}.
///
/// <p/>
///
/// Reduce these configuration subsets to the set of possible alternatives.
/// You can compute the alternative subsets in one pass as follows:
///
/// <p/>
///
/// {@code A_s,ctx = {i | (s, i, ctx, _)}} for each configuration in
/// {@code C} holding {@code s} and {@code ctx} fixed.
///
/// <p/>
///
/// Or in pseudo-code, for each configuration {@code c} in {@code C}:
///
/// <pre>
/// map[c] U= c.<seealso cref="ATNConfig#alt alt"/> # map hash/equals uses s and
/// x, not
/// alt and not pred
/// </pre>
///
/// <p/>
///
/// The values in {@code map} are the set of {@code A_s,ctx} sets.
///
/// <p/>
///
/// If {@code |A_s,ctx|=1} then there is no conflict associated with
/// {@code s} and {@code ctx}.
///
/// <p/>
///
/// Reduce the subsets to singletons by choosing a minimum of each subset. If
/// the union of these alternative subsets is a singleton, then no amount of
/// more lookahead will help us. We will always pick that alternative. If,
/// however, there is more than one alternative, then we are uncertain which
/// alternative to predict and must continue looking for resolution. We may
/// or may not discover an ambiguity in the future, even if there are no
/// conflicting subsets this round.
///
/// <p/>
///
/// The biggest sin is to terminate early because it means we've made a
/// decision but were uncertain as to the eventual outcome. We haven't used
/// enough lookahead. On the other hand, announcing a conflict too late is no
/// big deal; you will still have the conflict. It's just inefficient. It
/// might even look until the end of file.
///
/// <p/>
///
/// No special consideration for semantic predicates is required because
/// predicates are evaluated on-the-fly for full LL prediction, ensuring that
/// no configuration contains a semantic context during the termination
/// check.
///
/// <p/>
///
/// <strong>CONFLICTING CONFIGS</strong>
///
/// <p/>
///
/// Two configurations {@code (s, i, x)} and {@code (s, j, x')}, conflict
/// when {@code i!=j} but {@code x=x'}. Because we merge all
/// {@code (s, i, _)} configurations together, that means that there are at
/// most {@code n} configurations associated with state {@code s} for
/// {@code n} possible alternatives in the decision. The merged stacks
/// complicate the comparison of configuration contexts {@code x} and
/// {@code x'}. Sam checks to see if one is a subset of the other by calling
/// merge and checking to see if the merged result is either {@code x} or
/// {@code x'}. If the {@code x} associated with lowest alternative {@code i}
/// is the superset, then {@code i} is the only possible prediction since the
/// others resolve to {@code min(i)} as well. However, if {@code x} is
/// associated with {@code j>i} then at least one stack configuration for
/// {@code j} is not in conflict with alternative {@code i}. The algorithm
/// should keep going, looking for more lookahead due to the uncertainty.
///
/// <p/>
///
/// For simplicity, I'm doing a equality check between {@code x} and
/// {@code x'} that lets the algorithm continue to consume lookahead longer
/// than necessary. The reason I like the equality is of course the
/// simplicity but also because that is the test you need to detect the
/// alternatives that are actually in conflict.
///
/// <p/>
///
/// <strong>CONTINUE/STOP RULE</strong>
///
/// <p/>
///
/// Continue if union of resolved alternative sets from non-conflicting and
/// conflicting alternative subsets has more than one alternative. We are
/// uncertain about which alternative to predict.
///
/// <p/>
///
/// The complete set of alternatives, {@code [i for (_,i,_)]}, tells us which
/// alternatives are still in the running for the amount of input we've
/// consumed at this point. The conflicting sets let us to strip away
/// configurations that won't lead to more states because we resolve
/// conflicts to the configuration with a minimum alternate for the
/// conflicting set.
///
/// <p/>
///
/// <strong>CASES</strong>
///
/// <ul>
///
/// <li>no conflicts and more than 1 alternative in set =&gt; continue</li>
///
/// <li> {@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s, 3, z)},
/// {@code (s', 1, y)}, {@code (s', 2, y)} yields non-conflicting set
/// {@code {3}} U conflicting sets {@code min({1,2})} U {@code min({1,2})} =
/// {@code {1,3}} =&gt; continue
/// </li>
///
/// <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 1, y)},
/// {@code (s', 2, y)}, {@code (s'', 1, z)} yields non-conflicting set
/// {@code {1}} U conflicting sets {@code min({1,2})} U {@code min({1,2})} =
/// {@code {1}} =&gt; stop and predict 1</li>
///
/// <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 1, y)},
/// {@code (s', 2, y)} yields conflicting, reduced sets {@code {1}} U
/// {@code {1}} = {@code {1}} =&gt; stop and predict 1, can announce
/// ambiguity {@code {1,2}}</li>
///
/// <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 2, y)},
/// {@code (s', 3, y)} yields conflicting, reduced sets {@code {1}} U
/// {@code {2}} = {@code {1,2}} =&gt; continue</li>
///
/// <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 3, y)},
/// {@code (s', 4, y)} yields conflicting, reduced sets {@code {1}} U
/// {@code {3}} = {@code {1,3}} =&gt; continue</li>
///
/// </ul>
///
/// <strong>EXACT AMBIGUITY DETECTION</strong>
///
/// <p/>
///
/// If all states report the same conflicting set of alternatives, then we
/// know we have the exact ambiguity set.
///
/// <p/>
///
/// <code>|A_<em>i</em>|&gt;1</code> and
/// <code>A_<em>i</em> = A_<em>j</em></code> for all <em>i</em>, <em>j</em>.
///
/// <p/>
///
/// In other words, we continue examining lookahead until all {@code A_i}
/// have more than one alternative and all {@code A_i} are the same. If
/// {@code A={{1,2}, {1,3}}}, then regular LL prediction would terminate
/// because the resolved set is {@code {1}}. To determine what the real
/// ambiguity is, we have to know whether the ambiguity is between one and
/// two or one and three so we keep going. We can only stop prediction when
/// we need exact ambiguity detection when the sets look like
/// {@code A={{1,2}}} or {@code {{1,2},{1,2}}}, etc...
/// </summary>
/**
* Full LL prediction termination.
*
* <p>Can we stop looking ahead during ATN simulation or is there some
* uncertainty as to which alternative we will ultimately pick, after
* consuming more input? Even if there are partial conflicts, we might know
* that everything is going to resolve to the same minimum alternative. That
* means we can stop since no more lookahead will change that fact. On the
* other hand, there might be multiple conflicts that resolve to different
* minimums. That means we need more look ahead to decide which of those
* alternatives we should predict.</p>
*
* <p>The basic idea is to split the set of configurations {@code C}, into
* conflicting subsets {@code (s, _, ctx, _)} and singleton subsets with
* non-conflicting configurations. Two configurations conflict if they have
* identical {@link ATNConfig#state} and {@link ATNConfig#context} values
* but different {@link ATNConfig#alt} value, e.g. {@code (s, i, ctx, _)}
* and {@code (s, j, ctx, _)} for {@code i!=j}.</p>
*
* <p>Reduce these configuration subsets to the set of possible alternatives.
* You can compute the alternative subsets in one pass as follows:</p>
*
* <p>{@code A_s,ctx = {i | (s, i, ctx, _)}} for each configuration in
* {@code C} holding {@code s} and {@code ctx} fixed.</p>
*
* <p>Or in pseudo-code, for each configuration {@code c} in {@code C}:</p>
*
* <pre>
* map[c] U= c.{@link ATNConfig#alt alt} # map hash/equals uses s and x, not
* alt and not pred
* </pre>
*
* <p>The values in {@code map} are the set of {@code A_s,ctx} sets.</p>
*
* <p>If {@code |A_s,ctx|=1} then there is no conflict associated with
* {@code s} and {@code ctx}.</p>
*
* <p>Reduce the subsets to singletons by choosing a minimum of each subset. If
* the union of these alternative subsets is a singleton, then no amount of
* more lookahead will help us. We will always pick that alternative. If,
* however, there is more than one alternative, then we are uncertain which
* alternative to predict and must continue looking for resolution. We may
* or may not discover an ambiguity in the future, even if there are no
* conflicting subsets this round.</p>
*
* <p>The biggest sin is to terminate early because it means we've made a
* decision but were uncertain as to the eventual outcome. We haven't used
* enough lookahead. On the other hand, announcing a conflict too late is no
* big deal; you will still have the conflict. It's just inefficient. It
* might even look until the end of file.</p>
*
* <p>No special consideration for semantic predicates is required because
* predicates are evaluated on-the-fly for full LL prediction, ensuring that
* no configuration contains a semantic context during the termination
* check.</p>
*
* <p><strong>CONFLICTING CONFIGS</strong></p>
*
* <p>Two configurations {@code (s, i, x)} and {@code (s, j, x')}, conflict
* when {@code i!=j} but {@code x=x'}. Because we merge all
* {@code (s, i, _)} configurations together, that means that there are at
* most {@code n} configurations associated with state {@code s} for
* {@code n} possible alternatives in the decision. The merged stacks
* complicate the comparison of configuration contexts {@code x} and
* {@code x'}. Sam checks to see if one is a subset of the other by calling
* merge and checking to see if the merged result is either {@code x} or
* {@code x'}. If the {@code x} associated with lowest alternative {@code i}
* is the superset, then {@code i} is the only possible prediction since the
* others resolve to {@code min(i)} as well. However, if {@code x} is
* associated with {@code j>i} then at least one stack configuration for
* {@code j} is not in conflict with alternative {@code i}. The algorithm
* should keep going, looking for more lookahead due to the uncertainty.</p>
*
* <p>For simplicity, I'm doing a equality check between {@code x} and
* {@code x'} that lets the algorithm continue to consume lookahead longer
* than necessary. The reason I like the equality is of course the
* simplicity but also because that is the test you need to detect the
* alternatives that are actually in conflict.</p>
*
* <p><strong>CONTINUE/STOP RULE</strong></p>
*
* <p>Continue if union of resolved alternative sets from non-conflicting and
* conflicting alternative subsets has more than one alternative. We are
* uncertain about which alternative to predict.</p>
*
* <p>The complete set of alternatives, {@code [i for (_,i,_)]}, tells us which
* alternatives are still in the running for the amount of input we've
* consumed at this point. The conflicting sets let us to strip away
* configurations that won't lead to more states because we resolve
* conflicts to the configuration with a minimum alternate for the
* conflicting set.</p>
*
* <p><strong>CASES</strong></p>
*
* <ul>
*
* <li>no conflicts and more than 1 alternative in set =&gt; continue</li>
*
* <li> {@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s, 3, z)},
* {@code (s', 1, y)}, {@code (s', 2, y)} yields non-conflicting set
* {@code {3}} U conflicting sets {@code min({1,2})} U {@code min({1,2})} =
* {@code {1,3}} =&gt; continue
* </li>
*
* <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 1, y)},
* {@code (s', 2, y)}, {@code (s'', 1, z)} yields non-conflicting set
* {@code {1}} U conflicting sets {@code min({1,2})} U {@code min({1,2})} =
* {@code {1}} =&gt; stop and predict 1</li>
*
* <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 1, y)},
* {@code (s', 2, y)} yields conflicting, reduced sets {@code {1}} U
* {@code {1}} = {@code {1}} =&gt; stop and predict 1, can announce
* ambiguity {@code {1,2}}</li>
*
* <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 2, y)},
* {@code (s', 3, y)} yields conflicting, reduced sets {@code {1}} U
* {@code {2}} = {@code {1,2}} =&gt; continue</li>
*
* <li>{@code (s, 1, x)}, {@code (s, 2, x)}, {@code (s', 3, y)},
* {@code (s', 4, y)} yields conflicting, reduced sets {@code {1}} U
* {@code {3}} = {@code {1,3}} =&gt; continue</li>
*
* </ul>
*
* <p><strong>EXACT AMBIGUITY DETECTION</strong></p>
*
* <p>If all states report the same conflicting set of alternatives, then we
* know we have the exact ambiguity set.</p>
*
* <p><code>|A_<em>i</em>|&gt;1</code> and
* <code>A_<em>i</em> = A_<em>j</em></code> for all <em>i</em>, <em>j</em>.</p>
*
* <p>In other words, we continue examining lookahead until all {@code A_i}
* have more than one alternative and all {@code A_i} are the same. If
* {@code A={{1,2}, {1,3}}}, then regular LL prediction would terminate
* because the resolved set is {@code {1}}. To determine what the real
* ambiguity is, we have to know whether the ambiguity is between one and
* two or one and three so we keep going. We can only stop prediction when
* we need exact ambiguity detection when the sets look like
* {@code A={{1,2}}} or {@code {{1,2},{1,2}}}, etc...</p>
*/
static int resolvesToJustOneViableAlt(const std::vector<antlrcpp::BitSet> &altsets);
/// <summary>
@ -475,6 +430,9 @@ namespace atn {
/// <returns> the set of represented alternatives in {@code altsets} </returns>
static antlrcpp::BitSet getAlts(const std::vector<antlrcpp::BitSet> &altsets);
/** Get union of all alts from configs. @since 4.5.1 */
static antlrcpp::BitSet getAlts(Ref<ATNConfigSet> configs);
/// <summary>
/// This function gets the conflicting alt subsets from a configuration set.
/// For each configuration {@code c} in {@code configs}:

View File

@ -81,8 +81,15 @@ namespace atn {
public:
virtual int getSerializationType() const = 0;
/// <summary>
/// Are we epsilon, action, sempred? </summary>
/**
* Determines if the transition is an "epsilon" transition.
*
* <p>The default implementation returns {@code false}.</p>
*
* @return {@code true} if traversing this transition in the ATN does not
* consume an input symbol; otherwise, {@code false} if traversing this
* transition consumes (matches) an input symbol.
*/
virtual bool isEpsilon() const;
virtual misc::IntervalSet label() const;
virtual bool matches(int symbol, int minVocabSymbol, int maxVocabSymbol) const = 0;

View File

@ -32,14 +32,24 @@
#include "MurmurHash.h"
#include "Lexer.h"
#include "Exceptions.h"
#include "VocabularyImpl.h"
#include "IntervalSet.h"
using namespace org::antlr::v4::runtime;
using namespace org::antlr::v4::runtime::misc;
IntervalSet const IntervalSet::COMPLETE_CHAR_SET = IntervalSet::of(0, Lexer::MAX_CHAR_VALUE);
IntervalSet const IntervalSet::EMPTY_SET;
IntervalSet const IntervalSet::COMPLETE_CHAR_SET = []() {
IntervalSet complete = IntervalSet::of(Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE);
complete.setReadOnly(true);
return complete;
}();
IntervalSet const IntervalSet::EMPTY_SET = []() {
IntervalSet empty;
empty.setReadOnly(true);
return empty;
}();
IntervalSet::IntervalSet() {
InitializeInstanceFields();
@ -50,6 +60,7 @@ IntervalSet::IntervalSet(const std::vector<Interval> &intervals) : IntervalSet()
}
IntervalSet::IntervalSet(const IntervalSet &set) : IntervalSet() {
_intervals.clear();
addAll(set);
}
@ -162,53 +173,80 @@ IntervalSet IntervalSet::complement(int minElement, int maxElement) const {
}
IntervalSet IntervalSet::complement(const IntervalSet &vocabulary) const {
if (vocabulary == IntervalSet::EMPTY_SET) {
return IntervalSet::EMPTY_SET; // nothing in common with null set
}
int maxElement = vocabulary.getMaxElement();
IntervalSet compliment;
if (_intervals.empty()) {
return compliment;
}
Interval first = _intervals[0];
// Add a range from 0 to first.a constrained to vocab.
if (first.a > 0) {
IntervalSet s = IntervalSet::of(0, first.a - 1);
IntervalSet a = s.And(vocabulary);
compliment.addAll(a);
}
for (size_t i = 1; i < _intervals.size(); i++) { // from 2nd interval .. nth
const Interval &previous = _intervals[i - 1];
const Interval &current = _intervals[i];
IntervalSet s = IntervalSet::of(previous.b + 1, current.a - 1);
IntervalSet a = s.And(vocabulary);
compliment.addAll(a);
}
const Interval &last = _intervals.back();
// Add a range from last.b to maxElement constrained to vocab
if (last.b < maxElement) {
IntervalSet s = IntervalSet::of(last.b + 1, maxElement);
IntervalSet a = s.And(vocabulary);
compliment.addAll(a);
}
return compliment;
return vocabulary.subtract(*this);
}
IntervalSet IntervalSet::subtract(const IntervalSet &other) const {
// assume the whole unicode range here for the complement
// because it doesn't matter. Anything beyond the max of this' set
// will be ignored since we are doing this & ~other. The intersection
// will be empty. The only problem would be when this' set max value
// goes beyond MAX_CHAR_VALUE, but hopefully the constant MAX_CHAR_VALUE
// will prevent this.
return And(other.complement(COMPLETE_CHAR_SET));
return subtract(*this, other);
}
IntervalSet IntervalSet::subtract(const IntervalSet &left, const IntervalSet &right) {
if (left.isEmpty()) {
return IntervalSet();
}
if (right.isEmpty()) {
// right set has no elements; just return the copy of the current set
return left;
}
IntervalSet result(left);
size_t resultI = 0;
size_t rightI = 0;
while (resultI < result._intervals.size() && rightI < right._intervals.size()) {
Interval &resultInterval = result._intervals[resultI];
const Interval &rightInterval = right._intervals[rightI];
// operation: (resultInterval - rightInterval) and update indexes
if (rightInterval.b < resultInterval.a) {
rightI++;
continue;
}
if (rightInterval.a > resultInterval.b) {
resultI++;
continue;
}
Interval beforeCurrent;
Interval afterCurrent;
if (rightInterval.a > resultInterval.a) {
beforeCurrent = Interval(resultInterval.a, rightInterval.a - 1);
}
if (rightInterval.b < resultInterval.b) {
afterCurrent = Interval(rightInterval.b + 1, resultInterval.b);
}
if (beforeCurrent.a > -1) { // -1 is the default value
if (afterCurrent.a > -1) {
// split the current interval into two
result._intervals[resultI] = beforeCurrent;
result._intervals.insert(result._intervals.begin() + resultI + 1, afterCurrent);
resultI++;
rightI++;
} else {
// replace the current interval
result._intervals[resultI] = beforeCurrent;
resultI++;
}
} else {
if (afterCurrent.a > -1) {
// replace the current interval
result._intervals[resultI] = afterCurrent;
rightI++;
} else {
// remove the current interval (thus no need to increment resultI)
result._intervals.erase(result._intervals.begin() + resultI);
}
}
}
// If rightI reached right.intervals.size(), no more intervals to subtract from result.
// If resultI reached result.intervals.size(), we would be subtracting from an empty set.
// Either way, we are done.
return result;
}
IntervalSet IntervalSet::Or(const IntervalSet &a) const {
@ -305,17 +343,7 @@ int IntervalSet::getMinElement() const {
return Token::INVALID_TYPE;
}
for (auto &interval : _intervals) {
int a = interval.a;
int b = interval.b;
for (int v = a; v <= b; v++) {
if (v >= 0) {
return v;
}
}
}
return Token::INVALID_TYPE;
return _intervals[0].a;
}
std::vector<Interval> IntervalSet::getIntervals() const {
@ -366,7 +394,7 @@ std::wstring IntervalSet::toString(bool elemAreChar) const {
int a = interval.a;
int b = interval.b;
if (a == b) {
if (a == -1) {
if (a == EOF) {
ss << L"<EOF>";
} else if (elemAreChar) {
ss << L"'" << static_cast<wchar_t>(a) << L"'";
@ -389,6 +417,10 @@ std::wstring IntervalSet::toString(bool elemAreChar) const {
}
std::wstring IntervalSet::toString(const std::vector<std::wstring> &tokenNames) const {
return toString(dfa::VocabularyImpl::fromTokenNames(tokenNames));
}
std::wstring IntervalSet::toString(Ref<dfa::Vocabulary> vocabulary) const {
if (_intervals.empty()) {
return L"{}";
}
@ -408,13 +440,13 @@ std::wstring IntervalSet::toString(const std::vector<std::wstring> &tokenNames)
ssize_t a = (ssize_t)interval.a;
ssize_t b = (ssize_t)interval.b;
if (a == b) {
ss << elementName(tokenNames, a);
ss << elementName(vocabulary, a);
} else {
for (ssize_t i = a; i <= b; i++) {
if (i > a) {
ss << L", ";
}
ss << elementName(tokenNames, i);
ss << elementName(vocabulary, i);
}
}
}
@ -426,12 +458,16 @@ std::wstring IntervalSet::toString(const std::vector<std::wstring> &tokenNames)
}
std::wstring IntervalSet::elementName(const std::vector<std::wstring> &tokenNames, ssize_t a) const {
return elementName(dfa::VocabularyImpl::fromTokenNames(tokenNames), a);
}
std::wstring IntervalSet::elementName(Ref<dfa::Vocabulary> vocabulary, ssize_t a) const {
if (a == EOF) {
return L"<EOF>";
} else if (a == Token::EPSILON) {
return L"<EPSILON>";
} else {
return tokenNames[(size_t)a];
return vocabulary->getDisplayName(a);
}
}
@ -526,6 +562,8 @@ bool IntervalSet::isReadOnly() const {
}
void IntervalSet::setReadOnly(bool readonly) {
if (_readonly && !readonly)
throw IllegalStateException("Can't alter readonly IntervalSet");
_readonly = readonly;
}

View File

@ -39,21 +39,18 @@ namespace v4 {
namespace runtime {
namespace misc {
/// <summary>
/// A set of integers that relies on ranges being common to do
/// "run-length-encoded" like compression (if you view an IntSet like
/// a BitSet with runs of 0s and 1s). Only ranges are recorded so that
/// a few ints up near value 1000 don't cause massive bitsets, just two
/// integer intervals.
///
/// element values may be negative. Useful for sets of EPSILON and EOF.
///
/// 0..9 char range is index pair ['\u0030','\u0039'].
/// Multiple ranges are encoded with multiple index pairs. Isolated
/// elements are encoded with an index pair where both intervals are the same.
///
/// The ranges are ordered and disjoint so that 2..6 appears before 101..103.
/// </summary>
/**
* This class implements the {@link IntSet} backed by a sorted array of
* non-overlapping intervals. It is particularly efficient for representing
* large collections of numbers, where the majority of elements appear as part
* of a sequential range of numbers that are all part of the set. For example,
* the set { 1, 2, 3, 4, 7, 8 } may be represented as { [1, 4], [7, 8] }.
*
* <p>
* This class is able to represent sets containing any combination of values in
* the range {@link Integer#MIN_VALUE} to {@link Integer#MAX_VALUE}
* (inclusive).</p>
*/
class IntervalSet {
public:
static IntervalSet const COMPLETE_CHAR_SET;
@ -121,6 +118,13 @@ namespace misc {
/// anything that is in other but not in this will be ignored.
virtual IntervalSet subtract(const IntervalSet &other) const;
/**
* Compute the set difference between two interval sets. The specific
* operation is {@code left - right}. If either of the input sets is
* {@code null}, it is treated as though it was an empty set.
*/
static IntervalSet subtract(const IntervalSet &left, const IntervalSet &right);
virtual IntervalSet Or(const IntervalSet &a) const;
/// <summary>
@ -142,10 +146,20 @@ namespace misc {
/// If this set is a single integer, return it otherwise Token.INVALID_TYPE </summary>
virtual int getSingleElement() const;
/**
* Returns the maximum value contained in the set.
*
* @return the maximum value contained in the set. If the set is empty, this
* method returns {@link Token#INVALID_TYPE}.
*/
virtual int getMaxElement() const;
/// <summary>
/// Return minimum element >= 0 </summary>
/**
* Returns the minimum value contained in the set.
*
* @return the minimum value contained in the set. If the set is empty, this
* method returns {@link Token#INVALID_TYPE}.
*/
virtual int getMinElement() const;
/// <summary>
@ -160,10 +174,19 @@ namespace misc {
bool operator == (const IntervalSet &other) const;
virtual std::wstring toString() const;
virtual std::wstring toString(bool elemAreChar) const;
/**
* @deprecated Use {@link #toString(Vocabulary)} instead.
*/
virtual std::wstring toString(const std::vector<std::wstring> &tokenNames) const;
virtual std::wstring toString(Ref<dfa::Vocabulary> vocabulary) const;
protected:
/**
* @deprecated Use {@link #elementName(Vocabulary, int)} instead.
*/
virtual std::wstring elementName(const std::vector<std::wstring> &tokenNames, ssize_t a) const;
virtual std::wstring elementName(Ref<dfa::Vocabulary> vocabulary, ssize_t a) const;
public:
virtual size_t size() const;