PDA for lexers are too slow; working DFA back in. DFA construction was nondeterministic due to sets not ordered sets.

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6909]
This commit is contained in:
parrt 2010-06-10 15:24:09 -08:00
parent a017bb8feb
commit c18898a917
17 changed files with 207 additions and 114 deletions

View File

@ -57,6 +57,7 @@ public class Bytecode {
public static final short SEMPRED = 14;
public static final short ACTION = 15;
public static final short NOT = 16; // not next match instr
public static final short SWITCH = 17;
/** Used for disassembly; describes instruction set */
public static Instruction[] instructions = new Instruction[] {
@ -77,6 +78,7 @@ public class Bytecode {
new Instruction("sempred", OperandType.SHORT, OperandType.SHORT), // sempred ruleIndex, predIndex
new Instruction("action", OperandType.SHORT, OperandType.SHORT), // action ruleIndex, actionIndex
new Instruction("not"),
new Instruction("switch", OperandType.SHORT),
};
public static String disassemble(byte[] code, int start, boolean operandsAreChars) {

View File

@ -27,17 +27,23 @@ public class PDA {
public CommonToken[] labelValues;
public int nLabels;
public int[][] charToAddr;
/** If we hit an action, we'll have to rewind and do the winning rule again */
boolean bypassedAction;
boolean notNextMatch;
List<ThreadState> s0_closure;
List<ThreadState>[] closure_cache;
public PDA(byte[] code, int[] altToAddr, int nLabels) {
//System.out.println("code="+Arrays.toString(code));
this.code = code;
this.altToAddr = altToAddr;
this.nLabels = nLabels;
labelValues = new CommonToken[nLabels];
closure_cache = new ArrayList[255+1];
}
public int execThompson(IntStream input) {
@ -64,7 +70,39 @@ public class PDA {
int c = input.LA(1);
if ( c==Token.EOF ) return Token.EOF;
List<ThreadState> closure = computeStartState(ip);
// List<ThreadState> closure = null;
// int[] x = charToAddr[c];
// //System.out.println("list for "+Bytecode.quotedCharLiteral(c)+" is "+Arrays.toString(x));
// if ( closure_cache[c] != null ) {
// closure = new ArrayList<ThreadState>();
// closure.addAll(closure_cache[c]);
// }
// else {
// if ( x!=null ) {
// closure = new ArrayList<ThreadState>();
// int i = 1;
// for (int v : x) {
// //ThreadState t = new ThreadState(v, i, NFAStack.EMPTY);
// addToClosure(closure, v, i, NFAStack.EMPTY);
// //closure.add(t);
// i++;
// }
// closure_cache[c] = new ArrayList<ThreadState>();
// closure_cache[c].addAll(closure);
// //System.out.println("caching "+closure);
// }
// else {
// System.err.println("invalid char: "+Bytecode.quotedCharLiteral(c));
// }
// }
List<ThreadState> closure = null;
if ( s0_closure == null ) {
s0_closure = computeStartState(ip);
}
closure = new ArrayList<ThreadState>();
closure.addAll(s0_closure);
List<ThreadState> reach = new ArrayList<ThreadState>();
ThreadState prevAccept = new ThreadState(Integer.MAX_VALUE, -1, NFAStack.EMPTY);
ThreadState firstAccept = null;
@ -256,7 +294,7 @@ processOneChar:
// accept is just a ret if we have a stack;
// i.e., don't stop; someone called us and we need to use their
// accept, not this one
closure.add(t); // add to closure; need to execute during reach
closure.add(t); // add to closure; need to execute during reach
case Bytecode.RET :
if ( context != NFAStack.EMPTY ) {
addToClosure(closure, context.returnAddr, alt, context.parent);
@ -279,6 +317,7 @@ processOneChar:
}
}
List<ThreadState> computeStartState(int ip) {
// if we're starting at a SPLIT, add closure of all SPLIT targets
// else just add closure of ip

View File

@ -330,6 +330,9 @@ public static final byte[] <name>_code = {
public static final int[] <name>_tokenTypeToAddr = {
<model.altToAddr; separator=", ">
};
public static final int[][] <name>charToAddr = {
<model.charToAddr:{addrs | /* <i0> */ {<addrs; separator=", ">\}}; null="null", separator=",\n">
};
public final class <name>_PDA extends PDA {
<if(actions)>
@ -352,6 +355,7 @@ public final class <name>_PDA extends PDA {
<endif>
public <name>_PDA() {
super(<name>_code, <name>_tokenTypeToAddr, <model.nLabels>);
this.charToAddr = <name>charToAddr;
}
}<\n>
>>

View File

@ -20,9 +20,8 @@ public class AnalysisPipeline {
if ( lr.listOfRecursiveCycles.size()>0 ) return; // bail out
// BUILD DFA FOR EACH DECISION
// if ( g.isLexer() ) processLexer();
// else processParserOrTreeParser();
// TODO: don't do lexers for now; we can add lookahead analysis to help with NFA simulation later
if ( g.isLexer() ) processLexer();
else processParserOrTreeParser();
if ( !g.isLexer() ) processParserOrTreeParser();
}

View File

@ -75,7 +75,7 @@ public class PredictionDFAFactory {
* hence looping forever. Sensitive to the NFA state, the alt, and
* the stack context.
*/
Set<NFAConfig> closureBusy;
OrderedHashSet<NFAConfig> closureBusy;
Resolver resolver;
@ -90,7 +90,7 @@ public class PredictionDFAFactory {
}
public DFA createDFA() {
closureBusy = new HashSet<NFAConfig>();
closureBusy = new OrderedHashSet<NFAConfig>();
computeStartState();
dfa.addState(dfa.startState); // make sure dfa knows about this state
work.add(dfa.startState);
@ -117,7 +117,6 @@ public class PredictionDFAFactory {
*/
void reach(DFAState d) {
OrderedHashSet<IntervalSet> labels = DFA.getReachableLabels(d);
for (IntervalSet label : labels) {
DFAState t = reach(d, label);
if ( debug ) {

View File

@ -1,6 +1,5 @@
package org.antlr.v4.automata;
import org.antlr.v4.misc.Utils;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.Rule;
@ -43,7 +42,8 @@ public class DFASerializer {
}
}
String output = buf.toString();
return Utils.sortLinesInString(output);
//return Utils.sortLinesInString(output);
return output;
}
String getStateString(DFAState s) {

View File

@ -119,7 +119,7 @@ public class DFAState {
}
public Set<NFAState> getUniqueNFAStates(int alt) {
Set<NFAState> alts = new HashSet<NFAState>();
OrderedHashSet<NFAState> alts = new OrderedHashSet<NFAState>();
for (NFAConfig c : nfaConfigs) {
if ( alt==NFA.INVALID_ALT_NUMBER || c.alt==alt ) alts.add(c.state);
}

View File

@ -5,7 +5,6 @@ import org.antlr.v4.tool.GrammarAST;
import org.antlr.v4.tool.LexerGrammar;
import org.antlr.v4.tool.Rule;
import org.antlr.v4.tool.TerminalAST;
import org.stringtemplate.v4.misc.Misc;
import java.util.List;
@ -60,7 +59,7 @@ public class LexerNFAFactory extends ParserNFAFactory {
*/
public Handle stringLiteral(TerminalAST stringLiteralAST) {
String chars = stringLiteralAST.getText();
chars = Misc.strip(chars, 1); // strip quotes
chars = CharSupport.getStringFromGrammarStringLiteral(chars);
int n = chars.length();
BasicState left = newState(stringLiteralAST);
BasicState prev = left;

View File

@ -21,6 +21,9 @@ public class CompiledPDA {
public int[] altToAddr; // either token type (in lexer) or alt num for DFA in parser
// charToAddr['a'] is list of addresses we can reach upon 'a' (only start state)
public List[] charToAddr = new List[256];
public DoubleKeyMap<Rule, String, Integer> ruleLabels = new DoubleKeyMap<Rule, String, Integer>();
public DoubleKeyMap<Rule, Token, Integer> ruleActions = new DoubleKeyMap<Rule, Token, Integer>();
public DoubleKeyMap<Rule, Token, Integer> ruleSempreds = new DoubleKeyMap<Rule, Token, Integer>();

View File

@ -30,7 +30,6 @@ public class LexerCompiler {
SplitInstr s0 = new SplitInstr(numRules - numFragmentRules);
gen.emit(s0);
for (Rule r : lg.modes.get(modeName)) { // for each rule in mode
gen.currentRule = r;
GrammarAST blk = (GrammarAST)r.ast.getFirstChildWithType(ANTLRParser.BLOCK);
@ -56,6 +55,29 @@ public class LexerCompiler {
e.printStackTrace(System.err);
}
}
// for (Rule r : lg.modes.get(modeName)) {
// if ( !r.isFragment() ) {
// LinearApproximator approx = new LinearApproximator(lg, NFA.INVALID_DECISION_NUMBER);
// IntervalSet fset = approx.FIRST(lg.nfa.ruleToStartState.get(r));
// System.out.println("first of "+r.name+"="+fset);
// for (int c : fset.toArray()) {
// if ( c>=0 && c<=255 ) {
// int a = gen.obj.ruleToAddr.get(r.name);
// List addrs = gen.obj.charToAddr[c];
// if ( addrs==null ) {
// addrs = new ArrayList();
// gen.obj.charToAddr[c] = addrs;
// }
// addrs.add(a);
// }
// }
// }
// }
// for (int c=0; c<=255; c++) {
// System.out.println(c+": "+gen.obj.charToAddr[c]);
// }
gen.compile();
gen.obj.nLabels = gen.labelIndex;
System.out.println(Bytecode.disassemble(gen.obj.code));

View File

@ -29,6 +29,11 @@ public class OrderedHashSet<T> extends HashSet<T> {
return oldElement;
}
public boolean remove(int i) {
T o = elements.remove(i);
return super.remove(o);
}
/** Add a value to list; keep in hashtable for consistency also;
* Key is object itself. Good for say asking if a certain string is in
* a list of strings.
@ -62,7 +67,12 @@ public class OrderedHashSet<T> extends HashSet<T> {
return elements;
}
public String toString() {
@Override
public Object[] toArray() {
return elements.toArray();
}
public String toString() {
return elements.toString();
}
}

View File

@ -337,7 +337,7 @@ public class DOTGenerator {
List<Integer> altList = new ArrayList<Integer>();
altList.addAll(alts);
Collections.sort(altList);
Set configurations = ((DFAState) s).nfaConfigs;
Set<NFAConfig> configurations = ((DFAState)s).nfaConfigs;
for (int altIndex = 0; altIndex < altList.size(); altIndex++) {
Integer altI = (Integer) altList.get(altIndex);
int alt = altI.intValue();

View File

@ -389,7 +389,13 @@ public class Grammar implements AttributeResolver {
}
//System.out.println("getTokenDisplayName ttype="+ttype+", index="+index+", name="+tokenName);
return tokenName;
}
}
public List<String> getTokenDisplayNames(Collection<Integer> types) {
List<String> names = new ArrayList<String>();
for (int t : types) names.add(getTokenDisplayName(t));
return names;
}
/** What is the max char value possible for this grammar's target? Use
* unicode max if no target defined.

View File

@ -8,6 +8,7 @@ import org.junit.Test;
import java.util.List;
/** NON-OPTIMIZED DFA */
public class TestDFAConstruction extends BaseTest {
@Test public void testSimpleLinearApproxDecisionAsDFA() throws Exception {
String g =
@ -28,17 +29,41 @@ public class TestDFAConstruction extends BaseTest {
"e : L e R\n" +
" | I\n" +
" ;";
// String expecting =
// "s0-I->s2\n" +
// "s0-L->s1\n" +
// "s1-I->s2\n" +
// "s1-L->s1\n" +
// "s2-R->s3\n" +
// "s2-X->:s5=>1\n" +
// "s2-Y->:s4=>2\n" +
// "s3-R->s3\n" +
// "s3-X->:s5=>1\n" +
// "s3-Y->:s4=>2\n";
// String expecting =
// "s0-I->s1\n" +
// "s0-L->s2\n" +
// "s1-Y->:s3=>2\n" +
// "s1-X->:s4=>1\n" +
// "s2-I->s5\n" +
// "s2-L->s2\n" +
// "s5-Y->:s3=>2\n" +
// "s5-X->:s4=>1\n" +
// "s5-R->s6\n" +
// "s6-Y->:s3=>2\n" +
// "s6-X->:s4=>1\n" +
// "s6-R->s6\n";
String expecting =
"s0-I->s2\n" +
"s0-L->s1\n" +
"s1-I->s2\n" +
"s0-I->s2\n" +
"s1-L->s1\n" +
"s2-R->s3\n" +
"s2-X->:s5=>1\n" +
"s2-Y->:s4=>2\n" +
"s3-R->s3\n" +
"s3-X->:s5=>1\n" +
"s3-Y->:s4=>2\n";
"s1-I->s2\n" +
"s2-X->:s3=>1\n" +
"s2-R->s4\n" +
"s2-Y->:s5=>2\n" +
"s4-X->:s3=>1\n" +
"s4-R->s4\n" +
"s4-Y->:s5=>2\n";
checkRuleDFA(g, "a", expecting);
}
@ -54,15 +79,15 @@ public class TestDFAConstruction extends BaseTest {
" ;\n" +
"c : C | ;";
String expecting =
"s0-C->s3\n" +
"s0-D->s1\n" +
"s0-E->s2\n" +
"s1-X->:s5=>1\n" +
"s1-Y->:s4=>2\n" +
"s2-X->:s5=>1\n" +
"s2-Y->:s4=>2\n" +
"s3-D->s1\n" +
"s3-E->s2\n";
"s0-C->s1\n" +
"s0-D->s2\n" +
"s0-E->s3\n" +
"s1-D->s2\n" +
"s1-E->s3\n" +
"s2-X->:s4=>1\n" +
"s2-Y->:s5=>2\n" +
"s3-X->:s4=>1\n" +
"s3-Y->:s5=>2\n";
checkRuleDFA(g, "a", expecting);
}
@ -78,10 +103,10 @@ public class TestDFAConstruction extends BaseTest {
"q : b Q ;";
String expecting =
"s0-F->s1\n" +
"s0-X->:s3=>1\n" +
"s0-Y->:s2=>2\n" +
"s1-X->:s3=>1\n" +
"s1-Y->:s2=>2\n";
"s0-X->:s2=>1\n" +
"s0-Y->:s3=>2\n" +
"s1-X->:s2=>1\n" +
"s1-Y->:s3=>2\n";
checkRuleDFA(g, "a", expecting);
}
@ -119,27 +144,27 @@ public class TestDFAConstruction extends BaseTest {
" | I\n" +
" ;";
String expecting =
"s0-A->s1\n" +
"s0-L->s1\n" +
"s0-A->s2\n" +
"s0-I->s3\n" +
"s0-L->s2\n" +
"s1-A->s1\n" +
"s1-L->s1\n" +
"s1-A->s2\n" +
"s1-I->s3\n" +
"s1-L->s2\n" +
"s2-A->s1\n" +
"s2-L->s1\n" +
"s2-A->s2\n" +
"s2-I->s3\n" +
"s2-L->s2\n" +
"s3-B->s4\n" +
"s3-X->:s4=>1\n" +
"s3-R->s5\n" +
"s3-X->:s7=>1\n" +
"s3-Y->:s6=>2\n" +
"s4-B->s4\n" +
"s4-R->s5\n" +
"s4-X->:s7=>1\n" +
"s4-Y->:s6=>2\n" +
"s5-B->s4\n" +
"s3-B->s6\n" +
"s3-Y->:s7=>2\n" +
"s5-X->:s4=>1\n" +
"s5-R->s5\n" +
"s5-X->:s7=>1\n" +
"s5-Y->:s6=>2\n";
"s5-B->s6\n" +
"s5-Y->:s7=>2\n" +
"s6-X->:s4=>1\n" +
"s6-R->s5\n" +
"s6-B->s6\n" +
"s6-Y->:s7=>2\n";
checkRuleDFA(g, "s", expecting);
}
@ -183,10 +208,10 @@ public class TestDFAConstruction extends BaseTest {
" ;";
String expecting =
"s0-F->s1\n" +
"s0-X->:s3=>1\n" +
"s0-Y->:s2=>2\n" +
"s1-X->:s3=>1\n" +
"s1-Y->:s2=>2\n";
"s0-X->:s2=>1\n" +
"s0-Y->:s3=>2\n" +
"s1-X->:s2=>1\n" +
"s1-Y->:s3=>2\n";
List<Message> msgs = checkRuleDFA(g, "a", expecting);
System.out.println(msgs);
assertEquals(msgs.size(), 0);
@ -198,11 +223,11 @@ public class TestDFAConstruction extends BaseTest {
"s : a Y | A+ X ;\n" +
"a : A a | Q;";
String expecting =
"s0-A->s2\n" +
"s0-Q->:s1=>1\n" +
"s2-A->s2\n" +
"s2-Q->:s1=>1\n" +
"s2-X->:s3=>2\n";
"s0-A->s1\n" +
"s0-Q->:s2=>1\n" +
"s1-A->s1\n" +
"s1-Q->:s2=>1\n" +
"s1-X->:s3=>2\n";
List<Message> msgs = checkRuleDFA(g, "s", expecting);
System.out.println(msgs);
assertEquals(msgs.size(), 0);
@ -246,8 +271,8 @@ public class TestDFAConstruction extends BaseTest {
// nondeterministic from left edge
String expecting =
"s0-P->s1\n" +
"s1-EOF->:s3=>2\n" +
"s1-P->:s2=>1\n";
"s1-P->:s2=>1\n" +
"s1-EOF->:s3=>2\n";
List<Message> msgs = checkRuleDFA(g, "a", expecting);
System.out.println(msgs);
ambig(msgs, new int[] {1,2}, "P P");
@ -329,10 +354,10 @@ public class TestDFAConstruction extends BaseTest {
String expecting =
"s0-C->s1\n" +
"s1-B->s2\n" +
"s1-X->:s4=>1\n" +
"s1-Y->:s3=>2\n" +
"s2-X->:s4=>1\n" +
"s2-Y->:s3=>2\n";
"s1-X->:s3=>1\n" +
"s1-Y->:s4=>2\n" +
"s2-X->:s3=>1\n" +
"s2-Y->:s4=>2\n";
List<Message> msgs = checkRuleDFA(g, "a", expecting);
System.out.println(msgs);
assertEquals(msgs.size(), 0);
@ -367,8 +392,8 @@ public class TestDFAConstruction extends BaseTest {
assertEquals(msgs.size(), 1);
expecting =
"s0-A->:s1=>2\n" +
"s0-B->:s2=>1\n";
"s0-B->:s1=>1\n" +
"s0-A->:s2=>2\n";
msgs = checkRuleDFA(g, 1, expecting);
System.out.println(msgs);
ambig(msgs, new int[] {1,2}, "B");

View File

@ -2,7 +2,6 @@ package org.antlr.v4.test;
import org.junit.Test;
/** TODO: delete since i don't built DFA anymore for lexer */
public class TestLexerDFAConstruction extends BaseTest {
@Test public void unicode() throws Exception {
@ -11,7 +10,12 @@ public class TestLexerDFAConstruction extends BaseTest {
"A : '\\u0030'..'\\u8000'+ 'a' ;\n" + // TODO: FAILS; \\u not converted
"B : '\\u0020' ;";
String expecting =
"";
"s0-{'0'..'\\u8000'}->s1\n" +
"s0-' '->:s2=> B\n" +
"s1-'a'->:s3=> A\n" +
"s1-{'0'..'`', 'b'..'\\u8000'}->s1\n" +
":s3=> A-'a'->:s3=> A\n" +
":s3=> A-{'0'..'`', 'b'..'\\u8000'}->s1\n";
checkLexerDFA(g, expecting);
}
@ -24,20 +28,19 @@ public class TestLexerDFAConstruction extends BaseTest {
"public fragment\n" +
"DIGIT : '0'..'9' ;";
String expecting =
":s1=> INT-{'0'..'9'}->:s1=> INT\n" +
"s0-'i'->:s1=> ID\n" +
"s0-{'a'..'h', 'j'..'z'}->:s2=> ID\n" +
"s0-{'0'..'9'}->:s3=> INT\n" +
":s1=> ID-'f'->:s4=> IF ID\n" +
":s1=> ID-{'a'..'e', 'g'..'z'}->:s2=> ID\n" +
":s2=> ID-{'a'..'z'}->:s2=> ID\n" +
":s3=> ID-'f'->:s4=> IF ID\n" +
":s3=> ID-{'a'..'e', 'g'..'z'}->:s2=> ID\n" +
":s4=> IF ID-{'a'..'z'}->:s2=> ID\n" +
"s0-'i'->:s3=> ID\n" +
"s0-{'0'..'9'}->:s1=> INT\n" +
"s0-{'a'..'h', 'j'..'z'}->:s2=> ID\n";
":s3=> INT-{'0'..'9'}->:s3=> INT\n" +
":s4=> IF ID-{'a'..'z'}->:s2=> ID\n";
checkLexerDFA(g, expecting);
}
@Test public void recursiveMatchingTwoAlts() throws Exception {
// ambig with ACTION; accept state will try both after matching
// since one is recursive
// TODO: recursion requires NFA
String g =
"lexer grammar L3;\n" +
"SPECIAL : '{{}}' ;\n" +
@ -46,25 +49,7 @@ public class TestLexerDFAConstruction extends BaseTest {
"FOO : ACTION ;\n" +
"LCURLY : '{' ;";
String expecting =
":s1=> LCURLY-'x'->s4\n" +
":s1=> LCURLY-'{'->s3\n" +
":s1=> LCURLY-'}'->:s2=> ACTION\n" +
"s0-'{'->:s1=> LCURLY\n" +
"s3-'x'->s6\n" +
"s3-'}'->s5\n" +
"s4-'x'->s4\n" +
"s4-'{'->s7\n" +
"s4-'}'->:s2=> ACTION\n" +
"s5-'x'->s4\n" +
"s5-'{'->s7\n" +
"s5-'}'->:s8=> SPECIAL ACTION\n" + // order meaningful here: SPECIAL ACTION
"s6-'x'->s6\n" +
"s6-'}'->s9\n" +
"s7-'x'->s6\n" +
"s7-'}'->s9\n" +
"s9-'x'->s4\n" +
"s9-'{'->s7\n" +
"s9-'}'->:s2=> ACTION\n";
"";
checkLexerDFA(g, expecting);
}

View File

@ -28,8 +28,8 @@ public class TestLinearApproximateLookahead extends BaseTest {
"b : c | C ;\n" +
"c : D ;";
String expecting =
"s0-B->:s2=>2\n" +
"s0-{D, C}->:s1=>1\n";
"s0-{D, C}->:s1=>1\n" +
"s0-B->:s2=>2\n";
checkRule(g, "a", expecting);
}
@ -52,8 +52,8 @@ public class TestLinearApproximateLookahead extends BaseTest {
"a : b B | X b C ;\n" +
"b : A | ;";
String expecting =
"s0-X->:s2=>2\n" +
"s0-{A, B}->:s1=>1\n";
"s0-{A, B}->:s1=>1\n" +
"s0-X->:s2=>2\n";
checkRule(g, "a", expecting);
}

View File

@ -16,8 +16,8 @@ public class TestPredicatedDFAConstruction extends BaseTest {
" ;";
String expecting =
"s0-ID->s1\n" +
"s1-true->:s3=>2\n" +
"s1-{p1}?->:s2=>1\n";
"s1-{p1}?->:s2=>1\n" +
"s1-true->:s3=>2\n";
checkRuleDFA(g, "a", expecting);
}
@ -51,13 +51,13 @@ public class TestPredicatedDFAConstruction extends BaseTest {
"\n" +
"expr : ID;";
String expecting =
"s0-';'->:s2=>3\n" +
"s0-ID->s1\n" +
"s1-ID->s3\n" +
"s3-';'->s5\n" +
"s0-';'->:s1=>3\n" +
"s0-ID->s2\n" +
"s2-ID->s3\n" +
"s3-ID->:s4=>1\n" +
"s5-{CALL}?->:s7=>2\n" +
"s5-{IF}?->:s6=>1\n";
"s3-';'->s5\n" +
"s5-{IF}?->:s6=>1\n" +
"s5-{CALL}?->:s7=>2\n";
List<Message> msgs = checkRuleDFA(g, "stat", expecting);
System.err.println(msgs);
}
@ -76,7 +76,7 @@ public class TestPredicatedDFAConstruction extends BaseTest {
String expecting =
"s0-ID->s1\n" +
"s1-SEMI->s2\n" +
"s2-({while}?||{for}?||{do}?)->:s3=>1\n" +
"s2-({while}?||{do}?||{for}?)->:s3=>1\n" +
"s2-true->:s4=>2\n";
checkRuleDFA(g, "a", expecting);
}