got DFA interp working from parser grammar + unit tests

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6902]
This commit is contained in:
parrt 2010-05-28 12:17:17 -08:00
parent 231758b0de
commit de380d2fd1
6 changed files with 279 additions and 154 deletions

View File

@ -3,6 +3,7 @@ package org.antlr.v4.runtime.pda;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.IntStream;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenStream;
import org.antlr.v4.runtime.CommonToken;
import java.util.ArrayList;
@ -121,10 +122,6 @@ processOneChar:
addToClosure(reach, ip, alt, context);
}
break;
case Bytecode.SET :
System.err.println("not impl");
notNextMatch = false;
break;
case Bytecode.LABEL : // lexers only
int labelIndex = getShort(code, ip);
labelValues[labelIndex] =
@ -225,7 +222,9 @@ processOneChar:
switch (opcode) {
case Bytecode.NOT : // see thru NOT but include in closure so we exec during reach
closure.add(t); // add to closure; need to execute during reach
addToClosure(closure, ip, alt, context);
// add NOT and next instruction since reach only looks at
// what's in closure (it doesn't jump to ip after NOT)
addToClosure(closure, ip, alt, context);
break;
case Bytecode.JMP :
addToClosure(closure, getShort(code, ip), alt, context);
@ -302,6 +301,7 @@ processOneChar:
// this stuff below can't do SAVE nor CALL/RET but faster. (nor preds)
/*
public int execThompson_no_stack(CharStream input, int ip) {
int c = input.LA(1);
if ( c==Token.EOF ) return Token.EOF;
@ -441,7 +441,7 @@ processOneChar:
break;
}
}
*/
// subclass needs to override these if there are sempreds or actions in lexer rules
public boolean sempred(int ruleIndex, int actionIndex) {
@ -456,10 +456,91 @@ processOneChar:
System.out.println(instr);
}
void traceDFA(int ip) {
String instr = Bytecode.disassembleInstruction(code, ip, false);
System.out.println(instr);
}
public static int getShort(byte[] memory, int index) {
return (memory[index]&0xFF) <<(8*1) | (memory[index+1]&0xFF); // prevent sign extension with mask
}
public static class Context {
public int ip;
public int inputMarker;
public Context(int ip, int inputMarker) {
this.ip = ip;
this.inputMarker = inputMarker;
}
}
public int execNoRecursion(TokenStream input, int ip) {
System.out.println("execNoRecursion @"+ip);
List<Context> work = new ArrayList<Context>();
work.add(new Context(ip, input.mark()));
workLoop:
while ( work.size()>0 ) {
Context ctx = work.remove(work.size()-1); // treat like stack
ip = ctx.ip;
input.rewind(ctx.inputMarker);
while ( ip < code.length ) {
int c = input.LA(1);
traceDFA(ip);
short opcode = code[ip];
ip++; // move to next instruction or first byte of operand
switch (opcode) {
case Bytecode.MATCH8 :
if ( c != code[ip] ) continue workLoop;
ip++;
input.consume();
break;
case Bytecode.MATCH16 :
if ( c != getShort(code, ip) ) continue workLoop;
ip += 2;
input.consume();
break;
case Bytecode.RANGE8 :
if ( c<code[ip] || c>code[ip+1] ) continue workLoop;
ip += 2;
input.consume();
break;
case Bytecode.RANGE16 :
if ( c<getShort(code, ip) || c>getShort(code, ip+2) ) continue workLoop;
ip += 4;
input.consume();
break;
case Bytecode.ACCEPT :
int ruleIndex = getShort(code, ip);
ip += 2;
System.out.println("accept "+ruleIndex);
// returning gives first match not longest; i.e., like PEG
return ruleIndex;
case Bytecode.JMP :
int target = getShort(code, ip);
ip = target;
continue;
case Bytecode.SPLIT :
int nopnds = getShort(code, ip);
ip += 2;
// add split addresses to work queue in reverse order ('cept first one)
for (int i=nopnds-1; i>=1; i--) {
int addr = getShort(code, ip+i*2);
//System.out.println("try alt "+i+" at "+addr);
work.add(new Context(addr, input.mark()));
}
// try first alternative (w/o adding to work list)
int addr = getShort(code, ip);
ip = addr;
//System.out.println("try alt "+nopnds+" at "+addr);
continue;
default :
throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
}
}
}
return 0;
}
/*
public int exec(CharStream input, String ruleName) {
return exec(input, ruleToAddr.get(ruleName));
@ -527,80 +608,6 @@ processOneChar:
return 0;
}
public static class Context {
public int ip;
public int inputMarker;
public Context(int ip, int inputMarker) {
this.ip = ip;
this.inputMarker = inputMarker;
}
}
public int execNoRecursion(CharStream input, int ip) {
List<Context> work = new ArrayList<Context>();
work.add(new Context(ip, input.mark()));
workLoop:
while ( work.size()>0 ) {
Context ctx = work.remove(work.size()-1); // treat like stack
ip = ctx.ip;
input.rewind(ctx.inputMarker);
while ( ip < code.length ) {
int c = input.LA(1);
trace(ip);
short opcode = code[ip];
ip++; // move to next instruction or first byte of operand
switch (opcode) {
case Bytecode.MATCH8 :
if ( c != code[ip] ) continue workLoop;
ip++;
input.consume();
break;
case Bytecode.MATCH16 :
if ( c != getShort(code, ip) ) continue workLoop;
ip += 2;
input.consume();
break;
case Bytecode.RANGE8 :
if ( c<code[ip] || c>code[ip+1] ) continue workLoop;
ip += 2;
input.consume();
break;
case Bytecode.RANGE16 :
if ( c<getShort(code, ip) || c>getShort(code, ip+2) ) continue workLoop;
ip += 4;
input.consume();
break;
case Bytecode.ACCEPT :
int ruleIndex = getShort(code, ip);
ip += 2;
System.out.println("accept "+ruleIndex);
// returning gives first match not longest; i.e., like PEG
return ruleIndex;
case Bytecode.JMP :
int target = getShort(code, ip);
ip = target;
continue;
case Bytecode.SPLIT :
int nopnds = getShort(code, ip);
ip += 2;
// add split addresses to work queue in reverse order ('cept first one)
for (int i=nopnds-1; i>=1; i--) {
int addr = getShort(code, ip+i*2);
//System.out.println("try alt "+i+" at "+addr);
work.add(new Context(addr, input.mark()));
}
// try first alternative (w/o adding to work list)
int addr = getShort(code, ip);
ip = addr;
//System.out.println("try alt "+nopnds+" at "+addr);
continue;
default :
throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
}
}
}
return 0;
}
*/
}

View File

@ -4,6 +4,7 @@ import org.antlr.v4.automata.DFA;
import org.antlr.v4.automata.DFAState;
import org.antlr.v4.automata.Edge;
import org.antlr.v4.codegen.pda.*;
import org.antlr.v4.runtime.pda.Bytecode;
/** */
public class DFACompiler {
@ -20,6 +21,8 @@ public class DFACompiler {
public CompiledPDA compile() {
walk();
gen.compile();
System.out.println("DFA: ");
System.out.println(Bytecode.disassemble(gen.obj.code,false));
return gen.obj;
}
@ -30,7 +33,6 @@ public class DFACompiler {
// walk code, update jump targets.
for (Instr I : gen.obj.instrs) {
System.out.println("instr "+I);
if ( I instanceof JumpInstr) {
JumpInstr J = (JumpInstr)I;
J.target = stateToAddr[J.target];

View File

@ -28,16 +28,17 @@
package org.antlr.v4.test;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.antlr.runtime.*;
import org.antlr.v4.Tool;
import org.antlr.v4.analysis.DFAMinimizer;
import org.antlr.v4.analysis.LexerNFAToDFAConverter;
import org.antlr.v4.analysis.PredictionDFAFactory;
import org.antlr.v4.automata.DFA;
import org.antlr.v4.automata.*;
import org.antlr.v4.codegen.CompiledPDA;
import org.antlr.v4.codegen.LexerCompiler;
import org.antlr.v4.misc.Utils;
import org.antlr.v4.runtime.pda.PDA;
import org.antlr.v4.semantics.SemanticPipeline;
import org.antlr.v4.tool.*;
import org.junit.After;
@ -133,6 +134,37 @@ public abstract class BaseTest {
dfa.minimized = dmin.minimize();
}
PDA getLexerPDA(LexerGrammar g) {
NFA nfa = createNFA(g);
LexerCompiler comp = new LexerCompiler(g);
CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
return PDA;
}
List<Integer> getTypesFromString(Grammar g, String expecting) {
List<Integer> expectingTokenTypes = new ArrayList<Integer>();
if ( expecting!=null && !expecting.trim().equals("") ) {
for (String tname : expecting.replace(" ", "").split(",")) {
int ttype = g.getTokenType(tname);
expectingTokenTypes.add(ttype);
}
}
return expectingTokenTypes;
}
List<Integer> getTokenTypes(String input, PDA lexerPDA) {
ANTLRStringStream in = new ANTLRStringStream(input);
List<Integer> tokenTypes = new ArrayList<Integer>();
int ttype = 0;
do {
ttype = lexerPDA.execThompson(in);
tokenTypes.add(ttype);
} while ( ttype!= Token.EOF );
return tokenTypes;
}
List<Message> checkRuleDFA(String gtext, String ruleName, String expecting)
throws Exception
{

View File

@ -0,0 +1,98 @@
package org.antlr.v4.test;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.Token;
import org.antlr.v4.automata.DFA;
import org.antlr.v4.automata.DecisionState;
import org.antlr.v4.automata.NFA;
import org.antlr.v4.codegen.CompiledPDA;
import org.antlr.v4.codegen.DFACompiler;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.pda.PDA;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.LexerGrammar;
import org.junit.Test;
import java.util.List;
/** */
public class TestDFAInterp extends BaseTest {
public static class InterpLexer extends Lexer {
public InterpLexer(CharStream input, PDA pda) {
super(input);
modeToPDA = new PDA[] { pda };
}
}
@Test public void testSimpleLL1Decision() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n" +
"ID : 'a'..'z'+ ;\n" +
"INT : '0'..'9'+ ;\n");
Grammar g = new Grammar(
"parser grammar P;\n" +
"a : ID | INT ;\n"
);
int expecting = 1;
checkDFAMatches(g, lg, 0, "ab", expecting);
expecting = 2;
checkDFAMatches(g, lg, 0, "32", expecting);
}
@Test public void testArbCommonPrefix() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n" +
"SEMI : ';' ;\n" +
"DOT : '.' ;\n" +
"WS : ' ' ;\n" +
"ID : 'a'..'z'+ ;\n" +
"INT : '0'..'9'+ ;\n");
Grammar g = new Grammar(
"parser grammar P;\n" +
"tokens { WS; }\n" +
"a : ID+ SEMI\n" +
" | ID+ DOT\n" +
" ;\n"
);
int expecting = 1;
checkDFAMatches(g, lg, 2, "a b c ;", expecting);
expecting = 2;
checkDFAMatches(g, lg, 2, "a b c .", expecting);
}
int interp(Grammar g, LexerGrammar lg, int decision, String input) {
NFA nfa = createNFA(g);
DecisionState blk = nfa.decisionToNFAState.get(decision);
DFA dfa = createDFA(g, blk);
DFACompiler comp = new DFACompiler(dfa);
CompiledPDA obj = comp.compile();
PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);
lg.importVocab(g);
PDA lexerPDA = getLexerPDA(lg);
Lexer lexer = new InterpLexer(new ANTLRStringStream(input), lexerPDA);
CommonTokenStream tokens = new CommonTokenStream(lexer);
tokens.fill();
List<Token> list = tokens.getTokens();
for (Token t : list) {// hide WS
if ( t.getType()==g.getTokenType("WS") ) t.setChannel(Token.HIDDEN_CHANNEL);
}
System.out.println("tokens="+ list);
int alt = pda.execNoRecursion(tokens, 0);
return alt;
}
void checkDFAMatches(Grammar g, LexerGrammar lg, int decision,
String input, int expecting) {
int result = interp(g, lg, decision, input);
assertEquals(expecting, result);
}
}

View File

@ -2,6 +2,7 @@ package org.antlr.v4.test;
import org.antlr.v4.automata.DFA;
import org.antlr.v4.automata.DecisionState;
import org.antlr.v4.automata.Edge;
import org.antlr.v4.automata.NFA;
import org.antlr.v4.codegen.CompiledPDA;
import org.antlr.v4.codegen.DFACompiler;
@ -21,24 +22,52 @@ public class TestDFAtoPDABytecodeGeneration extends BaseTest {
"0007:\tset 0\n" +
"0010:\tjmp 13\n" +
"0013:\taccept 1\n" +
"0016:\tmatch8 5\n" +
"0016:\tmatch8 4\n" +
"0018:\tjmp 21\n" +
"0021:\taccept 2\n";
checkBytecode(g, 0, expecting);
}
@Test public void testAorBToSameState() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : A | B ;");
String expecting =
"0000:\tsplit 7, 15\n" +
"0007:\tmatch8 4\n" +
"0009:\tjmp 12\n" +
"0012:\taccept 2\n" +
"0015:\tmatch8 5\n" +
"0017:\tjmp 12\n";
NFA nfa = createNFA(g);
DecisionState blk = nfa.decisionToNFAState.get(0);
DFA dfa = createDFA(g, blk);
// make S0 go to S1 on both A and B (pinch alts back to single state)
Edge e0 = dfa.states.get(0).edge(0);
Edge e1 = dfa.states.get(0).edge(1);
e0.target = e1.target;
System.out.print("altered DFA="+dfa);
DFACompiler comp = new DFACompiler(dfa);
CompiledPDA obj = comp.compile();
PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);
assertEquals(expecting, Bytecode.disassemble(pda.code, false));
}
@Test public void testAorB() throws Exception {
Grammar g = new Grammar(
"parser grammar T;\n"+
"a : A | B ;");
String expecting =
"0000:\tsplit 7, 15\n" +
"0007:\tmatch8 5\n" +
"0007:\tmatch8 4\n" +
"0009:\tjmp 12\n" +
"0012:\taccept 2\n" +
"0015:\tmatch8 4\n" +
"0012:\taccept 1\n" +
"0015:\tmatch8 5\n" +
"0017:\tjmp 20\n" +
"0020:\taccept 1\n";
"0020:\taccept 2\n";
checkBytecode(g, 0, expecting);
}
@ -82,10 +111,6 @@ public class TestDFAtoPDABytecodeGeneration extends BaseTest {
NFA nfa = createNFA(g);
DecisionState blk = nfa.decisionToNFAState.get(decision);
DFA dfa = createDFA(g, blk);
// Edge e0 = dfa.states.get(1).edge(0);
// Edge e1 = dfa.states.get(1).edge(1);
// e0.target = e1.target;
// System.out.print("altered DFA="+dfa);
DFACompiler comp = new DFACompiler(dfa);
CompiledPDA obj = comp.compile();
PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);

View File

@ -1,13 +1,7 @@
package org.antlr.v4.test;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.Token;
import org.antlr.v4.Tool;
import org.antlr.v4.codegen.CompiledPDA;
import org.antlr.v4.codegen.LexerCompiler;
import org.antlr.v4.runtime.pda.PDA;
import org.antlr.v4.semantics.SemanticPipeline;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.LexerGrammar;
import org.junit.Test;
@ -191,74 +185,41 @@ public class TestPDABytecodeInterp extends BaseTest {
}
void checkMatches(LexerGrammar g, String input, String expecting) {
if ( g.ast!=null && !g.ast.hasErrors ) {
System.out.println(g.ast.toStringTree());
Tool antlr = new Tool();
SemanticPipeline sem = new SemanticPipeline(g);
sem.process();
if ( g.getImportedGrammars()!=null ) { // process imported grammars (if any)
for (Grammar imp : g.getImportedGrammars()) {
antlr.process(imp);
}
}
}
PDA pda = getLexerPDA(g);
List<Integer> expectingTokenTypes = new ArrayList<Integer>();
if ( expecting!=null && !expecting.trim().equals("") ) {
for (String tname : expecting.replace(" ", "").split(",")) {
int ttype = g.getTokenType(tname);
expectingTokenTypes.add(ttype);
}
}
List<Integer> expectingTokenTypes = getTypesFromString(g, expecting);
LexerCompiler comp = new LexerCompiler(g);
CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
ANTLRStringStream in = new ANTLRStringStream(input);
List<Integer> tokenTypes = new ArrayList<Integer>();
int ttype = 0;
do {
ttype = PDA.execThompson(in);
tokenTypes.add(ttype);
} while ( ttype!= Token.EOF );
List<Integer> tokenTypes = getTokenTypes(input, pda);
assertEquals(expectingTokenTypes, tokenTypes);
}
void checkLabels(LexerGrammar g, String input, String expecting,
String expectingTokens)
{
if ( g.ast!=null && !g.ast.hasErrors ) {
System.out.println(g.ast.toStringTree());
Tool antlr = new Tool();
SemanticPipeline sem = new SemanticPipeline(g);
sem.process();
if ( g.getImportedGrammars()!=null ) { // process imported grammars (if any)
for (Grammar imp : g.getImportedGrammars()) {
antlr.process(imp);
}
}
}
List<Integer> expectingTokenTypes = new ArrayList<Integer>();
if ( expecting!=null && !expecting.trim().equals("") ) {
for (String tname : expecting.replace(" ", "").split(",")) {
int ttype = g.getTokenType(tname);
expectingTokenTypes.add(ttype);
}
}
LexerCompiler comp = new LexerCompiler(g);
CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
PDA pda = getLexerPDA(g);
List<Integer> expectingTokenTypes = getTypesFromString(g, expecting);
ANTLRStringStream in = new ANTLRStringStream(input);
List<Integer> tokenTypes = new ArrayList<Integer>();
int ttype = PDA.execThompson(in);
int ttype = pda.execThompson(in);
tokenTypes.add(ttype);
assertEquals(expectingTokenTypes, tokenTypes);
if ( expectingTokens!=null ) {
assertEquals(expectingTokens, Arrays.toString(PDA.labelValues));
assertEquals(expectingTokens, Arrays.toString(pda.labelValues));
}
}
// List<Token> getTokens(String input, PDA lexerPDA) {
// ANTLRStringStream in = new ANTLRStringStream(input);
// List<Token> tokens = new ArrayList<Token>();
// int ttype = 0;
// do {
// ttype = lexerPDA.execThompson(in);
// tokens.add(new CommonToken(ttype,""));
// } while ( ttype!= Token.EOF );
// return tokens;
// }
}