got DFA interp working from parser grammar + unit tests

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6902]
2010-05-28 12:17:17 -08:00 · 2010-05-28 12:17:17 -08:00 · de380d2fd1
parent 231758b0de
commit de380d2fd1
6 changed files with 279 additions and 154 deletions
--- a/runtime/Java/src/org/antlr/v4/runtime/pda/PDA.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/pda/PDA.java
@ -3,6 +3,7 @@ package org.antlr.v4.runtime.pda;
 import org.antlr.runtime.CharStream;
 import org.antlr.runtime.IntStream;
 import org.antlr.runtime.Token;
+import org.antlr.runtime.TokenStream;
 import org.antlr.v4.runtime.CommonToken;

 import java.util.ArrayList;
@ -121,10 +122,6 @@ processOneChar:
 							addToClosure(reach, ip, alt, context);
 						}
 						break;
-					case Bytecode.SET :
-						System.err.println("not impl");
-						notNextMatch = false;
-						break;
 					case Bytecode.LABEL : // lexers only
 						int labelIndex = getShort(code, ip);
 						labelValues[labelIndex] =
@ -225,7 +222,9 @@ processOneChar:
 		switch (opcode) {
 			case Bytecode.NOT : // see thru NOT but include in closure so we exec during reach
 				closure.add(t);	// add to closure; need to execute during reach
-				addToClosure(closure, ip, alt, context);				
+				// add NOT and next instruction since reach only looks at
+				// what's in closure (it doesn't jump to ip after NOT)
+				addToClosure(closure, ip, alt, context);
 				break;
 			case Bytecode.JMP :
 				addToClosure(closure, getShort(code, ip), alt, context);
@ -302,6 +301,7 @@ processOneChar:

 	// this stuff below can't do SAVE nor CALL/RET but faster.  (nor preds)
 	
+/*
 	public int execThompson_no_stack(CharStream input, int ip) {
 		int c = input.LA(1);
 		if ( c==Token.EOF ) return Token.EOF;
@ -441,7 +441,7 @@ processOneChar:
 				break;
 		}
 	}
-
+*/
 	// subclass needs to override these if there are sempreds or actions in lexer rules

 	public boolean sempred(int ruleIndex, int actionIndex) {
@ -456,10 +456,91 @@ processOneChar:
 		System.out.println(instr);
 	}

+	void traceDFA(int ip) {
+		String instr = Bytecode.disassembleInstruction(code, ip, false);
+		System.out.println(instr);
+	}
+
 	public static int getShort(byte[] memory, int index) {
 		return (memory[index]&0xFF) <<(8*1) | (memory[index+1]&0xFF); // prevent sign extension with mask
 	}

+	public static class Context {
+		public int ip;
+		public int inputMarker;
+		public Context(int ip, int inputMarker) {
+			this.ip = ip;
+			this.inputMarker = inputMarker;
+		}
+	}
+
+	public int execNoRecursion(TokenStream input, int ip) {
+		System.out.println("execNoRecursion @"+ip);
+		List<Context> work = new ArrayList<Context>();
+		work.add(new Context(ip, input.mark()));
+workLoop:
+		while ( work.size()>0 ) {
+			Context ctx = work.remove(work.size()-1); // treat like stack
+			ip = ctx.ip;
+			input.rewind(ctx.inputMarker);
+			while ( ip < code.length ) {
+				int c = input.LA(1);
+				traceDFA(ip);
+				short opcode = code[ip];
+				ip++; // move to next instruction or first byte of operand
+				switch (opcode) {
+					case Bytecode.MATCH8 :
+						if ( c != code[ip] ) continue workLoop;
+						ip++;
+						input.consume();
+						break;
+					case Bytecode.MATCH16 :
+						if ( c != getShort(code, ip) ) continue workLoop;
+						ip += 2;
+						input.consume();
+						break;
+					case Bytecode.RANGE8 :
+						if ( c<code[ip] || c>code[ip+1] ) continue workLoop;
+						ip += 2;
+						input.consume();
+						break;
+					case Bytecode.RANGE16 :
+						if ( c<getShort(code, ip) || c>getShort(code, ip+2) ) continue workLoop;
+						ip += 4;
+						input.consume();
+						break;
+					case Bytecode.ACCEPT :
+						int ruleIndex = getShort(code, ip);
+						ip += 2;
+						System.out.println("accept "+ruleIndex);
+						// returning gives first match not longest; i.e., like PEG
+						return ruleIndex;
+					case Bytecode.JMP :
+						int target = getShort(code, ip);
+						ip = target;
+						continue;
+					case Bytecode.SPLIT :
+						int nopnds = getShort(code, ip);
+						ip += 2;
+						// add split addresses to work queue in reverse order ('cept first one)
+						for (int i=nopnds-1; i>=1; i--) {
+							int addr = getShort(code, ip+i*2);
+							//System.out.println("try alt "+i+" at "+addr);
+							work.add(new Context(addr, input.mark()));
+						}
+						// try first alternative (w/o adding to work list)
+						int addr = getShort(code, ip);
+						ip = addr;
+						//System.out.println("try alt "+nopnds+" at "+addr);
+						continue;
+					default :
+						throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
+				}
+			}
+		}
+		return 0;
+	}
+
 /*
 	public int exec(CharStream input, String ruleName) {
 		return exec(input, ruleToAddr.get(ruleName));
@ -527,80 +608,6 @@ processOneChar:
 		return 0;
 	}

-	public static class Context {
-		public int ip;
-		public int inputMarker;
-		public Context(int ip, int inputMarker) {
-			this.ip = ip;
-			this.inputMarker = inputMarker;
-		}
-	}
-
-	public int execNoRecursion(CharStream input, int ip) {
-		List<Context> work = new ArrayList<Context>();
-		work.add(new Context(ip, input.mark()));
-workLoop:
-		while ( work.size()>0 ) {
-			Context ctx = work.remove(work.size()-1); // treat like stack
-			ip = ctx.ip;
-			input.rewind(ctx.inputMarker);
-			while ( ip < code.length ) {
-				int c = input.LA(1);
-				trace(ip);
-				short opcode = code[ip];
-				ip++; // move to next instruction or first byte of operand
-				switch (opcode) {
-					case Bytecode.MATCH8 :
-						if ( c != code[ip] ) continue workLoop;
-						ip++;
-						input.consume();
-						break;
-					case Bytecode.MATCH16 :
-						if ( c != getShort(code, ip) ) continue workLoop;
-						ip += 2;
-						input.consume();
-						break;
-					case Bytecode.RANGE8 :
-						if ( c<code[ip] || c>code[ip+1] ) continue workLoop;
-						ip += 2;
-						input.consume();
-						break;
-					case Bytecode.RANGE16 :
-						if ( c<getShort(code, ip) || c>getShort(code, ip+2) ) continue workLoop;
-						ip += 4;
-						input.consume();
-						break;
-					case Bytecode.ACCEPT :
-						int ruleIndex = getShort(code, ip);
-						ip += 2;
-						System.out.println("accept "+ruleIndex);
-						// returning gives first match not longest; i.e., like PEG
-						return ruleIndex;
-					case Bytecode.JMP :
-						int target = getShort(code, ip);
-						ip = target;
-						continue;
-					case Bytecode.SPLIT :
-						int nopnds = getShort(code, ip);
-						ip += 2;
-						// add split addresses to work queue in reverse order ('cept first one)
-						for (int i=nopnds-1; i>=1; i--) {
-							int addr = getShort(code, ip+i*2);
-							//System.out.println("try alt "+i+" at "+addr);
-							work.add(new Context(addr, input.mark()));
-						}
-						// try first alternative (w/o adding to work list)
-						int addr = getShort(code, ip);
-						ip = addr;
-						//System.out.println("try alt "+nopnds+" at "+addr);
-						continue;
-					default :
-						throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
-				}
-			}
-		}
-		return 0;
-	}
 */
 	
 }
--- a/tool/src/org/antlr/v4/codegen/DFACompiler.java
+++ b/tool/src/org/antlr/v4/codegen/DFACompiler.java
@ -4,6 +4,7 @@ import org.antlr.v4.automata.DFA;
 import org.antlr.v4.automata.DFAState;
 import org.antlr.v4.automata.Edge;
 import org.antlr.v4.codegen.pda.*;
+import org.antlr.v4.runtime.pda.Bytecode;

 /** */
 public class DFACompiler {
@ -20,6 +21,8 @@ public class DFACompiler {
 	public CompiledPDA compile() {
 		walk();
 		gen.compile();
+		System.out.println("DFA: ");
+		System.out.println(Bytecode.disassemble(gen.obj.code,false));		
 		return gen.obj;
 	}

@ -30,7 +33,6 @@ public class DFACompiler {

 		// walk code, update jump targets.
 		for (Instr I : gen.obj.instrs) {
-			System.out.println("instr "+I);
 			if ( I instanceof JumpInstr) {
 				JumpInstr J = (JumpInstr)I;
 				J.target = stateToAddr[J.target];
--- a/tool/test/org/antlr/v4/test/BaseTest.java
+++ b/tool/test/org/antlr/v4/test/BaseTest.java
@ -28,16 +28,17 @@
 package org.antlr.v4.test;


-import org.antlr.runtime.CommonTokenStream;
-import org.antlr.runtime.RecognitionException;
-import org.antlr.runtime.Token;
-import org.antlr.runtime.TokenSource;
+import org.antlr.runtime.*;
 import org.antlr.v4.Tool;
 import org.antlr.v4.analysis.DFAMinimizer;
 import org.antlr.v4.analysis.LexerNFAToDFAConverter;
 import org.antlr.v4.analysis.PredictionDFAFactory;
+import org.antlr.v4.automata.DFA;
 import org.antlr.v4.automata.*;
+import org.antlr.v4.codegen.CompiledPDA;
+import org.antlr.v4.codegen.LexerCompiler;
 import org.antlr.v4.misc.Utils;
+import org.antlr.v4.runtime.pda.PDA;
 import org.antlr.v4.semantics.SemanticPipeline;
 import org.antlr.v4.tool.*;
 import org.junit.After;
@ -133,6 +134,37 @@ public abstract class BaseTest {
 		dfa.minimized = dmin.minimize();
 	}

+	PDA getLexerPDA(LexerGrammar g) {
+		NFA nfa = createNFA(g);
+
+		LexerCompiler comp = new LexerCompiler(g);
+		CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
+		PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
+		return PDA;
+	}
+
+	List<Integer> getTypesFromString(Grammar g, String expecting) {
+		List<Integer> expectingTokenTypes = new ArrayList<Integer>();
+		if ( expecting!=null && !expecting.trim().equals("") ) {
+			for (String tname : expecting.replace(" ", "").split(",")) {
+				int ttype = g.getTokenType(tname);
+				expectingTokenTypes.add(ttype);
+			}
+		}
+		return expectingTokenTypes;
+	}
+
+	List<Integer> getTokenTypes(String input, PDA lexerPDA) {
+		ANTLRStringStream in = new ANTLRStringStream(input);
+		List<Integer> tokenTypes = new ArrayList<Integer>();
+		int ttype = 0;
+		do {
+			ttype = lexerPDA.execThompson(in);
+			tokenTypes.add(ttype);
+		} while ( ttype!= Token.EOF );
+		return tokenTypes;
+	}
+	
 	List<Message> checkRuleDFA(String gtext, String ruleName, String expecting)
 		throws Exception
 	{
--- a/tool/test/org/antlr/v4/test/TestDFAInterp.java
+++ b/tool/test/org/antlr/v4/test/TestDFAInterp.java
@ -0,0 +1,98 @@
+package org.antlr.v4.test;
+
+import org.antlr.runtime.ANTLRStringStream;
+import org.antlr.runtime.CharStream;
+import org.antlr.runtime.CommonTokenStream;
+import org.antlr.runtime.Token;
+import org.antlr.v4.automata.DFA;
+import org.antlr.v4.automata.DecisionState;
+import org.antlr.v4.automata.NFA;
+import org.antlr.v4.codegen.CompiledPDA;
+import org.antlr.v4.codegen.DFACompiler;
+import org.antlr.v4.runtime.Lexer;
+import org.antlr.v4.runtime.pda.PDA;
+import org.antlr.v4.tool.Grammar;
+import org.antlr.v4.tool.LexerGrammar;
+import org.junit.Test;
+
+import java.util.List;
+
+/** */
+public class TestDFAInterp extends BaseTest {
+
+	public static class InterpLexer extends Lexer {
+		public InterpLexer(CharStream input, PDA pda) {
+			super(input);
+			modeToPDA = new PDA[] { pda };
+		}
+	}
+
+	@Test public void testSimpleLL1Decision() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n" +
+			"ID  : 'a'..'z'+ ;\n" +
+			"INT : '0'..'9'+ ;\n");
+
+		Grammar g = new Grammar(
+			"parser grammar P;\n" +
+			"a : ID | INT ;\n"
+		);
+		int expecting = 1;
+		checkDFAMatches(g, lg, 0, "ab", expecting);
+
+		expecting = 2;
+		checkDFAMatches(g, lg, 0, "32", expecting);
+	}
+
+	@Test public void testArbCommonPrefix() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+			"lexer grammar L;\n" +
+			"SEMI  : ';' ;\n" +
+			"DOT   : '.' ;\n" +
+			"WS    : ' ' ;\n" +
+			"ID    : 'a'..'z'+ ;\n" +
+			"INT  : '0'..'9'+ ;\n");
+
+		Grammar g = new Grammar(
+			"parser grammar P;\n" +
+			"tokens { WS; }\n" +
+			"a : ID+ SEMI\n" +
+			"  | ID+ DOT\n" +
+			"  ;\n"
+		);
+		int expecting = 1;
+		checkDFAMatches(g, lg, 2, "a b c ;", expecting);
+
+		expecting = 2;
+		checkDFAMatches(g, lg, 2, "a b c .", expecting);
+	}
+
+	int interp(Grammar g, LexerGrammar lg, int decision, String input) {
+		NFA nfa = createNFA(g);
+		DecisionState blk = nfa.decisionToNFAState.get(decision);
+		DFA dfa = createDFA(g, blk);
+		DFACompiler comp = new DFACompiler(dfa);
+		CompiledPDA obj = comp.compile();
+		PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);
+
+		lg.importVocab(g);
+		PDA lexerPDA = getLexerPDA(lg);
+		Lexer lexer = new InterpLexer(new ANTLRStringStream(input), lexerPDA);
+
+		CommonTokenStream tokens = new CommonTokenStream(lexer);
+		tokens.fill();
+		List<Token> list = tokens.getTokens();
+		for (Token t : list) {// hide WS
+			if ( t.getType()==g.getTokenType("WS") ) t.setChannel(Token.HIDDEN_CHANNEL);
+		}
+		System.out.println("tokens="+ list);
+		int alt = pda.execNoRecursion(tokens, 0);
+		return alt;
+	}
+
+	void checkDFAMatches(Grammar g, LexerGrammar lg, int decision,
+						 String input, int expecting) {
+		int result = interp(g, lg, decision, input);
+		assertEquals(expecting, result);
+	}
+}
--- a/tool/test/org/antlr/v4/test/TestDFAtoPDABytecodeGeneration.java
+++ b/tool/test/org/antlr/v4/test/TestDFAtoPDABytecodeGeneration.java
@ -2,6 +2,7 @@ package org.antlr.v4.test;

 import org.antlr.v4.automata.DFA;
 import org.antlr.v4.automata.DecisionState;
+import org.antlr.v4.automata.Edge;
 import org.antlr.v4.automata.NFA;
 import org.antlr.v4.codegen.CompiledPDA;
 import org.antlr.v4.codegen.DFACompiler;
@ -21,24 +22,52 @@ public class TestDFAtoPDABytecodeGeneration extends BaseTest {
 			"0007:\tset           0\n" +
 			"0010:\tjmp           13\n" +
 			"0013:\taccept        1\n" +
-			"0016:\tmatch8        5\n" +
+			"0016:\tmatch8        4\n" +
 			"0018:\tjmp           21\n" +
 			"0021:\taccept        2\n";
 		checkBytecode(g, 0, expecting);
 	}

+	@Test public void testAorBToSameState() throws Exception {
+		Grammar g = new Grammar(
+			"parser grammar T;\n"+
+			"a : A | B ;");
+		String expecting =
+			"0000:\tsplit         7, 15\n" +
+			"0007:\tmatch8        4\n" +
+			"0009:\tjmp           12\n" +
+			"0012:\taccept        2\n" +
+			"0015:\tmatch8        5\n" +
+			"0017:\tjmp           12\n";
+
+		NFA nfa = createNFA(g);
+		DecisionState blk = nfa.decisionToNFAState.get(0);
+		DFA dfa = createDFA(g, blk);
+
+		// make S0 go to S1 on both A and B (pinch alts back to single state)
+		Edge e0 = dfa.states.get(0).edge(0);
+		Edge e1 = dfa.states.get(0).edge(1);
+		e0.target = e1.target;
+		System.out.print("altered DFA="+dfa);
+
+		DFACompiler comp = new DFACompiler(dfa);
+		CompiledPDA obj = comp.compile();
+		PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);
+		assertEquals(expecting, Bytecode.disassemble(pda.code, false));
+	}
+
 	@Test public void testAorB() throws Exception {
 		Grammar g = new Grammar(
 			"parser grammar T;\n"+
 			"a : A | B ;");
 		String expecting =
 			"0000:\tsplit         7, 15\n" +
-			"0007:\tmatch8        5\n" +
+			"0007:\tmatch8        4\n" +
 			"0009:\tjmp           12\n" +
-			"0012:\taccept        2\n" +
-			"0015:\tmatch8        4\n" +
+			"0012:\taccept        1\n" +
+			"0015:\tmatch8        5\n" +
 			"0017:\tjmp           20\n" +
-			"0020:\taccept        1\n";
+			"0020:\taccept        2\n";
 		checkBytecode(g, 0, expecting);
 	}

@ -82,10 +111,6 @@ public class TestDFAtoPDABytecodeGeneration extends BaseTest {
 		NFA nfa = createNFA(g);
 		DecisionState blk = nfa.decisionToNFAState.get(decision);
 		DFA dfa = createDFA(g, blk);
-//		Edge e0 = dfa.states.get(1).edge(0);
-//		Edge e1 = dfa.states.get(1).edge(1);
-//		e0.target = e1.target;
-//		System.out.print("altered DFA="+dfa);
 		DFACompiler comp = new DFACompiler(dfa);
 		CompiledPDA obj = comp.compile();
 		PDA pda = new PDA(obj.code, obj.altToAddr, obj.nLabels);
--- a/tool/test/org/antlr/v4/test/TestPDABytecodeInterp.java
+++ b/tool/test/org/antlr/v4/test/TestPDABytecodeInterp.java
@ -1,13 +1,7 @@
 package org.antlr.v4.test;

 import org.antlr.runtime.ANTLRStringStream;
-import org.antlr.runtime.Token;
-import org.antlr.v4.Tool;
-import org.antlr.v4.codegen.CompiledPDA;
-import org.antlr.v4.codegen.LexerCompiler;
 import org.antlr.v4.runtime.pda.PDA;
-import org.antlr.v4.semantics.SemanticPipeline;
-import org.antlr.v4.tool.Grammar;
 import org.antlr.v4.tool.LexerGrammar;
 import org.junit.Test;

@ -191,74 +185,41 @@ public class TestPDABytecodeInterp extends BaseTest {
 	}

 	void checkMatches(LexerGrammar g, String input, String expecting) {
-		if ( g.ast!=null && !g.ast.hasErrors ) {
-			System.out.println(g.ast.toStringTree());
-			Tool antlr = new Tool();
-			SemanticPipeline sem = new SemanticPipeline(g);
-			sem.process();
-			if ( g.getImportedGrammars()!=null ) { // process imported grammars (if any)
-				for (Grammar imp : g.getImportedGrammars()) {
-					antlr.process(imp);
-				}
-			}
-		}
+		PDA pda = getLexerPDA(g);

-		List<Integer> expectingTokenTypes = new ArrayList<Integer>();
-		if ( expecting!=null && !expecting.trim().equals("") ) {
-			for (String tname : expecting.replace(" ", "").split(",")) {
-				int ttype = g.getTokenType(tname);
-				expectingTokenTypes.add(ttype);
-			}
-		}
+		List<Integer> expectingTokenTypes = getTypesFromString(g, expecting);

-		LexerCompiler comp = new LexerCompiler(g);
-		CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
-		PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
-
-		ANTLRStringStream in = new ANTLRStringStream(input);
-		List<Integer> tokenTypes = new ArrayList<Integer>();
-		int ttype = 0;
-		do {
-			ttype = PDA.execThompson(in);
-			tokenTypes.add(ttype);
-		} while ( ttype!= Token.EOF );
+		List<Integer> tokenTypes = getTokenTypes(input, pda);
 		assertEquals(expectingTokenTypes, tokenTypes);
 	}

 	void checkLabels(LexerGrammar g, String input, String expecting,
 					  String expectingTokens)
 	{
-		if ( g.ast!=null && !g.ast.hasErrors ) {
-			System.out.println(g.ast.toStringTree());
-			Tool antlr = new Tool();
-			SemanticPipeline sem = new SemanticPipeline(g);
-			sem.process();
-			if ( g.getImportedGrammars()!=null ) { // process imported grammars (if any)
-				for (Grammar imp : g.getImportedGrammars()) {
-					antlr.process(imp);
-				}
-			}
-		}
-
-		List<Integer> expectingTokenTypes = new ArrayList<Integer>();
-		if ( expecting!=null && !expecting.trim().equals("") ) {
-			for (String tname : expecting.replace(" ", "").split(",")) {
-				int ttype = g.getTokenType(tname);
-				expectingTokenTypes.add(ttype);
-			}
-		}
-
-		LexerCompiler comp = new LexerCompiler(g);
-		CompiledPDA obj = comp.compileMode(LexerGrammar.DEFAULT_MODE_NAME);
-		PDA PDA = new PDA(obj.code, obj.altToAddr, obj.nLabels);
+		PDA pda = getLexerPDA(g);
+		List<Integer> expectingTokenTypes = getTypesFromString(g, expecting);
 		ANTLRStringStream in = new ANTLRStringStream(input);
 		List<Integer> tokenTypes = new ArrayList<Integer>();
-		int ttype = PDA.execThompson(in);
+		int ttype = pda.execThompson(in);
 		tokenTypes.add(ttype);
 		assertEquals(expectingTokenTypes, tokenTypes);

 		if ( expectingTokens!=null ) {
-			assertEquals(expectingTokens, Arrays.toString(PDA.labelValues));
+			assertEquals(expectingTokens, Arrays.toString(pda.labelValues));
 		}
 	}
+
+	
+
+//	List<Token> getTokens(String input, PDA lexerPDA) {
+//		ANTLRStringStream in = new ANTLRStringStream(input);
+//		List<Token> tokens = new ArrayList<Token>();
+//		int ttype = 0;
+//		do {
+//			ttype = lexerPDA.execThompson(in);
+//			tokens.add(new CommonToken(ttype,""));
+//		} while ( ttype!= Token.EOF );
+//		return tokens;
+//	}
+
 }