added in NFA VM prototype

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6820]
2010-04-22 13:07:16 -08:00 · 2010-04-22 13:07:16 -08:00 · 3015778202
parent 41c0225adf
commit 3015778202
8 changed files with 2476 additions and 2 deletions
--- a/runtime/Java/src/org/antlr/v4/runtime/nfa/Bytecode.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/nfa/Bytecode.java
@ -0,0 +1,125 @@
+package org.antlr.v4.runtime.nfa;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/** */
+public class Bytecode {
+	public static final int MAX_OPNDS = 3; // Or single opnd indicating variable number
+	public static final int OPND_SIZE_IN_BYTES = 2;
+	public enum OperandType { NONE, CHAR, ADDR, INT, VARARGS }
+
+	public static class Instruction {
+		String name; // E.g., "load_str", "new"
+		OperandType[] type = new OperandType[MAX_OPNDS];
+		int n = 0;
+		public Instruction(String name) {
+			this(name,OperandType.NONE,OperandType.NONE,OperandType.NONE); n=0;
+		}
+		public Instruction(String name, OperandType a) {
+			this(name,a,OperandType.NONE,OperandType.NONE); n=1;
+		}
+		public Instruction(String name, OperandType a, OperandType b) {
+			this(name,a,b,OperandType.NONE); n=2;
+		}
+		public Instruction(String name, OperandType a, OperandType b, OperandType c) {
+			this.name = name;
+			type[0] = a;
+			type[1] = b;
+			type[2] = c;
+			n = MAX_OPNDS;
+		}
+	}
+
+	// don't use enum for efficiency; don't want code block to
+	// be an array of objects (Bytecode[]). We want it to be byte[].
+
+	// INSTRUCTION BYTECODES (byte is signed; use a short to keep 0..255)
+	public static final short ACCEPT	= 1;
+	public static final short JMP		= 2;
+	public static final short SPLIT		= 3;
+	public static final short MATCH		= 4;
+	public static final short RANGE		= 5;
+
+	/** Used for disassembly; describes instruction set */
+	public static Instruction[] instructions = new Instruction[] {
+		null, // <INVALID>
+		new Instruction("accept", OperandType.INT), // index is the opcode
+		new Instruction("jmp", OperandType.ADDR),
+		new Instruction("split", OperandType.VARARGS),
+		new Instruction("match", OperandType.CHAR),
+		new Instruction("range", OperandType.CHAR, OperandType.CHAR)
+	};
+
+	public static String disassemble(byte[] code) {
+		StringBuilder buf = new StringBuilder();
+		int i=0;
+		while (i<code.length) {
+			i = disassembleInstruction(buf, code, i);
+			buf.append('\n');
+		}
+		return buf.toString();
+	}
+
+	public static String disassembleInstruction(byte[] code, int ip) {
+		StringBuilder buf = new StringBuilder();
+		disassembleInstruction(buf, code, ip);
+		return buf.toString();
+	}
+
+	public static int disassembleInstruction(StringBuilder buf, byte[] code, int ip) {
+		int opcode = code[ip];
+		if ( ip>=code.length ) {
+			throw new IllegalArgumentException("ip out of range: "+ip);
+		}
+		Bytecode.Instruction I =
+			Bytecode.instructions[opcode];
+		if ( I==null ) {
+			throw new IllegalArgumentException("no such instruction "+opcode+
+				" at address "+ip);
+		}
+		String instrName = I.name;
+		buf.append( String.format("%04d:\t%-14s", ip, instrName) );
+		ip++;
+		if ( I.n==0 ) {
+			buf.append("  ");
+			return ip;
+		}
+		List<String> operands = new ArrayList<String>();
+		for (int i=0; i<I.n; i++) {
+			int opnd = getShort(code, ip);
+			ip += Bytecode.OPND_SIZE_IN_BYTES;
+			switch ( I.type[i] ) {
+				case CHAR :
+					operands.add("'"+(char)opnd+"'");
+					break;
+				case VARARGS : // get n (opnd) operands
+					int n = opnd;
+					// operands.add(String.valueOf(n)); don't show n in varargs
+					for (int j=0; j<n; j++) {
+						operands.add(String.valueOf(getShort(code, ip)));
+						ip += OPND_SIZE_IN_BYTES;
+					}
+					break;
+				case INT :
+				case ADDR :
+				default:
+					operands.add(String.valueOf(opnd));
+					break;
+			}
+		}
+		for (int i = 0; i < operands.size(); i++) {
+			String s = operands.get(i);
+			if ( i>0 ) buf.append(", ");
+			buf.append( s );
+		}
+		return ip;
+	}
+
+	public static int getShort(byte[] memory, int index) {
+		int b1 = memory[index++]&0xFF; // mask off sign-extended bits
+		int b2 = memory[index++]&0xFF;
+		int word = b1<<(8*1) | b2;
+		return word;
+	}	
+}
--- a/runtime/Java/src/org/antlr/v4/runtime/nfa/Interpreter.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/nfa/Interpreter.java
@ -0,0 +1,98 @@
+package org.antlr.v4.runtime.nfa;
+
+import org.antlr.runtime.CharStream;
+
+/** http://swtch.com/~rsc/regexp/regexp2.html */
+public class Interpreter {
+	byte[] code;
+	public Interpreter(byte[] code) { this.code = code; }
+
+	/*
+	for(;;){
+        switch(pc->opcode){
+        case Char:
+            if(*sp != pc->c)
+                return 0;
+            pc++;
+            sp++;
+            continue;
+        case Match:
+            return 1;
+        case Jmp:
+            pc = pc->x;
+            continue;
+        case Split:
+            if(recursiveloop(pc->x, sp))
+                return 1;
+            pc = pc->y;
+            continue;
+        }
+        assert(0);
+        return -1; 
+    }
+	 */
+	public int exec(CharStream input, int ip) {
+		while ( ip < code.length ) {
+			int c = input.LA(1);
+			trace(ip);
+			short opcode = code[ip];
+			ip++; // move to next instruction or first byte of operand
+			switch (opcode) {
+				case Bytecode.MATCH :
+					int o = getShort(code, ip);
+					ip += 2;
+					if ( c != o ) return 0;
+					input.consume();
+					break;
+				case Bytecode.RANGE :
+					int from = getShort(code, ip);
+					ip += 2;
+					int to = getShort(code, ip);
+					ip += 2;
+					if ( c<from || c>to ) return 0;
+					input.consume();
+					break;
+				case Bytecode.ACCEPT :
+					int ruleIndex = getShort(code, ip);
+					ip += 2;
+					System.out.println("accept "+ruleIndex);
+					return ruleIndex;
+				case Bytecode.JMP :
+					int target = getShort(code, ip);
+					ip = target;
+					continue;
+				case Bytecode.SPLIT :
+					int nopnds = getShort(code, ip);
+					ip += 2;
+					for (int i=1; i<=nopnds-1; i++) {
+						int addr = getShort(code, ip);
+						ip += 2;
+						System.out.println("try alt "+i+" at "+addr);
+						int m = input.mark();
+						int r = exec(input, addr);
+						if ( r>0 ) { input.release(m); return r; }
+						input.rewind(m);
+					}
+					// try final alternative (w/o recursion)
+					int addr = getShort(code, ip);
+					ip = addr;
+					System.out.println("try alt "+nopnds+" at "+addr);
+					continue;
+				default :
+					throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
+			}
+		}
+		return 0;
+	}
+
+	void trace(int ip) {
+		String instr = Bytecode.disassembleInstruction(code, ip);
+        System.out.println(instr);
+    }
+
+	public static int getShort(byte[] memory, int index) {
+        int b1 = memory[index++]&0xFF; // mask off sign-extended bits
+        int b2 = memory[index++]&0xFF;
+        return b1<<(8*1) | b2;
+    }
+}
--- a/tool/src/org/antlr/v4/analysis/AnalysisPipeline.java
+++ b/tool/src/org/antlr/v4/analysis/AnalysisPipeline.java
@ -20,8 +20,10 @@ public class AnalysisPipeline {
 		if ( lr.listOfRecursiveCycles.size()>0 ) return; // bail out

 		// BUILD DFA FOR EACH DECISION
-		if ( g.isLexer() ) processLexer();
-		else processParserOrTreeParser();
+//		if ( g.isLexer() ) processLexer();
+//		else processParserOrTreeParser();
+		// TODO: don't do lexers for now; we can add lookahead analysis to help with NFA simulation later
+		if ( !g.isLexer() ) processParserOrTreeParser();
 	}

 	void processLexer() {
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java
@ -0,0 +1,180 @@
+package org.antlr.v4.codegen;
+
+import org.antlr.runtime.RecognizerSharedState;
+import org.antlr.runtime.Token;
+import org.antlr.runtime.tree.TreeNodeStream;
+import org.antlr.v4.runtime.nfa.Bytecode;
+import org.antlr.v4.runtime.tree.TreeParser;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/** http://swtch.com/~rsc/regexp/regexp2.html */
+public class NFABytecodeGenerator extends TreeParser {
+	public abstract static class Instr {
+		public short opcode;
+		public int addr;
+		public int nBytes;
+		public Instr(short opcode, int nBytes) { this.opcode = opcode; this.nBytes = nBytes; }
+		public void write(byte[] code) { code[addr] = (byte)opcode;	}
+	}
+
+	public static class MatchInstr extends Instr {
+		Token token;
+		char c;
+		public MatchInstr(Token t, char c) { super(Bytecode.MATCH, 3); this.token = t; this.c = c; }
+		public void write(byte[] code) {
+			super.write(code);
+			writeShort(code, addr+1, (short)c);
+		}
+
+		@Override
+		public String toString() {
+			return addr+":MatchInstr{" +
+				   "c=" + c +
+				   '}';
+		}
+	}
+
+	public static class RangeInstr extends Instr {
+		Token a, b;
+		char start, stop;
+		public RangeInstr(Token a, Token b) {
+			super(Bytecode.RANGE, 1+2*Bytecode.OPND_SIZE_IN_BYTES);
+			this.a = a;
+			this.b = b;
+			start = (char)Target.getCharValueFromGrammarCharLiteral(a.getText());
+			stop = (char)Target.getCharValueFromGrammarCharLiteral(b.getText());
+		}
+		public void write(byte[] code) {
+			super.write(code);
+			writeShort(code, addr+1, (short)start);
+			writeShort(code, addr+1+Bytecode.OPND_SIZE_IN_BYTES, (short)stop);
+		}
+
+		@Override
+		public String toString() {
+			return addr+":RangeInstr{"+start+".."+stop+"}";
+		}
+	}
+
+	public static class AcceptInstr extends Instr {
+		int ruleIndex;
+		public AcceptInstr(int ruleIndex) {
+			super(Bytecode.ACCEPT, 3);
+			this.ruleIndex = ruleIndex;
+		}
+		public void write(byte[] code) {
+			super.write(code);
+			writeShort(code, addr+1, (short)ruleIndex);
+		}
+		public String toString() { return addr+":AcceptInstr "+ruleIndex; }
+	}
+
+	public static class JumpInstr extends Instr {
+		int target;
+		public JumpInstr() { super(Bytecode.JMP, 3); }
+		public void write(byte[] code) {
+			super.write(code);
+			writeShort(code, addr+1, (short)target);
+		}
+
+		@Override
+		public String toString() {
+			return addr+":JumpInstr{" +
+				   "target=" + target +
+				   '}';
+		}
+	}
+
+	public static class SplitInstr extends Instr {
+		List<Integer> addrs = new ArrayList<Integer>();
+		public SplitInstr(int nAlts) { super(Bytecode.SPLIT, 1+2+nAlts*2); }
+		public void write(byte[] code) {
+			super.write(code);
+			int a = addr + 1;
+			writeShort(code, a, (short)addrs.size());
+			a += Bytecode.OPND_SIZE_IN_BYTES;
+			for (int x : addrs) {
+				writeShort(code, a, (short)x);
+				a += Bytecode.OPND_SIZE_IN_BYTES;
+			}
+		}
+
+		@Override
+		public String toString() {
+			return addr+":SplitInstr{" +
+				   "addrs=" + addrs +
+				   '}';
+		}
+	}
+
+	public List<Instr> instrs = new ArrayList<Instr>();
+	public int ip = 0; // where to write next
+
+	public NFABytecodeGenerator(TreeNodeStream input) {
+		super(input);
+	}
+
+	public NFABytecodeGenerator(TreeNodeStream input, RecognizerSharedState state) {
+		super(input, state);
+	}
+
+	public void emit(Instr I) {
+		I.addr = ip;
+		ip += I.nBytes;
+		instrs.add(I);
+	}
+
+	public void emitString(Token t) {
+		String chars = Target.getStringFromGrammarStringLiteral(t.getText());
+		for (char c : chars.toCharArray()) {
+			emit(new MatchInstr(t, c));
+		}
+	}
+
+	public byte[] getCode() {
+		Instr last = instrs.get(instrs.size() - 1);
+		int size = last.addr + last.nBytes;
+		byte[] code = new byte[size];
+		for (Instr I : instrs) {
+			I.write(code);
+		}
+		return code;
+	}
+
+	/** Write value at index into a byte array highest to lowest byte,
+	 *  left to right.
+	 */
+	public static void writeShort(byte[] memory, int index, short value) {
+		memory[index+0] = (byte)((value>>(8*1))&0xFF);
+		memory[index+1] = (byte)(value&0xFF);
+	}
+
+	/* CODE TO GENERATE NFA BYTECODES
+			// testing code gen concept
+			GrammarASTAdaptor adaptor = new GrammarASTAdaptor();
+			for (Rule r : lg.modes.get(modeName)) {
+				GrammarAST blk = (GrammarAST)r.ast.getFirstChildWithType(ANTLRParser.BLOCK);
+				CommonTreeNodeStream nodes = new CommonTreeNodeStream(adaptor,blk);
+				NFABytecodeTriggers gen = new NFABytecodeTriggers(nodes);
+				try {
+					gen.block();
+					gen.emit(new NFABytecodeGenerator.AcceptInstr(r.index));
+					System.out.println("code=\n"+gen.instrs);
+					byte[] code = gen.getCode();
+					System.out.println(Bytecode.disassemble(code));
+					Interpreter in = new Interpreter(code);
+					String s = "i";
+					ANTLRStringStream input = new ANTLRStringStream(s);
+					int rule = in.exec(input, 0);
+					System.out.println(s+" matched rule "+rule+" leaving off at index="+input.index());
+				}
+				catch (Exception e){
+					e.printStackTrace(System.err);
+				}
+			}
+
+	 */
+
+}
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.g
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.g
@ -0,0 +1,165 @@
+tree grammar NFABytecodeTriggers;
+options {
+	language     = Java;
+	tokenVocab   = ANTLRParser;
+	ASTLabelType = GrammarAST;
+	superClass   = NFABytecodeGenerator;
+}
+
+@header {
+package org.antlr.v4.codegen;
+import org.antlr.v4.tool.GrammarAST;
+}
+
+/*
+e1 | e2 | e3:
+	split 3, L1, L2, L3
+L1:	e1
+	jmp END
+L2:	e2
+	jmp END
+L3:	e3
+END:
+*/
+block
+    :	^(	BLOCK (^(OPTIONS .+))?
+    		{
+    		GrammarAST firstAlt = (GrammarAST)input.LT(1);
+    		int i = firstAlt.getChildIndex();    		
+			int nAlts = $start.getChildCount() - i;
+    		System.out.println("alts "+nAlts);
+    		List<JumpInstr> jumps = new ArrayList<JumpInstr>();
+    		SplitInstr S = null;
+    		if ( nAlts>1 ) {
+	    		S = new SplitInstr(nAlts);
+	    		emit(S);
+	    		S.addrs.add(ip);
+    		}
+    		int alt = 1;
+    		}
+    		(	alternative
+    			{
+    			if ( alt < nAlts ) {
+	    			JumpInstr J = new JumpInstr();
+	    			jumps.add(J);
+	    			emit(J);
+	    			S.addrs.add(ip);
+    			}
+    			alt++;
+    			}
+    		)+
+    		{
+    		int END = ip;
+    		for (JumpInstr J : jumps) J.target = END;
+    		}
+    	)
+    ;
+
+alternative
+    :	^(ALT_REWRITE a=alternative .)	
+    |	^(ALT EPSILON)					
+    |   ^(ALT (e=element )+)    									
+    ;
+
+element
+	:	labeledElement				
+	|	atom						
+	|	ebnf						
+	|   ACTION						
+	|   SEMPRED						
+	|	GATED_SEMPRED				
+	|	treeSpec					
+	;
+	
+labeledElement
+	:	^(ASSIGN ID atom)			
+	|	^(ASSIGN ID block)			
+	|	^(PLUS_ASSIGN ID atom)		
+	|	^(PLUS_ASSIGN ID block)		
+	;
+
+treeSpec
+    : ^(TREE_BEGIN  (e=element )+)	
+    ;
+
+ebnf
+	:	^(astBlockSuffix block)		
+	|	{
+	   	SplitInstr S = new SplitInstr(2);
+		emit(S);
+   		S.addrs.add(ip);
+		}
+		^(OPTIONAL block)			
+		{
+   		S.addrs.add(ip);
+		}
+	|	{
+		int start=ip;
+	   	SplitInstr S = new SplitInstr(2);
+		emit(S);
+   		S.addrs.add(ip);
+		}
+		^(CLOSURE block)			
+		{
+	    JumpInstr J = new JumpInstr();
+	    emit(J);
+	    J.target = start;
+	    S.addrs.add(ip);
+		}
+	|	{int start=ip;} ^(POSITIVE_CLOSURE block)
+		{
+   		SplitInstr S = new SplitInstr(2);
+		emit(S);
+		int stop = ip;
+   		S.addrs.add(start);
+   		S.addrs.add(stop);
+		}
+	| 	block						
+    ;
+
+astBlockSuffix
+    : ROOT
+    | IMPLIES
+    | BANG
+    ;
+
+atom
+	:	^(ROOT range)			
+	|	^(BANG range)			
+	|	^(ROOT notSet)			
+	|	^(BANG notSet)			
+	|	notSet					
+	|	range					
+	|	^(DOT ID terminal)		
+	|	^(DOT ID ruleref)		
+    |	^(WILDCARD .)			
+    |	WILDCARD				
+    |   terminal				
+    |   ruleref					
+    ;
+
+notSet
+    : ^(NOT terminal)		
+    | ^(NOT block)			
+    ;
+
+ruleref
+    :	^(ROOT ^(RULE_REF ARG_ACTION?))	
+    |	^(BANG ^(RULE_REF ARG_ACTION?))	
+    |	^(RULE_REF ARG_ACTION?)			
+    ;
+
+range
+    :	^(RANGE a=STRING_LITERAL b=STRING_LITERAL)
+    	{emit(new RangeInstr($a.token, $b.token));}
+    ;
+
+terminal
+    :  ^(STRING_LITERAL .)			{emitString($STRING_LITERAL.token);}
+    |	STRING_LITERAL				{emitString($STRING_LITERAL.token);}
+    |	^(TOKEN_REF ARG_ACTION .)	
+    |	^(TOKEN_REF .)				
+    |	TOKEN_REF					
+    |	^(ROOT terminal)			
+    |	^(BANG terminal)			
+    ;
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.java
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.java
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.tokens
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.tokens
@ -0,0 +1,99 @@
+COMBINED=91
+LT=44
+STAR=49
+BACKTRACK_SEMPRED=96
+DOUBLE_ANGLE_STRING_LITERAL=11
+FORCED_ACTION=5
+ARGLIST=89
+ALTLIST=86
+NOT=61
+SEMPRED=4
+ACTION=16
+TOKEN_REF=63
+RULEMODIFIERS=75
+ST_RESULT=100
+RPAREN=42
+RET=90
+IMPORT=22
+STRING_LITERAL=68
+ARG=88
+ARG_ACTION=14
+DOUBLE_QUOTE_STRING_LITERAL=10
+COMMENT=9
+ACTION_CHAR_LITERAL=13
+GRAMMAR=27
+RULEACTIONS=76
+WSCHARS=66
+INITACTION=92
+ALT_REWRITE=102
+IMPLIES=43
+RULE=73
+RBRACE=62
+ACTION_ESC=17
+PRIVATE=30
+SRC=7
+THROWS=32
+CHAR_RANGE=83
+INT=65
+EPSILON=84
+LIST=98
+COLONCOLON=38
+WSNLCHARS=18
+WS=71
+LEXER=24
+OR=52
+GT=45
+CATCH=33
+CLOSURE=80
+PARSER=25
+DOLLAR=54
+PROTECTED=28
+ELEMENT_OPTIONS=99
+NESTED_ACTION=15
+FRAGMENT=23
+ID=87
+TREE_BEGIN=59
+LPAREN=41
+AT=60
+ESC_SEQ=67
+ALT=85
+TREE=26
+SCOPE=21
+ETC=57
+COMMA=39
+WILDCARD=97
+DOC_COMMENT=6
+PLUS=50
+REWRITE_BLOCK=78
+DOT=55
+MODE=36
+RETURNS=31
+RULES=74
+RARROW=58
+UNICODE_ESC=70
+HEX_DIGIT=69
+RANGE=56
+TOKENS=20
+RESULT=101
+GATED_SEMPRED=94
+BANG=48
+ACTION_STRING_LITERAL=12
+ROOT=53
+SEMI=40
+RULE_REF=64
+NLCHARS=8
+OPTIONAL=79
+SYNPRED=82
+COLON=37
+QUESTION=47
+FINALLY=34
+TEMPLATE=35
+LABEL=93
+SYN_SEMPRED=95
+ERRCHAR=72
+BLOCK=77
+ASSIGN=46
+PLUS_ASSIGN=51
+PUBLIC=29
+POSITIVE_CLOSURE=81
+OPTIONS=19
--- a/tool/src/org/antlr/v4/codegen/Target.java
+++ b/tool/src/org/antlr/v4/codegen/Target.java
@ -65,6 +65,35 @@ public class Target {
 		}
 	}

+	public static String getStringFromGrammarStringLiteral(String literal) {
+		StringBuilder buf = new StringBuilder();
+		int n = literal.length();
+		int i = 1; // skip first quote
+		while ( i < (n-1) ) { // scan all but last quote 
+			switch ( literal.charAt(i) ) {
+				case '\\' :
+					i++;
+					if ( literal.charAt(i)=='u' ) { // '\u1234'
+						i++;
+						String unicodeChars = literal.substring(3,literal.length()-1);
+						buf.append((char)Integer.parseInt(unicodeChars, 16));
+					}
+					else {
+						char escChar = literal.charAt(i);
+						int charVal = ANTLRLiteralEscapedCharValue[escChar];
+						if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
+						else buf.append((char)charVal);
+					}
+					break;
+				default :
+					buf.append(literal.charAt(i));
+					i++;
+					break;
+			}
+		}
+		return buf.toString();
+	}
+
 	/** Return a string representing the escaped char for code c.  E.g., If c
 	 *  has value 0x100, you will get "\u0100".  ASCII gets the usual
 	 *  char (non-hex) representation.  Control characters are spit out