added in NFA VM prototype

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6820]
2010-04-22 13:07:16 -08:00 · 2010-04-22 13:07:16 -08:00 · 3015778202
parent 41c0225adf
commit 3015778202
8 changed files with 2476 additions and 2 deletions
--- a/runtime/Java/src/org/antlr/v4/runtime/nfa/Bytecode.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/nfa/Bytecode.java
@ -0,0 +1,125 @@
 package org.antlr.v4.runtime.nfa;
 import java.util.ArrayList;
 import java.util.List;
 /** */
 public class Bytecode {
 	public static final int MAX_OPNDS = 3; // Or single opnd indicating variable number
 	public static final int OPND_SIZE_IN_BYTES = 2;
 	public enum OperandType { NONE, CHAR, ADDR, INT, VARARGS }
 	public static class Instruction {
 		String name; // E.g., "load_str", "new"
 		OperandType[] type = new OperandType[MAX_OPNDS];
 		int n = 0;
 		public Instruction(String name) {
 			this(name,OperandType.NONE,OperandType.NONE,OperandType.NONE); n=0;
 		}
 		public Instruction(String name, OperandType a) {
 			this(name,a,OperandType.NONE,OperandType.NONE); n=1;
 		}
 		public Instruction(String name, OperandType a, OperandType b) {
 			this(name,a,b,OperandType.NONE); n=2;
 		}
 		public Instruction(String name, OperandType a, OperandType b, OperandType c) {
 			this.name = name;
 			type[0] = a;
 			type[1] = b;
 			type[2] = c;
 			n = MAX_OPNDS;
 		}
 	}
 	// don't use enum for efficiency; don't want code block to
 	// be an array of objects (Bytecode[]). We want it to be byte[].
 	// INSTRUCTION BYTECODES (byte is signed; use a short to keep 0..255)
 	public static final short ACCEPT	= 1;
 	public static final short JMP		= 2;
 	public static final short SPLIT		= 3;
 	public static final short MATCH		= 4;
 	public static final short RANGE		= 5;
 	/** Used for disassembly; describes instruction set */
 	public static Instruction[] instructions = new Instruction[] {
 		null, // <INVALID>
 		new Instruction("accept", OperandType.INT), // index is the opcode
 		new Instruction("jmp", OperandType.ADDR),
 		new Instruction("split", OperandType.VARARGS),
 		new Instruction("match", OperandType.CHAR),
 		new Instruction("range", OperandType.CHAR, OperandType.CHAR)
 	};
 	public static String disassemble(byte[] code) {
 		StringBuilder buf = new StringBuilder();
 		int i=0;
 		while (i<code.length) {
 			i = disassembleInstruction(buf, code, i);
 			buf.append('\n');
 		}
 		return buf.toString();
 	}
 	public static String disassembleInstruction(byte[] code, int ip) {
 		StringBuilder buf = new StringBuilder();
 		disassembleInstruction(buf, code, ip);
 		return buf.toString();
 	}
 	public static int disassembleInstruction(StringBuilder buf, byte[] code, int ip) {
 		int opcode = code[ip];
 		if ( ip>=code.length ) {
 			throw new IllegalArgumentException("ip out of range: "+ip);
 		}
 		Bytecode.Instruction I =
 			Bytecode.instructions[opcode];
 		if ( I==null ) {
 			throw new IllegalArgumentException("no such instruction "+opcode+
 				" at address "+ip);
 		}
 		String instrName = I.name;
 		buf.append( String.format("%04d:\t%-14s", ip, instrName) );
 		ip++;
 		if ( I.n==0 ) {
 			buf.append("  ");
 			return ip;
 		}
 		List<String> operands = new ArrayList<String>();
 		for (int i=0; i<I.n; i++) {
 			int opnd = getShort(code, ip);
 			ip += Bytecode.OPND_SIZE_IN_BYTES;
 			switch ( I.type[i] ) {
 				case CHAR :
 					operands.add("'"+(char)opnd+"'");
 					break;
 				case VARARGS : // get n (opnd) operands
 					int n = opnd;
 					// operands.add(String.valueOf(n)); don't show n in varargs
 					for (int j=0; j<n; j++) {
 						operands.add(String.valueOf(getShort(code, ip)));
 						ip += OPND_SIZE_IN_BYTES;
 					}
 					break;
 				case INT :
 				case ADDR :
 				default:
 					operands.add(String.valueOf(opnd));
 					break;
 			}
 		}
 		for (int i = 0; i < operands.size(); i++) {
 			String s = operands.get(i);
 			if ( i>0 ) buf.append(", ");
 			buf.append( s );
 		}
 		return ip;
 	}
 	public static int getShort(byte[] memory, int index) {
 		int b1 = memory[index++]&0xFF; // mask off sign-extended bits
 		int b2 = memory[index++]&0xFF;
 		int word = b1<<(8*1) | b2;
 		return word;
 	}	
 }
--- a/runtime/Java/src/org/antlr/v4/runtime/nfa/Interpreter.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/nfa/Interpreter.java
@ -0,0 +1,98 @@
 package org.antlr.v4.runtime.nfa;
 import org.antlr.runtime.CharStream;
 /** http://swtch.com/~rsc/regexp/regexp2.html */
 public class Interpreter {
 	byte[] code;
 	public Interpreter(byte[] code) { this.code = code; }
 	/*
 	for(;;){
        switch(pc->opcode){
        case Char:
            if(*sp != pc->c)
                return 0;
            pc++;
            sp++;
            continue;
        case Match:
            return 1;
        case Jmp:
            pc = pc->x;
            continue;
        case Split:
            if(recursiveloop(pc->x, sp))
                return 1;
            pc = pc->y;
            continue;
        }
        assert(0);
        return -1; 
    }
 	 */
 	public int exec(CharStream input, int ip) {
 		while ( ip < code.length ) {
 			int c = input.LA(1);
 			trace(ip);
 			short opcode = code[ip];
 			ip++; // move to next instruction or first byte of operand
 			switch (opcode) {
 				case Bytecode.MATCH :
 					int o = getShort(code, ip);
 					ip += 2;
 					if ( c != o ) return 0;
 					input.consume();
 					break;
 				case Bytecode.RANGE :
 					int from = getShort(code, ip);
 					ip += 2;
 					int to = getShort(code, ip);
 					ip += 2;
 					if ( c<from || c>to ) return 0;
 					input.consume();
 					break;
 				case Bytecode.ACCEPT :
 					int ruleIndex = getShort(code, ip);
 					ip += 2;
 					System.out.println("accept "+ruleIndex);
 					return ruleIndex;
 				case Bytecode.JMP :
 					int target = getShort(code, ip);
 					ip = target;
 					continue;
 				case Bytecode.SPLIT :
 					int nopnds = getShort(code, ip);
 					ip += 2;
 					for (int i=1; i<=nopnds-1; i++) {
 						int addr = getShort(code, ip);
 						ip += 2;
 						System.out.println("try alt "+i+" at "+addr);
 						int m = input.mark();
 						int r = exec(input, addr);
 						if ( r>0 ) { input.release(m); return r; }
 						input.rewind(m);
 					}
 					// try final alternative (w/o recursion)
 					int addr = getShort(code, ip);
 					ip = addr;
 					System.out.println("try alt "+nopnds+" at "+addr);
 					continue;
 				default :
 					throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
 			}
 		}
 		return 0;
 	}
 	void trace(int ip) {
 		String instr = Bytecode.disassembleInstruction(code, ip);
        System.out.println(instr);
    }
 	public static int getShort(byte[] memory, int index) {
        int b1 = memory[index++]&0xFF; // mask off sign-extended bits
        int b2 = memory[index++]&0xFF;
        return b1<<(8*1) | b2;
    }
 }
--- a/tool/src/org/antlr/v4/analysis/AnalysisPipeline.java
+++ b/tool/src/org/antlr/v4/analysis/AnalysisPipeline.java
@ -20,8 +20,10 @@ public class AnalysisPipeline {
 		if ( lr.listOfRecursiveCycles.size()>0 ) return; // bail out
 		// BUILD DFA FOR EACH DECISION
-		if ( g.isLexer() ) processLexer();
+//		if ( g.isLexer() ) processLexer();
-		else processParserOrTreeParser();
+//		else processParserOrTreeParser();
 		// TODO: don't do lexers for now; we can add lookahead analysis to help with NFA simulation later
 		if ( !g.isLexer() ) processParserOrTreeParser();
 	}
 	void processLexer() {
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java
@ -0,0 +1,180 @@
 package org.antlr.v4.codegen;
 import org.antlr.runtime.RecognizerSharedState;
 import org.antlr.runtime.Token;
 import org.antlr.runtime.tree.TreeNodeStream;
 import org.antlr.v4.runtime.nfa.Bytecode;
 import org.antlr.v4.runtime.tree.TreeParser;
 import java.util.ArrayList;
 import java.util.List;
 /** http://swtch.com/~rsc/regexp/regexp2.html */
 public class NFABytecodeGenerator extends TreeParser {
 	public abstract static class Instr {
 		public short opcode;
 		public int addr;
 		public int nBytes;
 		public Instr(short opcode, int nBytes) { this.opcode = opcode; this.nBytes = nBytes; }
 		public void write(byte[] code) { code[addr] = (byte)opcode;	}
 	}
 	public static class MatchInstr extends Instr {
 		Token token;
 		char c;
 		public MatchInstr(Token t, char c) { super(Bytecode.MATCH, 3); this.token = t; this.c = c; }
 		public void write(byte[] code) {
 			super.write(code);
 			writeShort(code, addr+1, (short)c);
 		}
 		@Override
 		public String toString() {
 			return addr+":MatchInstr{" +
 				   "c=" + c +
 				   '}';
 		}
 	}
 	public static class RangeInstr extends Instr {
 		Token a, b;
 		char start, stop;
 		public RangeInstr(Token a, Token b) {
 			super(Bytecode.RANGE, 1+2*Bytecode.OPND_SIZE_IN_BYTES);
 			this.a = a;
 			this.b = b;
 			start = (char)Target.getCharValueFromGrammarCharLiteral(a.getText());
 			stop = (char)Target.getCharValueFromGrammarCharLiteral(b.getText());
 		}
 		public void write(byte[] code) {
 			super.write(code);
 			writeShort(code, addr+1, (short)start);
 			writeShort(code, addr+1+Bytecode.OPND_SIZE_IN_BYTES, (short)stop);
 		}
 		@Override
 		public String toString() {
 			return addr+":RangeInstr{"+start+".."+stop+"}";
 		}
 	}
 	public static class AcceptInstr extends Instr {
 		int ruleIndex;
 		public AcceptInstr(int ruleIndex) {
 			super(Bytecode.ACCEPT, 3);
 			this.ruleIndex = ruleIndex;
 		}
 		public void write(byte[] code) {
 			super.write(code);
 			writeShort(code, addr+1, (short)ruleIndex);
 		}
 		public String toString() { return addr+":AcceptInstr "+ruleIndex; }
 	}
 	public static class JumpInstr extends Instr {
 		int target;
 		public JumpInstr() { super(Bytecode.JMP, 3); }
 		public void write(byte[] code) {
 			super.write(code);
 			writeShort(code, addr+1, (short)target);
 		}
 		@Override
 		public String toString() {
 			return addr+":JumpInstr{" +
 				   "target=" + target +
 				   '}';
 		}
 	}
 	public static class SplitInstr extends Instr {
 		List<Integer> addrs = new ArrayList<Integer>();
 		public SplitInstr(int nAlts) { super(Bytecode.SPLIT, 1+2+nAlts*2); }
 		public void write(byte[] code) {
 			super.write(code);
 			int a = addr + 1;
 			writeShort(code, a, (short)addrs.size());
 			a += Bytecode.OPND_SIZE_IN_BYTES;
 			for (int x : addrs) {
 				writeShort(code, a, (short)x);
 				a += Bytecode.OPND_SIZE_IN_BYTES;
 			}
 		}
 		@Override
 		public String toString() {
 			return addr+":SplitInstr{" +
 				   "addrs=" + addrs +
 				   '}';
 		}
 	}
 	public List<Instr> instrs = new ArrayList<Instr>();
 	public int ip = 0; // where to write next
 	public NFABytecodeGenerator(TreeNodeStream input) {
 		super(input);
 	}
 	public NFABytecodeGenerator(TreeNodeStream input, RecognizerSharedState state) {
 		super(input, state);
 	}
 	public void emit(Instr I) {
 		I.addr = ip;
 		ip += I.nBytes;
 		instrs.add(I);
 	}
 	public void emitString(Token t) {
 		String chars = Target.getStringFromGrammarStringLiteral(t.getText());
 		for (char c : chars.toCharArray()) {
 			emit(new MatchInstr(t, c));
 		}
 	}
 	public byte[] getCode() {
 		Instr last = instrs.get(instrs.size() - 1);
 		int size = last.addr + last.nBytes;
 		byte[] code = new byte[size];
 		for (Instr I : instrs) {
 			I.write(code);
 		}
 		return code;
 	}
 	/** Write value at index into a byte array highest to lowest byte,
 	 *  left to right.
 	 */
 	public static void writeShort(byte[] memory, int index, short value) {
 		memory[index+0] = (byte)((value>>(8*1))&0xFF);
 		memory[index+1] = (byte)(value&0xFF);
 	}
 	/* CODE TO GENERATE NFA BYTECODES
 			// testing code gen concept
 			GrammarASTAdaptor adaptor = new GrammarASTAdaptor();
 			for (Rule r : lg.modes.get(modeName)) {
 				GrammarAST blk = (GrammarAST)r.ast.getFirstChildWithType(ANTLRParser.BLOCK);
 				CommonTreeNodeStream nodes = new CommonTreeNodeStream(adaptor,blk);
 				NFABytecodeTriggers gen = new NFABytecodeTriggers(nodes);
 				try {
 					gen.block();
 					gen.emit(new NFABytecodeGenerator.AcceptInstr(r.index));
 					System.out.println("code=\n"+gen.instrs);
 					byte[] code = gen.getCode();
 					System.out.println(Bytecode.disassemble(code));
 					Interpreter in = new Interpreter(code);
 					String s = "i";
 					ANTLRStringStream input = new ANTLRStringStream(s);
 					int rule = in.exec(input, 0);
 					System.out.println(s+" matched rule "+rule+" leaving off at index="+input.index());
 				}
 				catch (Exception e){
 					e.printStackTrace(System.err);
 				}
 			}
 	 */
 }
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.g
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.g
@ -0,0 +1,165 @@
 tree grammar NFABytecodeTriggers;
 options {
 	language     = Java;
 	tokenVocab   = ANTLRParser;
 	ASTLabelType = GrammarAST;
 	superClass   = NFABytecodeGenerator;
 }
@header {
 package org.antlr.v4.codegen;
 import org.antlr.v4.tool.GrammarAST;
 }
 /*
 e1 | e2 | e3:
 	split 3, L1, L2, L3
 L1:	e1
 	jmp END
 L2:	e2
 	jmp END
 L3:	e3
 END:
 */
 block
    :	^(	BLOCK (^(OPTIONS .+))?
    		{
    		GrammarAST firstAlt = (GrammarAST)input.LT(1);
    		int i = firstAlt.getChildIndex();    		
 			int nAlts = $start.getChildCount() - i;
    		System.out.println("alts "+nAlts);
    		List<JumpInstr> jumps = new ArrayList<JumpInstr>();
    		SplitInstr S = null;
    		if ( nAlts>1 ) {
 	    		S = new SplitInstr(nAlts);
 	    		emit(S);
 	    		S.addrs.add(ip);
    		}
    		int alt = 1;
    		}
    		(	alternative
    			{
    			if ( alt < nAlts ) {
 	    			JumpInstr J = new JumpInstr();
 	    			jumps.add(J);
 	    			emit(J);
 	    			S.addrs.add(ip);
    			}
    			alt++;
    			}
    		)+
    		{
    		int END = ip;
    		for (JumpInstr J : jumps) J.target = END;
    		}
    	)
    ;
 alternative
    :	^(ALT_REWRITE a=alternative .)	
    |	^(ALT EPSILON)					
    |   ^(ALT (e=element )+)    									
    ;
 element
 	:	labeledElement				
 	|	atom						
 	|	ebnf						
 	|   ACTION						
 	|   SEMPRED						
 	|	GATED_SEMPRED				
 	|	treeSpec					
 	;
 labeledElement
 	:	^(ASSIGN ID atom)			
 	|	^(ASSIGN ID block)			
 	|	^(PLUS_ASSIGN ID atom)		
 	|	^(PLUS_ASSIGN ID block)		
 	;
 treeSpec
    : ^(TREE_BEGIN  (e=element )+)	
    ;
 ebnf
 	:	^(astBlockSuffix block)		
 	|	{
 	   	SplitInstr S = new SplitInstr(2);
 		emit(S);
   		S.addrs.add(ip);
 		}
 		^(OPTIONAL block)			
 		{
   		S.addrs.add(ip);
 		}
 	|	{
 		int start=ip;
 	   	SplitInstr S = new SplitInstr(2);
 		emit(S);
   		S.addrs.add(ip);
 		}
 		^(CLOSURE block)			
 		{
 	    JumpInstr J = new JumpInstr();
 	    emit(J);
 	    J.target = start;
 	    S.addrs.add(ip);
 		}
 	|	{int start=ip;} ^(POSITIVE_CLOSURE block)
 		{
   		SplitInstr S = new SplitInstr(2);
 		emit(S);
 		int stop = ip;
   		S.addrs.add(start);
   		S.addrs.add(stop);
 		}
 	| 	block						
    ;
 astBlockSuffix
    : ROOT
    | IMPLIES
    | BANG
    ;
 atom
 	:	^(ROOT range)			
 	|	^(BANG range)			
 	|	^(ROOT notSet)			
 	|	^(BANG notSet)			
 	|	notSet					
 	|	range					
 	|	^(DOT ID terminal)		
 	|	^(DOT ID ruleref)		
    |	^(WILDCARD .)			
    |	WILDCARD				
    |   terminal				
    |   ruleref					
    ;
 notSet
    : ^(NOT terminal)		
    | ^(NOT block)			
    ;
 ruleref
    :	^(ROOT ^(RULE_REF ARG_ACTION?))	
    |	^(BANG ^(RULE_REF ARG_ACTION?))	
    |	^(RULE_REF ARG_ACTION?)			
    ;
 range
    :	^(RANGE a=STRING_LITERAL b=STRING_LITERAL)
    	{emit(new RangeInstr($a.token, $b.token));}
    ;
 terminal
    :  ^(STRING_LITERAL .)			{emitString($STRING_LITERAL.token);}
    |	STRING_LITERAL				{emitString($STRING_LITERAL.token);}
    |	^(TOKEN_REF ARG_ACTION .)	
    |	^(TOKEN_REF .)				
    |	TOKEN_REF					
    |	^(ROOT terminal)			
    |	^(BANG terminal)			
    ;
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.java
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.java
--- a/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.tokens
+++ b/tool/src/org/antlr/v4/codegen/NFABytecodeTriggers.tokens
@ -0,0 +1,99 @@
 COMBINED=91
 LT=44
 STAR=49
 BACKTRACK_SEMPRED=96
 DOUBLE_ANGLE_STRING_LITERAL=11
 FORCED_ACTION=5
 ARGLIST=89
 ALTLIST=86
 NOT=61
 SEMPRED=4
 ACTION=16
 TOKEN_REF=63
 RULEMODIFIERS=75
 ST_RESULT=100
 RPAREN=42
 RET=90
 IMPORT=22
 STRING_LITERAL=68
 ARG=88
 ARG_ACTION=14
 DOUBLE_QUOTE_STRING_LITERAL=10
 COMMENT=9
 ACTION_CHAR_LITERAL=13
 GRAMMAR=27
 RULEACTIONS=76
 WSCHARS=66
 INITACTION=92
 ALT_REWRITE=102
 IMPLIES=43
 RULE=73
 RBRACE=62
 ACTION_ESC=17
 PRIVATE=30
 SRC=7
 THROWS=32
 CHAR_RANGE=83
 INT=65
 EPSILON=84
 LIST=98
 COLONCOLON=38
 WSNLCHARS=18
 WS=71
 LEXER=24
 OR=52
 GT=45
 CATCH=33
 CLOSURE=80
 PARSER=25
 DOLLAR=54
 PROTECTED=28
 ELEMENT_OPTIONS=99
 NESTED_ACTION=15
 FRAGMENT=23
 ID=87
 TREE_BEGIN=59
 LPAREN=41
 AT=60
 ESC_SEQ=67
 ALT=85
 TREE=26
 SCOPE=21
 ETC=57
 COMMA=39
 WILDCARD=97
 DOC_COMMENT=6
 PLUS=50
 REWRITE_BLOCK=78
 DOT=55
 MODE=36
 RETURNS=31
 RULES=74
 RARROW=58
 UNICODE_ESC=70
 HEX_DIGIT=69
 RANGE=56
 TOKENS=20
 RESULT=101
 GATED_SEMPRED=94
 BANG=48
 ACTION_STRING_LITERAL=12
 ROOT=53
 SEMI=40
 RULE_REF=64
 NLCHARS=8
 OPTIONAL=79
 SYNPRED=82
 COLON=37
 QUESTION=47
 FINALLY=34
 TEMPLATE=35
 LABEL=93
 SYN_SEMPRED=95
 ERRCHAR=72
 BLOCK=77
 ASSIGN=46
 PLUS_ASSIGN=51
 PUBLIC=29
 POSITIVE_CLOSURE=81
 OPTIONS=19
--- a/tool/src/org/antlr/v4/codegen/Target.java
+++ b/tool/src/org/antlr/v4/codegen/Target.java
@ -65,6 +65,35 @@ public class Target {
 		}
 	}
 	public static String getStringFromGrammarStringLiteral(String literal) {
 		StringBuilder buf = new StringBuilder();
 		int n = literal.length();
 		int i = 1; // skip first quote
 		while ( i < (n-1) ) { // scan all but last quote 
 			switch ( literal.charAt(i) ) {
 				case '\\' :
 					i++;
 					if ( literal.charAt(i)=='u' ) { // '\u1234'
 						i++;
 						String unicodeChars = literal.substring(3,literal.length()-1);
 						buf.append((char)Integer.parseInt(unicodeChars, 16));
 					}
 					else {
 						char escChar = literal.charAt(i);
 						int charVal = ANTLRLiteralEscapedCharValue[escChar];
 						if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
 						else buf.append((char)charVal);
 					}
 					break;
 				default :
 					buf.append(literal.charAt(i));
 					i++;
 					break;
 			}
 		}
 		return buf.toString();
 	}
 	/** Return a string representing the escaped char for code c.  E.g., If c
 	 *  has value 0x100, you will get "\u0100".  ASCII gets the usual
 	 *  char (non-hex) representation.  Control characters are spit out