added in NFA VM prototype

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6820]
This commit is contained in:
parrt 2010-04-22 13:07:16 -08:00
parent 41c0225adf
commit 3015778202
8 changed files with 2476 additions and 2 deletions

View File

@ -0,0 +1,125 @@
package org.antlr.v4.runtime.nfa;
import java.util.ArrayList;
import java.util.List;
/** */
public class Bytecode {
public static final int MAX_OPNDS = 3; // Or single opnd indicating variable number
public static final int OPND_SIZE_IN_BYTES = 2;
public enum OperandType { NONE, CHAR, ADDR, INT, VARARGS }
public static class Instruction {
String name; // E.g., "load_str", "new"
OperandType[] type = new OperandType[MAX_OPNDS];
int n = 0;
public Instruction(String name) {
this(name,OperandType.NONE,OperandType.NONE,OperandType.NONE); n=0;
}
public Instruction(String name, OperandType a) {
this(name,a,OperandType.NONE,OperandType.NONE); n=1;
}
public Instruction(String name, OperandType a, OperandType b) {
this(name,a,b,OperandType.NONE); n=2;
}
public Instruction(String name, OperandType a, OperandType b, OperandType c) {
this.name = name;
type[0] = a;
type[1] = b;
type[2] = c;
n = MAX_OPNDS;
}
}
// don't use enum for efficiency; don't want code block to
// be an array of objects (Bytecode[]). We want it to be byte[].
// INSTRUCTION BYTECODES (byte is signed; use a short to keep 0..255)
public static final short ACCEPT = 1;
public static final short JMP = 2;
public static final short SPLIT = 3;
public static final short MATCH = 4;
public static final short RANGE = 5;
/** Used for disassembly; describes instruction set */
public static Instruction[] instructions = new Instruction[] {
null, // <INVALID>
new Instruction("accept", OperandType.INT), // index is the opcode
new Instruction("jmp", OperandType.ADDR),
new Instruction("split", OperandType.VARARGS),
new Instruction("match", OperandType.CHAR),
new Instruction("range", OperandType.CHAR, OperandType.CHAR)
};
public static String disassemble(byte[] code) {
StringBuilder buf = new StringBuilder();
int i=0;
while (i<code.length) {
i = disassembleInstruction(buf, code, i);
buf.append('\n');
}
return buf.toString();
}
public static String disassembleInstruction(byte[] code, int ip) {
StringBuilder buf = new StringBuilder();
disassembleInstruction(buf, code, ip);
return buf.toString();
}
public static int disassembleInstruction(StringBuilder buf, byte[] code, int ip) {
int opcode = code[ip];
if ( ip>=code.length ) {
throw new IllegalArgumentException("ip out of range: "+ip);
}
Bytecode.Instruction I =
Bytecode.instructions[opcode];
if ( I==null ) {
throw new IllegalArgumentException("no such instruction "+opcode+
" at address "+ip);
}
String instrName = I.name;
buf.append( String.format("%04d:\t%-14s", ip, instrName) );
ip++;
if ( I.n==0 ) {
buf.append(" ");
return ip;
}
List<String> operands = new ArrayList<String>();
for (int i=0; i<I.n; i++) {
int opnd = getShort(code, ip);
ip += Bytecode.OPND_SIZE_IN_BYTES;
switch ( I.type[i] ) {
case CHAR :
operands.add("'"+(char)opnd+"'");
break;
case VARARGS : // get n (opnd) operands
int n = opnd;
// operands.add(String.valueOf(n)); don't show n in varargs
for (int j=0; j<n; j++) {
operands.add(String.valueOf(getShort(code, ip)));
ip += OPND_SIZE_IN_BYTES;
}
break;
case INT :
case ADDR :
default:
operands.add(String.valueOf(opnd));
break;
}
}
for (int i = 0; i < operands.size(); i++) {
String s = operands.get(i);
if ( i>0 ) buf.append(", ");
buf.append( s );
}
return ip;
}
public static int getShort(byte[] memory, int index) {
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
int b2 = memory[index++]&0xFF;
int word = b1<<(8*1) | b2;
return word;
}
}

View File

@ -0,0 +1,98 @@
package org.antlr.v4.runtime.nfa;
import org.antlr.runtime.CharStream;
/** http://swtch.com/~rsc/regexp/regexp2.html */
public class Interpreter {
byte[] code;
public Interpreter(byte[] code) { this.code = code; }
/*
for(;;){
switch(pc->opcode){
case Char:
if(*sp != pc->c)
return 0;
pc++;
sp++;
continue;
case Match:
return 1;
case Jmp:
pc = pc->x;
continue;
case Split:
if(recursiveloop(pc->x, sp))
return 1;
pc = pc->y;
continue;
}
assert(0);
return -1;
}
*/
public int exec(CharStream input, int ip) {
while ( ip < code.length ) {
int c = input.LA(1);
trace(ip);
short opcode = code[ip];
ip++; // move to next instruction or first byte of operand
switch (opcode) {
case Bytecode.MATCH :
int o = getShort(code, ip);
ip += 2;
if ( c != o ) return 0;
input.consume();
break;
case Bytecode.RANGE :
int from = getShort(code, ip);
ip += 2;
int to = getShort(code, ip);
ip += 2;
if ( c<from || c>to ) return 0;
input.consume();
break;
case Bytecode.ACCEPT :
int ruleIndex = getShort(code, ip);
ip += 2;
System.out.println("accept "+ruleIndex);
return ruleIndex;
case Bytecode.JMP :
int target = getShort(code, ip);
ip = target;
continue;
case Bytecode.SPLIT :
int nopnds = getShort(code, ip);
ip += 2;
for (int i=1; i<=nopnds-1; i++) {
int addr = getShort(code, ip);
ip += 2;
System.out.println("try alt "+i+" at "+addr);
int m = input.mark();
int r = exec(input, addr);
if ( r>0 ) { input.release(m); return r; }
input.rewind(m);
}
// try final alternative (w/o recursion)
int addr = getShort(code, ip);
ip = addr;
System.out.println("try alt "+nopnds+" at "+addr);
continue;
default :
throw new RuntimeException("invalid instruction @ "+ip+": "+opcode);
}
}
return 0;
}
void trace(int ip) {
String instr = Bytecode.disassembleInstruction(code, ip);
System.out.println(instr);
}
public static int getShort(byte[] memory, int index) {
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
int b2 = memory[index++]&0xFF;
return b1<<(8*1) | b2;
}
}

View File

@ -20,8 +20,10 @@ public class AnalysisPipeline {
if ( lr.listOfRecursiveCycles.size()>0 ) return; // bail out
// BUILD DFA FOR EACH DECISION
if ( g.isLexer() ) processLexer();
else processParserOrTreeParser();
// if ( g.isLexer() ) processLexer();
// else processParserOrTreeParser();
// TODO: don't do lexers for now; we can add lookahead analysis to help with NFA simulation later
if ( !g.isLexer() ) processParserOrTreeParser();
}
void processLexer() {

View File

@ -0,0 +1,180 @@
package org.antlr.v4.codegen;
import org.antlr.runtime.RecognizerSharedState;
import org.antlr.runtime.Token;
import org.antlr.runtime.tree.TreeNodeStream;
import org.antlr.v4.runtime.nfa.Bytecode;
import org.antlr.v4.runtime.tree.TreeParser;
import java.util.ArrayList;
import java.util.List;
/** http://swtch.com/~rsc/regexp/regexp2.html */
public class NFABytecodeGenerator extends TreeParser {
public abstract static class Instr {
public short opcode;
public int addr;
public int nBytes;
public Instr(short opcode, int nBytes) { this.opcode = opcode; this.nBytes = nBytes; }
public void write(byte[] code) { code[addr] = (byte)opcode; }
}
public static class MatchInstr extends Instr {
Token token;
char c;
public MatchInstr(Token t, char c) { super(Bytecode.MATCH, 3); this.token = t; this.c = c; }
public void write(byte[] code) {
super.write(code);
writeShort(code, addr+1, (short)c);
}
@Override
public String toString() {
return addr+":MatchInstr{" +
"c=" + c +
'}';
}
}
public static class RangeInstr extends Instr {
Token a, b;
char start, stop;
public RangeInstr(Token a, Token b) {
super(Bytecode.RANGE, 1+2*Bytecode.OPND_SIZE_IN_BYTES);
this.a = a;
this.b = b;
start = (char)Target.getCharValueFromGrammarCharLiteral(a.getText());
stop = (char)Target.getCharValueFromGrammarCharLiteral(b.getText());
}
public void write(byte[] code) {
super.write(code);
writeShort(code, addr+1, (short)start);
writeShort(code, addr+1+Bytecode.OPND_SIZE_IN_BYTES, (short)stop);
}
@Override
public String toString() {
return addr+":RangeInstr{"+start+".."+stop+"}";
}
}
public static class AcceptInstr extends Instr {
int ruleIndex;
public AcceptInstr(int ruleIndex) {
super(Bytecode.ACCEPT, 3);
this.ruleIndex = ruleIndex;
}
public void write(byte[] code) {
super.write(code);
writeShort(code, addr+1, (short)ruleIndex);
}
public String toString() { return addr+":AcceptInstr "+ruleIndex; }
}
public static class JumpInstr extends Instr {
int target;
public JumpInstr() { super(Bytecode.JMP, 3); }
public void write(byte[] code) {
super.write(code);
writeShort(code, addr+1, (short)target);
}
@Override
public String toString() {
return addr+":JumpInstr{" +
"target=" + target +
'}';
}
}
public static class SplitInstr extends Instr {
List<Integer> addrs = new ArrayList<Integer>();
public SplitInstr(int nAlts) { super(Bytecode.SPLIT, 1+2+nAlts*2); }
public void write(byte[] code) {
super.write(code);
int a = addr + 1;
writeShort(code, a, (short)addrs.size());
a += Bytecode.OPND_SIZE_IN_BYTES;
for (int x : addrs) {
writeShort(code, a, (short)x);
a += Bytecode.OPND_SIZE_IN_BYTES;
}
}
@Override
public String toString() {
return addr+":SplitInstr{" +
"addrs=" + addrs +
'}';
}
}
public List<Instr> instrs = new ArrayList<Instr>();
public int ip = 0; // where to write next
public NFABytecodeGenerator(TreeNodeStream input) {
super(input);
}
public NFABytecodeGenerator(TreeNodeStream input, RecognizerSharedState state) {
super(input, state);
}
public void emit(Instr I) {
I.addr = ip;
ip += I.nBytes;
instrs.add(I);
}
public void emitString(Token t) {
String chars = Target.getStringFromGrammarStringLiteral(t.getText());
for (char c : chars.toCharArray()) {
emit(new MatchInstr(t, c));
}
}
public byte[] getCode() {
Instr last = instrs.get(instrs.size() - 1);
int size = last.addr + last.nBytes;
byte[] code = new byte[size];
for (Instr I : instrs) {
I.write(code);
}
return code;
}
/** Write value at index into a byte array highest to lowest byte,
* left to right.
*/
public static void writeShort(byte[] memory, int index, short value) {
memory[index+0] = (byte)((value>>(8*1))&0xFF);
memory[index+1] = (byte)(value&0xFF);
}
/* CODE TO GENERATE NFA BYTECODES
// testing code gen concept
GrammarASTAdaptor adaptor = new GrammarASTAdaptor();
for (Rule r : lg.modes.get(modeName)) {
GrammarAST blk = (GrammarAST)r.ast.getFirstChildWithType(ANTLRParser.BLOCK);
CommonTreeNodeStream nodes = new CommonTreeNodeStream(adaptor,blk);
NFABytecodeTriggers gen = new NFABytecodeTriggers(nodes);
try {
gen.block();
gen.emit(new NFABytecodeGenerator.AcceptInstr(r.index));
System.out.println("code=\n"+gen.instrs);
byte[] code = gen.getCode();
System.out.println(Bytecode.disassemble(code));
Interpreter in = new Interpreter(code);
String s = "i";
ANTLRStringStream input = new ANTLRStringStream(s);
int rule = in.exec(input, 0);
System.out.println(s+" matched rule "+rule+" leaving off at index="+input.index());
}
catch (Exception e){
e.printStackTrace(System.err);
}
}
*/
}

View File

@ -0,0 +1,165 @@
tree grammar NFABytecodeTriggers;
options {
language = Java;
tokenVocab = ANTLRParser;
ASTLabelType = GrammarAST;
superClass = NFABytecodeGenerator;
}
@header {
package org.antlr.v4.codegen;
import org.antlr.v4.tool.GrammarAST;
}
/*
e1 | e2 | e3:
split 3, L1, L2, L3
L1: e1
jmp END
L2: e2
jmp END
L3: e3
END:
*/
block
: ^( BLOCK (^(OPTIONS .+))?
{
GrammarAST firstAlt = (GrammarAST)input.LT(1);
int i = firstAlt.getChildIndex();
int nAlts = $start.getChildCount() - i;
System.out.println("alts "+nAlts);
List<JumpInstr> jumps = new ArrayList<JumpInstr>();
SplitInstr S = null;
if ( nAlts>1 ) {
S = new SplitInstr(nAlts);
emit(S);
S.addrs.add(ip);
}
int alt = 1;
}
( alternative
{
if ( alt < nAlts ) {
JumpInstr J = new JumpInstr();
jumps.add(J);
emit(J);
S.addrs.add(ip);
}
alt++;
}
)+
{
int END = ip;
for (JumpInstr J : jumps) J.target = END;
}
)
;
alternative
: ^(ALT_REWRITE a=alternative .)
| ^(ALT EPSILON)
| ^(ALT (e=element )+)
;
element
: labeledElement
| atom
| ebnf
| ACTION
| SEMPRED
| GATED_SEMPRED
| treeSpec
;
labeledElement
: ^(ASSIGN ID atom)
| ^(ASSIGN ID block)
| ^(PLUS_ASSIGN ID atom)
| ^(PLUS_ASSIGN ID block)
;
treeSpec
: ^(TREE_BEGIN (e=element )+)
;
ebnf
: ^(astBlockSuffix block)
| {
SplitInstr S = new SplitInstr(2);
emit(S);
S.addrs.add(ip);
}
^(OPTIONAL block)
{
S.addrs.add(ip);
}
| {
int start=ip;
SplitInstr S = new SplitInstr(2);
emit(S);
S.addrs.add(ip);
}
^(CLOSURE block)
{
JumpInstr J = new JumpInstr();
emit(J);
J.target = start;
S.addrs.add(ip);
}
| {int start=ip;} ^(POSITIVE_CLOSURE block)
{
SplitInstr S = new SplitInstr(2);
emit(S);
int stop = ip;
S.addrs.add(start);
S.addrs.add(stop);
}
| block
;
astBlockSuffix
: ROOT
| IMPLIES
| BANG
;
atom
: ^(ROOT range)
| ^(BANG range)
| ^(ROOT notSet)
| ^(BANG notSet)
| notSet
| range
| ^(DOT ID terminal)
| ^(DOT ID ruleref)
| ^(WILDCARD .)
| WILDCARD
| terminal
| ruleref
;
notSet
: ^(NOT terminal)
| ^(NOT block)
;
ruleref
: ^(ROOT ^(RULE_REF ARG_ACTION?))
| ^(BANG ^(RULE_REF ARG_ACTION?))
| ^(RULE_REF ARG_ACTION?)
;
range
: ^(RANGE a=STRING_LITERAL b=STRING_LITERAL)
{emit(new RangeInstr($a.token, $b.token));}
;
terminal
: ^(STRING_LITERAL .) {emitString($STRING_LITERAL.token);}
| STRING_LITERAL {emitString($STRING_LITERAL.token);}
| ^(TOKEN_REF ARG_ACTION .)
| ^(TOKEN_REF .)
| TOKEN_REF
| ^(ROOT terminal)
| ^(BANG terminal)
;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,99 @@
COMBINED=91
LT=44
STAR=49
BACKTRACK_SEMPRED=96
DOUBLE_ANGLE_STRING_LITERAL=11
FORCED_ACTION=5
ARGLIST=89
ALTLIST=86
NOT=61
SEMPRED=4
ACTION=16
TOKEN_REF=63
RULEMODIFIERS=75
ST_RESULT=100
RPAREN=42
RET=90
IMPORT=22
STRING_LITERAL=68
ARG=88
ARG_ACTION=14
DOUBLE_QUOTE_STRING_LITERAL=10
COMMENT=9
ACTION_CHAR_LITERAL=13
GRAMMAR=27
RULEACTIONS=76
WSCHARS=66
INITACTION=92
ALT_REWRITE=102
IMPLIES=43
RULE=73
RBRACE=62
ACTION_ESC=17
PRIVATE=30
SRC=7
THROWS=32
CHAR_RANGE=83
INT=65
EPSILON=84
LIST=98
COLONCOLON=38
WSNLCHARS=18
WS=71
LEXER=24
OR=52
GT=45
CATCH=33
CLOSURE=80
PARSER=25
DOLLAR=54
PROTECTED=28
ELEMENT_OPTIONS=99
NESTED_ACTION=15
FRAGMENT=23
ID=87
TREE_BEGIN=59
LPAREN=41
AT=60
ESC_SEQ=67
ALT=85
TREE=26
SCOPE=21
ETC=57
COMMA=39
WILDCARD=97
DOC_COMMENT=6
PLUS=50
REWRITE_BLOCK=78
DOT=55
MODE=36
RETURNS=31
RULES=74
RARROW=58
UNICODE_ESC=70
HEX_DIGIT=69
RANGE=56
TOKENS=20
RESULT=101
GATED_SEMPRED=94
BANG=48
ACTION_STRING_LITERAL=12
ROOT=53
SEMI=40
RULE_REF=64
NLCHARS=8
OPTIONAL=79
SYNPRED=82
COLON=37
QUESTION=47
FINALLY=34
TEMPLATE=35
LABEL=93
SYN_SEMPRED=95
ERRCHAR=72
BLOCK=77
ASSIGN=46
PLUS_ASSIGN=51
PUBLIC=29
POSITIVE_CLOSURE=81
OPTIONS=19

View File

@ -65,6 +65,35 @@ public class Target {
}
}
public static String getStringFromGrammarStringLiteral(String literal) {
StringBuilder buf = new StringBuilder();
int n = literal.length();
int i = 1; // skip first quote
while ( i < (n-1) ) { // scan all but last quote
switch ( literal.charAt(i) ) {
case '\\' :
i++;
if ( literal.charAt(i)=='u' ) { // '\u1234'
i++;
String unicodeChars = literal.substring(3,literal.length()-1);
buf.append((char)Integer.parseInt(unicodeChars, 16));
}
else {
char escChar = literal.charAt(i);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
else buf.append((char)charVal);
}
break;
default :
buf.append(literal.charAt(i));
i++;
break;
}
}
return buf.toString();
}
/** Return a string representing the escaped char for code c. E.g., If c
* has value 0x100, you will get "\u0100". ASCII gets the usual
* char (non-hex) representation. Control characters are spit out