more work on NFA

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6735]
This commit is contained in:
parrt 2010-03-04 15:56:47 -08:00
parent bf08801022
commit d7bda19fd1
17 changed files with 771 additions and 618 deletions

View File

@ -27,13 +27,12 @@
*/
package org.antlr.v4.runtime.tree;
import org.antlr.runtime.BitSet;
import org.antlr.runtime.Token;
import org.antlr.runtime.tree.BaseTree;
import org.antlr.runtime.tree.Tree;
import org.antlr.v4.runtime.tree.gui.ASTViewer;
import java.util.BitSet;
/** A tree node that is wrapper for a Token object. After 3.0 release
* while building tree rewrite stuff, it became clear that computing
* parent and child index is very difficult and cumbersome. Better to
@ -183,7 +182,7 @@ public class CommonTree extends BaseTree {
// TODO: move to basetree when i settle on how runtime works
// TODO: don't include this node!!
/** include this node */
// TODO: reuse other method
public CommonTree getFirstDescendantWithType(int type) {
if ( getType()==type ) return this;
if ( children==null ) return null;
@ -196,12 +195,13 @@ public class CommonTree extends BaseTree {
return null;
}
// TODO: don't include this node!!
public CommonTree getFirstDescendantWithType(BitSet types) {
if ( types.get(getType()) ) return this;
if ( types.member(getType()) ) return this;
if ( children==null ) return null;
for (Object c : children) {
CommonTree t = (CommonTree)c;
if ( types.get(t.getType()) ) return t;
if ( types.member(t.getType()) ) return t;
CommonTree d = t.getFirstDescendantWithType(types);
if ( d!=null ) return d;
}

View File

@ -369,13 +369,23 @@ public class Tool {
}
// BUILD NFA FROM AST
NFAFactory factory = new ParserNFAFactory(g);
if ( g.getType()==ANTLRParser.LEXER ) factory = new LexerNFAFactory(g);
GrammarAST rules = (GrammarAST)g.ast.getFirstChildWithType(ANTLRParser.RULES);
List<GrammarAST> kids = rules.getChildren();
for (GrammarAST n : kids) {
if ( n.getType()!=ANTLRParser.RULE ) continue;
GrammarASTAdaptor adaptor = new GrammarASTAdaptor();
BufferedTreeNodeStream nodes =
new BufferedTreeNodeStream(adaptor,g.ast);
NFAFactory fac = new ParserNFAFactory(g);
if ( g.getType()==ANTLRParser.LEXER ) fac = new LexerNFAFactory(g);
NFABuilder nfaBuilder = new NFABuilder(nodes,fac);
nfaBuilder.downup(g.ast);
new BufferedTreeNodeStream(adaptor,n);
NFABuilder b = new NFABuilder(nodes,factory);
try {
b.rule();
}
catch (RecognitionException re) {
}
}
// PERFORM GRAMMAR ANALYSIS ON NFA: BUILD DECISION DFAs

View File

@ -1,6 +1,7 @@
package org.antlr.v4.automata;
import org.antlr.v4.misc.IntervalSet;
import org.antlr.v4.tool.Grammar;
/** */
public class AtomTransition extends Transition {
@ -33,4 +34,8 @@ public class AtomTransition extends Transition {
public int compareTo(Object o) {
return this.label-((AtomTransition)o).label;
}
public String toString(Grammar g) {
return g.getTokenDisplayName(label);
}
}

View File

@ -11,4 +11,22 @@ public class BasicState extends NFAState {
public Transition incidentTransition;
public BasicState(NFA nfa) { super(nfa); }
@Override
public int getNumberOfTransitions() {
if ( transition!=null ) return 1;
return 0;
}
@Override
public void addTransition(Transition e) {
if ( transition!=null ) throw new IllegalArgumentException("only one transition");
transition = e;
}
@Override
public Transition transition(int i) {
if ( i>0 ) throw new IllegalArgumentException("only one transition");
return transition;
}
}

View File

@ -1,5 +1,7 @@
package org.antlr.v4.automata;
import org.antlr.v4.tool.Grammar;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -11,21 +13,61 @@ import java.util.Set;
public class FASerializer {
List<State> work;
Set<State> marked;
Grammar g;
State start;
public String serialize(State s) {
if ( s==null ) return null;
public FASerializer(Grammar g, State start) {
this.g = g;
this.start = start;
}
public String toString() {
if ( start==null ) return null;
work = new ArrayList<State>();
marked = new HashSet<State>();
work.add(s);
work.add(start);
StringBuilder buf = new StringBuilder();
State s = null;
while ( work.size()>0 ) {
s = work.remove(work.size()-1); // pop
System.out.println(s);
marked.add(s);
// add targets
s = work.remove(0);
int n = s.getNumberOfTransitions();
for (int i=0; i<n; i++) work.add( s.transition(i).target );
//System.out.println("visit "+getStateString(s)+"; edges="+n);
marked.add(s);
for (int i=0; i<n; i++) {
Transition t = s.transition(i);
work.add( t.target );
buf.append(getStateString(s));
if ( t instanceof EpsilonTransition ) {
buf.append("->"+getStateString(t.target)+'\n');
}
return "";
else if ( t instanceof RuleTransition ) {
buf.append("->"+getStateString(t.target)+'\n');
}
else {
AtomTransition a = (AtomTransition)t;
buf.append("-"+a.toString(g)+"->"+getStateString(t.target)+'\n');
}
}
}
return buf.toString();
}
String getStateString(State s) {
int n = s.stateNumber;
String stateStr = ".s"+n;
// if ( s instanceof DFAState ) {
// stateStr = ":s"+n+"=>"+((DFAState)s).getUniquelyPredictedAlt();
// }
// else
if ( s instanceof StarBlockStartState ) stateStr = "StarBlockStart_"+n;
if ( s instanceof PlusBlockStartState ) stateStr = "PlusBlockStart_"+n;
if ( s instanceof StarBlockStartState ) stateStr = "StarBlockStart_"+n;
if ( s instanceof BlockStartState ) stateStr = "BlockStart_"+n;
if ( s instanceof BlockEndState ) stateStr = "BlockEnd_"+n;
if ( s instanceof RuleStartState ) stateStr = "RuleStart_"+n;
if ( s instanceof RuleStopState ) stateStr = "RuleStop"+n;
if ( s instanceof LoopbackState ) stateStr = "LoopBack_"+n;
return stateStr;
}
}

View File

@ -57,8 +57,8 @@ public abstract class Label implements /*Comparable, */ Cloneable {
*/
public static final int MIN_ATOM_VALUE = EOT;
// public static final int MIN_CHAR_VALUE = '\u0000';
// public static final int MAX_CHAR_VALUE = '\uFFFE';
public static final int MIN_CHAR_VALUE = '\u0000';
public static final int MAX_CHAR_VALUE = '\uFFFE';
/** End of rule token type; imaginary token type used only for
* local, partial FOLLOW sets to indicate that the local FOLLOW

View File

@ -9,6 +9,13 @@ import java.util.List;
public class NFA {
public Grammar g;
public List<NFAState> states = new ArrayList<NFAState>();
/** Each subrule/rule is a decision point and we must track them so we
* can go back later and build DFA predictors for them. This includes
* all the rules, subrules, optional blocks, ()+, ()* etc...
*/
protected List<NFAState> decisionToNFAState = new ArrayList<NFAState>();
int stateNumber = 0;
public NFA(Grammar g) { this.g = g; }
@ -17,4 +24,9 @@ public class NFA {
states.add(state);
state.stateNumber = stateNumber++;
}
public int defineDecisionState(NFAState s) {
decisionToNFAState.add(s);
return decisionToNFAState.size()-1;
}
}

View File

@ -1,6 +0,0 @@
package org.antlr.v4.automata;
/** */
public class OptionalBlockStartState extends BlockStartState {
public OptionalBlockStartState(NFA nfa) { super(nfa); }
}

View File

@ -2,11 +2,9 @@ package org.antlr.v4.automata;
import org.antlr.v4.misc.IntSet;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.GrammarAST;
import org.antlr.v4.tool.Rule;
import org.antlr.v4.tool.TerminalAST;
import org.antlr.v4.tool.*;
import java.lang.reflect.Constructor;
import java.util.Collection;
import java.util.List;
@ -43,6 +41,20 @@ public class ParserNFAFactory implements NFAFactory {
this.currentRule = g.getRule(name);
}
public NFAState newState(Class nodeType, GrammarAST node) {
try {
Constructor ctor = nodeType.getConstructor(NFA.class);
NFAState s = (NFAState)ctor.newInstance(nfa);
s.ast = node;
nfa.addState(s);
return s;
}
catch (Exception e) {
ErrorManager.internalError("can't create NFA node: "+nodeType.getName(), e);
}
return null;
}
public BasicState newState(GrammarAST node) {
BasicState n = new BasicState(nfa);
n.ast = node;
@ -157,18 +169,16 @@ public class ParserNFAFactory implements NFAFactory {
/** From A|B|..|Z alternative block build
*
* o->o-A->o->o (last NFAState is blockEndNFAState pointed to by all alts)
* o->o-A->o->o (last NFAState is BlockEndState pointed to by all alts)
* | ^
* o->o-B->o--|
* |->o-B->o--|
* | |
* ... |
* | |
* o->o-Z->o--|
* |->o-Z->o--|
*
* So every alternative gets begin NFAState connected by epsilon
* and every alt right side points at a block end NFAState. There is a
* new NFAState in the NFAState in the handle for each alt plus one for the
* end NFAState.
* So start node points at every alternative with epsilon transition
* and every alt right side points at a block end NFAState.
*
* Special case: only one alternative: don't make a block with alt
* begin/end.
@ -176,11 +186,23 @@ public class ParserNFAFactory implements NFAFactory {
* Special case: if just a list of tokens/chars/sets, then collapse
* to a single edge'd o-set->o graph.
*
* Set alt number (1..n) in the left-Transition NFAState.
* TODO: Set alt number (1..n) in the states?
*/
public Handle block(List<Handle> alts) {
public Handle block(GrammarAST blkAST, List<Handle> alts) {
System.out.println("block: "+alts);
return null;
if ( alts.size()==1 ) return alts.get(0);
BlockStartState start = (BlockStartState)newState(BlockStartState.class, blkAST);
BlockEndState end = (BlockEndState)newState(BlockEndState.class, blkAST);
for (Handle alt : alts) {
epsilon(start, alt.left);
epsilon(alt.right, end);
}
Handle h = new Handle(start, end);
FASerializer ser = new FASerializer(g, h.left);
nfa.defineDecisionState(start);
System.out.println(blkAST.toStringTree()+":\n"+ser);
return h;
}
public Handle alt(List<Handle> els) {
@ -197,13 +219,27 @@ public class ParserNFAFactory implements NFAFactory {
*
* or, if A is a block, just add an empty alt to the end of the block
*/
public Handle optional(Handle A) {
OptionalBlockStartState left = new OptionalBlockStartState(nfa);
BlockEndState right = new BlockEndState(nfa);
epsilon(left, A.left);
epsilon(A.right, right);
epsilon(left, right);
return new Handle(left, right);
public Handle optional(GrammarAST optAST, Handle blk) {
if ( blk.left instanceof BlockStartState ) {
epsilon(blk.left, blk.right);
FASerializer ser = new FASerializer(g, blk.left);
System.out.println(optAST.toStringTree()+":\n"+ser);
return blk;
}
// construct block
BlockStartState start = (BlockStartState)newState(BlockStartState.class, optAST);
BlockEndState end = (BlockEndState)newState(BlockEndState.class, optAST);
epsilon(start, blk.left);
epsilon(blk.right, end);
epsilon(start, end);
nfa.defineDecisionState(start);
Handle h = new Handle(start, end);
FASerializer ser = new FASerializer(g, h.left);
System.out.println(optAST.toStringTree()+":\n"+ser);
return h;
}
/** From (A)+ build
@ -219,7 +255,7 @@ public class ParserNFAFactory implements NFAFactory {
* During analysis we'll call the follow link (transition 1) alt n+1 for
* an n-alt A block.
*/
public Handle plus(Handle A) { return null; }
public Handle plus(GrammarAST plusAST, Handle blk) { return null; }
/** From (A)* build
*
@ -251,7 +287,7 @@ public class ParserNFAFactory implements NFAFactory {
* is sufficient to let me make an appropriate enter, exit, loop
* determination. See codegen.g
*/
public Handle star(Handle A) { return null; }
public Handle star(GrammarAST starAST, Handle blk) { return null; }
/** Build an atom with all possible values in its label */
public Handle wildcard(GrammarAST associatedAST) { return null; }

View File

@ -28,4 +28,9 @@ public abstract class State {
if ( o instanceof State ) return this == (State)o;
return false;
}
@Override
public String toString() {
return String.valueOf(stateNumber);
}
}

View File

@ -0,0 +1,65 @@
package org.antlr.v4.codegen;
import org.antlr.analysis.Label;
import org.antlr.tool.ErrorManager;
/** */
public class Target {
/** When converting ANTLR char and string literals, here is the
* value set of escape chars.
*/
public static int ANTLRLiteralEscapedCharValue[] = new int[255];
/** Given a char, we need to be able to show as an ANTLR literal.
*/
public static String ANTLRLiteralCharValueEscape[] = new String[255];
static {
ANTLRLiteralEscapedCharValue['n'] = '\n';
ANTLRLiteralEscapedCharValue['r'] = '\r';
ANTLRLiteralEscapedCharValue['t'] = '\t';
ANTLRLiteralEscapedCharValue['b'] = '\b';
ANTLRLiteralEscapedCharValue['f'] = '\f';
ANTLRLiteralEscapedCharValue['\\'] = '\\';
ANTLRLiteralEscapedCharValue['\''] = '\'';
ANTLRLiteralEscapedCharValue['"'] = '"';
ANTLRLiteralCharValueEscape['\n'] = "\\n";
ANTLRLiteralCharValueEscape['\r'] = "\\r";
ANTLRLiteralCharValueEscape['\t'] = "\\t";
ANTLRLiteralCharValueEscape['\b'] = "\\b";
ANTLRLiteralCharValueEscape['\f'] = "\\f";
ANTLRLiteralCharValueEscape['\\'] = "\\\\";
ANTLRLiteralCharValueEscape['\''] = "\\'";
}
/** Return a string representing the escaped char for code c. E.g., If c
* has value 0x100, you will get "\u0100". ASCII gets the usual
* char (non-hex) representation. Control characters are spit out
* as unicode. While this is specially set up for returning Java strings,
* it can be used by any language target that has the same syntax. :)
*/
public static String getANTLRCharLiteralForChar(int c) {
if ( c< Label.MIN_CHAR_VALUE ) {
ErrorManager.internalError("invalid char value "+c);
return "'<INVALID>'";
}
if ( c<ANTLRLiteralCharValueEscape.length && ANTLRLiteralCharValueEscape[c]!=null ) {
return '\''+ANTLRLiteralCharValueEscape[c]+'\'';
}
if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN &&
!Character.isISOControl((char)c) ) {
if ( c=='\\' ) {
return "'\\\\'";
}
if ( c=='\'') {
return "'\\''";
}
return '\''+Character.toString((char)c)+'\'';
}
// turn on the bit above max "\uFFFF" value so that we pad with zeros
// then only take last 4 digits
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
String unicodeStr = "'\\u"+hex+"'";
return unicodeStr;
}
}

View File

@ -29,7 +29,7 @@ options {
language = Java;
tokenVocab = ANTLRParser;
ASTLabelType = GrammarAST;
filter = true;
// filter = true;
}
// Include the copyright in this source and also the generated source
@ -84,13 +84,13 @@ bottomup
;
rule returns [NFAFactory.Handle p]
: ^(RULE name=ID .+) {factory.setCurrentRuleName($name.text);}
: ^(RULE name=ID ~BLOCK* block) {factory.setCurrentRuleName($name.text);}
;
block returns [NFAFactory.Handle p]
@init {List<NFAFactory.Handle> alts = new ArrayList<NFAFactory.Handle>();}
: ^(BLOCK ~ALT* (a=alternative {alts.add($a.p);})+)
{factory.block(alts);}
{$p = factory.block($BLOCK, alts);}
;
alternative returns [NFAFactory.Handle p]
@ -125,9 +125,9 @@ treeSpec returns [NFAFactory.Handle p]
ebnf returns [NFAFactory.Handle p]
: ^(astBlockSuffix block) {$p = $block.p;}
| ^(OPTIONAL block) {$p = factory.optional($block.p);}
| ^(CLOSURE block) {$p = factory.star($block.p);}
| ^(POSITIVE_CLOSURE block) {$p = factory.plus($block.p);}
| ^(OPTIONAL block) {$p = factory.optional($start, $block.p);}
| ^(CLOSURE block) {$p = factory.star($start, $block.p);}
| ^(POSITIVE_CLOSURE block) {$p = factory.plus($start, $block.p);}
| block {$p = $block.p;}
;

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ import org.antlr.runtime.*;
import org.antlr.runtime.tree.TreeWizard;
import org.antlr.v4.Tool;
import org.antlr.v4.automata.Label;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.parse.ANTLRLexer;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.parse.GrammarASTAdaptor;
@ -65,6 +66,8 @@ public class Grammar implements AttributeResolver {
* field will have entries both mapped to 35.
*/
public Map<String, Integer> stringLiteralToTypeMap = new LinkedHashMap<String, Integer>();
/** Reverse index for stringLiteralToTypeMap */
public Vector<String> typeToStringLiteralList = new Vector<String>();
/** Map a token type to its token name.
* Must subtract MIN_TOKEN_TYPE from index.
@ -82,6 +85,8 @@ public class Grammar implements AttributeResolver {
public Map<String, String> options;
public Map<String, AttributeDict> scopes = new LinkedHashMap<String, AttributeDict>();
public static final String AUTO_GENERATED_TOKEN_NAME_PREFIX = "T__";
public Grammar(Tool tool, GrammarRootAST ast) {
if ( ast==null ) throw new IllegalArgumentException("can't pass null tree");
@ -284,7 +289,7 @@ public class Grammar implements AttributeResolver {
public String getStringLiteralLexerRuleName(String lit) {
int ttype = getTokenType(lit);
return "T__"+ttype;
return AUTO_GENERATED_TOKEN_NAME_PREFIX +ttype;
}
/** Return grammar directly imported by this grammar */
@ -308,6 +313,44 @@ public class Grammar implements AttributeResolver {
return i;
}
/** Given a token type, get a meaningful name for it such as the ID
* or string literal. If this is a lexer and the ttype is in the
* char vocabulary, compute an ANTLR-valid (possibly escaped) char literal.
*/
public String getTokenDisplayName(int ttype) {
String tokenName = null;
int index=0;
// inside any target's char range and is lexer grammar?
if ( getType()==ANTLRParser.LEXER &&
ttype >= Label.MIN_CHAR_VALUE && ttype <= Label.MAX_CHAR_VALUE )
{
return Target.getANTLRCharLiteralForChar(ttype);
}
// faux label?
else if ( ttype<0 ) {
tokenName = typeToTokenList.get(Label.NUM_FAUX_LABELS+ttype);
}
else {
// compute index in typeToTokenList for ttype
index = ttype-1; // normalize to 0..n-1
index += Label.NUM_FAUX_LABELS; // jump over faux tokens
if ( index<typeToTokenList.size() ) {
tokenName = typeToTokenList.get(index);
if ( tokenName!=null &&
tokenName.startsWith(AUTO_GENERATED_TOKEN_NAME_PREFIX) )
{
tokenName = typeToStringLiteralList.get(ttype);
}
}
else {
tokenName = String.valueOf(ttype);
}
}
//System.out.println("getTokenDisplayName ttype="+ttype+", index="+index+", name="+tokenName);
return tokenName;
}
/** Return a new unique integer in the token type space */
public int getNewTokenType() {
maxTokenType++;
@ -333,6 +376,12 @@ public class Grammar implements AttributeResolver {
if ( !stringLiteralToTypeMap.containsKey(lit) ) {
int ttype = getNewTokenType();
stringLiteralToTypeMap.put(lit, ttype);
// track in reverse index too
if ( ttype>=typeToStringLiteralList.size() ) {
typeToStringLiteralList.setSize(ttype+1);
}
typeToStringLiteralList.set(ttype, text);
setTokenForType(ttype, lit);
return ttype;
}

View File

@ -1,11 +1,16 @@
package org.antlr.v4.tool;
import org.antlr.runtime.BitSet;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.Token;
import org.antlr.runtime.tree.Tree;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.tree.CommonTree;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class GrammarAST extends CommonTree {
public GrammarAST() {;}
public GrammarAST(Token t) { super(t); }
@ -18,6 +23,23 @@ public class GrammarAST extends CommonTree {
t.setText(text);
}
public List<GrammarAST> getNodesWithType(int ttype) {
return getNodesWithType(BitSet.of(ttype));
}
public List<GrammarAST> getNodesWithType(BitSet types) {
List<GrammarAST> nodes = new ArrayList<GrammarAST>();
List<GrammarAST> work = new LinkedList<GrammarAST>();
work.add(this);
GrammarAST t = null;
while ( work.size()>0 ) {
t = work.remove(0);
if ( types.member(t.getType()) ) nodes.add(this);
work.addAll(children);
}
return nodes;
}
@Override
public Tree dupNode() {
return new GrammarAST(this);

View File

@ -1,15 +1,14 @@
package org.antlr.v4.tool;
import org.antlr.runtime.BitSet;
import org.antlr.v4.parse.ANTLRParser;
import java.util.BitSet;
public class LabelElementPair {
public static final BitSet tokenTypeForTokens = new BitSet();
static {
tokenTypeForTokens.set(ANTLRParser.TOKEN_REF);
tokenTypeForTokens.set(ANTLRParser.STRING_LITERAL);
tokenTypeForTokens.set(ANTLRParser.WILDCARD);
tokenTypeForTokens.add(ANTLRParser.TOKEN_REF);
tokenTypeForTokens.add(ANTLRParser.STRING_LITERAL);
tokenTypeForTokens.add(ANTLRParser.WILDCARD);
}
public GrammarAST label;