From 2ddeb7c769b798731b0cc2d538fd55a0c101119b Mon Sep 17 00:00:00 2001 From: parrt Date: Fri, 17 Jun 2011 15:42:21 -0800 Subject: [PATCH] adding new files [git-p4: depot-paths = "//depot/code/antlr4/main/": change = 8658] --- .../org/antlr/v4/runtime/BaseRecognizer.java | 3 +- .../v4/runtime/LexerNoViableAltException.java | 2 +- .../v4/runtime/NoViableAltException.java | 2 +- .../src/org/antlr/v4/runtime/atn/ATN.java | 1 - .../org/antlr/v4/runtime/atn/ATNConfig.java | 124 +++ .../antlr/v4/runtime/atn/ATNInterpreter.java | 3 +- .../org/antlr/v4/runtime/atn/ATNStack.java | 75 -- .../v4/runtime/atn/LexerInterpreter.java | 1 - .../v4/runtime/atn/ParserInterpreter.java | 5 +- .../src/org/antlr/v4/runtime/dfa/DFA.java | 42 - .../org/antlr/v4/runtime/dfa/DFAState.java | 2 +- .../org/antlr/v4/runtime/misc/LABitSet.java | 219 ---- .../v4/runtime/misc/LookaheadStream.java | 2 - .../org/antlr/v4/runtime/tree/CommonTree.java | 9 +- tool/src/org/antlr/v4/Tool.java | 350 ++++++- .../src/org/antlr/v4/automata/ATNFactory.java | 189 ++++ .../src/org/antlr/v4/automata/ATNPrinter.java | 85 ++ .../org/antlr/v4/automata/ATNSerializer.java | 218 ++++ .../antlr/v4/automata/LexerATNFactory.java | 90 ++ .../antlr/v4/automata/ParserATNFactory.java | 506 +++++++++ tool/src/org/antlr/v4/misc/CharSupport.java | 129 +++ tool/src/org/antlr/v4/misc/DoubleKeyMap.java | 55 + tool/src/org/antlr/v4/misc/Interval.java | 142 +++ tool/src/org/antlr/v4/misc/IntervalSet.java | 536 ++++++++++ .../src/org/antlr/v4/misc/OrderedHashMap.java | 30 + .../src/org/antlr/v4/misc/OrderedHashSet.java | 88 ++ tool/src/org/antlr/v4/parse/ANTLRLexer.g | 714 +++++++++++++ tool/src/org/antlr/v4/parse/ANTLRParser.g | 964 ++++++++++++++++++ tool/src/org/antlr/v4/parse/ASTVerifier.g | 431 ++++++++ tool/src/org/antlr/v4/parse/ATNBuilder.g | 176 ++++ tool/src/org/antlr/v4/parse/ActionSplitter.g | 172 ++++ .../v4/parse/ActionSplitterListener.java | 27 + .../org/antlr/v4/parse/GrammarASTAdaptor.java | 46 + .../v4/parse/ResyncToEndOfRuleBlock.java | 7 + tool/src/org/antlr/v4/parse/ScopeParser.java | 237 +++++ .../org/antlr/v4/parse/TokenVocabParser.java | 142 +++ .../org/antlr/v4/parse/ToolANTLRParser.java | 47 + .../org/antlr/v4/parse/v4ParserException.java | 16 + tool/src/org/antlr/v4/tool/ActionAST.java | 29 + tool/src/org/antlr/v4/tool/AltAST.java | 21 + tool/src/org/antlr/v4/tool/Alternative.java | 136 +++ tool/src/org/antlr/v4/tool/Attribute.java | 37 + tool/src/org/antlr/v4/tool/AttributeDict.java | 86 ++ .../org/antlr/v4/tool/AttributeResolver.java | 40 + tool/src/org/antlr/v4/tool/BlockAST.java | 28 + tool/src/org/antlr/v4/tool/Grammar.java | 630 +++++++++++- tool/src/org/antlr/v4/tool/GrammarAST.java | 99 ++ .../src/org/antlr/v4/tool/GrammarRootAST.java | 33 + .../org/antlr/v4/tool/LabelElementPair.java | 48 + tool/src/org/antlr/v4/tool/LabelType.java | 15 + tool/src/org/antlr/v4/tool/LexerGrammar.java | 36 + tool/src/org/antlr/v4/tool/PredAST.java | 15 + tool/src/org/antlr/v4/tool/Rule.java | 246 +++++ tool/src/org/antlr/v4/tool/RuleAST.java | 29 + tool/src/org/antlr/v4/tool/TerminalAST.java | 19 + 55 files changed, 7051 insertions(+), 383 deletions(-) create mode 100644 runtime/Java/src/org/antlr/v4/runtime/atn/ATNConfig.java delete mode 100644 runtime/Java/src/org/antlr/v4/runtime/atn/ATNStack.java delete mode 100644 runtime/Java/src/org/antlr/v4/runtime/misc/LABitSet.java create mode 100644 tool/src/org/antlr/v4/automata/ATNFactory.java create mode 100644 tool/src/org/antlr/v4/automata/ATNPrinter.java create mode 100644 tool/src/org/antlr/v4/automata/ATNSerializer.java create mode 100644 tool/src/org/antlr/v4/automata/LexerATNFactory.java create mode 100644 tool/src/org/antlr/v4/automata/ParserATNFactory.java create mode 100644 tool/src/org/antlr/v4/misc/CharSupport.java create mode 100644 tool/src/org/antlr/v4/misc/DoubleKeyMap.java create mode 100644 tool/src/org/antlr/v4/misc/Interval.java create mode 100644 tool/src/org/antlr/v4/misc/IntervalSet.java create mode 100644 tool/src/org/antlr/v4/misc/OrderedHashMap.java create mode 100644 tool/src/org/antlr/v4/misc/OrderedHashSet.java create mode 100644 tool/src/org/antlr/v4/parse/ANTLRLexer.g create mode 100644 tool/src/org/antlr/v4/parse/ANTLRParser.g create mode 100644 tool/src/org/antlr/v4/parse/ASTVerifier.g create mode 100644 tool/src/org/antlr/v4/parse/ATNBuilder.g create mode 100644 tool/src/org/antlr/v4/parse/ActionSplitter.g create mode 100644 tool/src/org/antlr/v4/parse/ActionSplitterListener.java create mode 100644 tool/src/org/antlr/v4/parse/GrammarASTAdaptor.java create mode 100644 tool/src/org/antlr/v4/parse/ResyncToEndOfRuleBlock.java create mode 100644 tool/src/org/antlr/v4/parse/ScopeParser.java create mode 100644 tool/src/org/antlr/v4/parse/TokenVocabParser.java create mode 100644 tool/src/org/antlr/v4/parse/ToolANTLRParser.java create mode 100644 tool/src/org/antlr/v4/parse/v4ParserException.java create mode 100644 tool/src/org/antlr/v4/tool/ActionAST.java create mode 100644 tool/src/org/antlr/v4/tool/AltAST.java create mode 100644 tool/src/org/antlr/v4/tool/Alternative.java create mode 100644 tool/src/org/antlr/v4/tool/Attribute.java create mode 100644 tool/src/org/antlr/v4/tool/AttributeDict.java create mode 100644 tool/src/org/antlr/v4/tool/AttributeResolver.java create mode 100644 tool/src/org/antlr/v4/tool/BlockAST.java create mode 100644 tool/src/org/antlr/v4/tool/GrammarAST.java create mode 100644 tool/src/org/antlr/v4/tool/GrammarRootAST.java create mode 100644 tool/src/org/antlr/v4/tool/LabelElementPair.java create mode 100644 tool/src/org/antlr/v4/tool/LabelType.java create mode 100644 tool/src/org/antlr/v4/tool/LexerGrammar.java create mode 100644 tool/src/org/antlr/v4/tool/PredAST.java create mode 100644 tool/src/org/antlr/v4/tool/Rule.java create mode 100644 tool/src/org/antlr/v4/tool/RuleAST.java create mode 100644 tool/src/org/antlr/v4/tool/TerminalAST.java diff --git a/runtime/Java/src/org/antlr/v4/runtime/BaseRecognizer.java b/runtime/Java/src/org/antlr/v4/runtime/BaseRecognizer.java index 83ab0d1a1..92169efc4 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/BaseRecognizer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/BaseRecognizer.java @@ -27,9 +27,8 @@ */ package org.antlr.v4.runtime; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.*; -import org.antlr.v4.runtime.atn.ParserInterpreter; +import org.antlr.v4.runtime.atn.*; import java.util.*; diff --git a/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java b/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java index 98a6746ed..2a5fd88bb 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java +++ b/runtime/Java/src/org/antlr/v4/runtime/LexerNoViableAltException.java @@ -1,7 +1,7 @@ package org.antlr.v4.runtime; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.OrderedHashSet; +import org.antlr.v4.runtime.atn.ATNConfig; public class LexerNoViableAltException extends LexerRecognitionExeption { /** Prediction began at what input index? */ diff --git a/runtime/Java/src/org/antlr/v4/runtime/NoViableAltException.java b/runtime/Java/src/org/antlr/v4/runtime/NoViableAltException.java index 5ce905745..521801e93 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/NoViableAltException.java +++ b/runtime/Java/src/org/antlr/v4/runtime/NoViableAltException.java @@ -27,8 +27,8 @@ */ package org.antlr.v4.runtime; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.OrderedHashSet; +import org.antlr.v4.runtime.atn.ATNConfig; public class NoViableAltException extends RecognitionException { /** Prediction began at what input index? */ diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATN.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATN.java index 8700c4e7c..ae905a160 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATN.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATN.java @@ -1,6 +1,5 @@ package org.antlr.v4.runtime.atn; -import org.antlr.v4.analysis.LL1Analyzer; import org.antlr.v4.automata.ATNSerializer; import org.antlr.v4.misc.*; import org.antlr.v4.runtime.RuleContext; diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNConfig.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNConfig.java new file mode 100644 index 000000000..b23865958 --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNConfig.java @@ -0,0 +1,124 @@ +package org.antlr.v4.runtime.atn; + +import org.antlr.v4.runtime.*; + +/** An ATN state, predicted alt, and syntactic/semantic context. + * The syntactic context is a pointer into the rule invocation + * chain used to arrive at the state. The semantic context is + * the unordered set semantic predicates encountered before reaching + * an ATN state. + */ +public class ATNConfig { + /** The ATN state associated with this configuration */ + public ATNState state; + + /** What alt (or lexer rule) is predicted by this configuration */ + public int alt; + + /** The stack of invoking states leading to the rule/states associated + * wit this config. + */ + public RuleContext context; + + /** + Indicates that we have reached this ATN configuration after + traversing a predicate transition. This is important because we + cannot cache DFA states derived from such configurations + otherwise predicates would not get executed again (DFAs don't + have predicated edges in v4). + */ + public boolean traversedPredicate; + + /** + Indicates that we have reached this ATN configuration after + traversing a non-force action transition. We do not execute + predicates after such actions because the predicates could be + functions of the side effects. Force actions must be either side + effect free or automatically undone as the parse continues. + */ + public boolean traversedAction; + + public ATNConfig(ATNState state, + int alt, + RuleContext context) + { + this.state = state; + this.alt = alt; + this.context = context; + } + + public ATNConfig(ATNConfig c) { + this.state = c.state; + this.alt = c.alt; + this.context = c.context; + this.traversedPredicate = c.traversedPredicate; + this.traversedAction = c.traversedAction; + } + + public ATNConfig(ATNConfig c, ATNState state) { + this(c); + this.state = state; + } + + public ATNConfig(ATNConfig c, ATNState state, RuleContext context) { + this(c); + this.state = state; + this.context = context; + } + + public ATNConfig(ATNConfig c, RuleContext context) { + this(c); + this.context = context; + } + + /** An ATN configuration is equal to another if both have + * the same state, they predict the same alternative, and + * syntactic/semantic contexts are the same. + */ + public boolean equals(Object o) { + if ( o==null ) return false; + if ( this==o ) return true; + ATNConfig other = (ATNConfig)o; + return this.state.stateNumber==other.state.stateNumber && + this.alt==other.alt && + (this.context==other.context || + this.context.equals(other.context)); + } + + public int hashCode() { + if ( state==null ) { + System.out.println("eh?"); + } + int h = state.stateNumber + alt; + if ( context!=null ) h += context.hashCode(); + return h; + } + + public String toString() { + return toString(null, true); + } + + public String toString(Recognizer recog, boolean showAlt) { + StringBuffer buf = new StringBuffer(); + if ( state.ruleIndex>0 ) { + if ( recog!=null ) buf.append(recog.getRuleNames()[state.ruleIndex]+":"); + else buf.append(state.ruleIndex+":"); + } + buf.append(state); + if ( showAlt ) { + buf.append("|"); + buf.append(alt); + } + if ( context!=null ) { + buf.append("|"); + buf.append(context); + } +// if (isAccept) { +// buf.append("|=>"+alt); +// } +// if ( context.approximated ) { +// buf.append("|approx"); +// } + return buf.toString(); + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNInterpreter.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNInterpreter.java index 91a7af973..f4330345b 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNInterpreter.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNInterpreter.java @@ -1,10 +1,9 @@ package org.antlr.v4.runtime.atn; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.*; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.runtime.dfa.*; -import org.antlr.v4.tool.*; +import org.antlr.v4.tool.Grammar; import java.util.*; diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNStack.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNStack.java deleted file mode 100644 index 0059885a7..000000000 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNStack.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.antlr.v4.runtime.atn; - -/** Identical to ANTLR's static grammar analysis ATNContext object */ -public class ATNStack { - public static final ATNStack EMPTY = new ATNStack(null, -1); - - public ATNStack parent; - - /** The ATN state following state that invoked another rule's start state - * is recorded on the rule invocation context stack. - */ - public int returnAddr; - - /** Computing the hashCode is very expensive and ATN.addToClosure() - * uses it to track when it's seen a state|ctx before to avoid - * infinite loops. As we add new contexts, record the hash code - * as this + parent.cachedHashCode. Avoids walking - * up the tree for every hashCode(). Note that this caching works - * because a context is a monotonically growing tree of context nodes - * and nothing on the stack is ever modified...ctx just grows - * or shrinks. - */ - protected int cachedHashCode; - - public ATNStack(ATNStack parent, int returnAddr) { - this.parent = parent; - this.returnAddr = returnAddr; - if ( returnAddr >= 0 ) { - this.cachedHashCode = returnAddr; - } - if ( parent!=null ) { - this.cachedHashCode += parent.cachedHashCode; - } - } - - public int hashCode() { return cachedHashCode; } - - /** Two contexts are equals() if both have - * same call stack; walk upwards to the root. - * Recall that the root sentinel node has no parent. - * Note that you may be comparing contextsv in different alt trees. - */ - public boolean equals(Object o) { - ATNStack other = ((ATNStack)o); - if ( this.cachedHashCode != other.cachedHashCode ) { - return false; // can't be same if hash is different - } - if ( this==other ) return true; - - // System.out.println("comparing "+this+" with "+other); - ATNStack sp = this; - while ( sp.parent!=null && other.parent!=null ) { - if ( sp.returnAddr != other.returnAddr) return false; - sp = sp.parent; - other = other.parent; - } - if ( !(sp.parent==null && other.parent==null) ) { - return false; // both pointers must be at their roots after walk - } - return true; - } - - public String toString() { - StringBuffer buf = new StringBuffer(); - ATNStack sp = this; - buf.append("["); - while ( sp.parent!=null ) { - buf.append(sp.returnAddr); - buf.append(" "); - sp = sp.parent; - } - buf.append("$]"); - return buf.toString(); - } -} diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerInterpreter.java b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerInterpreter.java index 4d4901817..6b579d893 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/LexerInterpreter.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/LexerInterpreter.java @@ -1,6 +1,5 @@ package org.antlr.v4.runtime.atn; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.OrderedHashSet; import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.dfa.*; diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ParserInterpreter.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ParserInterpreter.java index 6fff01453..0312aa278 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ParserInterpreter.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ParserInterpreter.java @@ -1,11 +1,8 @@ package org.antlr.v4.runtime.atn; -import org.antlr.runtime.CharStream; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.*; import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.dfa.*; -import org.antlr.v4.tool.DOTGenerator; import org.stringtemplate.v4.misc.MultiMap; import java.util.*; @@ -37,7 +34,7 @@ public class ParserInterpreter extends ATNInterpreter { this.parser = parser; ctxToDFAs = new HashMap(); decisionToDFA = new DFA[atn.getNumberOfDecisions()+1]; - DOTGenerator dot = new DOTGenerator(null); +// DOTGenerator dot = new DOTGenerator(null); // System.out.println(dot.getDOT(atn.rules.get(0), parser.getRuleNames())); // System.out.println(dot.getDOT(atn.rules.get(1), parser.getRuleNames())); } diff --git a/runtime/Java/src/org/antlr/v4/runtime/dfa/DFA.java b/runtime/Java/src/org/antlr/v4/runtime/dfa/DFA.java index bd3f5e0f1..816ce0095 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/dfa/DFA.java +++ b/runtime/Java/src/org/antlr/v4/runtime/dfa/DFA.java @@ -38,7 +38,6 @@ public class DFA { public Map states = new LinkedHashMap(); public DFAState s0; public int decision; -// public int maxTokenType; /** From which ATN state did we create this DFA? */ public ATNState atnStartState; @@ -49,47 +48,6 @@ public class DFA { public boolean conflict; public DFA(ATNState atnStartState) { this.atnStartState = atnStartState; } -// public DFA(int maxTokenType) { this.maxTokenType = maxTokenType; } - -/* - public void addAll(Collection states) { - for (DFAState p : states) { - //addDFAEdge(p, t, q); - } - } - - public void addDFAEdge(OrderedHashSet p, - int t, - OrderedHashSet q) - { -// System.out.println("MOVE "+p+" -> "+q+" upon "+getTokenName(t)); - DFAState from = addDFAState(p); - DFAState to = addDFAState(q); - addDFAEdge(from, t, to); - } - - public void addDFAEdge(DFAState p, int t, DFAState q) { - if ( p.edges==null ) { - p.edges = new DFAState[maxTokenType+1]; // TODO: make adaptive - } - p.edges[t] = q; // connect - } - - protected DFAState addDFAState(OrderedHashSet configs) { - DFAState proposed = new DFAState(configs); - DFAState existing = states.get(proposed); - DFAState p; - if ( existing!=null ) p = existing; - else { - proposed.stateNumber = states.size(); - proposed.configs = new OrderedHashSet(); - proposed.configs.addAll(configs); - states.put(proposed, proposed); - p = proposed; - } - return p; - } - */ public String toString() { return toString(null); } diff --git a/runtime/Java/src/org/antlr/v4/runtime/dfa/DFAState.java b/runtime/Java/src/org/antlr/v4/runtime/dfa/DFAState.java index 41fc86e86..aae40c652 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/dfa/DFAState.java +++ b/runtime/Java/src/org/antlr/v4/runtime/dfa/DFAState.java @@ -1,8 +1,8 @@ package org.antlr.v4.runtime.dfa; -import org.antlr.v4.analysis.ATNConfig; import org.antlr.v4.misc.OrderedHashSet; import org.antlr.v4.runtime.RuleContext; +import org.antlr.v4.runtime.atn.ATNConfig; import java.util.*; diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/LABitSet.java b/runtime/Java/src/org/antlr/v4/runtime/misc/LABitSet.java deleted file mode 100644 index 43aa84be9..000000000 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/LABitSet.java +++ /dev/null @@ -1,219 +0,0 @@ -package org.antlr.v4.runtime.misc; - -import org.antlr.v4.runtime.Token; - -/** */ -public class LABitSet implements Cloneable { - public final static int BITS = 64; // number of bits / long - public final static int LOG_BITS = 6; // 2^6 == 64 - - /* We will often need to do a mod operator (i mod nbits). Its - * turns out that, for powers of two, this mod operation is - * same as (i & (nbits-1)). Since mod is slow, we use a - * precomputed mod mask to do the mod instead. - */ - public final static int MOD_MASK = BITS - 1; - - public static final LABitSet EOF_SET = LABitSet.of(Token.EOF); - - /** The actual data bits */ - public long bits[]; - - public boolean EOF; // is EOF in set (-1)? - - /** Construct a bitset of size one word (64 bits) */ - public LABitSet() { - this(BITS); - } - - /** Construct a bitset given the size - * @param nbits The size of the bitset in bits - */ - public LABitSet(int nbits) { - bits = new long[((nbits - 1) >> LOG_BITS) + 1]; - } - - /** Construction from a static array of longs */ - public LABitSet(long[] bits_) { - if ( bits_==null || bits_.length==0 ) bits = new long[1]; - else bits = bits_; - } - - /** Construction from a static array of longs */ - public LABitSet(long[] bits_, boolean EOF) { - this(bits_); - this.EOF = EOF; - } - - public static LABitSet of(int el) { - LABitSet s = new LABitSet(el + 1); - s.add(el); - return s; - } - - /** or this element into this set (grow as necessary to accommodate) */ - public void add(int el) { - //System.out.println("add("+el+")"); - if ( el==Token.EOF ) { EOF = true; return; } - int n = wordNumber(el); - //System.out.println("word number is "+n); - //System.out.println("bits.length "+bits.length); - if (n >= bits.length) { - growToInclude(el); - } - bits[n] |= bitMask(el); - } - - public boolean member(int el) { - if ( el == Token.EOF ) return EOF; - int n = wordNumber(el); - if (n >= bits.length) return false; - return (bits[n] & bitMask(el)) != 0; - } - - /** return this | a in a new set */ - public LABitSet or(LABitSet a) { - if ( a==null ) { - return this; - } - LABitSet s = (LABitSet)this.clone(); - s.orInPlace((LABitSet)a); - return s; - } - - public void orInPlace(LABitSet a) { - if ( a==null ) { - return; - } - // If this is smaller than a, grow this first - if (a.bits.length > bits.length) { - setSize(a.bits.length); - } - int min = Math.min(bits.length, a.bits.length); - for (int i = min - 1; i >= 0; i--) { - bits[i] |= a.bits[i]; - } - EOF = EOF | a.EOF; - } - - // remove this element from this set - public void remove(int el) { - if ( el==Token.EOF ) { EOF = false; return; } - int n = wordNumber(el); - if (n >= bits.length) { - throw new IllegalArgumentException(el+" is outside set range of "+bits.length+" words"); - } - bits[n] &= ~bitMask(el); - } - - public Object clone() { - LABitSet s; - try { - s = (LABitSet)super.clone(); - s.bits = new long[bits.length]; - System.arraycopy(bits, 0, s.bits, 0, bits.length); - s.EOF = EOF; - return s; - } - catch (CloneNotSupportedException e) { - e.printStackTrace(System.err); - } - return null; - } - - /** - * Sets the size of a set. - * @param nwords how many words the new set should be - */ - void setSize(int nwords) { - long newbits[] = new long[nwords]; - int n = Math.min(nwords, bits.length); - System.arraycopy(bits, 0, newbits, 0, n); - bits = newbits; - } - - /** Get the first element you find and return it. */ - public int getSingleElement() { - for (int i = 0; i < (bits.length << LOG_BITS); i++) { - if (member(i)) { - return i; - } - } - return Token.INVALID_TYPE; - } - - /** Transform a bit set into a string by formatting each element as an integer - * separator The string to put in between elements - * @return A commma-separated list of values - */ - public String toString() { - StringBuffer buf = new StringBuffer(); - String separator = ","; - boolean havePrintedAnElement = false; - buf.append('{'); - if ( EOF ) { buf.append("EOF"); havePrintedAnElement=true; } - - for (int i = 0; i < (bits.length << LOG_BITS); i++) { - if (member(i)) { - if ( havePrintedAnElement ) { - buf.append(separator); - } - buf.append(i); - havePrintedAnElement = true; - } - } - buf.append('}'); - return buf.toString(); - } - -// /**Create a string representation where instead of integer elements, the -// * ith element of vocabulary is displayed instead. Vocabulary is a Vector -// * of Strings. -// * separator The string to put in between elements -// * @return A commma-separated list of character constants. -// */ -// public String toString(String separator, List vocabulary) { -// String str = ""; -// for (int i = 0; i < (bits.length << LOG_BITS); i++) { -// if (member(i)) { -// if (str.length() > 0) { -// str += separator; -// } -// if (i >= vocabulary.size()) { -// str += "'" + (char)i + "'"; -// } -// else if (vocabulary.get(i) == null) { -// str += "'" + (char)i + "'"; -// } -// else { -// str += (String)vocabulary.get(i); -// } -// } -// } -// return str; -// } - - /** - * Grows the set to a larger number of bits. - * @param bit element that must fit in set - */ - public void growToInclude(int bit) { - int newSize = Math.max(bits.length << 1, numWordsToHold(bit)); - long newbits[] = new long[newSize]; - System.arraycopy(bits, 0, newbits, 0, bits.length); - bits = newbits; - } - - static long bitMask(int bitNumber) { - int bitPosition = bitNumber & MOD_MASK; // bitNumber mod BITS - return 1L << bitPosition; - } - - static int numWordsToHold(int el) { - return (el >> LOG_BITS) + 1; - } - - static int wordNumber(int bit) { - return bit >> LOG_BITS; // bit / BITS - } -} diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/LookaheadStream.java b/runtime/Java/src/org/antlr/v4/runtime/misc/LookaheadStream.java index 995aba26e..517d0990a 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/LookaheadStream.java +++ b/runtime/Java/src/org/antlr/v4/runtime/misc/LookaheadStream.java @@ -27,8 +27,6 @@ */ package org.antlr.v4.runtime.misc; -import org.antlr.runtime.misc.FastQueue; - import java.util.NoSuchElementException; /** A lookahead queue that knows how to mark/release locations diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/CommonTree.java b/runtime/Java/src/org/antlr/v4/runtime/tree/CommonTree.java index 388bfef0d..ef4fc43de 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/CommonTree.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/CommonTree.java @@ -27,10 +27,11 @@ */ package org.antlr.v4.runtime.tree; -import org.antlr.runtime.BitSet; import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.tree.gui.ASTViewer; +import java.util.Set; + /** A tree node that is wrapper for a Token object. After 3.0 release * while building tree rewrite stuff, it became clear that computing * parent and child index is very difficult and cumbersome. Better to @@ -194,12 +195,12 @@ public class CommonTree extends BaseTree { } // TODO: don't include this node!! - public CommonTree getFirstDescendantWithType(BitSet types) { - if ( types.member(getType()) ) return this; + public CommonTree getFirstDescendantWithType(Set types) { + if ( types.contains(getType()) ) return this; if ( children==null ) return null; for (Object c : children) { CommonTree t = (CommonTree)c; - if ( types.member(t.getType()) ) return t; + if ( types.contains(t.getType()) ) return t; CommonTree d = t.getFirstDescendantWithType(types); if ( d!=null ) return d; } diff --git a/tool/src/org/antlr/v4/Tool.java b/tool/src/org/antlr/v4/Tool.java index 667c07328..84b82d204 100644 --- a/tool/src/org/antlr/v4/Tool.java +++ b/tool/src/org/antlr/v4/Tool.java @@ -1,28 +1,35 @@ package org.antlr.v4; +import org.antlr.runtime.*; +import org.antlr.tool.DOTGenerator; +import org.antlr.v4.parse.*; import org.antlr.v4.tool.*; +import java.io.IOException; +import java.lang.reflect.Field; import java.util.*; public class Tool { public String VERSION = "4.0-"+new Date(); - public static enum OptionArgType { NONE, STRING, INT } + public static enum OptionArgType { NONE, STRING } public static class Option { + String fieldName; String name; OptionArgType argType; Object defaultArgValue; String description; - public Option(String name, String description) { - this(name, OptionArgType.NONE, null, description); + public Option(String fieldName, String name, String description) { + this(fieldName, name, OptionArgType.NONE, null, description); } - public Option(String name, OptionArgType argType, String description) { - this(name, argType, null, description); + public Option(String fieldName, String name, OptionArgType argType, String description) { + this(fieldName, name, argType, null, description); } - public Option(String name, OptionArgType argType, Object defaultArgValue, String description) { + public Option(String fieldName, String name, OptionArgType argType, Object defaultArgValue, String description) { + this.fieldName = fieldName; this.name = name; this.argType = argType; this.defaultArgValue = defaultArgValue; @@ -30,29 +37,42 @@ public class Tool { } } + // fields set by option manager + + public String outputDirectory = "."; + public String libDirectory = "."; + public boolean report = false; + public boolean printGrammar = false; + public boolean debug = false; + public boolean profile = false; + public boolean trace = false; + public boolean generate_ATN_dot = false; + public String msgFormat = "antlr"; + public boolean saveLexer = false; + public boolean launch_ST_inspector = false; + public static Option[] optionDefs = { - new Option("o", OptionArgType.STRING, ".", "specify output directory where all output is generated"), - new Option("fo", OptionArgType.STRING, "same as -o but force even files with relative paths to dir"), - new Option("lib", "specify location of .token files"), - new Option("report", "print out a report about the grammar(s) processed"), - new Option("print", "print out the grammar without actions"), - new Option("debug", "generate a parser that emits debugging events"), - new Option("profile", "generate a parser that computes profiling information"), - new Option("atn", "generate rule augmented transition networks"), - new Option("message-format", OptionArgType.STRING, "specify output style for messages"), - new Option("version", "print the version of ANTLR and exit"), - new Option("savelexer", "save temp lexer file created for combined grammars"), - new Option("dbgST", "launch StringTemplate visualizer on generated code"), + new Option("outputDirectory", "-o", OptionArgType.STRING, ".", "specify output directory where all output is generated"), + new Option("libDirectory", "-lib", OptionArgType.STRING, ".", "specify location of .token files"), + new Option("report", "-report", "print out a report about the grammar(s) processed"), + new Option("printGrammar", "-print", "print out the grammar without actions"), + new Option("debug", "-debug", "generate a parser that emits debugging events"), + new Option("profile", "-profile", "generate a parser that computes profiling information"), + new Option("trace", "-trace", "generate a recognizer that traces rule entry/exit"), + new Option("generate_ATN_dot", "-atn", "generate rule augmented transition networks"), + new Option("msgFormat", "-message-format", OptionArgType.STRING, "antlr", "specify output style for messages"), + new Option("saveLexer", "-savelexer", "save temp lexer file created for combined grammars"), + new Option("launch_ST_inspector", "-dbgST", "launch StringTemplate visualizer on generated code"), }; - protected Map options = new HashMap(); + public final String[] args; - protected String[] args; + protected List grammarFiles = new ArrayList(); public ErrorManager errMgr = new ErrorManager(this); List listeners = - Collections.synchronizedList(new ArrayList()); + Collections.synchronizedList(new ArrayList()); /** Track separately so if someone adds a listener, it's the only one * instead of it and the default stderr listener. @@ -61,7 +81,8 @@ public class Tool { public static void main(String[] args) { Tool antlr = new Tool(args); - antlr.help(); + if ( args.length == 0 ) { antlr.help(); antlr.exit(0); } + antlr.processGrammarsOnCommandLine(); if (antlr.errMgr.getNumErrors() > 0) { @@ -79,18 +100,294 @@ public class Tool { public Tool(String[] args) { this.args = args; + parseArgs(); + } + + protected void parseArgs() { + int i=0; + while ( args!=null && i0 ) return; + + if ( g.getImportedGrammars()!=null ) { // process imported grammars (if any) + for (Grammar imp : g.getImportedGrammars()) { + processNonCombinedGrammar(imp); + } + } + + // BUILD ATN FROM AST + ATNFactory factory = new ParserATNFactory(g); + if ( g.isLexer() ) factory = new LexerATNFactory((LexerGrammar)g); + g.atn = factory.createATN(); + + if ( generate_ATN_dot ) generateATNs(g); + + // PERFORM GRAMMAR ANALYSIS ON ATN: BUILD DECISION DFAs + AnalysisPipeline anal = new AnalysisPipeline(g); + anal.process(); + + //if ( generate_DFA_dot ) generateDFAs(g); + + if ( g.tool.getNumErrors()>0 ) return; + + // GENERATE CODE + CodeGenPipeline gen = new CodeGenPipeline(g); + gen.process(); + } + + public Grammar createGrammar(GrammarRootAST ast) { + if ( ast.grammarType==ANTLRParser.LEXER ) return new LexerGrammar(this, ast); + else return new Grammar(this, ast); + } + + public GrammarAST load(String fileName) { + ANTLRFileStream in = null; + try { + in = new ANTLRFileStream(fileName); + } + catch (IOException ioe) { + errMgr.toolError(ErrorType.CANNOT_OPEN_FILE, fileName, ioe); + } + return load(in); + } + + public GrammarAST loadFromString(String grammar) { + return load(new ANTLRStringStream(grammar)); + } + + public GrammarAST load(CharStream in) { + try { + ANTLRLexer lexer = new ANTLRLexer(in); + CommonTokenStream tokens = new CommonTokenStream(lexer); + ToolANTLRParser p = new ToolANTLRParser(tokens, this); + p.setTreeAdaptor(new GrammarASTAdaptor(in)); + ParserRuleReturnScope r = p.grammarSpec(); + GrammarAST root = (GrammarAST) r.getTree(); + if ( root instanceof GrammarRootAST ) { + ((GrammarRootAST)root).hasErrors = p.getNumberOfSyntaxErrors()>0; + } + return root; + } + catch (RecognitionException re) { + // TODO: do we gen errors now? + errMgr.internalError("can't generate this message at moment; antlr recovers"); + } + return null; + } + + /** Build lexer grammar from combined grammar that looks like: + * + * (COMBINED_GRAMMAR A + * (tokens { X (= Y 'y')) + * (OPTIONS (= x 'y')) + * (scope Blort { int x; }) + * (@ members {foo}) + * (@ lexer header {package jj;}) + * (RULES (RULE .+))) + * + * Move rules and actions to new tree, don't dup. Split AST apart. + * We'll have this Grammar share token symbols later; don't generate + * tokenVocab or tokens{} section. + * + * Side-effects: it removes children from GRAMMAR & RULES nodes + * in combined AST. Careful: nodes are shared between + * trees after this call. + */ + public GrammarRootAST extractImplicitLexer(Grammar combinedGrammar) { + GrammarRootAST combinedAST = combinedGrammar.ast; + //System.out.println("before="+combinedAST.toStringTree()); + GrammarASTAdaptor adaptor = new GrammarASTAdaptor(combinedAST.token.getInputStream()); + List elements = combinedAST.getChildren(); + + // MAKE A GRAMMAR ROOT and ID + String lexerName = combinedAST.getChild(0).getText()+"Lexer"; + GrammarRootAST lexerAST = + new GrammarRootAST(new CommonToken(ANTLRParser.GRAMMAR,"LEXER_GRAMMAR")); + lexerAST.grammarType = ANTLRParser.LEXER; + lexerAST.token.setInputStream(combinedAST.token.getInputStream()); + lexerAST.addChild((org.antlr.v4.tool.GrammarAST)adaptor.create(ANTLRParser.ID, lexerName)); + + // MOVE OPTIONS + org.antlr.v4.tool.GrammarAST optionsRoot = + (org.antlr.v4.tool.GrammarAST)combinedAST.getFirstChildWithType(ANTLRParser.OPTIONS); + if ( optionsRoot!=null ) { + org.antlr.v4.tool.GrammarAST lexerOptionsRoot = (org.antlr.v4.tool.GrammarAST)adaptor.dupNode(optionsRoot); + lexerAST.addChild(lexerOptionsRoot); + List options = optionsRoot.getChildren(); + for (org.antlr.v4.tool.GrammarAST o : options) { + String optionName = o.getChild(0).getText(); + if ( !Grammar.doNotCopyOptionsToLexer.contains(optionName) ) { + lexerOptionsRoot.addChild(o); + } + } + } + + // MOVE lexer:: actions + List actionsWeMoved = new ArrayList(); + for (org.antlr.v4.tool.GrammarAST e : elements) { + if ( e.getType()==ANTLRParser.AT ) { + if ( e.getChild(0).getText().equals("lexer") ) { + lexerAST.addChild(e); + actionsWeMoved.add(e); + } + } + } + elements.removeAll(actionsWeMoved); + org.antlr.v4.tool.GrammarAST combinedRulesRoot = + (org.antlr.v4.tool.GrammarAST)combinedAST.getFirstChildWithType(ANTLRParser.RULES); + if ( combinedRulesRoot==null ) return lexerAST; + + // MOVE lexer rules + + org.antlr.v4.tool.GrammarAST lexerRulesRoot = + (org.antlr.v4.tool.GrammarAST)adaptor.create(ANTLRParser.RULES, "RULES"); + lexerAST.addChild(lexerRulesRoot); + List rulesWeMoved = new ArrayList(); + List rules = combinedRulesRoot.getChildren(); + for (GrammarASTWithOptions r : rules) { + String ruleName = r.getChild(0).getText(); + if ( Character.isUpperCase(ruleName.charAt(0)) ) { + lexerRulesRoot.addChild(r); + rulesWeMoved.add(r); + } + } + int nLexicalRules = rulesWeMoved.size(); + rules.removeAll(rulesWeMoved); + + // Will track 'if' from IF : 'if' ; rules to avoid defining new token for 'if' + Map litAliases = + Grammar.getStringLiteralAliasesFromLexerRules(lexerAST); + + if ( nLexicalRules==0 && (litAliases==null||litAliases.size()==0) && + combinedGrammar.stringLiteralToTypeMap.size()==0 ) + { + // no rules, tokens{}, or 'literals' in grammar + return null; + } + + // add strings from combined grammar (and imported grammars) into to lexer + for (String lit : combinedGrammar.stringLiteralToTypeMap.keySet()) { + if ( litAliases!=null && litAliases.containsKey(lit) ) continue; // already has rule + // create for each literal: (RULE (BLOCK (ALT )) + String rname = combinedGrammar.getStringLiteralLexerRuleName(lit); + // can't use wizard; need special node types + org.antlr.v4.tool.GrammarAST litRule = new RuleAST(ANTLRParser.RULE); + BlockAST blk = new BlockAST(ANTLRParser.BLOCK); + AltAST alt = new AltAST(ANTLRParser.ALT); + TerminalAST slit = new TerminalAST(new org.antlr.runtime.CommonToken(ANTLRParser.STRING_LITERAL, lit)); + alt.addChild(slit); + blk.addChild(alt); + CommonToken idToken = new CommonToken(ANTLRParser.ID, rname); + litRule.addChild(new TerminalAST(idToken)); + litRule.addChild(blk); + lexerRulesRoot.addChild(litRule); + +// (GrammarAST) +// wiz.create("(RULE ID["+rname+"] (BLOCK (ALT STRING_LITERAL["+lit+"])))"); + } + + System.out.println("after ="+combinedAST.toStringTree()); + System.out.println("lexer ="+lexerAST.toStringTree()); + return lexerAST; + } + + public void generateATNs(Grammar g) { + DOTGenerator dotGenerator = new DOTGenerator(g); + List grammars = new ArrayList(); + grammars.add(g); + List imported = g.getAllImportedGrammars(); + if ( imported!=null ) grammars.addAll(imported); + for (Grammar ig : grammars) { + for (Rule r : ig.rules.values()) { + try { + String dot = dotGenerator.getDOT(g.atn.ruleToStartState.get(r)); + if (dot != null) { + writeDOTFile(g, r, dot); + } + } catch (IOException ioe) { + errMgr.toolError(ErrorType.CANNOT_WRITE_FILE, ioe); + } + } + } + } public void help() { info("ANTLR Parser Generator Version " + new Tool().VERSION); for (Option o : optionDefs) { String name = o.name + (o.argType!=OptionArgType.NONE? " ___" : ""); - String s = String.format(" -%-19s %s", name, o.description); + String s = String.format(" %-19s %s", name, o.description); info(s); } } @@ -124,10 +421,9 @@ public class Tool { for (ANTLRToolListener l : listeners) l.warning(msg); } - - public void version() { - info("ANTLR Parser Generator Version " + new Tool().VERSION); - } + public void version() { + info("ANTLR Parser Generator Version " + new Tool().VERSION); + } public void exit(int e) { System.exit(e); } diff --git a/tool/src/org/antlr/v4/automata/ATNFactory.java b/tool/src/org/antlr/v4/automata/ATNFactory.java new file mode 100644 index 000000000..f27b91efe --- /dev/null +++ b/tool/src/org/antlr/v4/automata/ATNFactory.java @@ -0,0 +1,189 @@ +package org.antlr.v4.automata; + +import org.antlr.v4.misc.IntervalSet; +import org.antlr.v4.runtime.atn.*; +import org.antlr.v4.tool.*; + +import java.util.List; + +public interface ATNFactory { + /** A pair of states pointing to the left/right (start and end) states of a + * state submachine. Used to build ATNs. + */ + public static class Handle { + public ATNState left; + public ATNState right; + + public Handle(ATNState left, ATNState right) { + this.left = left; + this.right = right; + } + + @Override + public String toString() { + return "("+left+","+right+")"; + } + } + + ATN createATN(); + + void setCurrentRuleName(String name); + + Handle rule(GrammarAST ruleAST, String name, Handle blk); + + ATNState newState(); + + Handle label(Handle t); + + Handle listLabel(Handle t); + + Handle tokenRef(TerminalAST node); + + /** From set build single edge graph o->o-set->o. To conform to + * what an alt block looks like, must have extra state on left. + */ + Handle set(IntervalSet set, GrammarAST associatedAST); + + Handle tree(List els); + + Handle range(GrammarAST a, GrammarAST b); + + Handle not(GrammarAST a); + + /** For a non-lexer, just build a simple token reference atom. + * For a lexer, a string is a sequence of char to match. That is, + * "fog" is treated as 'f' 'o' 'g' not as a single transition in + * the DFA. Machine== o-'f'->o-'o'->o-'g'->o and has n+1 states + * for n characters. + */ + Handle stringLiteral(TerminalAST stringLiteralAST); + + /** For reference to rule r, build + * + * o-e->(r) o + * + * where (r) is the start of rule r and the trailing o is not linked + * to from rule ref state directly (it's done thru the transition(0) + * RuleClosureTransition. + * + * If the rule r is just a list of tokens, it's block will be just + * a set on an edge o->o->o-set->o->o->o, could inline it rather than doing + * the rule reference, but i'm not doing this yet as I'm not sure + * it would help much in the ATN->DFA construction. + * + * TODO add to codegen: collapse alt blks that are sets into single matchSet + * @param node + */ + Handle ruleRef(GrammarAST node); + + /** From an empty alternative build Grip o-e->o */ + Handle epsilon(GrammarAST node); + + /** Build what amounts to an epsilon transition with a semantic + * predicate action. The pred is a pointer into the AST of + * the SEMPRED token. + */ + Handle sempred(PredAST pred); + Handle gated_sempred(GrammarAST pred); + + /** Build what amounts to an epsilon transition with an action. + * The action goes into ATN though it is ignored during analysis. + * It slows things down a bit, but I must ignore predicates after + * having seen an action (5-5-2008). + */ + Handle action(ActionAST action); + + Handle alt(List els); + + /** From A|B|..|Z alternative block build + * + * o->o-A->o->o (last ATNState is blockEndATNState pointed to by all alts) + * | ^ + * o->o-B->o--| + * | | + * ... | + * | | + * o->o-Z->o--| + * + * So every alternative gets begin ATNState connected by epsilon + * and every alt right side points at a block end ATNState. There is a + * new ATNState in the ATNState in the Grip for each alt plus one for the + * end ATNState. + * + * Special case: only one alternative: don't make a block with alt + * begin/end. + * + * Special case: if just a list of tokens/chars/sets, then collapse + * to a single edge'd o-set->o graph. + * + * Set alt number (1..n) in the left-Transition ATNState. + */ + Handle block(BlockAST blockAST, GrammarAST ebnfRoot, List alternativeGrips); + + Handle notBlock(GrammarAST blockAST, List terminals); + + /** From (A)? build either: + * + * o--A->o + * | ^ + * o---->| + * + * or, if A is a block, just add an empty alt to the end of the block + */ + Handle optional(GrammarAST optAST, Handle blk); + + /** From (A)+ build + * + * |---| (Transition 2 from A.right points at alt 1) + * v | (follow of loop is Transition 1) + * o->o-A-o->o + * + * Meaning that the last ATNState in A points back to A's left Transition ATNState + * and we add a new begin/end ATNState. A can be single alternative or + * multiple. + * + * During analysis we'll call the follow link (transition 1) alt n+1 for + * an n-alt A block. + */ + Handle plus(GrammarAST plusAST, Handle blk); + + /** From (A)* build + * + * |---| + * v | + * o->o-A-o--o (Transition 2 from block end points at alt 1; follow is Transition 1) + * | ^ + * o---------| (optional branch is 2nd alt of optional block containing A+) + * + * Meaning that the last (end) ATNState in A points back to A's + * left side ATNState and we add 3 new ATNStates (the + * optional branch is built just like an optional subrule). + * See the Aplus() method for more on the loop back Transition. + * The new node on right edge is set to RIGHT_EDGE_OF_CLOSURE so we + * can detect nested (A*)* loops and insert an extra node. Previously, + * two blocks shared same EOB node. + * + * There are 2 or 3 decision points in a A*. If A is not a block (i.e., + * it only has one alt), then there are two decisions: the optional bypass + * and then loopback. If A is a block of alts, then there are three + * decisions: bypass, loopback, and A's decision point. + * + * Note that the optional bypass must be outside the loop as (A|B)* is + * not the same thing as (A|B|)+. + * + * This is an accurate ATN representation of the meaning of (A)*, but + * for generating code, I don't need a DFA for the optional branch by + * virtue of how I generate code. The exit-loopback-branch decision + * is sufficient to let me make an appropriate enter, exit, loop + * determination. See codegen.g + */ + Handle star(GrammarAST starAST, Handle blk); + + /** Build an atom with all possible values in its label */ + Handle wildcard(GrammarAST associatedAST); + + /** Build a subrule matching ^(. .*) (any tree or node). Let's use + * (^(. .+) | .) to be safe. + */ + Handle wildcardTree(GrammarAST associatedAST); +} diff --git a/tool/src/org/antlr/v4/automata/ATNPrinter.java b/tool/src/org/antlr/v4/automata/ATNPrinter.java new file mode 100644 index 000000000..b58ae0830 --- /dev/null +++ b/tool/src/org/antlr/v4/automata/ATNPrinter.java @@ -0,0 +1,85 @@ +package org.antlr.v4.automata; + +import org.antlr.v4.runtime.atn.*; +import org.antlr.v4.tool.Grammar; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** An ATN walker that knows how to dump them to serialized strings. */ +public class ATNPrinter { + List work; + Set marked; + Grammar g; + ATNState start; + + public ATNPrinter(Grammar g, ATNState start) { + this.g = g; + this.start = start; + } + + public String toString() { + if ( start==null ) return null; + marked = new HashSet(); + + work = new ArrayList(); + work.add(start); + + StringBuilder buf = new StringBuilder(); + ATNState s = null; + + while ( work.size()>0 ) { + s = work.remove(0); + if ( marked.contains(s) ) continue; + int n = s.getNumberOfTransitions(); + //System.out.println("visit "+getATNStateString(s)+"; edges="+n); + marked.add(s); + for (int i=0; i"+ getStateString(t.target)+'\n'); + } + else if ( t instanceof RuleTransition ) { + buf.append("->"+ getStateString(t.target)+'\n'); + } + else if ( t instanceof ActionTransition ) { + ActionTransition a = (ActionTransition)t; + buf.append("-"+a.actionAST.getText()+"->"+ getStateString(t.target)+'\n'); + } + else if ( t instanceof AtomTransition ) { + AtomTransition a = (AtomTransition)t; + buf.append("-"+a.toString(g)+"->"+ getStateString(t.target)+'\n'); + } + else { + buf.append("-"+t.toString(g)+"->"+ getStateString(t.target)+'\n'); + } + } + } + return buf.toString(); + } + + String getStateString(ATNState s) { + if ( s==null ) { + System.out.println("s==null"); + } + int n = s.stateNumber; + String stateStr = "s"+n; + if ( s instanceof StarBlockStartState ) stateStr = "StarBlockStart_"+n; + else if ( s instanceof PlusBlockStartState ) stateStr = "PlusBlockStart_"+n; + else if ( s instanceof StarBlockStartState ) stateStr = "StarBlockStart_"+n; + else if ( s instanceof BlockStartState) stateStr = "BlockStart_"+n; + else if ( s instanceof BlockEndState ) stateStr = "BlockEnd_"+n; + else if ( s instanceof RuleStartState) stateStr = "RuleStart_"+s.rule.name+"_"+n; + else if ( s instanceof RuleStopState ) stateStr = "RuleStop_"+s.rule.name+"_"+n; + else if ( s instanceof PlusLoopbackState) stateStr = "PlusLoopBack_"+n; + else if ( s instanceof StarLoopbackState) stateStr = "StarLoopBack_"+n; + return stateStr; + } +} diff --git a/tool/src/org/antlr/v4/automata/ATNSerializer.java b/tool/src/org/antlr/v4/automata/ATNSerializer.java new file mode 100644 index 000000000..d0f11d933 --- /dev/null +++ b/tool/src/org/antlr/v4/automata/ATNSerializer.java @@ -0,0 +1,218 @@ +package org.antlr.v4.automata; + +import org.antlr.v4.misc.*; +import org.antlr.v4.runtime.atn.*; +import org.antlr.v4.tool.Rule; + +import java.util.*; + +public class ATNSerializer { + public ATN atn; + public List sets = new ArrayList(); + + public ATNSerializer(ATN atn) { this.atn = atn; } + + /** Serialize state descriptors, edge descriptors, and decision->state map + * into list of ints: + * + * grammar-type, (ANTLRParser.LEXER, ...) + * max token type, + * num states, + * state-0-type ruleIndex, state-1-type ruleIndex, ... + * num rules, + * rule-1-start-state rule-1-args, rule-2-start-state rule-2-args, ... + * (args are token type,actionIndex in lexer else 0,0) + * num modes, + * mode-0-start-state, mode-1-start-state, ... (parser has 0 modes) + * num sets + * set-0-interval-count intervals, set-1-interval-count intervals, ... + * num total edges, + * src, trg, edge-type, edge arg1, optional edge arg2 (present always), ... + * num decisions, + * decision-0-start-state, decision-1-start-state, ... + * + * Convenient to pack into unsigned shorts to make as Java string. + */ + public List serialize() { + List data = new ArrayList(); + data.add(atn.g.getType()); + data.add(atn.g.getMaxTokenType()); + data.add(atn.states.size()); + int nedges = 0; + // dump states, count edges and collect sets while doing so + for (ATNState s : atn.states) { + data.add(ATNState.serializationTypes.get(s.getClass())); + if ( s.rule!=null ) data.add(s.rule.index); + else data.add(s.ruleIndex); + nedges += s.getNumberOfTransitions(); + for (int i=0; i0 ) { + for (ATNState modeStartState : atn.modeToStartState) { + data.add(modeStartState.stateNumber); + } + } + int nsets = sets.size(); + data.add(nsets); + for (IntervalSet set : sets) { + data.add(set.getIntervals().size()); + for (Interval I : set.getIntervals()) { + data.add(I.a); + data.add(I.b); + } + } + data.add(nedges); + int setIndex = 0; + for (ATNState s : atn.states) { + for (int i=0; i1 ) buf.append(", "); + buf.append(getTokenName(ATNInterpreter.toInt(data[p]))+".."+getTokenName(ATNInterpreter.toInt(data[p+1]))); + p += 2; + } + buf.append("\n"); + } + int nedges = ATNInterpreter.toInt(data[p++]); + for (int i=1; i<=nedges; i++) { + int src = ATNInterpreter.toInt(data[p]); + int trg = ATNInterpreter.toInt(data[p+1]); + int ttype = ATNInterpreter.toInt(data[p+2]); + int arg1 = ATNInterpreter.toInt(data[p+3]); + int arg2 = ATNInterpreter.toInt(data[p+4]); + buf.append(src+"->"+trg+ + " "+Transition.serializationNames[ttype]+ + " "+arg1+","+arg2+ + "\n"); + p += 5; + } + int ndecisions = ATNInterpreter.toInt(data[p++]); + for (int i=1; i<=ndecisions; i++) { + int s = ATNInterpreter.toInt(data[p++]); + buf.append((i-1)+":"+s+"\n"); + } + return buf.toString(); + } + + public String getTokenName(int t) { + if ( t==-1 ) return "EOF"; + if ( atn.g!=null ) return atn.g.getTokenDisplayName(t); + return String.valueOf(t); + } +} diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java new file mode 100644 index 000000000..9f0a2505d --- /dev/null +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -0,0 +1,90 @@ +package org.antlr.v4.automata; + +import org.antlr.v4.misc.CharSupport; +import org.antlr.v4.runtime.atn.*; +import org.antlr.v4.tool.*; + +import java.util.List; + +public class LexerATNFactory extends ParserATNFactory { + public LexerATNFactory(LexerGrammar g) { super(g); } + + public ATN createATN() { + // BUILD ALL START STATES (ONE PER MODE) + for (String modeName : ((LexerGrammar)g).modes.keySet()) { + // create s0, start state; implied Tokens rule node + TokensStartState startState = + (TokensStartState)newState(TokensStartState.class, null); + atn.modeNameToStartState.put(modeName, startState); + atn.modeToStartState.add(startState); + atn.defineDecisionState(startState); + } + + // CREATE ATN FOR EACH RULE + _createATN(g.rules.values()); + + // LINK MODE START STATE TO EACH TOKEN RULE + for (String modeName : ((LexerGrammar)g).modes.keySet()) { + List rules = ((LexerGrammar)g).modes.get(modeName); + TokensStartState startState = atn.modeNameToStartState.get(modeName); + for (Rule r : rules) { + if ( !r.isFragment() ) { + RuleStartState s = atn.ruleToStartState.get(r); + epsilon(startState, s); + } + } + } + + return atn; + } + + @Override + public Handle action(ActionAST action) { +// Handle h = super.action(action); +// ActionTransition a = (ActionTransition)h.left.transition(0); +// a.actionIndex = g.actions.get(action); +// return h; + // no actions in lexer ATN; just one on end and we exec via action number + ATNState x = newState(action); + return new Handle(x, x); // return just one blank state + } + + @Override + public Handle range(GrammarAST a, GrammarAST b) { + ATNState left = newState(a); + ATNState right = newState(b); + int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText()); + int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText()); + left.transition = new RangeTransition(t1, t2, right); + a.atnState = left; + b.atnState = left; + return new Handle(left, right); + } + + /** For a lexer, a string is a sequence of char to match. That is, + * "fog" is treated as 'f' 'o' 'g' not as a single transition in + * the DFA. Machine== o-'f'->o-'o'->o-'g'->o and has n+1 states + * for n characters. + */ + @Override + public Handle stringLiteral(TerminalAST stringLiteralAST) { + String chars = stringLiteralAST.getText(); + chars = CharSupport.getStringFromGrammarStringLiteral(chars); + int n = chars.length(); + ATNState left = newState(stringLiteralAST); + ATNState prev = left; + ATNState right = null; + for (int i=0; i rules) { + createRuleStartAndStopATNStates(); + + GrammarASTAdaptor adaptor = new GrammarASTAdaptor(); + for (Rule r : rules) { + // find rule's block + GrammarAST blk = (GrammarAST)r.ast.getFirstChildWithType(ANTLRParser.BLOCK); + CommonTreeNodeStream nodes = new CommonTreeNodeStream(adaptor,blk); + ATNBuilder b = new ATNBuilder(nodes,this); + try { + setCurrentRuleName(r.name); + Handle h = b.block(null); + rule(r.ast, r.name, h); + } + catch (RecognitionException re) { + ErrorManager.fatalInternalError("bad grammar AST structure", re); + } + } + } + + public void setCurrentRuleName(String name) { + this.currentRule = g.getRule(name); + } + + /* start->ruleblock->end */ + public Handle rule(GrammarAST ruleAST, String name, Handle blk) { + Rule r = g.getRule(name); + RuleStartState start = atn.ruleToStartState.get(r); + epsilon(start, blk.left); + RuleStopState stop = atn.ruleToStopState.get(r); + epsilon(blk.right, stop); + Handle h = new Handle(start, stop); +// FASerializer ser = new FASerializer(g, h.left); +// System.out.println(ruleAST.toStringTree()+":\n"+ser); + ruleAST.atnState = start; + return h; + } + + /** From label A build Graph o-A->o */ + public Handle tokenRef(TerminalAST node) { + ATNState left = newState(node); + ATNState right = newState(node); + int ttype = g.getTokenType(node.getText()); + left.transition = new AtomTransition(ttype, right); + right.incidentTransition = left.transition; + node.atnState = left; + return new Handle(left, right); + } + + /** From set build single edge graph o->o-set->o. To conform to + * what an alt block looks like, must have extra state on left. + */ + public Handle set(IntervalSet set, GrammarAST associatedAST) { + ATNState left = newState(associatedAST); + ATNState right = newState(associatedAST); + left.transition = new SetTransition(associatedAST, set, right); + right.incidentTransition = left.transition; + associatedAST.atnState = left; + return new Handle(left, right); + } + + public Handle tree(List els) { + return null; + } + + /** Not valid for non-lexers */ + public Handle range(GrammarAST a, GrammarAST b) { throw new UnsupportedOperationException(); } + + /** ~atom only */ + public Handle not(GrammarAST node) { + ATNState left = newState(node); + ATNState right = newState(node); + int ttype = getTokenType((GrammarAST) node.getChild(0)); + left.transition = new NotAtomTransition(ttype, right); + right.incidentTransition = left.transition; + node.atnState = left; + return new Handle(left, right); + } + + protected int getTokenType(GrammarAST atom) { + int ttype; + if ( g.isLexer() ) { + ttype = CharSupport.getCharValueFromGrammarCharLiteral(atom.getText()); + } + else { + ttype = g.getTokenType(atom.getText()); + } + return ttype; + } + + /** For a non-lexer, just build a simple token reference atom. */ + public Handle stringLiteral(TerminalAST stringLiteralAST) { + return tokenRef(stringLiteralAST); + } + + /** For reference to rule r, build + * + * o->(r) o + * + * where (r) is the start of rule r and the trailing o is not linked + * to from rule ref state directly (uses followState). + */ + public Handle ruleRef(GrammarAST node) { + Handle h = _ruleRef(node); + Rule r = g.getRule(node.getText()); + addFollowLink(r, h.right); + return h; + } + + public Handle _ruleRef(GrammarAST node) { + Rule r = g.getRule(node.getText()); + RuleStartState start = atn.ruleToStartState.get(r); + ATNState left = newState(node); + ATNState right = newState(node); + RuleTransition call = new RuleTransition(r, start, right); + left.addTransition(call); + + node.atnState = left; + return new Handle(left, right); + } + + public void addFollowLink(Rule r, ATNState right) { + // add follow edge from end of invoked rule + RuleStopState stop = atn.ruleToStopState.get(r); + epsilon(stop, right); + } + + /** From an empty alternative build o-e->o */ + public Handle epsilon(GrammarAST node) { + ATNState left = newState(node); + ATNState right = newState(node); + epsilon(left, right); + node.atnState = left; + return new Handle(left, right); + } + + /** Build what amounts to an epsilon transition with a semantic + * predicate action. The pred is a pointer into the AST of + * the SEMPRED token. + */ + public Handle sempred(PredAST pred) { + //System.out.println("sempred: "+ pred); + ATNState left = newState(pred); + ATNState right = newState(pred); + PredicateTransition p = new PredicateTransition(pred, right); + p.ruleIndex = currentRule.index; + p.predIndex = g.sempreds.get(pred); + left.transition = p; + pred.atnState = left; + return new Handle(left, right); + } + + public Handle gated_sempred(GrammarAST pred) { + ATNState left = newState(pred); + ATNState right = newState(pred); + left.transition = new PredicateTransition(pred, right); + pred.atnState = left; + return new Handle(left, right); + } + + /** Build what amounts to an epsilon transition with an action. + * The action goes into ATN though it is ignored during analysis. + * It slows things down a bit, but I must ignore predicates after + * having seen an action (5-5-2008). + */ + public Handle action(ActionAST action) { + //System.out.println("action: "+action); + ATNState left = newState(action); + ATNState right = newState(action); + ActionTransition a = new ActionTransition(action, right); + a.ruleIndex = currentRule.index; + if ( action.getType()==ANTLRParser.FORCED_ACTION ) { + a.actionIndex = g.actions.get(action); + } + left.transition = a; + action.atnState = left; + return new Handle(left, right); + } + + /** From A|B|..|Z alternative block build + * + * o->o-A->o->o (last ATNState is BlockEndState pointed to by all alts) + * | ^ + * |->o-B->o--| + * | | + * ... | + * | | + * |->o-Z->o--| + * + * So start node points at every alternative with epsilon transition + * and every alt right side points at a block end ATNState. + * + * Special case: only one alternative: don't make a block with alt + * begin/end. + * + * Special case: if just a list of tokens/chars/sets, then collapse + * to a single edge'd o-set->o graph. + * + * TODO: Set alt number (1..n) in the states? + */ + public Handle block(BlockAST blkAST, GrammarAST ebnfRoot, List alts) { + if ( ebnfRoot==null ) { + if ( alts.size()==1 ) { + Handle h = alts.get(0); + blkAST.atnState = h.left; + return h; + } + BlockStartState start = (BlockStartState)newState(BlockStartState.class, blkAST); + return makeBlock(start, blkAST, alts); + } + switch ( ebnfRoot.getType() ) { + case ANTLRParser.OPTIONAL : + BlockStartState start = (BlockStartState)newState(BlockStartState.class, blkAST); + Handle h = makeBlock(start, blkAST, alts); + return optional(ebnfRoot, h); + case ANTLRParser.CLOSURE : + BlockStartState star = (StarBlockStartState)newState(StarBlockStartState.class, ebnfRoot); + h = makeBlock(star, blkAST, alts); + return star(ebnfRoot, h); + case ANTLRParser.POSITIVE_CLOSURE : + PlusBlockStartState plus = (PlusBlockStartState)newState(PlusBlockStartState.class, ebnfRoot); + h = makeBlock(plus, blkAST, alts); + return plus(ebnfRoot, h); + } + return null; + } + + protected Handle makeBlock(BlockStartState start, GrammarAST blkAST, List alts) { + BlockEndState end = (BlockEndState)newState(BlockEndState.class, blkAST); + start.endState = end; + for (Handle alt : alts) { + epsilon(start, alt.left); + epsilon(alt.right, end); + } + atn.defineDecisionState(start); + Handle h = new Handle(start, end); +// FASerializer ser = new FASerializer(g, h.left); +// System.out.println(blkAST.toStringTree()+":\n"+ser); + blkAST.atnState = start; + return h; + } + + public Handle notBlock(GrammarAST notAST, List terminals) { + // assume list of atoms + IntervalSet notSet = new IntervalSet(); + for (GrammarAST elemAST : terminals) { + if ( elemAST.getType()==ANTLRParser.RANGE ) { + GrammarAST from = (GrammarAST)elemAST.getChild(0); + GrammarAST to = (GrammarAST)elemAST.getChild(1); + notSet.add(getTokenType(from), getTokenType(to)); + } + else { + notSet.add(getTokenType(elemAST)); + } + } + + ATNState left = newState(notAST); + ATNState right = newState(notAST); + left.transition = new NotSetTransition(notAST, notSet, right); + right.incidentTransition = left.transition; + notAST.atnState = left; + return new Handle(left, right); + } + + public Handle alt(List els) { + Handle prev = null; + for (Handle el : els) { // hook up elements + if ( prev!=null ) epsilon(prev.right, el.left); + prev = el; + } + Handle first = els.get(0); + Handle last = els.get(els.size()-1); + if ( first==null || last==null ) { + g.tool.errMgr.toolError(ErrorType.INTERNAL_ERROR, "alt Handle has first|last == null"); + } + return new Handle(first.left, last.right); + } + + /** From (A)? build either: + * + * o--A->o + * | ^ + * o---->| + * + * or, if A is a block, just add an empty alt to the end of the block + */ + public Handle optional(GrammarAST optAST, Handle blk) { + // TODO: no such thing as nongreedy ()? so give error + BlockStartState blkStart = (BlockStartState)blk.left; + epsilon(blkStart, blk.right); + optAST.atnState = blk.left; + return blk; + } + + /** From (blk)+ build + * + * |---------| + * v | + * o->o-A-o->o->o->o loop back points at start of all alts + * | ^ + * |->o-B-o--| + * + * Meaning that the last ATNState in A blk points to loop back node, + * which points back to block start. We add start/end nodes to + * outside. + */ + public Handle plus(GrammarAST plusAST, Handle blk) { + PlusBlockStartState start = (PlusBlockStartState)blk.left; + atn.defineDecisionState(start); // we don't use in code gen though + plusAST.atnState = start; + PlusLoopbackState loop = (PlusLoopbackState)newState(PlusLoopbackState.class, plusAST); + ATNState end = (ATNState)newState(ATNState.class, plusAST); + start.loopBackState = loop; + epsilon(blk.right, loop); + BlockAST blkAST = (BlockAST)plusAST.getChild(0); + // if not greedy, priority to exit branch; make it first + if ( !isGreedy(blkAST) ) epsilon(loop, end); + // connect loop back to all alt left edges + for (Transition trans : start.transitions) { + epsilon(loop, trans.target); + } + // if greedy, last alt of decisions is exit branch + if ( isGreedy(blkAST) ) epsilon(loop, end); + atn.defineDecisionState(loop); + return new Handle(start, end); + } + + /** From (blk)* build + * + * |----------| + * v | + * o-[blk]-o->o o + * | ^ + * o-------------| (optional branch is nth alt of StarBlockStartState) + * + * There 1 decision point in a A*. + * + * Note that the optional bypass must jump outside the loop as (A|B)* is + * not the same thing as (A|B|)+. + */ + public Handle star(GrammarAST starAST, Handle elem) { + BlockAST blkAST = (BlockAST)starAST.getChild(0); + + StarBlockStartState blkStart = (StarBlockStartState)elem.left; + BlockEndState blkEnd = (BlockEndState)elem.right; + + StarLoopbackState loop = (StarLoopbackState)newState(StarLoopbackState.class, starAST); + ATNState end = (ATNState)newState(ATNState.class, starAST); + // If greedy, exit alt is last, else exit is first + if ( isGreedy(blkAST) ) { + epsilon(blkStart, end); // bypass edge + } + else { + blkStart.addTransitionFirst(new EpsilonTransition(end)); + } + epsilon(loop, blkStart); + epsilon(blkEnd, loop); + starAST.atnState = blkStart; + return new Handle(blkStart, end); + } + + /** Build an atom with all possible values in its label */ + public Handle wildcard(GrammarAST node) { + ATNState left = newState(node); + ATNState right = newState(node); + int ttype = g.getTokenType(node.getText()); + left.transition = new WildcardTransition(right); + right.incidentTransition = left.transition; + node.atnState = left; + return new Handle(left, right); + } + + /** Build a subrule matching ^(. .*) (any tree or node). Let's use + * (^(. .+) | .) to be safe. + */ + public Handle wildcardTree(GrammarAST associatedAST) { throw new UnsupportedOperationException(); } + + void epsilon(ATNState a, ATNState b) { + if ( a!=null ) a.addTransition(new EpsilonTransition(b)); + } + + /** Define all the rule begin/end ATNStates to solve forward reference + * issues. + */ + void createRuleStartAndStopATNStates() { + for (Rule r : g.rules.values()) { + RuleStartState start = (RuleStartState)newState(RuleStartState.class, r.ast); + RuleStopState stop = (RuleStopState)newState(RuleStopState.class, r.ast); + start.stopState = stop; + start.setRule(r); + stop.setRule(r); + atn.ruleToStartState.put(r, start); + atn.rules.add(start); + atn.ruleToStopState.put(r, stop); + } + } + + /** add an EOF transition to any rule end ATNState that points to nothing + * (i.e., for all those rules not invoked by another rule). These + * are start symbols then. + * + * Return the number of grammar entry points; i.e., how many rules are + * not invoked by another rule (they can only be invoked from outside). + * These are the start rules. + */ + public int addEOFTransitionToStartRules() { + int n = 0; + ATNState eofTarget = newState(null); // one unique EOF target for all rules + for (Rule r : g.rules.values()) { + ATNState stop = atn.ruleToStopState.get(r); + if ( stop.getNumberOfTransitions()>0 ) continue; + n++; + Transition t = new AtomTransition(Token.EOF, eofTarget); + stop.addTransition(t); + } + return n; + } + + public Handle label(Handle t) { + return t; + } + + public Handle listLabel(Handle t) { + return t; + } + + public ATNState newState(Class nodeType, GrammarAST node) { + try { + Constructor ctor = nodeType.getConstructor(); + ATNState s = (ATNState)ctor.newInstance(); + s.ast = node; + s.setRule(currentRule); + atn.addState(s); + return s; + } + catch (Exception e) { + ErrorManager.internalError("can't create ATN node: "+nodeType.getName(), e); + } + return null; + } + + public ATNState newState(GrammarAST node) { + ATNState n = new ATNState(); + n.setRule(currentRule); + n.ast = node; + atn.addState(n); + return n; + } + + public ATNState newState() { return newState(null); } + + public boolean isGreedy(BlockAST blkAST) { + boolean greedy = true; + String greedyOption = blkAST.getOption("greedy"); + if ( blockHasWildcardAlt(blkAST) || greedyOption!=null&&greedyOption.equals("false") ) { + greedy = false; + } + return greedy; + } + + // (BLOCK (ALT .)) or (BLOCK (ALT 'a') (ALT .)) + public static boolean blockHasWildcardAlt(GrammarAST block) { + for (Object alt : block.getChildren()) { + if ( !(alt instanceof AltAST) ) continue; + AltAST altAST = (AltAST)alt; + if ( altAST.getChildCount()==1 ) { + Tree e = altAST.getChild(0); + if ( e.getType()==ANTLRParser.WILDCARD ) { + return true; + } + } + } + return false; + } +} diff --git a/tool/src/org/antlr/v4/misc/CharSupport.java b/tool/src/org/antlr/v4/misc/CharSupport.java new file mode 100644 index 000000000..5bcecfedb --- /dev/null +++ b/tool/src/org/antlr/v4/misc/CharSupport.java @@ -0,0 +1,129 @@ +package org.antlr.v4.misc; + +import org.antlr.v4.tool.Grammar; + +/** */ +public class CharSupport { + /** When converting ANTLR char and string literals, here is the + * value set of escape chars. + */ + public static int ANTLRLiteralEscapedCharValue[] = new int[255]; + + /** Given a char, we need to be able to show as an ANTLR literal. + */ + public static String ANTLRLiteralCharValueEscape[] = new String[255]; + + static { + ANTLRLiteralEscapedCharValue['n'] = '\n'; + ANTLRLiteralEscapedCharValue['r'] = '\r'; + ANTLRLiteralEscapedCharValue['t'] = '\t'; + ANTLRLiteralEscapedCharValue['b'] = '\b'; + ANTLRLiteralEscapedCharValue['f'] = '\f'; + ANTLRLiteralEscapedCharValue['\\'] = '\\'; + ANTLRLiteralEscapedCharValue['\''] = '\''; + ANTLRLiteralEscapedCharValue['"'] = '"'; + ANTLRLiteralCharValueEscape['\n'] = "\\n"; + ANTLRLiteralCharValueEscape['\r'] = "\\r"; + ANTLRLiteralCharValueEscape['\t'] = "\\t"; + ANTLRLiteralCharValueEscape['\b'] = "\\b"; + ANTLRLiteralCharValueEscape['\f'] = "\\f"; + ANTLRLiteralCharValueEscape['\\'] = "\\\\"; + ANTLRLiteralCharValueEscape['\''] = "\\'"; + } + + /** Return a string representing the escaped char for code c. E.g., If c + * has value 0x100, you will get "\u0100". ASCII gets the usual + * char (non-hex) representation. Control characters are spit out + * as unicode. While this is specially set up for returning Java strings, + * it can be used by any language target that has the same syntax. :) + */ + public static String getANTLRCharLiteralForChar(int c) { + if ( c< Grammar.MIN_CHAR_VALUE ) { + return "''"; + } + if ( c { + Map> data = new LinkedHashMap>(); + + public Value put(Key1 k1, Key2 k2, Value v) { + Map data2 = data.get(k1); + Value prev = null; + if ( data2==null ) { + data2 = new LinkedHashMap(); + data.put(k1, data2); + } + else { + prev = data2.get(k2); + } + data2.put(k2, v); + return prev; + } + + public Value get(Key1 k1, Key2 k2) { + Map data2 = data.get(k1); + if ( data2==null ) return null; + return data2.get(k2); + } + + public Map get(Key1 k1) { return data.get(k1); } + + /** Get all values associated with primary key */ + public Collection values(Key1 k1) { + Map data2 = data.get(k1); + if ( data2==null ) return null; + return data2.values(); + } + + /** get all primary keys */ + public Set keySet() { + return data.keySet(); + } + + /** get all secondary keys associated with a primary key */ + public Set keySet(Key1 k1) { + Map data2 = data.get(k1); + if ( data2==null ) return null; + return data2.keySet(); + } +} diff --git a/tool/src/org/antlr/v4/misc/Interval.java b/tool/src/org/antlr/v4/misc/Interval.java new file mode 100644 index 000000000..979ea2e63 --- /dev/null +++ b/tool/src/org/antlr/v4/misc/Interval.java @@ -0,0 +1,142 @@ +/* + [The "BSD license"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.antlr.v4.misc; + +/** An immutable inclusive interval a..b */ +public class Interval { + public static final int INTERVAL_POOL_MAX_VALUE = 1000; + + static Interval[] cache = new Interval[INTERVAL_POOL_MAX_VALUE+1]; + + public int a; + public int b; + + public static int creates = 0; + public static int misses = 0; + public static int hits = 0; + public static int outOfRange = 0; + + public Interval(int a, int b) { this.a=a; this.b=b; } + + /** Interval objects are used readonly so share all with the + * same single value a==b up to some max size. Use an array as a perfect hash. + * Return shared object for 0..INTERVAL_POOL_MAX_VALUE or a new + * Interval object with a..a in it. On Java.g, 218623 IntervalSets + * have a..a (set with 1 element). + */ + public static Interval create(int a, int b) { + //return new Interval(a,b); + // cache just a..a + if ( a!=b || a<0 || a>INTERVAL_POOL_MAX_VALUE ) { + return new Interval(a,b); + } + if ( cache[a]==null ) { + cache[a] = new Interval(a,a); + } + return cache[a]; + } + + public boolean equals(Object o) { + if ( o==null ) { + return false; + } + Interval other = (Interval)o; + return this.a==other.a && this.b==other.b; + } + + /** Does this start completely before other? Disjoint */ + public boolean startsBeforeDisjoint(Interval other) { + return this.a=other.a; + } + + /** Does this.a start after other.b? May or may not be disjoint */ + public boolean startsAfter(Interval other) { return this.a>other.a; } + + /** Does this start completely after other? Disjoint */ + public boolean startsAfterDisjoint(Interval other) { + return this.a>other.b; + } + + /** Does this start after other? NonDisjoint */ + public boolean startsAfterNonDisjoint(Interval other) { + return this.a>other.a && this.a<=other.b; // this.b>=other.b implied + } + + /** Are both ranges disjoint? I.e., no overlap? */ + public boolean disjoint(Interval other) { + return startsBeforeDisjoint(other) || startsAfterDisjoint(other); + } + + /** Are two intervals adjacent such as 0..41 and 42..42? */ + public boolean adjacent(Interval other) { + return this.a == other.b+1 || this.b == other.a-1; + } + + public boolean properlyContains(Interval other) { + return other.a >= this.a && other.b <= this.b; + } + + /** Return the interval computed from combining this and other */ + public Interval union(Interval other) { + return Interval.create(Math.min(a,other.a), Math.max(b,other.b)); + } + + /** Return the interval in common between this and o */ + public Interval intersection(Interval other) { + return Interval.create(Math.max(a,other.a), Math.min(b,other.b)); + } + + /** Return the interval with elements from this not in other; + * other must not be totally enclosed (properly contained) + * within this, which would result in two disjoint intervals + * instead of the single one returned by this method. + */ + public Interval differenceNotProperlyContained(Interval other) { + Interval diff = null; + // other.a to left of this.a (or same) + if ( other.startsBeforeNonDisjoint(this) ) { + diff = Interval.create(Math.max(this.a,other.b+1), + this.b); + } + + // other.a to right of this.a + else if ( other.startsAfterNonDisjoint(this) ) { + diff = Interval.create(this.a, other.a-1); + } + return diff; + } + + public String toString() { + return a+".."+b; + } +} diff --git a/tool/src/org/antlr/v4/misc/IntervalSet.java b/tool/src/org/antlr/v4/misc/IntervalSet.java new file mode 100644 index 000000000..c774573f6 --- /dev/null +++ b/tool/src/org/antlr/v4/misc/IntervalSet.java @@ -0,0 +1,536 @@ +/* + [The "BSD license"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.antlr.v4.misc; + +import org.antlr.v4.runtime.Token; +import org.antlr.v4.tool.Grammar; + +import java.util.*; + +/** A set of integers that relies on ranges being common to do + * "run-length-encoded" like compression (if you view an IntSet like + * a BitSet with runs of 0s and 1s). Only ranges are recorded so that + * a few ints up near value 1000 don't cause massive bitsets, just two + * integer intervals. + * + * element values may be negative. Useful for sets of EPSILON and EOF. + * + * 0..9 char range is index pair ['\u0030','\u0039']. + * Multiple ranges are encoded with multiple index pairs. Isolated + * elements are encoded with an index pair where both intervals are the same. + * + * The ranges are ordered and disjoint so that 2..6 appears before 101..103. + */ +public class IntervalSet implements IntSet { + public static final IntervalSet COMPLETE_SET = IntervalSet.of(0, Grammar.MAX_CHAR_VALUE); + public static final IntervalSet EMPTY_SET = new IntervalSet(); + + /** The list of sorted, disjoint intervals. */ + protected List intervals; + + /** Create a set with no elements */ + public IntervalSet() { + intervals = new ArrayList(2); // most sets are 1 or 2 elements + } + + public IntervalSet(List intervals) { + this.intervals = intervals; + } + + public IntervalSet(IntervalSet set) { + this(); + addAll(set); + } + + /** Create a set with a single element, el. */ + public static IntervalSet of(int a) { + IntervalSet s = new IntervalSet(); + s.add(a); + return s; + } + + /** Create a set with all ints within range [a..b] (inclusive) */ + public static IntervalSet of(int a, int b) { + IntervalSet s = new IntervalSet(); + s.add(a,b); + return s; + } + + public void clear() { + intervals.clear(); + } + + /** Add a single element to the set. An isolated element is stored + * as a range el..el. + */ + public void add(int el) { + add(el,el); + } + + /** Add interval; i.e., add all integers from a to b to set. + * If b 0 ) { + IntervalSet s = IntervalSet.of(0, first.a-1); + IntervalSet a = (IntervalSet)s.and(vocabularyIS); + compl.addAll(a); + } + for (int i=1; i1 ) { + buf.append("{"); + } + Iterator iter = this.intervals.iterator(); + while (iter.hasNext()) { + Interval I = (Interval) iter.next(); + int a = I.a; + int b = I.b; + if ( a==b ) { + if ( g!=null ) { + buf.append(g.getTokenDisplayName(a)); + } + else { + buf.append(a); + } + } + else { + if ( g!=null ) { + if ( !g.isLexer() ) { + for (int i=a; i<=b; i++) { + if ( i>a ) buf.append(", "); + buf.append(g.getTokenDisplayName(i)); + } + } + else { + buf.append(g.getTokenDisplayName(a)+".."+g.getTokenDisplayName(b)); + } + } + else { + buf.append(a+".."+b); + } + } + if ( iter.hasNext() ) { + buf.append(", "); + } + } + if ( this.size()>1 ) { + buf.append("}"); + } + return buf.toString(); + } + + public int size() { + int n = 0; + int numIntervals = intervals.size(); + if ( numIntervals==1 ) { + Interval firstInterval = this.intervals.get(0); + return firstInterval.b-firstInterval.a+1; + } + for (int i = 0; i < numIntervals; i++) { + Interval I = (Interval) intervals.get(i); + n += (I.b-I.a+1); + } + return n; + } + + public List toList() { + List values = new ArrayList(); + int n = intervals.size(); + for (int i = 0; i < n; i++) { + Interval I = (Interval) intervals.get(i); + int a = I.a; + int b = I.b; + for (int v=a; v<=b; v++) { + values.add(Utils.integer(v)); + } + } + return values; + } + + /** Get the ith element of ordered set. Used only by RandomPhrase so + * don't bother to implement if you're not doing that for a new + * ANTLR code gen target. + */ + public int get(int i) { + int n = intervals.size(); + int index = 0; + for (int j = 0; j < n; j++) { + Interval I = (Interval) intervals.get(j); + int a = I.a; + int b = I.b; + for (int v=a; v<=b; v++) { + if ( index==i ) { + return v; + } + index++; + } + } + return -1; + } + + public int[] toArray() { + int[] values = new int[size()]; + int n = intervals.size(); + int j = 0; + for (int i = 0; i < n; i++) { + Interval I = (Interval) intervals.get(i); + int a = I.a; + int b = I.b; + for (int v=a; v<=b; v++) { + values[j] = v; + j++; + } + } + return values; + } + + public void remove(int el) { + throw new NoSuchMethodError("IntervalSet.remove() unimplemented"); + } +} diff --git a/tool/src/org/antlr/v4/misc/OrderedHashMap.java b/tool/src/org/antlr/v4/misc/OrderedHashMap.java new file mode 100644 index 000000000..4de8c7dc6 --- /dev/null +++ b/tool/src/org/antlr/v4/misc/OrderedHashMap.java @@ -0,0 +1,30 @@ +package org.antlr.v4.misc; + +import java.util.*; + +/** I need the get-element-i functionality so I'm subclassing + * LinkedHashMap. + */ +public class OrderedHashMap extends LinkedHashMap { + /** Track the elements as they are added to the set */ + protected List elements = new ArrayList(); + + public K getKey(int i) { return elements.get(i); } + + @Override + public V put(K key, V value) { + elements.add(key); + return super.put(key, value); + } + + @Override + public V remove(Object key) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + elements.clear(); + super.clear(); + } +} diff --git a/tool/src/org/antlr/v4/misc/OrderedHashSet.java b/tool/src/org/antlr/v4/misc/OrderedHashSet.java new file mode 100644 index 000000000..8421c9db5 --- /dev/null +++ b/tool/src/org/antlr/v4/misc/OrderedHashSet.java @@ -0,0 +1,88 @@ +package org.antlr.v4.misc; + +import java.util.*; + +/** A HashMap that remembers the order that the elements were added. + * You can alter the ith element with set(i,value) too :) Unique list. + * I need the replace/set-element-i functionality so I'm subclassing + * OrderedHashSet. + */ +public class OrderedHashSet extends LinkedHashSet { + /** Track the elements as they are added to the set */ + protected List elements = new ArrayList(); + + public T get(int i) { + return elements.get(i); + } + + /** Replace an existing value with a new value; updates the element + * list and the hash table, but not the key as that has not changed. + */ + public T set(int i, T value) { + T oldElement = elements.get(i); + elements.set(i,value); // update list + super.remove(oldElement); // now update the set: remove/add + super.add(value); + return oldElement; + } + + public boolean remove(int i) { + T o = elements.remove(i); + return super.remove(o); + } + + /** Add a value to list; keep in hashtable for consistency also; + * Key is object itself. Good for say asking if a certain string is in + * a list of strings. + */ + public boolean add(T value) { + boolean result = super.add(value); + if ( result ) { // only track if new element not in set + elements.add(value); + } + return result; + } + + public boolean remove(Object o) { + throw new UnsupportedOperationException(); + } + + public void clear() { + elements.clear(); + super.clear(); + } + + @Override + public int hashCode() { + return elements.hashCode(); + } + + @Override + public boolean equals(Object o) { +// System.out.print("equals " + this + ", " + o+" = "); + boolean same = elements!=null && elements.equals(((OrderedHashSet)o).elements); +// System.out.println(same); + return same; + } + + @Override + public Iterator iterator() { + return elements.iterator(); + } + + /** Return the List holding list of table elements. Note that you are + * NOT getting a copy so don't write to the list. + */ + public List elements() { + return elements; + } + + @Override + public Object[] toArray() { + return elements.toArray(); + } + + public String toString() { + return elements.toString(); + } +} diff --git a/tool/src/org/antlr/v4/parse/ANTLRLexer.g b/tool/src/org/antlr/v4/parse/ANTLRLexer.g new file mode 100644 index 000000000..7c150a571 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g @@ -0,0 +1,714 @@ +// File : A3Lexer.g +// Author : Jim Idle (jimi@temporal-wave.com) +// Copyright : Free BSD - See @header clause below +// Version : First implemented as part of ANTLR 3.2 this is the self +// hosting ANTLR 3 Lexer. +// +// Description +// ----------- +// This is the definitive lexer grammar for parsing ANTLR V3.x.x grammars. All other +// gramnmars are derived from this grammar via source code control integration (perforce) +// or by the gdiff tool. +// +// This grammar and its associated grmmmars A3Parser.g and A3Walker.g exhibit the following +// traits, which are recommended for all production quality grammars: +// +// 1) They are separate grammars, not composite grammars; +// 2) They implement all supporting methods in a superclass (at least this is recommended +// for language targets that support inheritence; +// 3) All errors are pushed as far down the parsing chain as possible, which means +// that the lexer tries to defer error reporting to the parser, and the parser +// tries to defer error reporting to a semantic phase consisting of a single +// walk of the AST. The reason for this is that the error messages produced +// from later phases of the parse will generally have better context and so +// be more useful to the end user. Consider the message: "Syntax error at 'options'" +// vs: "You cannot specify two options{} sections in a single grammar file". +// 4) The lexer is 'programmed' to catch common mistakes such as unterminated literals +// and report them specifically and not just issue confusing lexer mismatch errors. +// + +/** Read in an ANTLR grammar and build an AST. Try not to do + * any actions, just build the tree. + * + * The phases are: + * + * A3Lexer.g (this file) + * A3Parser.g + * A3Verify.g (derived from A3Walker.g) + * assign.types.g + * define.g + * buildnfa.g + * antlr.print.g (optional) + * codegen.g + * + * Terence Parr + * University of San Francisco + * 2005 + * Jim Idle (this v3 grammar) + * Temporal Wave LLC + * 2009 + */ +lexer grammar ANTLRLexer; + +// ============================================================================== +// Note that while this grammar does not care about order of constructs +// that don't really matter, such as options before @header etc, it must first +// be parsed by the original v2 parser, before it replaces it. That parser does +// care about order of structures. Hence we are constrained by the v2 parser +// for at least the first bootstrap release that causes this parser to replace +// the v2 version. +// ============================================================================== + +// ------- +// Options +// +// V3 option directives to tell the tool what we are asking of it for this +// grammar. +// +options { + + // Target language is Java, which is the default but being specific + // here as this grammar is also meant as a good example grammar for + // for users. + // + language = Java; + + // The super class that this lexer should expect to inherit from, and + // which contains any and all support routines for the lexer. This is + // commented out in this baseline (definitive or normative grammar) + // - see the ANTLR tool implementation for hints on how to use the super + // class + // + //superclass = AbstractA3Lexer; +} + +tokens { SEMPRED; FORCED_ACTION; } + +// Include the copyright in this source and also the generated source +// +@lexer::header { +/* + [The "BSD licence"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; +} + +// +=====================+ +// | Lexer specification | +// +=====================+ + +// -------- +// Comments +// +// ANTLR comments can be multi or single line and we don't care +// which particularly. However we also accept Javadoc style comments +// of the form: /** ... */ and we do take care to distinguish those +// from ordinary multi-line comments +// Note how we guide the lexical PATH because we want to issue a decriptive +// error message in case of a standalone '/' character, which makes no +// sense in ANTLR source code. We alo trap unterminated multi-line comments +// +fragment DOC_COMMENT : ; +COMMENT +@init { + + // Record the start line and offsets as if we need to report an + // unterminated comment, then we want to show the start of the comment + // we think is broken, not the end, where people will have to try and work + // it out themselves. + // + int startLine = $line; + int offset = getCharPositionInLine(); +} + : // Eat the first character only, then see if we have a comment + // or something silly. + // + '/' // Comment introducer + + ( + // Single line comment, possibly with embedded src/line directives + // in a similar style to the C pre-processor, allowing generated + // code to refer the programmer back to the original source code + // in case of error. + // + '/' + ( + (' $ANTLR')=> ' $ANTLR' SRC + | ~(NLCHARS)* + ) + + | // Multi-line comment, which may be a documentation comment + // if it starts /** (note that we protect against accidentaly + // recognizing a comment /**/ as a documentation comment + // + '*' ( + { input.LA(2) != '/'}?=> '*' { $type = DOC_COMMENT; } + | { true }?=> // Required to cover all alts with predicates + ) + + // Should we support embedded multiline comments here? + // + ( + // Pick out end of multiline comment and exit the loop + // if we find it. + // + { !(input.LA(1) == '*' && input.LA(2) == '/') }? + + // Anything else other than the non-greedy match of + // the comment close sequence + // + . + )* + ( + // Look for the comment terminator, but if it is accidentally + // unterminated, then we will hit EOF, which will trigger the + // epsilon alt and hence we can issue an error message relative + // to the start of the unterminated multi-line comment + // + '*/' + + | // Unterminated comment! + // + { + // ErrorManager.msg(Msg.UNTERMINATED_DOC_COMMENT, startLine, offset, $pos, startLine, offset, $pos, (Object)null); + } + ) + + | // There was nothing that made sense following the opening '/' and so + // we issue an error regarding the malformed comment + // + { + // TODO: Insert error message relative to comment start + // + } + ) + { + // Unless we had a documentation comment, then we do not wish to + // pass the comments in to the parser. If you are writing a formatter + // then you will want to preserve the comments off channel, but could + // just skip and save token space if not. + // + if ($type != DOC_COMMENT) { + + $channel=2; // Comments are on channel 2 + } + } + ; + + +DOUBLE_QUOTE_STRING_LITERAL + : '"' (('\\')=>'\\' . | ~'"' )* '"' + ; + +DOUBLE_ANGLE_STRING_LITERAL + : '<<' (options {greedy=false;} : . )* '>>' + ; + +// -------------- +// Argument specs +// +// Certain argument lists, such as those specifying call parameters +// to a rule invocation, or input parameters to a rule specification +// are contained within square brackets. In the lexer we consume them +// all at once and sort them out later in the grammar analysis. +// +ARG_ACTION +@init +{ + StringBuffer theText = new StringBuffer(); +} + : '[' + ( + ('\\')=>'\\' + ( + (']')=>']' + { + // We do not include the \ character itself when picking up an escaped ] + // + theText.append(']'); + } + | c=. + { + // We DO include the \ character when finding any other escape + // + theText.append('\\'); + theText.append((char)$c); + } + ) + + | ('"')=>as=ACTION_STRING_LITERAL + { + // Append the embedded string literal test + // + theText.append($as.text); + } + + | ('\'')=>ac=ACTION_CHAR_LITERAL + { + // Append the embedded chracter literal text + // + theText.append($ac.text); + } + + | c=~']' + { + // Whatever else we found in the scan + // + theText.append((char)$c); + } + )* + + ']' + { + // Set the token text to our gathered string + // + setText(theText.toString()); + } + ; + +// ------- +// Actions +// +// Other than making sure to distinguish between { and } embedded +// within what we have assumed to be literals in the action code, the +// job of the lexer is merely to gather the code within the action +// (delimited by {}) and pass it to the parser as a single token. +// Note the special case of the {{ }} action, which is a forced +// action, that the generated code will execute regardless of +// backtracking (predicate) level. +// We know that this token will be asked for its text somewhere +// in the upcoming parse, so setting the text here to exclude +// the delimiting {} is no additional overhead. +// +ACTION + : NESTED_ACTION ('?' {$type = SEMPRED;} )? + { + // Note that because of the sempred detection above, we + // will not see {{ action }}? as a forced action, but as a semantic + // predicate. + if ( $text.startsWith("{{") && $text.endsWith("}}") ) { + // Switch types to a forced action + $type = FORCED_ACTION; + } + } + ; + +// ---------------- +// Action structure +// +// Many language targets use {} as block delimiters and so we +// must recursively match {} delimited blocks to balance the +// braces. Additionally, we must make some assumptions about +// literal string representation in the target language. We assume +// that they are delimited by ' or " and so consume these +// in their own alts so as not to inadvertantly match {}. +// This rule calls itself on matching a { +// +fragment +NESTED_ACTION +@init { + + // Record the start line and offsets as if we need to report an + // unterminated block, then we want to show the start of the comment + // we think is broken, not the end, where people will have to try and work + // it out themselves. + // + int startLine = getLine(); + int offset = getCharPositionInLine(); +} + + : // Action and other blocks start with opening { + // + '{' + ( + // And now we can match one of a number of embedded + // elements within the action until we find a + // } that balances the opening {. If we do not find + // the balanced } then we will hit EOF and can issue + // an error message about the brace that we belive to + // be mismatched. This won't be foolproof but we will + // be able to at least report an error against the + // opening brace that we feel is in error and this will + // guide the user to the correction as best we can. + // + + + // An embedded {} block + // + NESTED_ACTION + + | // What appears to be a literal + // + ACTION_CHAR_LITERAL + + | // We have assumed that the target language has C/Java + // type comments. + // + COMMENT + + | // What appears to be a literal + // + ACTION_STRING_LITERAL + + | // What appears to be an escape sequence + // + ACTION_ESC + + | // Some other single character that is not + // handled above + // + ~('\\'|'"'|'\''|'/'|'{'|'}') + + )* + + ( + // Correctly balanced closing brace + // + '}' + + | // Looks like have an imblanced {} block, report + // with respect to the opening brace. + // + { + // TODO: Report imbalanced {} + System.out.println("Block starting at line " + startLine + " offset " + (offset+1) + " contains imbalanced {} or is missing a }"); + } + ) + ; + + +// Keywords +// -------- +// keywords used to specify ANTLR v3 grammars. Keywords may not be used as +// labels for rules or in any other context where they woudl be ambiguous +// with the keyword vs some other identifier +// OPTIONS and TOKENS must also consume the opening brace that captures +// their option block, as this is teh easiest way to parse it separate +// to an ACTION block, despite it usingthe same {} delimiters. +// +OPTIONS : 'options' WSNLCHARS* '{' ; +TOKENS : 'tokens' WSNLCHARS* '{' ; + +SCOPE : 'scope' ; +IMPORT : 'import' ; +FRAGMENT : 'fragment' ; +LEXER : 'lexer' ; +PARSER : 'parser' ; +TREE : 'tree' ; +GRAMMAR : 'grammar' ; +PROTECTED : 'protected' ; +PUBLIC : 'public' ; +PRIVATE : 'private' ; +RETURNS : 'returns' ; +THROWS : 'throws' ; +CATCH : 'catch' ; +FINALLY : 'finally' ; +TEMPLATE : 'template' ; +MODE : 'mode' ; + +// ----------- +// Punctuation +// +// Character sequences used as separators, delimters, operators, etc +// +COLON : ':' ; +COLONCOLON : '::' ; +COMMA : ',' ; +SEMI : ';' ; +LPAREN : '(' ; +RPAREN : ')' ; +IMPLIES : '=>' ; +LT : '<' ; +GT : '>' ; +ASSIGN : '=' ; +QUESTION : '?' ; +BANG : '!' ; +STAR : '*' ; +PLUS : '+' ; +PLUS_ASSIGN : '+=' ; +OR : '|' ; +ROOT : '^' ; +DOLLAR : '$' ; +DOT : '.' ; // can be WILDCARD or DOT in qid or imported rule ref +RANGE : '..' ; +ETC : '...' ; +RARROW : '->' ; +TREE_BEGIN : '^(' ; +AT : '@' ; +NOT : '~' ; +RBRACE : '}' ; + +// --------------- +// Token reference +// +// The names of all tokens must start with an upper case letter and so +// the lexer can distinguish them directly. +// +TOKEN_REF + : ('A'..'Z') ('A'..'Z' | 'a'..'z' | '0'..'9' | '_')* + ; + +// -------------- +// Rule reference +// +// The names of all rules must start with a lower case letter +// so the lexer can distibguish them directly. The parser takes +// care of the case such as id=rulename +// +RULE_REF + : ('a'..'z') ('A'..'Z' | 'a'..'z' | '0'..'9' | '_')* + ; + + +// ---------------------------- +// Literals embedded in actions +// +// Note that we have made the assumption that the language used within +// actions uses the fairly standard " and ' delimiters for literals and +// that within these literals, characters are escaped using the \ character. +// There are some languages which do not conform to this in all cases, such +// as by using /string/ and so on. We will have to deal with such cases if +// if they come up in targets. +// + +// Within actions, or other structures that are not part of the ANTLR +// syntax, we may encounter literal characters. Within these, we do +// not want to inadvertantly match things like '}' and so we eat them +// specifically. While this rule is called CHAR it allows for the fact that +// some languages may use/allow ' as the string delimiter. +// +fragment +ACTION_CHAR_LITERAL + : '\'' (('\\')=>ACTION_ESC | ~'\'' )* '\'' + ; + +// Within actions, or other structures that are not part of the ANTLR +// syntax, we may encounter literal strings. Within these, we do +// not want to inadvertantly match things like '}' and so we eat them +// specifically. +// +fragment +ACTION_STRING_LITERAL + : '"' (('\\')=>ACTION_ESC | ~'"')* '"' + ; + +// Within literal strings and characters that are not part of the ANTLR +// syntax, we must allow for escaped character sequences so that we do not +// inadvertantly recognize the end of a string or character when the terminating +// delimiter has been esacped. +// +fragment +ACTION_ESC + : '\\' . + ; + +// ------- +// Integer +// +// Obviously (I hope) match an aribtrary long sequence of digits. +// +INT : ('0'..'9')+ + ; + +// ----------- +// Source spec +// +// A fragment rule for picking up information about an origrinating +// file from which the grammar we are parsing has been generated. This allows +// ANTLR to report errors against the originating file and not the generated +// file. +// +fragment +SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT + { + // TODO: Add target specific code to change the source file name and current line number + // + } + ; + +// -------------- +// Literal string +// +// ANTLR makes no disticintion between a single character literal and a +// multi-character string. All literals are single quote delimited and +// may contain unicode escape sequences of the form \uxxxx, where x +// is a valid hexadecimal number (as per Java basically). +STRING_LITERAL +@init { + int len = 0; +} + : '\'' ( ( ESC_SEQ | ~('\\'|'\'') ) {len++;} )* '\'' + ; + +// A valid hex digit specification +// +fragment +HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; + +// Any kind of escaped character that we can embed within ANTLR +// literal strings. +// +fragment +ESC_SEQ + : '\\' + ( + // The standard escaped character set such as tab, newline, + // etc. + // + 'b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\' + + | // A Java style Unicode escape sequence + // + UNICODE_ESC + + | // An illegal escape seqeunce + // + { + // TODO: Issue error message + // + } + ) + ; + +fragment +UNICODE_ESC + +@init { + + // Flag to tell us whether we have a valid number of + // hex digits in the escape sequence + // + int hCount = 0; +} + : 'u' // Leadin for unicode escape sequence + + // We now require 4 hex digits. Note though + // that we accept any number of characters + // and issue an error if we do not get 4. We cannot + // use an inifinite count such as + because this + // might consume too many, so we lay out the lexical + // options and issue an error at the invalid paths. + // + ( + ( + HEX_DIGIT { hCount++; } + ( + HEX_DIGIT { hCount++; } + ( + HEX_DIGIT { hCount++; } + ( + // Four valid hex digits, we are good + // + HEX_DIGIT { hCount++; } + + | // Three valid digits + ) + + | // Two valid digits + ) + + | // One valid digit + ) + ) + | // No valid hex digits at all + ) + + // Now check the digit count and issue an error if we need to + // + { + if (hCount != 4) { + + // TODO: Issue error message + } + } + ; + +// ---------- +// Whitespace +// +// Characters and character constructs that are of no import +// to the parser and are used to make the grammar easier to read +// for humans. +// +WS + : ( + ' ' + | '\t' + | '\r' + | '\n' + | '\f' + )+ + { + + $channel=2; + } + ; + +// A fragment rule for use in recognizing end of line in +// rules like COMMENT. +// +fragment +NLCHARS + : '\n' | '\r' + ; + +// A fragment rule for recognizing traditional whitespace +// characters within lexer rules. +// +fragment +WSCHARS + : ' ' | '\t' | '\f' + ; + +// A fragment rule for recognizing both traditional whitespace and +// end of line markers, when we don't care to distinguish but don't +// want any action code going on. +// +fragment +WSNLCHARS + : ' ' | '\t' | '\f' | '\n' | '\r' + ; + +// ----------------- +// Illegal Character +// +// This is an illegal character trap which is always the last rule in the +// lexer specification. It matches a single character of any value and being +// the last rule in the file will match when no other rule knows what to do +// about the character. It is reported as an error but is not passed on to the +// parser. This means that the parser to deal with the gramamr file anyway +// but we will not try to analyse or code generate from a file with lexical +// errors. +// +ERRCHAR + : . + { + // TODO: Issue error message + // + skip(); + } + ; diff --git a/tool/src/org/antlr/v4/parse/ANTLRParser.g b/tool/src/org/antlr/v4/parse/ANTLRParser.g new file mode 100644 index 000000000..f097af925 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ANTLRParser.g @@ -0,0 +1,964 @@ +/* + [The "BSD license"] + Copyright (c) 2010 Jim Idle, Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** The definitive ANTLR v3 grammar to parse ANTLR v4 grammars. + * The grammar builds ASTs that are sniffed by subsequent stages. + */ +parser grammar ANTLRParser; + +options { + // Target language is Java, which is the default but being specific + // here as this grammar is also meant as a good example grammar for + // for users. + language = Java; + + // The output of this grammar is going to be an AST upon which + // we run a semantic checking phase, then the rest of the analysis + // including final code generation. + output = AST; + + // The vocabulary (tokens and their int token types) we are using + // for the parser. This is generated by the lexer. The vocab will be extended + // to include the imaginary tokens below. + tokenVocab = ANTLRLexer; + + ASTLabelType = GrammarAST; +} + +// Imaginary Tokens +// +// Imaginary tokens do not exist as far as the lexer is concerned, and it cannot +// generate them. However we sometimes need additional 'tokens' to use as root +// nodes for the AST we are generating. The tokens section is where we +// specify any such tokens +tokens { + LEXER; + RULE; + RULES; + RULEMODIFIERS; + RULEACTIONS; + BLOCK; + REWRITE_BLOCK; + OPTIONAL; + CLOSURE; + POSITIVE_CLOSURE; + SYNPRED; + RANGE; + CHAR_RANGE; + EPSILON; + ALT; + ALTLIST; + ID; + ARG; + ARGLIST; + RET; + COMBINED; + INITACTION; + LABEL; // $x used in rewrite rules + TEMPLATE; + GATED_SEMPRED; // {p}? => + SYN_SEMPRED; // (...) => it's a manually-specified synpred converted to sempred + BACKTRACK_SEMPRED; // auto backtracking mode syn pred converted to sempred + WILDCARD; + // A generic node indicating a list of something when we don't + // really need to distinguish what we have a list of as the AST + // will 'kinow' by context. + // + LIST; + ELEMENT_OPTIONS; // TOKEN + ST_RESULT; // distinguish between ST and tree rewrites + RESULT; + ALT_REWRITE; // indicate ALT is rewritten +} + +// Include the copyright in this source and also the generated source +// +@header { +/* + [The "BSD licence"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; + +import org.antlr.v4.tool.*; +} + +@members { +Stack paraphrases = new Stack(); +} + +// The main entry point for parsing a V3 grammar from top to toe. This is +// the method call from whence to obtain the AST for the parse. +// +grammarSpec + : + // The grammar itself can have a documenation comment, which is the + // first terminal in the file. + // + DOC_COMMENT? + + // Next we should see the type and name of the grammar file that + // we are about to parse. + // + grammarType id SEMI + + // There now follows zero or more declaration sections that should + // be given to us before the rules are declared + // +// A number of things can be declared/stated before the grammar rules +// 'proper' are parsed. These include grammar imports (delegate), grammar +// options, imaginary token declarations, global scope declarations, +// and actions such as @header. In this rule we allow any number of +// these constructs in any order so that the grammar author is not +// constrained by some arbitrary order of declarations that nobody +// can remember. In the next phase of the parse, we verify that these +// constructs are valid, not repeated and so on. + sync ( prequelConstruct sync )* + + // We should now see at least one ANTLR EBNF style rule + // declaration. If the rules are missing we will let the + // semantic verification phase tell the user about it. + // + rules + + mode* + + // And we force ANTLR to process everything it finds in the input + // stream by specifying hte need to match End Of File before the + // parse is complete. + // + EOF + + // Having parsed everything in the file and accumulated the relevant + // subtrees, we can now rewrite everything into the main AST form + // that our tree walkers are expecting. + // + + -> ^(grammarType // The grammar type is our root AST node + id // We need to identify the grammar of course + DOC_COMMENT? // We may or may not have a global documentation comment for the file + prequelConstruct* // The set of declarations we accumulated + rules // And of course, we need the set of rules we discovered + mode* + ) + ; + +grammarType +@after { + if ( $t!=null ) ((GrammarRootAST)$tree).grammarType = $t.type; + else ((GrammarRootAST)$tree).grammarType=COMBINED; +} + : ( t=LEXER g=GRAMMAR -> GRAMMAR[$g, "LEXER_GRAMMAR"] + | // A standalone parser specification + t=PARSER g=GRAMMAR -> GRAMMAR[$g, "PARSER_GRAMMAR"] + + | // A standalone tree parser specification + t=TREE g=GRAMMAR -> GRAMMAR[$g, "TREE_GRAMMAR"] + + // A combined lexer and parser specification + | g=GRAMMAR -> GRAMMAR[$g, "COMBINED_GRAMMAR"] + ) + ; + +// This is the list of all constructs that can be declared before +// the set of rules that compose the grammar, and is invoked 0..n +// times by the grammarPrequel rule. +prequelConstruct + : // A list of options that affect analysis and/or code generation + optionsSpec + + | // A list of grammars to which this grammar will delegate certain + // parts of the parsing sequence - a set of imported grammars + delegateGrammars + + | // The declaration of any token types we need that are not already + // specified by a preceeding grammar, such as when a parser declares + // imaginary tokens with which to construct the AST, or a rewriting + // tree parser adds further imaginary tokens to ones defined in a prior + // {tree} parser. + tokensSpec + + | // A declaration of a scope that may be used in multiple rules within + // the grammar spec, rather than being delcared and therefore associated + // with, a specific rule. + attrScope + + | // A declaration of language target implemented constructs. All such + // action sections start with '@' and are given to the language target's + // StringTemplate group. For instance @parser::header and @lexer::header + // are gathered here. + action + ; + +// A list of options that affect analysis and/or code generation +optionsSpec + : OPTIONS (option SEMI)* RBRACE -> ^(OPTIONS[$OPTIONS, "OPTIONS"] option+) + ; + +option + : id ASSIGN^ optionValue + ; + +// ------------ +// Option Value +// +// The actual value of an option - Doh! +// +optionValue + : // If the option value is a single word that conforms to the + // lexical rules of token or rule names, then the user may skip quotes + // and so on. Many option values meet this description + // + qid + + | // The value is a long string + // + STRING_LITERAL + + | // The value was an integer number + // + INT + + | // Asterisk, used for things like k=* + // + STAR + ; + +// A list of grammars to which this grammar will delegate certain +// parts of the parsing sequence - a set of imported grammars +delegateGrammars + : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI -> ^(IMPORT delegateGrammar+) + ; + +// A possibly named grammar file that should be imported to this gramamr +// and delgated to for the rules it specifies +delegateGrammar + : id ASSIGN^ id + | id + ; + +/** The declaration of any token types we need that are not already + * specified by a preceeding grammar, such as when a parser declares + * imaginary tokens with which to construct the AST, or a rewriting + * tree parser adds further imaginary tokens to ones defined in a prior + * {tree} parser. + */ +tokensSpec + : TOKENS tokenSpec+ RBRACE -> ^(TOKENS tokenSpec+) + ; + +tokenSpec + : id + ( ASSIGN STRING_LITERAL -> ^(ASSIGN id STRING_LITERAL) + | -> id + ) + SEMI + | RULE_REF // INVALID! (an error alt) + ; + +// A declaration of a scope that may be used in multiple rules within +// the grammar spec, rather than being declared within and therefore associated +// with, a specific rule. +attrScope + : SCOPE id ACTION -> ^(SCOPE id ACTION) + ; + +// A declaration of a language target specifc section, +// such as @header, @includes and so on. We do not verify these +// sections, they are just passed on to the language target. +/** Match stuff like @parser::members {int i;} */ +action + : AT (actionScopeName COLONCOLON)? id ACTION -> ^(AT actionScopeName? id ACTION) + ; + +/** Sometimes the scope names will collide with keywords; allow them as + * ids for action scopes. + */ +actionScopeName + : id + | LEXER -> ID[$LEXER] + | PARSER -> ID[$PARSER] + ; + +mode: MODE id SEMI sync (rule sync)+ -> ^(MODE id rule+) ; + +rules + : sync (rule sync)* + // Rewrite with an enclosing node as this is good for counting + // the number of rules and an easy marker for the walker to detect + // that there are no rules. + ->^(RULES rule*) + ; + +sync +@init { + BitSet followSet = computeErrorRecoverySet(); + if ( input.LA(1)!=Token.EOF && !followSet.member(input.LA(1)) ) { + reportError(new NoViableAltException("",0,0,input)); + beginResync(); + consumeUntil(input, followSet); + endResync(); + } +} : + ; + +// The specification of an EBNF rule in ANTLR style, with all the +// rule level parameters, declarations, actions, rewrite specs and so +// on. +// +// Note that here we allow any number of rule declaration sections (such +// as scope, returns, etc) in any order and we let the upcoming semantic +// verification of the AST determine if things are repeated or if a +// particular functional element is not valid in the context of the +// grammar type, such as using returns in lexer rules and so on. +rule +@init { paraphrases.push("matching a rule"); } +@after { paraphrases.pop(); } + : // A rule may start with an optional documentation comment + DOC_COMMENT? + + // Following the documentation, we can declare a rule to be + // public, private and so on. This is only valid for some + // language targets of course but the target will ignore these + // modifiers if they make no sense in that language. + ruleModifiers? + + // Next comes the rule name. Here we do not distinguish between + // parser or lexer rules, the semantic verification phase will + // reject any rules that make no sense, such as lexer rules in + // a pure parser or tree parser. + id + + // Immediately following the rulename, there may be a specification + // of input parameters for the rule. We do not do anything with the + // parameters here except gather them for future phases such as + // semantic verifcation, type assignment etc. We require that + // the input parameters are the next syntactically significant element + // following the rule id. + ARG_ACTION? + + ruleReturns? + + // Now, before the rule specification itself, which is introduced + // with a COLON, we may have zero or more configuration sections. + // As usual we just accept anything that is syntactically valid for + // one form of the rule or another and let the semantic verification + // phase throw out anything that is invalid. +// At the rule level, a programmer may specify a number of sections, such +// as scope declarations, rule return elements, @ sections (which may be +// language target specific) and so on. We allow any number of these in any +// order here and as usual rely onthe semantic verification phase to reject +// anything invalid using its addinotal context information. Here we are +// context free and just accept anything that is a syntactically correct +// construct. +// + rulePrequels + + COLON + + // The rule is, at the top level, just a list of alts, with + // finer grained structure defined within the alts. + ruleBlock + + SEMI + + exceptionGroup + + -> ^( RULE id DOC_COMMENT? ruleModifiers? ARG_ACTION? + ruleReturns? rulePrequels? ruleBlock exceptionGroup* + ) + ; + +// Many language targets support exceptions and the rule will +// generally be able to throw the language target equivalent +// of a recognition exception. The grammar programmar can +// specify a list of exceptions to catch or a generic catch all +// and the target language code generation template is +// responsible for generating code that makes sense. +exceptionGroup + : exceptionHandler* finallyClause? + ; + +// Specifies a handler for a particular type of exception +// thrown by a rule +exceptionHandler + : CATCH ARG_ACTION ACTION -> ^(CATCH ARG_ACTION ACTION) + ; + +// Specifies a block of code to run after the rule and any +// expcetion blocks have exceuted. +finallyClause + : FINALLY ACTION -> ^(FINALLY ACTION) + ; + +rulePrequels +@init { paraphrases.push("matching rule preamble"); } +@after { paraphrases.pop(); } + : sync (rulePrequel sync)* -> rulePrequel* + ; + + // An individual rule level configuration as referenced by the ruleActions +// rule above. +// +rulePrequel + : throwsSpec + | ruleScopeSpec + | optionsSpec + | ruleAction + ; + +// A rule can return elements that it constructs as it executes. +// The return values are specified in a 'returns' prequel element, +// which contains COMMA separated declarations, where the declaration +// is target language specific. Here we see the returns declaration +// as a single lexical action element, to be processed later. +// +ruleReturns + : RETURNS^ ARG_ACTION + ; + +// -------------- +// Exception spec +// +// Some target languages, such as Java and C# support exceptions +// and they are specified as a prequel element for each rule that +// wishes to throw its own exception type. Note that the name of the +// exception is just a single word, so the header section of the grammar +// must specify the correct import statements (or language equivalent). +// Target languages that do not support exceptions just safely ignore +// them. +// +throwsSpec + : THROWS qid (COMMA qid)* -> ^(THROWS qid+) + ; + +// As well as supporting globally specifed scopes, ANTLR supports rule +// level scopes, which are tracked in a rule specific stack. Rule specific +// scopes are specified at this level, and globally specified scopes +// are merely referenced here. +ruleScopeSpec + : SCOPE ACTION -> ^(SCOPE ACTION) + | SCOPE id (COMMA id)* SEMI -> ^(SCOPE id+) + ; + +// @ Sections are generally target language specific things +// such as local variable declarations, code to run before the +// rule starts and so on. Fir instance most targets support the +// @init {} section where declarations and code can be placed +// to run before the rule is entered. The C target also has +// an @declarations {} section, where local variables are declared +// in order that the generated code is C89 copmliant. +// +/** Match stuff like @init {int i;} */ +ruleAction + : AT id ACTION -> ^(AT id ACTION) + ; + +// A set of access modifiers that may be applied to rule declarations +// and which may or may not mean something to the target language. +// Note that the parser allows any number of these in any order and the +// semantic pass will throw out invalid combinations. +// +ruleModifiers + : ruleModifier+ -> ^(RULEMODIFIERS ruleModifier+) + ; + +// An individual access modifier for a rule. The 'fragment' modifier +// is an internal indication for lexer rules that they do not match +// from the input but are like subroutines for other lexer rules to +// reuse for certain lexical patterns. The other modifiers are passed +// to the code generation templates and may be ignored by the template +// if they are of no use in that language. +ruleModifier + : PUBLIC + | PRIVATE + | PROTECTED + | FRAGMENT + ; + +altList + : alternative (OR alternative)* -> alternative+ + ; + +// A set of alts, rewritten as a BLOCK for generic processing +// in tree walkers. Used by the rule 'rule' so that the list of +// alts for a rule appears as a BLOCK containing the alts and +// can be processed by the generic BLOCK rule. Note that we +// use a separate rule so that the BLOCK node has start and stop +// boundaries set correctly by rule post processing of rewrites. +ruleBlock +@init {Token colon = input.LT(-1);} + : altList -> ^(BLOCK[colon,"BLOCK"] altList) + ; + catch [ResyncToEndOfRuleBlock e] { + // just resyncing; ignore error + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), null); + } + +// An individual alt with an optional rewrite clause for the +// elements of the alt. +alternative +@init { paraphrases.push("matching alternative"); } +@after { paraphrases.pop(); } + : elements + ( rewrite -> ^(ALT_REWRITE elements rewrite) + | -> elements + ) + | rewrite -> ^(ALT_REWRITE ^(ALT EPSILON) rewrite) // empty alt with rewrite + | -> ^(ALT EPSILON) // empty alt + ; + +elements + : e+=element+ -> ^(ALT $e+) + ; + +element +@init { + paraphrases.push("looking for rule element"); + int m = input.mark(); +} +@after { paraphrases.pop(); } + : labeledElement + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$labeledElement.start,"BLOCK"] ^(ALT labeledElement ) )) + | -> labeledElement + ) + | atom + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$atom.start,"BLOCK"] ^(ALT atom) ) ) + | -> atom + ) + | ebnf + | ACTION + | FORCED_ACTION + | SEMPRED + ( IMPLIES -> GATED_SEMPRED[$SEMPRED] + | -> SEMPRED + ) + | treeSpec + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK[$treeSpec.start,"BLOCK"] ^(ALT treeSpec ) ) ) + | -> treeSpec + ) + ; + catch [RecognitionException re] { + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), re); + int ttype = input.get(input.range()).getType(); + // look for anything that really belongs at the start of the rule minus the initial ID + if ( ttype==COLON || ttype==RETURNS || ttype==CATCH || ttype==FINALLY || ttype==AT ) { + RecognitionException missingSemi = + new v4ParserException("unterminated rule (missing ';') detected at '"+ + input.LT(1).getText()+" "+input.LT(2).getText()+"'", input); + reportError(missingSemi); + if ( ttype==CATCH || ttype==FINALLY ) { + input.seek(input.range()); // ignore what's before rule trailer stuff + } + if ( ttype==RETURNS || ttype==AT ) { // scan back looking for ID of rule header + int p = input.index(); + Token t = input.get(p); + while ( t.getType()!=RULE_REF && t.getType()!=TOKEN_REF ) { + p--; + t = input.get(p); + } + input.seek(p); + } + throw new ResyncToEndOfRuleBlock(); // make sure it goes back to rule block level to recover + } + reportError(re); + recover(input,re); +/* + input.rewind(m); + final List subset = input.get(input.index(), input.range()); + System.out.println("failed to match as element: '"+subset); + CommonTokenStream ns = new CommonTokenStream( + new TokenSource() { + int i = 0; + public Token nextToken() { + if ( i>=subset.size() ) return Token.EOF_TOKEN; + return (Token)subset.get(i++); + } + public String getSourceName() { return null; } + }); + ANTLRParser errorParser = new ANTLRParser(ns); + errorParser.setTreeAdaptor(this.adaptor); + errorParser.element_errors(re); + retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), re); + */ + } + +/* +element_errors[RecognitionException origError] +options {backtrack=true;} +@init { +int m = input.mark(); +//state.backtracking++; +} +@after { +//state.backtracking--; +} + : ( DOC_COMMENT? ruleModifiers? id ARG_ACTION? ruleReturns? rulePrequel* COLON + | exceptionGroup + ) + {reportError(missingSemi); recover(input,null);} + ; + catch [RecognitionException ignore] { + input.rewind(m); + input.consume(); // kill at least one token + reportError(origError); + BitSet followSet = computeErrorRecoverySet(); + beginResync(); + consumeUntil(input, followSet); + endResync(); + } +*/ + +labeledElement : id (ASSIGN^|PLUS_ASSIGN^) (atom|block) ; + +// Tree specifying alt +// Tree grammars need to have alts that describe a tree structure they +// will walk of course. Alts for trees therefore start with ^( XXX, which +// says we will see a root node of XXX then DOWN etc +treeSpec + : TREE_BEGIN + // Only a subset of elements are allowed to be a root node. However + // we allow any element to appear here and reject silly ones later + // when we walk the AST. + element + // After the tree root we get the usual suspects, + // all members of the element set + element+ + RPAREN + -> ^(TREE_BEGIN element+) + ; + +// A block of gramamr structure optionally followed by standard EBNF +// notation, or ANTLR specific notation. I.E. ? + ^ and so on +ebnf + : block + // And now we see if we have any of the optional suffixs and rewrite + // the AST for this rule accordingly + // + ( blockSuffixe -> ^(blockSuffixe block) + | -> block + ) + ; + +// The standard EBNF suffixes with additional components that make +// sense only to ANTLR, in the context of a grammar block. +blockSuffixe + : ebnfSuffix // Standard EBNF + + // ANTLR Specific Suffixes + | ROOT + | IMPLIES // We will change this to syn/sem pred in the next phase + | BANG + ; + +ebnfSuffix +@init { + Token op = input.LT(1); +} + : QUESTION -> OPTIONAL[op] + | STAR -> CLOSURE[op] + | PLUS -> POSITIVE_CLOSURE[op] + ; + +atom: // Qualified reference delegate.rule. This must be + // lexically contiguous (no spaces either side of the DOT) + // otherwise it is two references with a wildcard in between + // and not a qualified reference. + { + input.LT(1).getCharPositionInLine()+input.LT(1).getText().length()== + input.LT(2).getCharPositionInLine() && + input.LT(2).getCharPositionInLine()+1==input.LT(3).getCharPositionInLine() + }? + id DOT ruleref -> ^(DOT id ruleref) + | range (ROOT^ | BANG^)? // Range x..y - only valid in lexers + | terminal (ROOT^ | BANG^)? + | ruleref + | notSet (ROOT^|BANG^)? + | // Wildcard '.' means any character in a lexer, any + // token in parser and any token or node in a tree parser + // Because the terminal rule is allowed to be the node + // specification for the start of a tree rule, we must + // later check that wildcard was not used for that. + DOT elementOptions? -> ^(WILDCARD[$DOT] elementOptions?) + ; + catch [RecognitionException re] { throw re; } // pass upwards to element + +// -------------------- +// Inverted element set +// +// A set of characters (in a lexer) or terminal tokens, if a parser +// that are then used to create the inverse set of them. +// +notSet + : NOT terminal -> ^(NOT terminal) + | NOT blockSet -> ^(NOT blockSet) + ; + +blockSet + : LPAREN + setElement (OR setElement)* + RPAREN + -> ^(BLOCK[$LPAREN,"BLOCK"] setElement+ ) + ; + +setElement + : range + | terminal + ; + +// ------------- +// Grammar Block +// +// Anywhere where an element is valid, the grammar may start a new block +// of alts by surrounding that block with ( ). A new block may also have a set +// of options, which apply only to that block. +// +block + : LPAREN + // A new blocked altlist may have a set of options set sepcifically + // for it. + ( optionsSpec? ra+=ruleAction* COLON )? + altList + RPAREN + -> ^(BLOCK[$LPAREN,"BLOCK"] optionsSpec? $ra* altList ) + ; + +// ---------------- +// Parser rule ref +// +// Reference to a parser rule with optional arguments and optional +// directive to become the root node or ignore the tree produced +// +ruleref + : RULE_REF ARG_ACTION? + ( (op=ROOT|op=BANG) -> ^($op ^(RULE_REF ARG_ACTION?)) + | -> ^(RULE_REF ARG_ACTION?) + ) + ; + catch [RecognitionException re] { throw re; } // pass upwards to element + +// --------------- +// Character Range +// +// Specifies a range of characters. Valid for lexer rules only, but +// we do not check that here, the tree walkers shoudl do that. +// Note also that the parser also allows through more than just +// character literals so that we can produce a much nicer semantic +// error about any abuse of the .. operator. +// +range + : STRING_LITERAL RANGE^ STRING_LITERAL + ; + +terminal + : // Args are only valid for lexer rules + TOKEN_REF ARG_ACTION? elementOptions? -> ^(TOKEN_REF ARG_ACTION? elementOptions?) + | STRING_LITERAL elementOptions? -> ^(STRING_LITERAL elementOptions?) + ; + +// Terminals may be adorned with certain options when +// reference in the grammar: TOK<,,,> +elementOptions + : LT elementOption (COMMA elementOption)* GT -> ^(ELEMENT_OPTIONS elementOption+) + ; + +// WHen used with elements we can specify what the tree node type can +// be and also assign settings of various options (which we do not check here) +elementOption + : // This format indicates the default node option + qid + + | // This format indicates option assignment + id ASSIGN^ (qid | STRING_LITERAL) + ; + +rewrite + : predicatedRewrite* nakedRewrite -> predicatedRewrite* nakedRewrite + ; + +predicatedRewrite + : RARROW SEMPRED rewriteAlt + -> {$rewriteAlt.isTemplate}? ^(ST_RESULT[$RARROW] SEMPRED rewriteAlt) + -> ^(RESULT[$RARROW] SEMPRED rewriteAlt) + ; + +nakedRewrite + : RARROW rewriteAlt -> {$rewriteAlt.isTemplate}? ^(ST_RESULT[$RARROW] rewriteAlt) + -> ^(RESULT[$RARROW] rewriteAlt) + ; + +// distinguish between ST and tree rewrites; for ETC/EPSILON and trees, +// rule altAndRewrite makes REWRITE root. for ST, we use ST_REWRITE +rewriteAlt returns [boolean isTemplate] +options {backtrack=true;} + : // try to parse a template rewrite + rewriteTemplate {$isTemplate=true;} + + | // If we are not building templates, then we must be + // building ASTs or have rewrites in a grammar that does not + // have output=AST; options. If that is the case, we will issue + // errors/warnings in the next phase, so we just eat them here + rewriteTreeAlt + + | ETC + + | /* empty rewrite */ -> EPSILON + ; + +rewriteTreeAlt + : rewriteTreeElement+ -> ^(ALT rewriteTreeElement+) + ; + +rewriteTreeElement + : rewriteTreeAtom + | rewriteTreeAtom ebnfSuffix -> ^( ebnfSuffix ^(REWRITE_BLOCK ^(ALT rewriteTreeAtom)) ) + | rewriteTree + ( ebnfSuffix + -> ^(ebnfSuffix ^(REWRITE_BLOCK ^(ALT rewriteTree)) ) + | -> rewriteTree + ) + | rewriteTreeEbnf + ; + +rewriteTreeAtom + : TOKEN_REF elementOptions? ARG_ACTION? -> ^(TOKEN_REF elementOptions? ARG_ACTION?) // for imaginary nodes + | RULE_REF + | STRING_LITERAL elementOptions? -> ^(STRING_LITERAL elementOptions?) + | DOLLAR id -> LABEL[$DOLLAR,$id.text] // reference to a label in a rewrite rule + | ACTION + ; + +rewriteTreeEbnf +@init { + Token firstToken = input.LT(1); +} +@after { + $rewriteTreeEbnf.tree.getToken().setLine(firstToken.getLine()); + $rewriteTreeEbnf.tree.getToken().setCharPositionInLine(firstToken.getCharPositionInLine()); +} + : lp=LPAREN rewriteTreeAlt RPAREN ebnfSuffix -> ^(ebnfSuffix ^(REWRITE_BLOCK[$lp] rewriteTreeAlt)) + ; + +rewriteTree + : TREE_BEGIN rewriteTreeAtom rewriteTreeElement* RPAREN + -> ^(TREE_BEGIN rewriteTreeAtom rewriteTreeElement* ) + ; + +/** Build a tree for a template rewrite: + ^(TEMPLATE (ID|ACTION) ^(ARGLIST ^(ARG ID ACTION) ...) ) + ID can be "template" keyword. If first child is ACTION then it's + an indirect template ref + + -> foo(a={...}, b={...}) + -> ({string-e})(a={...}, b={...}) // e evaluates to template name + -> {%{$ID.text}} // create literal template from string (done in ActionTranslator) + -> {st-expr} // st-expr evaluates to ST + */ +rewriteTemplate + : // -> template(a={...},...) "..." inline template + TEMPLATE LPAREN rewriteTemplateArgs RPAREN + ( str=DOUBLE_QUOTE_STRING_LITERAL | str=DOUBLE_ANGLE_STRING_LITERAL ) + -> ^(TEMPLATE[$TEMPLATE,"TEMPLATE"] rewriteTemplateArgs? $str) + + | // -> foo(a={...}, ...) + rewriteTemplateRef + + | // -> ({expr})(a={...}, ...) + rewriteIndirectTemplateHead + + | // -> {...} + ACTION + ; + +/** -> foo(a={...}, ...) */ +rewriteTemplateRef + : id LPAREN rewriteTemplateArgs RPAREN + -> ^(TEMPLATE[$LPAREN,"TEMPLATE"] id rewriteTemplateArgs?) + ; + +/** -> ({expr})(a={...}, ...) */ +rewriteIndirectTemplateHead + : lp=LPAREN ACTION RPAREN LPAREN rewriteTemplateArgs RPAREN + -> ^(TEMPLATE[$lp,"TEMPLATE"] ACTION rewriteTemplateArgs?) + ; + +rewriteTemplateArgs + : rewriteTemplateArg (COMMA rewriteTemplateArg)* + -> ^(ARGLIST rewriteTemplateArg+) + | + ; + +rewriteTemplateArg + : id ASSIGN ACTION -> ^(ARG[$ASSIGN] id ACTION) + ; + +// The name of the grammar, and indeed some other grammar elements may +// come through to the parser looking like a rule reference or a token +// reference, hence this rule is used to pick up whichever it is and rewrite +// it as a generic ID token. +id +@init { paraphrases.push("looking for an identifier"); } +@after { paraphrases.pop(); } + : RULE_REF ->ID[$RULE_REF] + | TOKEN_REF ->ID[$TOKEN_REF] + | TEMPLATE ->ID[$TEMPLATE] // keyword + ; + +qid +@init { paraphrases.push("looking for a qualified identifier"); } +@after { paraphrases.pop(); } + : id (DOT id)* -> ID[$qid.start, $text] + ; + +alternativeEntry : alternative EOF ; // allow gunit to call alternative and see EOF afterwards +elementEntry : element EOF ; +ruleEntry : rule EOF ; +blockEntry : block EOF ; diff --git a/tool/src/org/antlr/v4/parse/ASTVerifier.g b/tool/src/org/antlr/v4/parse/ASTVerifier.g new file mode 100644 index 000000000..001c2cf47 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ASTVerifier.g @@ -0,0 +1,431 @@ +/* + [The "BSD license"] + Copyright (c) 2010 Terence Parr + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** The definitive ANTLR v3 tree grammar to parse ANTLR v4 grammars. + * Parses trees created in ANTLRParser.g. + */ +tree grammar ASTVerifier; +options { + language = Java; + tokenVocab = ANTLRParser; + ASTLabelType = GrammarAST; +} + +// Include the copyright in this source and also the generated source +@header { +/* + [The "BSD license"] + Copyright (c) 2005-2009 Terence Parr + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; +import org.antlr.v4.tool.*; +import org.antlr.v4.runtime.tree.CommonTree; // use updated v4 one not v3 +} + +@members { +public String getErrorMessage(RecognitionException e, + String[] tokenNames) +{ + List stack = getRuleInvocationStack(e, this.getClass().getName()); + String msg = null; + String inputContext = + input.LT(-3) == null ? "" : ((Tree)input.LT(-3)).getText()+" "+ + input.LT(-2) == null ? "" : ((Tree)input.LT(-2)).getText()+" "+ + input.LT(-1) == null ? "" : ((Tree)input.LT(-1)).getText()+" >>>"+ + input.LT(1) == null ? "" : ((Tree)input.LT(1)).getText()+"<<< "+ + input.LT(2) == null ? "" : ((Tree)input.LT(2)).getText()+" "+ + input.LT(3) == null ? "" : ((Tree)input.LT(3)).getText(); + if ( e instanceof NoViableAltException ) { + NoViableAltException nvae = (NoViableAltException)e; + msg = " no viable alt; token="+e.token+ + " (decision="+nvae.decisionNumber+ + " state "+nvae.stateNumber+")"+ + " decision=<<"+nvae.grammarDecisionDescription+">>"; + } + else { + msg = super.getErrorMessage(e, tokenNames); + } + return stack+" "+msg+"\ncontext=..."+inputContext+"..."; +} +public String getTokenErrorDisplay(Token t) { + return t.toString(); +} +public void traceIn(String ruleName, int ruleIndex) { + System.out.print("enter "+ruleName+" "+ + ((GrammarAST)input.LT(1)).token+" "+ + ((GrammarAST)input.LT(2)).token+" "+ + ((GrammarAST)input.LT(3)).token+" "+ + ((GrammarAST)input.LT(4)).token); + if ( state.backtracking>0 ) { + System.out.print(" backtracking="+state.backtracking); + } + System.out.println(); +} + protected void mismatch(IntStream input, int ttype, BitSet follow) + throws RecognitionException { + throw new MismatchedTokenException(ttype, input); + } + public void recoverFromMismatchedToken(IntStream input, + RecognitionException e, BitSet follow) + throws RecognitionException + + { + throw e; + } +} + +// Alter code generation so catch-clauses get replace with // this action. +@rulecatch { catch (RecognitionException e) { +throw e; +} +} + +grammarSpec + : ^(GRAMMAR ID DOC_COMMENT? prequelConstruct* rules mode*) + ; + +prequelConstruct + : optionsSpec + | delegateGrammars + | tokensSpec + | attrScope + | action + ; + +optionsSpec + : ^(OPTIONS option*) + ; + +option + : ^(ASSIGN ID optionValue) + ; + +optionValue returns [String v] +@init {$v = $start.token.getText();} + : ID + | STRING_LITERAL + | INT + | STAR + ; + +delegateGrammars + : ^(IMPORT delegateGrammar+) + ; + +delegateGrammar + : ^(ASSIGN ID ID) + | ID + ; + +tokensSpec + : ^(TOKENS tokenSpec+) + ; + +tokenSpec + : ^(ASSIGN ID STRING_LITERAL) + | ID + ; + +attrScope + : ^(SCOPE ID ACTION) + ; + +action + : ^(AT ID? ID ACTION) + ; + +rules + : ^(RULES rule*) + ; + +mode: ^( MODE ID rule+ ) ; + +rule: ^( RULE ID DOC_COMMENT? ruleModifiers? ARG_ACTION? + ruleReturns? rulePrequel* altListAsBlock exceptionGroup + ) + ; + +exceptionGroup + : exceptionHandler* finallyClause? + ; + +exceptionHandler + : ^(CATCH ARG_ACTION ACTION) + ; + +finallyClause + : ^(FINALLY ACTION) + ; + +rulePrequel + : throwsSpec + | ruleScopeSpec + | optionsSpec + | ruleAction + ; + +ruleReturns + : ^(RETURNS ARG_ACTION) + ; +throwsSpec + : ^(THROWS ID+) + ; + +ruleScopeSpec + : ^(SCOPE ACTION) + | ^(SCOPE ID+) + ; + +ruleAction + : ^(AT ID ACTION) + ; + +ruleModifiers + : ^(RULEMODIFIERS ruleModifier+) + ; + +ruleModifier + : PUBLIC + | PRIVATE + | PROTECTED + | FRAGMENT + ; + +altList + : alternative+ + ; + +altListAsBlock + : ^(BLOCK altList) + ; + +alternative + : ^(ALT_REWRITE alternative rewrite) + | ^(ALT EPSILON) + | elements + ; + +elements + : ^(ALT element+) + ; + +element + : labeledElement + | atom + | ebnf + | ACTION + | FORCED_ACTION + | SEMPRED + | GATED_SEMPRED + | treeSpec + ; + +labeledElement + : ^(ASSIGN ID atom) + | ^(ASSIGN ID block) + | ^(PLUS_ASSIGN ID atom) + | ^(PLUS_ASSIGN ID block) + ; + +treeSpec + : ^(TREE_BEGIN element+) + ; + +ebnf: ^(blockSuffix block) + | block + ; + +blockSuffix + : ebnfSuffix + | ROOT + | IMPLIES + | BANG + ; + +ebnfSuffix + : OPTIONAL + | CLOSURE + | POSITIVE_CLOSURE + ; + +atom: ^(ROOT range) + | ^(BANG range) + | ^(ROOT notSet) + | ^(BANG notSet) + | notSet + | ^(ROOT terminal) + | ^(BANG terminal) + | range + | ^(DOT ID terminal) + | ^(DOT ID ruleref) + | ^(WILDCARD elementOptions) + | WILDCARD + | terminal + | ruleref + ; + +notSet + : ^(NOT setElement) + | ^(NOT blockSet) + ; + +blockSet + : ^(BLOCK setElement+) + ; + +setElement + : STRING_LITERAL + | TOKEN_REF + | ^(RANGE STRING_LITERAL STRING_LITERAL) + ; + +block + : ^(BLOCK optionsSpec? ruleAction* ACTION? altList) + ; + +ruleref + : ^(ROOT ^(RULE_REF ARG_ACTION?)) + | ^(BANG ^(RULE_REF ARG_ACTION?)) + | ^(RULE_REF ARG_ACTION?) + ; + +range + : ^(RANGE STRING_LITERAL STRING_LITERAL) + ; + +terminal + : ^(STRING_LITERAL elementOptions) + | STRING_LITERAL + | ^(TOKEN_REF ARG_ACTION elementOptions) + | ^(TOKEN_REF ARG_ACTION) + | ^(TOKEN_REF elementOptions) + | TOKEN_REF + ; + +elementOptions + : ^(ELEMENT_OPTIONS elementOption+) + ; + +elementOption + : ID + | ^(ASSIGN ID ID) + | ^(ASSIGN ID STRING_LITERAL) + ; + +rewrite + : predicatedRewrite* nakedRewrite + ; + +predicatedRewrite + : ^(ST_RESULT SEMPRED rewriteAlt) + | ^(RESULT SEMPRED rewriteAlt) + ; + +nakedRewrite + : ^(ST_RESULT rewriteAlt) + | ^(RESULT rewriteAlt) + ; + +rewriteAlt + : rewriteTemplate + | rewriteTreeAlt + | ETC + | EPSILON + ; + +rewriteTreeAlt + : ^(ALT rewriteTreeElement+) + ; + +rewriteTreeElement + : rewriteTreeAtom + | rewriteTree + | rewriteTreeEbnf + ; + +rewriteTreeAtom + : ^(TOKEN_REF elementOptions ARG_ACTION) + | ^(TOKEN_REF elementOptions) + | ^(TOKEN_REF ARG_ACTION) + | TOKEN_REF + | RULE_REF + | ^(STRING_LITERAL elementOptions) + | STRING_LITERAL + | LABEL + | ACTION + ; + +rewriteTreeEbnf + : ^(ebnfSuffix ^(REWRITE_BLOCK rewriteTreeAlt)) + ; +rewriteTree + : ^(TREE_BEGIN rewriteTreeAtom rewriteTreeElement* ) + ; + +rewriteTemplate + : ^(TEMPLATE rewriteTemplateArgs? DOUBLE_QUOTE_STRING_LITERAL) + | ^(TEMPLATE rewriteTemplateArgs? DOUBLE_ANGLE_STRING_LITERAL) + | rewriteTemplateRef + | rewriteIndirectTemplateHead + | ACTION + ; + +rewriteTemplateRef + : ^(TEMPLATE ID rewriteTemplateArgs?) + ; + +rewriteIndirectTemplateHead + : ^(TEMPLATE ACTION rewriteTemplateArgs?) + ; + +rewriteTemplateArgs + : ^(ARGLIST rewriteTemplateArg+) + ; + +rewriteTemplateArg + : ^(ARG ID ACTION) + ; diff --git a/tool/src/org/antlr/v4/parse/ATNBuilder.g b/tool/src/org/antlr/v4/parse/ATNBuilder.g new file mode 100644 index 000000000..b5d7b2255 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ATNBuilder.g @@ -0,0 +1,176 @@ +/* + [The "BSD license"] + Copyright (c) 2010 Terence Parr + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +tree grammar ATNBuilder; +options { + language = Java; + tokenVocab = ANTLRParser; + ASTLabelType = GrammarAST; +// filter = true; +} + +// Include the copyright in this source and also the generated source +@header { +/* + [The "BSD license"] + Copyright (c) 2010 Terence Parr + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +package org.antlr.v4.parse; +import org.antlr.v4.tool.*; +import org.antlr.v4.automata.ATNFactory; +import org.antlr.v4.runtime.tree.CommonTree; // use updated v4 one not v3 +} + +@members { + ATNFactory factory; + public ATNBuilder(TreeNodeStream input, ATNFactory factory) { + this(input); + this.factory = factory; + } +} + +block[GrammarAST ebnfRoot] returns [ATNFactory.Handle p] +@init {List alts = new ArrayList();} + : ^(BLOCK (^(OPTIONS .+))? (a=alternative {alts.add($a.p);})+) + {$p = factory.block((BlockAST)$BLOCK, ebnfRoot, alts);} + ; + +alternative returns [ATNFactory.Handle p] +@init {List els = new ArrayList();} + : ^(ALT_REWRITE a=alternative .) {$p = $a.p;} + | ^(ALT EPSILON) {$p = factory.epsilon($EPSILON);} + | ^(ALT (e=element {els.add($e.p);})+) + {$p = factory.alt(els);} + ; + +element returns [ATNFactory.Handle p] + : labeledElement {$p = $labeledElement.p;} + | atom {$p = $atom.p;} + | ebnf {$p = $ebnf.p;} + | ACTION {$p = factory.action((ActionAST)$ACTION);} + | FORCED_ACTION {$p = factory.action((ActionAST)$FORCED_ACTION);} + | SEMPRED {$p = factory.sempred((PredAST)$SEMPRED);} + | GATED_SEMPRED {$p = factory.gated_sempred($GATED_SEMPRED);} + | treeSpec {$p = $treeSpec.p;} + ; + +labeledElement returns [ATNFactory.Handle p] + : ^(ASSIGN ID atom) {$p = factory.label($atom.p);} + | ^(ASSIGN ID block[null]) {$p = factory.label($block.p);} + | ^(PLUS_ASSIGN ID atom) {$p = factory.listLabel($atom.p);} + | ^(PLUS_ASSIGN ID block[null]) {$p = factory.listLabel($block.p);} + ; + +treeSpec returns [ATNFactory.Handle p] +@init {List els = new ArrayList();} + : ^(TREE_BEGIN (e=element {els.add($e.p);})+) {$p = factory.tree(els);} + ; + +ebnf returns [ATNFactory.Handle p] + : ^(astBlockSuffix block[null]) {$p = $block.p;} + | ^(OPTIONAL block[$start]) {$p = $block.p;} + | ^(CLOSURE block[$start]) {$p = $block.p;} + | ^(POSITIVE_CLOSURE block[$start]) {$p = $block.p;} + | block[null] {$p = $block.p;} + ; + +astBlockSuffix + : ROOT + | IMPLIES + | BANG + ; + +atom returns [ATNFactory.Handle p] + : ^(ROOT range) {$p = $range.p;} + | ^(BANG range) {$p = $range.p;} + | ^(ROOT notSet) {$p = $notSet.p;} + | ^(BANG notSet) {$p = $notSet.p;} + | notSet {$p = $notSet.p;} + | range {$p = $range.p;} + | ^(DOT ID terminal) {$p = $terminal.p;} + | ^(DOT ID ruleref) {$p = $ruleref.p;} + | ^(WILDCARD .) {$p = factory.wildcard($start);} + | WILDCARD {$p = factory.wildcard($start);} + | terminal {$p = $terminal.p;} + | ruleref {$p = $ruleref.p;} + ; + +notSet returns [ATNFactory.Handle p] + : ^(NOT setElement) {$p = factory.not($NOT);} + | ^(NOT blockSet) {$p = factory.notBlock($NOT, $blockSet.alts);} + ; + +blockSet returns [List alts] +@init {$alts = new ArrayList();} + : ^(BLOCK (t=setElement {$alts.add($t.start);})+) + ; + +setElement + : STRING_LITERAL + | TOKEN_REF + | ^(RANGE STRING_LITERAL STRING_LITERAL) + ; + +ruleref returns [ATNFactory.Handle p] + : ^(ROOT ^(RULE_REF ARG_ACTION?)) {$p = factory.ruleRef($RULE_REF);} + | ^(BANG ^(RULE_REF ARG_ACTION?)) {$p = factory.ruleRef($RULE_REF);} + | ^(RULE_REF ARG_ACTION?) {$p = factory.ruleRef($RULE_REF);} + ; + +range returns [ATNFactory.Handle p] + : ^(RANGE a=STRING_LITERAL b=STRING_LITERAL) {$p = factory.range($a,$b);} + ; + +terminal returns [ATNFactory.Handle p] + : ^(STRING_LITERAL .) {$p = factory.stringLiteral((TerminalAST)$start);} + | STRING_LITERAL {$p = factory.stringLiteral((TerminalAST)$start);} + | ^(TOKEN_REF ARG_ACTION .) {$p = factory.tokenRef((TerminalAST)$start);} + | ^(TOKEN_REF .) {$p = factory.tokenRef((TerminalAST)$start);} + | TOKEN_REF {$p = factory.tokenRef((TerminalAST)$start);} + | ^(ROOT t=terminal) {$p = $t.p;} + | ^(BANG t=terminal) {$p = $t.p;} + ; diff --git a/tool/src/org/antlr/v4/parse/ActionSplitter.g b/tool/src/org/antlr/v4/parse/ActionSplitter.g new file mode 100644 index 000000000..068b6a969 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ActionSplitter.g @@ -0,0 +1,172 @@ +lexer grammar ActionSplitter; + +options { filter=true; } + +@header { +package org.antlr.v4.parse; +import org.antlr.v4.tool.*; +} + +@members { +ActionSplitterListener delegate; + +public ActionSplitter(CharStream input, ActionSplitterListener delegate) { + this(input, new RecognizerSharedState()); + this.delegate = delegate; +} + +public void emit(Token token) { + super.emit(token); + +} + +/** force filtering (and return tokens). triggers all above actions. */ +public List getActionTokens() { + List chunks = new ArrayList(); + Token t = nextToken(); + while ( t.getType()!=Token.EOF ) { + chunks.add(t); + t = nextToken(); + } + return chunks; +} +} + +// ignore comments right away + +COMMENT + : '/*' ( options {greedy=false;} : . )* '*/' {delegate.text($text);} + ; + +LINE_COMMENT + : '//' ~('\n'|'\r')* '\r'? '\n' {delegate.text($text);} + ; + +ESC + : '\\$' {delegate.text("$");} + | '\\%' {delegate.text("\%");} + ; + +SET_QUALIFIED_ATTR + : '$' x=ID '.' y=ID WS? '=' expr=ATTR_VALUE_EXPR ';' + {delegate.setQualifiedAttr($text, $x, $y, $expr);} + ; + +QUALIFIED_ATTR + : '$' x=ID '.' y=ID {input.LA(1)!='('}? {delegate.qualifiedAttr($text, $x, $y);} + ; + +SET_DYNAMIC_SCOPE_ATTR + : '$' x=ID '::' y=ID WS? '=' expr=ATTR_VALUE_EXPR ';' + {delegate.setDynamicScopeAttr($text, $x, $y, $expr);} + ; + +DYNAMIC_SCOPE_ATTR + : '$' x=ID '::' y=ID {delegate.dynamicScopeAttr($text, $x, $y);} + ; + +/** To access deeper (than top of stack) scopes, use the notation: + * + * $x[-1]::y previous (just under top of stack) + * $x[-i]::y top of stack - i where the '-' MUST BE PRESENT; + * i.e., i cannot simply be negative without the '-' sign! + * $x[i]::y absolute index i (0..size-1) + * $x[0]::y is the absolute 0 indexed element (bottom of the stack) + */ +SET_DYNAMIC_NEGATIVE_INDEXED_SCOPE_ATTR + : '$' x=ID '[' '-' index=SCOPE_INDEX_EXPR ']' '::' y=ID + WS? '=' expr=ATTR_VALUE_EXPR ';' + {delegate.setDynamicNegativeIndexedScopeAttr($text, $x, $y, $index, $expr);} + ; + +DYNAMIC_NEGATIVE_INDEXED_SCOPE_ATTR + : '$' x=ID '[' '-' index=SCOPE_INDEX_EXPR ']' '::' y=ID + {delegate.dynamicNegativeIndexedScopeAttr($text, $x, $y, $index);} + ; + +SET_DYNAMIC_ABSOLUTE_INDEXED_SCOPE_ATTR + : '$' x=ID '[' index=SCOPE_INDEX_EXPR ']' '::' y=ID + WS? '=' expr=ATTR_VALUE_EXPR ';' + {delegate.setDynamicAbsoluteIndexedScopeAttr($text, $x, $y, $index, $expr);} + ; + +DYNAMIC_ABSOLUTE_INDEXED_SCOPE_ATTR + : '$' x=ID '[' index=SCOPE_INDEX_EXPR ']' '::' y=ID + {delegate.dynamicAbsoluteIndexedScopeAttr($text, $x, $y, $index);} + ; + +SET_ATTR + : '$' x=ID WS? '=' expr=ATTR_VALUE_EXPR ';' {delegate.setAttr($text, $x, $expr);} + ; + +ATTR + : '$' x=ID {delegate.attr($text, $x);} + ; + +/** %foo(a={},b={},...) ctor */ +TEMPLATE_INSTANCE + : '%' ID '(' ( WS? ARG (',' WS? ARG)* WS? )? ')' + ; + +/** %({name-expr})(a={},...) indirect template ctor reference */ +INDIRECT_TEMPLATE_INSTANCE + : '%' '(' ACTION ')' '(' ( WS? ARG (',' WS? ARG)* WS? )? ')' + ; + +/** %{expr}.y = z; template attribute y of StringTemplate-typed expr to z */ +SET_EXPR_ATTRIBUTE + : '%' a=ACTION '.' ID WS? '=' expr=ATTR_VALUE_EXPR ';' + ; + +/* %x.y = z; set template attribute y of x (always set never get attr) + * to z [languages like python without ';' must still use the + * ';' which the code generator is free to remove during code gen] + */ +SET_ATTRIBUTE + : '%' x=ID '.' y=ID WS? '=' expr=ATTR_VALUE_EXPR ';' + ; + +/** %{string-expr} anonymous template from string expr */ +TEMPLATE_EXPR + : '%' a=ACTION + ; + +UNKNOWN_SYNTAX +@after {delegate.unknownSyntax(emit());} + : '%' (ID|'.'|'('|')'|','|'{'|'}'|'"')* + ; + +// Anything else is just random text +TEXT +@after {delegate.text($text);} + : ~('$'|'%') // can't do (...)+ here since it gobbles \$, \% + ; + +fragment +ACTION + : '{' ('\\}'|~'}')* '}' + ; + +fragment +ARG : ID '=' ACTION + ; + +fragment +ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* + ; + +/** Don't allow an = as first char to prevent $x == 3; kind of stuff. */ +fragment +ATTR_VALUE_EXPR + : ~'=' (~';')* + ; + +fragment +SCOPE_INDEX_EXPR + : ('\\]'|~']')+ + ; + +fragment +WS : (' '|'\t'|'\n'|'\r')+ + ; + diff --git a/tool/src/org/antlr/v4/parse/ActionSplitterListener.java b/tool/src/org/antlr/v4/parse/ActionSplitterListener.java new file mode 100644 index 000000000..82de1795f --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ActionSplitterListener.java @@ -0,0 +1,27 @@ +package org.antlr.v4.parse; + +import org.antlr.runtime.Token; + +/** */ +public interface ActionSplitterListener { + void setQualifiedAttr(String expr, Token x, Token y, Token rhs); + void qualifiedAttr(String expr, Token x, Token y); + void setAttr(String expr, Token x, Token rhs); + void attr(String expr, Token x); + + void setDynamicScopeAttr(String expr, Token x, Token y, Token rhs); + void dynamicScopeAttr(String expr, Token x, Token y); + void setDynamicNegativeIndexedScopeAttr(String expr, Token x, Token y, Token index, Token rhs); + void dynamicNegativeIndexedScopeAttr(String expr, Token x, Token y, Token index); + void setDynamicAbsoluteIndexedScopeAttr(String expr, Token x, Token y, Token index, Token rhs); + void dynamicAbsoluteIndexedScopeAttr(String expr, Token x, Token y, Token index); + + void templateInstance(String expr); + void indirectTemplateInstance(String expr); + void setExprAttribute(String expr); // TODO: rename + void setSTAttribute(String expr); + void templateExpr(String expr); + + void unknownSyntax(Token t); + void text(String text); +} diff --git a/tool/src/org/antlr/v4/parse/GrammarASTAdaptor.java b/tool/src/org/antlr/v4/parse/GrammarASTAdaptor.java new file mode 100644 index 000000000..75b51d356 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/GrammarASTAdaptor.java @@ -0,0 +1,46 @@ +package org.antlr.v4.parse; + +import org.antlr.runtime.*; +import org.antlr.runtime.tree.CommonTreeAdaptor; +import org.antlr.v4.tool.*; + +public class GrammarASTAdaptor extends CommonTreeAdaptor { + org.antlr.runtime.CharStream input; // where we can find chars ref'd by tokens in tree + public GrammarASTAdaptor() { ; } + public GrammarASTAdaptor(org.antlr.runtime.CharStream input) { this.input = input; } + + public Object create(Token token) { + return new GrammarAST(token); + } + + @Override + /** Make sure even imaginary nodes know the input stream */ + public Object create(int tokenType, String text) { + GrammarAST t = null; + if ( tokenType==ANTLRParser.RULE ) { + // needed by TreeWizard to make RULE tree + t = new GrammarASTWithOptions(new CommonToken(tokenType, text)); + } + else if ( tokenType==ANTLRParser.STRING_LITERAL ) { + // implicit lexer construction done with wizard; needs this node type + // whereas grammar ANTLRParser.g can use token option to spec node type + t = new TerminalAST(new CommonToken(tokenType, text)); + } + else { + t = (GrammarAST)super.create(tokenType, text); + } + ((CommonToken)t.token).setInputStream(input); + return t; + } + + public Object dupNode(Object t) { + if ( t==null ) return null; + return ((GrammarAST)t).dupNode(); //create(((GrammarAST)t).token); + } + + public Object errorNode(org.antlr.runtime.TokenStream input, org.antlr.runtime.Token start, org.antlr.runtime.Token stop, + org.antlr.runtime.RecognitionException e) + { + return new GrammarASTErrorNode(input, start, stop, e); + } +} diff --git a/tool/src/org/antlr/v4/parse/ResyncToEndOfRuleBlock.java b/tool/src/org/antlr/v4/parse/ResyncToEndOfRuleBlock.java new file mode 100644 index 000000000..10497ab7c --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ResyncToEndOfRuleBlock.java @@ -0,0 +1,7 @@ +package org.antlr.v4.parse; + +/** Used to throw us out of deeply nested element back to end of a rule's + * alt list. Note it's not under RecognitionException. + */ +public class ResyncToEndOfRuleBlock extends RuntimeException { +} diff --git a/tool/src/org/antlr/v4/parse/ScopeParser.java b/tool/src/org/antlr/v4/parse/ScopeParser.java new file mode 100644 index 000000000..d9f46ef77 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ScopeParser.java @@ -0,0 +1,237 @@ +package org.antlr.v4.parse; + +import org.antlr.tool.ErrorManager; +import org.antlr.v4.tool.*; + +import java.util.*; + +/** Parse args, return values, and dynamic scopes. + * + * rule[arg1, arg2, ..., argN] returns [ret1, ..., retN] + * scope { decl1; decl2; ... declN; } + * + * The ',' and ';' are significant. Use \, and \; to use within + * types if necessary like [Map foo, int y]. + * + * arg, ret, and decl are target language dependent. Java/C#/C/C++ would + * use "int i" but ruby/python would use "i". + */ +public class ScopeParser { + /** Given an arg or retval scope definition list like + * + * Map, int[] j3, char *foo32[3] + * + * or + * + * int i=3, j=a[34]+20 + * + * convert to an attribute scope. + */ + public static AttributeDict parseTypeList(String s) { return parse(s, ','); } + + public static AttributeDict parseDynamicScope(String s) { + // ignore outer {..} if present + s = s.trim(); + if ( s.startsWith("{") ) { + int lastCurly = s.lastIndexOf('}'); + s = s.substring(1, lastCurly); + } + return parse(s, ';'); + } + + public static AttributeDict parse(String s, char separator) { + int i = 0; + int n = s.length(); + AttributeDict dict = new AttributeDict(); + while ( i0 ) { + // everything after the '=' is the init value + attr.initValue = decl.substring(equalsIndex+1,decl.length()); + rightEdgeOfDeclarator = equalsIndex-1; + } + // walk backwards looking for start of an ID + for (int i=rightEdgeOfDeclarator; i>=0; i--) { + // if we haven't found the end yet, keep going + if ( !inID && Character.isLetterOrDigit(decl.charAt(i)) ) { + inID = true; + } + else if ( inID && + !(Character.isLetterOrDigit(decl.charAt(i))|| + decl.charAt(i)=='_') ) { + start = i+1; + break; + } + } + if ( start<0 && inID ) { + start = 0; + } + if ( start<0 ) { + ErrorManager.error(ErrorManager.MSG_CANNOT_FIND_ATTRIBUTE_NAME_IN_DECL,decl); + } + // walk forwards looking for end of an ID + int stop=-1; + for (int i=start; i<=rightEdgeOfDeclarator; i++) { + // if we haven't found the end yet, keep going + if ( !(Character.isLetterOrDigit(decl.charAt(i))|| + decl.charAt(i)=='_') ) + { + stop = i; + break; + } + if ( i==rightEdgeOfDeclarator ) { + stop = i+1; + } + } + + // the name is the last ID + attr.name = decl.substring(start,stop); + + // the type is the decl minus the ID (could be empty) + attr.type = decl.substring(0,start); + if ( stop<=rightEdgeOfDeclarator ) { + attr.type += decl.substring(stop,rightEdgeOfDeclarator+1); + } + attr.type = attr.type.trim(); + if ( attr.type.length()==0 ) { + attr.type = null; + } + + attr.decl = decl; + return attr; + } + + /** Given an argument list like + * + * x, (*a).foo(21,33), 3.2+1, '\n', + * "a,oo\nick", {bl, "fdkj"eck}, ["cat\n,", x, 43] + * + * convert to a list of attributes. Allow nested square brackets etc... + * Set separatorChar to ';' or ',' or whatever you want. + */ + public static List splitArgumentList(String s, int separatorChar) { + List args = new ArrayList(); + _splitArgumentList(s, 0, -1, separatorChar, args); + return args; + } + + + public static int _splitArgumentList(String actionText, + int start, + int targetChar, + int separatorChar, + List args) + { + if ( actionText==null ) { + return -1; + } + actionText = actionText.replaceAll("//.*\n", ""); + int n = actionText.length(); + //System.out.println("actionText@"+start+"->"+(char)targetChar+"="+actionText.substring(start,n)); + int p = start; + int last = p; + while ( p',p+1)>=p ) { + // do we see a matching '>' ahead? if so, hope it's a generic + // and not less followed by expr with greater than + p = _splitArgumentList(actionText,p+1,'>',separatorChar,args); + } + else { + p++; // treat as normal char + } + break; + case '[' : + p = _splitArgumentList(actionText,p+1,']',separatorChar,args); + break; + default : + if ( c==separatorChar && targetChar==-1 ) { + String arg = actionText.substring(last, p); + //System.out.println("arg="+arg); + args.add(arg.trim()); + last = p+1; + } + p++; + break; + } + } + if ( targetChar==-1 && p<=n ) { + String arg = actionText.substring(last, p).trim(); + //System.out.println("arg="+arg); + if ( arg.length()>0 ) { + args.add(arg.trim()); + } + } + p++; + return p; + } + +} diff --git a/tool/src/org/antlr/v4/parse/TokenVocabParser.java b/tool/src/org/antlr/v4/parse/TokenVocabParser.java new file mode 100644 index 000000000..3fea4f8cb --- /dev/null +++ b/tool/src/org/antlr/v4/parse/TokenVocabParser.java @@ -0,0 +1,142 @@ +package org.antlr.v4.parse; + +import org.antlr.codegen.CodeGenerator; +import org.antlr.misc.Utils; +import org.antlr.tool.ErrorManager; +import org.antlr.v4.Tool; + +import java.io.*; +import java.util.*; + +/** */ +public class TokenVocabParser { + Tool tool; + String vocabName; + + public TokenVocabParser(Tool tool, String vocabName) { + this.tool = tool; + this.vocabName = vocabName; + } + + /** Load a vocab file .tokens and return mapping. */ + public Map load() { + Map tokens = new LinkedHashMap(); + int maxTokenType = -1; + File fullFile = getImportedVocabFile(); + try { + FileReader fr = new FileReader(fullFile); + BufferedReader br = new BufferedReader(fr); + StreamTokenizer tokenizer = new StreamTokenizer(br); + tokenizer.parseNumbers(); + tokenizer.wordChars('_', '_'); + tokenizer.eolIsSignificant(true); + tokenizer.slashSlashComments(true); + tokenizer.slashStarComments(true); + tokenizer.ordinaryChar('='); + tokenizer.quoteChar('\''); + tokenizer.whitespaceChars(' ',' '); + tokenizer.whitespaceChars('\t','\t'); + int lineNum = 1; + int token = tokenizer.nextToken(); + while (token != StreamTokenizer.TT_EOF) { + String tokenID; + if ( token == StreamTokenizer.TT_WORD ) { + tokenID = tokenizer.sval; + } + else if ( token == '\'' ) { + tokenID = "'"+tokenizer.sval+"'"; + } + else { + ErrorManager.error(ErrorManager.MSG_TOKENS_FILE_SYNTAX_ERROR, + vocabName+ CodeGenerator.VOCAB_FILE_EXTENSION, + Utils.integer(lineNum)); + while ( tokenizer.nextToken() != StreamTokenizer.TT_EOL ) {;} + token = tokenizer.nextToken(); + continue; + } + token = tokenizer.nextToken(); + if ( token != '=' ) { + ErrorManager.error(ErrorManager.MSG_TOKENS_FILE_SYNTAX_ERROR, + vocabName+CodeGenerator.VOCAB_FILE_EXTENSION, + Utils.integer(lineNum)); + while ( tokenizer.nextToken() != StreamTokenizer.TT_EOL ) {;} + token = tokenizer.nextToken(); + continue; + } + token = tokenizer.nextToken(); // skip '=' + if ( token != StreamTokenizer.TT_NUMBER ) { + ErrorManager.error(ErrorManager.MSG_TOKENS_FILE_SYNTAX_ERROR, + vocabName+CodeGenerator.VOCAB_FILE_EXTENSION, + Utils.integer(lineNum)); + while ( tokenizer.nextToken() != StreamTokenizer.TT_EOL ) {;} + token = tokenizer.nextToken(); + continue; + } + int tokenType = (int)tokenizer.nval; + token = tokenizer.nextToken(); + System.out.println("import "+tokenID+"="+tokenType); + tokens.put(tokenID, tokenType); + maxTokenType = Math.max(maxTokenType,tokenType); + lineNum++; + if ( token != StreamTokenizer.TT_EOL ) { + ErrorManager.error(ErrorManager.MSG_TOKENS_FILE_SYNTAX_ERROR, + vocabName+CodeGenerator.VOCAB_FILE_EXTENSION, + Utils.integer(lineNum)); + while ( tokenizer.nextToken() != StreamTokenizer.TT_EOL ) {;} + token = tokenizer.nextToken(); + continue; + } + token = tokenizer.nextToken(); // skip newline + } + br.close(); + } + catch (FileNotFoundException fnfe) { + ErrorManager.error(ErrorManager.MSG_CANNOT_FIND_TOKENS_FILE, + fullFile); + } + catch (IOException ioe) { + ErrorManager.error(ErrorManager.MSG_ERROR_READING_TOKENS_FILE, + fullFile, + ioe); + } + catch (Exception e) { + ErrorManager.error(ErrorManager.MSG_ERROR_READING_TOKENS_FILE, + fullFile, + e); + } + return tokens; + } + + /** Return a File descriptor for vocab file. Look in library or + * in -o output path. antlr -o foo T.g U.g where U needs T.tokens + * won't work unless we look in foo too. If we do not find the + * file in the lib directory then must assume that the .tokens file + * is going to be generated as part of this build and we have defined + * .tokens files so that they ALWAYS are generated in the base output + * directory, which means the current directory for the command line tool if there + * was no output directory specified. + */ + public File getImportedVocabFile() { + + File f = new File(tool.getLibraryDirectory(), + File.separator + + vocabName + + CodeGenerator.VOCAB_FILE_EXTENSION); + if (f.exists()) { + return f; + } + + // We did not find the vocab file in the lib directory, so we need + // to look for it in the output directory which is where .tokens + // files are generated (in the base, not relative to the input + // location.) + // + if (tool.haveOutputDir) { + f = new File(tool.getOutputDirectory(), vocabName + CodeGenerator.VOCAB_FILE_EXTENSION); + } + else { + f = new File(vocabName + CodeGenerator.VOCAB_FILE_EXTENSION); + } + return f; + } +} diff --git a/tool/src/org/antlr/v4/parse/ToolANTLRParser.java b/tool/src/org/antlr/v4/parse/ToolANTLRParser.java new file mode 100644 index 000000000..347d1c88c --- /dev/null +++ b/tool/src/org/antlr/v4/parse/ToolANTLRParser.java @@ -0,0 +1,47 @@ +package org.antlr.v4.parse; + +import org.antlr.runtime.*; +import org.antlr.v4.Tool; +import org.antlr.v4.tool.ErrorType; + +/** Override error handling for use with ANTLR tool itself; leaves + * nothing in grammar associated with Tool so others can use in IDEs, ... + */ +public class ToolANTLRParser extends ANTLRParser { + public Tool tool; + + public ToolANTLRParser(TokenStream input, Tool tool) { + super(input); + this.tool = tool; + } + + public void displayRecognitionError(String[] tokenNames, + RecognitionException e) + { + String msg = getParserErrorMessage(this, e); + if ( paraphrases.size()>0 ) { + String paraphrase = (String)paraphrases.peek(); + msg = msg+" while "+paraphrase; + } + // List stack = getRuleInvocationStack(e, this.getClass().getName()); + // msg += ", rule stack = "+stack; + tool.errMgr.syntaxError(ErrorType.SYNTAX_ERROR, getSourceName(), e.token, e, msg); + } + + public String getParserErrorMessage(Parser parser, RecognitionException e) { + String msg = null; + if ( e instanceof NoViableAltException) { + String name = parser.getTokenErrorDisplay(e.token); + msg = name+" came as a complete surprise to me"; + } + else if ( e instanceof v4ParserException) { + msg = ((v4ParserException)e).msg; + } + else { + msg = parser.getErrorMessage(e, parser.getTokenNames()); + } + return msg; + } + + +} diff --git a/tool/src/org/antlr/v4/parse/v4ParserException.java b/tool/src/org/antlr/v4/parse/v4ParserException.java new file mode 100644 index 000000000..073867a42 --- /dev/null +++ b/tool/src/org/antlr/v4/parse/v4ParserException.java @@ -0,0 +1,16 @@ +package org.antlr.v4.parse; + +import org.antlr.runtime.*; + +/** */ +public class v4ParserException extends RecognitionException { + public String msg; + /** Used for remote debugger deserialization */ + public v4ParserException() {;} + + public v4ParserException(String msg, IntStream input) { + super(input); + this.msg = msg; + } + +} diff --git a/tool/src/org/antlr/v4/tool/ActionAST.java b/tool/src/org/antlr/v4/tool/ActionAST.java new file mode 100644 index 000000000..b1b0f72c3 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/ActionAST.java @@ -0,0 +1,29 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; + +import java.util.List; + +public class ActionAST extends GrammarAST { + // Alt, rule, grammar space + public AttributeResolver resolver; + public List chunks; // useful for ANTLR IDE developers + /** In which alt does this node live? */ +// public Alternative alt; + + + public ActionAST(GrammarAST node) { + super(node); + this.resolver = ((ActionAST)node).resolver; + this.chunks = ((ActionAST)node).chunks; + } + + public ActionAST(Token t) { super(t); } + public ActionAST(int type) { super(type); } + public ActionAST(int type, Token t) { super(type, t); } + + @Override + public Tree dupNode() { return new ActionAST(this); } + +} diff --git a/tool/src/org/antlr/v4/tool/AltAST.java b/tool/src/org/antlr/v4/tool/AltAST.java new file mode 100644 index 000000000..f777a7ea2 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/AltAST.java @@ -0,0 +1,21 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; + +/** An ALT or ALT_REWRITE node (left of ->) */ +public class AltAST extends GrammarAST { + public Alternative alt; + + public AltAST(GrammarAST node) { + super(node); + this.alt = ((AltAST)node).alt; + } + + public AltAST(Token t) { super(t); } + public AltAST(int type) { super(type); } + public AltAST(int type, Token t) { super(type, t); } + + @Override + public Tree dupNode() { return new AltAST(this); } +} diff --git a/tool/src/org/antlr/v4/tool/Alternative.java b/tool/src/org/antlr/v4/tool/Alternative.java new file mode 100644 index 000000000..b1b65e968 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/Alternative.java @@ -0,0 +1,136 @@ +package org.antlr.v4.tool; + + +import org.stringtemplate.v4.misc.MultiMap; + +import java.util.ArrayList; +import java.util.List; + +/** Record use/def information about an outermost alternative in a subrule + * or rule of a grammar. + */ +public class Alternative implements AttributeResolver { + Rule rule; + + public AltAST ast; + + // token IDs, string literals in this alt + public MultiMap tokenRefs = new MultiMap(); + + // does not include labels + public MultiMap tokenRefsInActions = new MultiMap(); + + // all rule refs in this alt + public MultiMap ruleRefs = new MultiMap(); + + // does not include labels + public MultiMap ruleRefsInActions = new MultiMap(); + + /** A list of all LabelElementPair attached to tokens like id=ID, ids+=ID */ + public MultiMap labelDefs = new MultiMap(); + + // track all token, rule, label refs in rewrite (right of ->) + public List rewriteElements = new ArrayList(); + + /** Track all executable actions other than named actions like @init + * and catch/finally (not in an alt). Also tracks predicates, rewrite actions. + * We need to examine these actions before code generation so + * that we can detect refs to $rule.attr etc... + */ + public List actions = new ArrayList(); + + public Alternative(Rule r) { this.rule = r; } + + public boolean resolvesToToken(String x, ActionAST node) { + if ( tokenRefs.get(x)!=null ) return true; + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null && anyLabelDef.type==LabelType.TOKEN_LABEL ) return true; + return false; + } + +// public String getTokenLabel(String x, ActionAST node) { +// LabelElementPair anyLabelDef = getAnyLabelDef(x); +// if ( anyLabelDef!=null ) return anyLabelDef.label.getText(); +// if ( tokenRefs.get(x)!=null ) { +// +// } +// LabelElementPair anyLabelDef = getAnyLabelDef(x); +// if ( anyLabelDef!=null && anyLabelDef.type==LabelType.TOKEN_LABEL ) return true; +// return false; +// } + + public boolean resolvesToAttributeDict(String x, ActionAST node) { + if ( resolvesToToken(x, node) ) return true; + if ( x.equals(rule.name) ) return true; // $r for action in rule r, $r is a dict + if ( rule!=null && rule.scope!=null ) return true; + if ( rule.g.scopes.get(x)!=null ) return true; + return false; + } + + /** $x Attribute: rule arguments, return values, predefined rule prop. + */ + public Attribute resolveToAttribute(String x, ActionAST node) { + return rule.resolveToAttribute(x, node); // reuse that code + } + + /** $x.y, x can be surrounding rule, token/rule/label ref. y is visible + * attr in that dictionary. Can't see args on rule refs. + */ + public Attribute resolveToAttribute(String x, String y, ActionAST node) { + if ( rule.name.equals(x) ) { // x is this rule? + return rule.resolveToAttribute(x, y, node); + } + if ( tokenRefs.get(x)!=null ) { // token ref in this alt? + return rule.getPredefinedScope(LabelType.TOKEN_LABEL).get(y); + } + if ( ruleRefs.get(x)!=null ) { // rule ref in this alt? + // look up rule, ask it to resolve y (must be retval or predefined) + return rule.g.getRule(x).resolveRetvalOrProperty(y); + } + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null && anyLabelDef.type==LabelType.RULE_LABEL ) { + return rule.g.getRule(anyLabelDef.element.getText()).resolveRetvalOrProperty(y); + } + else if ( anyLabelDef!=null ) { + return rule.getPredefinedScope(anyLabelDef.type).get(y); + } + return null; + } + + public AttributeDict resolveToDynamicScope(String x, ActionAST node) { + Rule r = resolveToRule(x); + if ( r!=null && r.scope !=null ) return r.scope; + return rule.resolveToDynamicScope(x, node); + } + + public boolean resolvesToLabel(String x, ActionAST node) { + LabelElementPair anyLabelDef = getAnyLabelDef(x); + return anyLabelDef!=null && + (anyLabelDef.type==LabelType.TOKEN_LABEL || + anyLabelDef.type==LabelType.RULE_LABEL); + } + + public boolean resolvesToListLabel(String x, ActionAST node) { + LabelElementPair anyLabelDef = getAnyLabelDef(x); + return anyLabelDef!=null && + (anyLabelDef.type==LabelType.RULE_LIST_LABEL || + anyLabelDef.type==LabelType.TOKEN_LIST_LABEL); + } + + public LabelElementPair getAnyLabelDef(String x) { + List labels = labelDefs.get(x); + if ( labels!=null ) return labels.get(0); + return null; + } + + /** x can be ruleref or rule label. */ + public Rule resolveToRule(String x) { + if ( ruleRefs.get(x)!=null ) return rule.g.getRule(x); + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null && anyLabelDef.type==LabelType.RULE_LABEL ) { + return rule.g.getRule(anyLabelDef.element.getText()); + } + if ( x.equals(rule.name) ) return rule; + return null; + } +} diff --git a/tool/src/org/antlr/v4/tool/Attribute.java b/tool/src/org/antlr/v4/tool/Attribute.java new file mode 100644 index 000000000..f758ba1f3 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/Attribute.java @@ -0,0 +1,37 @@ +package org.antlr.v4.tool; + +/** Track the names of attributes define in arg lists, return values, + * scope blocks etc... + */ +public class Attribute { + /** The entire declaration such as "String foo;" */ + public String decl; + + /** The type; might be empty such as for Python which has no static typing */ + public String type; + + /** The name of the attribute "foo" */ + public String name; + + /** The optional attribute intialization expression */ + public String initValue; + + /** Who contains us? */ + public AttributeDict dict; + + public Attribute() {;} + + public Attribute(String name) { this(name,null); } + + public Attribute(String name, String decl) { + this.name = name; + this.decl = decl; + } + + public String toString() { + if ( initValue!=null ) { + return type+" "+name+"="+initValue; + } + return type+" "+name; + } +} \ No newline at end of file diff --git a/tool/src/org/antlr/v4/tool/AttributeDict.java b/tool/src/org/antlr/v4/tool/AttributeDict.java new file mode 100644 index 000000000..6a61dae34 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/AttributeDict.java @@ -0,0 +1,86 @@ +package org.antlr.v4.tool; + +import java.util.*; + +/** Track the attributes within retval, arg lists etc... + * + * Each rule has potentially 3 scopes: return values, + * parameters, and an implicitly-named scope (i.e., a scope defined in a rule). + * Implicitly-defined scopes are named after the rule; rules and scopes then + * must live in the same name space--no collisions allowed. + */ +public class AttributeDict { + public String name; + public GrammarAST ast; + public DictType type; + + /** All token scopes (token labels) share the same fixed scope of + * of predefined attributes. I keep this out of the runtime.Token + * object to avoid a runtime type leakage. + */ + public static AttributeDict predefinedTokenDict = new AttributeDict(DictType.TOKEN) {{ + add(new Attribute("text")); + add(new Attribute("type")); + add(new Attribute("line")); + add(new Attribute("index")); + add(new Attribute("pos")); + add(new Attribute("channel")); + add(new Attribute("tree")); + add(new Attribute("int")); + }}; + + public static enum DictType { + ARG, RET, TOKEN, + PREDEFINED_RULE, PREDEFINED_TREE_RULE, PREDEFINED_LEXER_RULE, + GLOBAL_SCOPE, // scope symbols { ...} + RULE_SCOPE; // scope { int i; int j; } + } + + /** The list of Attribute objects */ + + public LinkedHashMap attributes = + new LinkedHashMap(); + + public AttributeDict() {;} + public AttributeDict(DictType type) { this.type = type; } + + public Attribute add(Attribute a) { a.dict = this; return attributes.put(a.name, a); } + public Attribute get(String name) { return attributes.get(name); } + + public String getName() { +// if ( isParameterScope ) { +// return name+"_parameter"; +// } +// else if ( isReturnScope ) { +// return name+"_return"; +// } + return name; + } + + public int size() { return attributes==null?0:attributes.size(); } + + /** Return the set of keys that collide from + * this and other. + */ + public Set intersection(AttributeDict other) { + if ( other==null || other.size()==0 || size()==0 ) { + return null; + } + Set inter = new HashSet(); + Set thisKeys = attributes.keySet(); + for (Iterator it = thisKeys.iterator(); it.hasNext();) { + String key = (String) it.next(); + if ( other.attributes.get(key)!=null ) { + inter.add(key); + } + } + if ( inter.size()==0 ) { + return null; + } + return inter; + } + + public String toString() { + return getName()+":"+attributes; + } +} diff --git a/tool/src/org/antlr/v4/tool/AttributeResolver.java b/tool/src/org/antlr/v4/tool/AttributeResolver.java new file mode 100644 index 000000000..9452cb980 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/AttributeResolver.java @@ -0,0 +1,40 @@ +package org.antlr.v4.tool; + +/** Grammars, rules, and alternatives all have symbols visible to + * actions. To evaluate attr exprs, ask action for its resolver + * then ask resolver to look up various symbols. Depending on the context, + * some symbols are available at some aren't. + * + * Alternative level: + * + * $x Attribute: rule arguments, return values, predefined rule prop. + * AttributeDict: references to tokens and token labels in the + * current alt (including any elements within subrules contained + * in that outermost alt). x can be rule with scope or a global scope. + * List label: x is a token/rule list label. + * $x.y Attribute: x is surrounding rule, rule/token/label ref + * $s::y Attribute: s is any rule with scope or global scope; y is prop within + * + * Rule level: + * + * $x Attribute: rule arguments, return values, predefined rule prop. + * AttributeDict: references to token labels in *any* alt. x can + * be any rule with scope or global scope. + * List label: x is a token/rule list label. + * $x.y Attribute: x is surrounding rule, label ref (in any alts) + * $s::y Attribute: s is any rule with scope or global scope; y is prop within + * + * Grammar level: + * + * $s AttributeDict: s is a global scope + * $s::y Attribute: s is a global scope; y is prop within + */ +public interface AttributeResolver { + public boolean resolvesToListLabel(String x, ActionAST node); + public boolean resolvesToLabel(String x, ActionAST node); + public boolean resolvesToAttributeDict(String x, ActionAST node); + public boolean resolvesToToken(String x, ActionAST node); + public Attribute resolveToAttribute(String x, ActionAST node); + public Attribute resolveToAttribute(String x, String y, ActionAST node); + public AttributeDict resolveToDynamicScope(String x, ActionAST node); +} diff --git a/tool/src/org/antlr/v4/tool/BlockAST.java b/tool/src/org/antlr/v4/tool/BlockAST.java new file mode 100644 index 000000000..5d0f9d307 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/BlockAST.java @@ -0,0 +1,28 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; + +import java.util.*; + +public class BlockAST extends GrammarASTWithOptions { + // TODO: maybe I need a Subrule object like Rule so these options mov to that? + /** What are the default options for a subrule? */ + public static final Map defaultBlockOptions = + new HashMap() {{put("greedy","true");}}; + + public static final Map defaultLexerBlockOptions = + new HashMap() {{put("greedy","true");}}; + + public BlockAST(GrammarAST node) { + super(node); + } + + public BlockAST(Token t) { super(t); } + public BlockAST(int type) { super(type); } + public BlockAST(int type, Token t) { super(type, t); } + public BlockAST(int type, Token t, String text) { super(type,t,text); } + + @Override + public Tree dupNode() { return new BlockAST(this); } +} diff --git a/tool/src/org/antlr/v4/tool/Grammar.java b/tool/src/org/antlr/v4/tool/Grammar.java index a0f52a0a8..0de1d624e 100644 --- a/tool/src/org/antlr/v4/tool/Grammar.java +++ b/tool/src/org/antlr/v4/tool/Grammar.java @@ -1,4 +1,632 @@ package org.antlr.v4.tool; -public class Grammar { +import org.antlr.runtime.*; +import org.antlr.runtime.tree.TreeWizard; +import org.antlr.v4.Tool; +import org.antlr.v4.misc.*; +import org.antlr.v4.parse.*; +import org.antlr.v4.runtime.Token; +import org.antlr.v4.runtime.atn.ATN; +import org.antlr.v4.runtime.dfa.DFA; + +import java.util.*; + +public class Grammar implements AttributeResolver { + public static final Set doNotCopyOptionsToLexer = + new HashSet() { + { + add("output"); add("ASTLabelType"); add("superClass"); + add("k"); add("backtrack"); add("memoize"); add("rewrite"); + } + }; + + public static Map grammarAndLabelRefTypeToScope = + new HashMap() {{ + put("lexer:RULE_LABEL", Rule.predefinedLexerRulePropertiesDict); + put("lexer:LEXER_STRING_LABEL", Rule.predefinedLexerRulePropertiesDict); + put("lexer:TOKEN_LABEL", AttributeDict.predefinedTokenDict); + put("parser:RULE_LABEL", Rule.predefinedRulePropertiesDict); + put("parser:TOKEN_LABEL", AttributeDict.predefinedTokenDict); + put("tree:RULE_LABEL", Rule.predefinedTreeRulePropertiesDict); + put("tree:TOKEN_LABEL", AttributeDict.predefinedTokenDict); + put("tree:WILDCARD_TREE_LABEL", AttributeDict.predefinedTokenDict); + put("combined:RULE_LABEL", Rule.predefinedRulePropertiesDict); + put("combined:TOKEN_LABEL", AttributeDict.predefinedTokenDict); + }}; + public static final int MIN_CHAR_VALUE = '\u0000'; + public static final int MAX_CHAR_VALUE = '\uFFFE'; + + public String name; + public GrammarRootAST ast; + public String text; // testing only + public String fileName; + + /** Was this created from a COMBINED grammar? */ + public Grammar implicitLexer; + public Grammar implicitLexerOwner; + + /** If we're imported, who imported us? If null, implies grammar is root */ + public Grammar parent; + public List importedGrammars; + + /** All rules defined in this specific grammar, not imported. Also does + * not include lexical rules if combined. + */ + public OrderedHashMap rules = new OrderedHashMap(); + int ruleNumber = 1; + + /** The ATN that represents the grammar with edges labelled with tokens + * or epsilon. It is more suitable to analysis than an AST representation. + */ + public ATN atn; + + public Map decisionDFAs = new HashMap(); + + public Vector decisionLOOK; + + public Tool tool; + + /** Token names and literal tokens like "void" are uniquely indexed. + * with -1 implying EOF. Characters are different; they go from + * -1 (EOF) to \uFFFE. For example, 0 could be a binary byte you + * want to lexer. Labels of DFA/ATN transitions can be both tokens + * and characters. I use negative numbers for bookkeeping labels + * like EPSILON. Char/String literals and token types overlap in the same + * space, however. + */ + int maxTokenType = Token.MIN_TOKEN_TYPE-1; + + /** Map token like ID (but not literals like "while") to its token type */ + public Map tokenNameToTypeMap = new LinkedHashMap(); + + /** Map token literals like "while" to its token type. It may be that + * WHILE="while"=35, in which case both tokenIDToTypeMap and this + * field will have entries both mapped to 35. + */ + public Map stringLiteralToTypeMap = new LinkedHashMap(); + /** Reverse index for stringLiteralToTypeMap. Indexed with raw token type. + * 0 is invalid. */ + public Vector typeToStringLiteralList = new Vector(); + + /** Map a token type to its token name. Indexed with raw token type. + * 0 is invalid. + */ + public Vector typeToTokenList = new Vector(); + + /** Map a name to an action. + * The code generator will use this to fill holes in the output files. + * I track the AST node for the action in case I need the line number + * for errors. + */ + public Map namedActions = new HashMap(); + + + /** Tracks all forced actions in all alternatives of all rules. + * Or if lexer all rules period. Doesn't track sempreds. + * maps tree node to action index. + */ + public LinkedHashMap actions = new LinkedHashMap(); + + /** All sempreds found in grammar; maps tree node to sempred index */ + public LinkedHashMap sempreds = new LinkedHashMap(); + + public Map scopes = new LinkedHashMap(); + public static final String AUTO_GENERATED_TOKEN_NAME_PREFIX = "T__"; + + public Grammar(Tool tool, GrammarRootAST ast) { + if ( ast==null ) throw new IllegalArgumentException("can't pass null tree"); + this.tool = tool; + this.ast = ast; + this.name = ((GrammarAST)ast.getChild(0)).getText(); + initTokenSymbolTables(); + } + + /** For testing */ + public Grammar(String grammarText) throws org.antlr.runtime.RecognitionException { + this("", grammarText, null); + } + + /** For testing */ + public Grammar(String grammarText, ANTLRToolListener listener) + throws org.antlr.runtime.RecognitionException + { + this("", grammarText, listener); + } + + /** For testing; only builds trees; no sem anal */ + public Grammar(String fileName, String grammarText, ANTLRToolListener listener) + throws org.antlr.runtime.RecognitionException + { + this.text = grammarText; + this.fileName = fileName; + this.tool = new Tool(); + this.tool.addListener(listener); + org.antlr.runtime.ANTLRStringStream in = new org.antlr.runtime.ANTLRStringStream(grammarText); + in.name = fileName; + ANTLRLexer lexer = new ANTLRLexer(in); + CommonTokenStream tokens = new CommonTokenStream(lexer); + ToolANTLRParser p = new ToolANTLRParser(tokens,tool); + p.setTreeAdaptor(new GrammarASTAdaptor(in)); + ParserRuleReturnScope r = p.grammarSpec(); + if ( r.getTree() instanceof GrammarRootAST ) { + this.ast = (GrammarRootAST)r.getTree(); + this.ast.hasErrors = p.getNumberOfSyntaxErrors()>0; + this.name = ((GrammarAST)ast.getChild(0)).getText(); + } + initTokenSymbolTables(); + } + + protected void initTokenSymbolTables() { + if ( isTreeGrammar() ) { + typeToTokenList.setSize(Token.UP + 1); + typeToTokenList.set(Token.DOWN, "DOWN"); + typeToTokenList.set(Token.UP, "UP"); + tokenNameToTypeMap.put("DOWN", Token.DOWN); + tokenNameToTypeMap.put("UP", Token.UP); + } + tokenNameToTypeMap.put("EOF", Token.EOF); + } + + public void loadImportedGrammars() { + if ( ast==null ) return; + GrammarAST i = (GrammarAST)ast.getFirstChildWithType(ANTLRParser.IMPORT); + if ( i==null ) return; + importedGrammars = new ArrayList(); + for (Object c : i.getChildren()) { + GrammarAST t = (GrammarAST)c; + String importedGrammarName = null; + if ( t.getType()==ANTLRParser.ASSIGN ) { + importedGrammarName = t.getChild(1).getText(); + System.out.println("import "+ importedGrammarName); + } + else if ( t.getType()==ANTLRParser.ID ) { + importedGrammarName = t.getText(); + System.out.println("import "+t.getText()); + } + try { + GrammarAST root = tool.load(importedGrammarName+".g"); + if ( root instanceof GrammarASTErrorNode ) return; // came back as error node + GrammarRootAST ast = (GrammarRootAST)root; + Grammar g = tool.createGrammar(ast); + g.fileName = importedGrammarName+".g"; + g.parent = this; + importedGrammars.add(g); + } + catch (Exception e) { + System.err.println("can't load grammar "+importedGrammarName); + } + } + } + + public void defineAction(GrammarAST atAST) { + if ( atAST.getChildCount()==2 ) { + String name = atAST.getChild(0).getText(); + namedActions.put(name, (ActionAST)atAST.getChild(1)); + } + else { + String scope = atAST.getChild(0).getText(); + if ( scope.equals(getTypeString()) ) { + String name = atAST.getChild(1).getText(); + namedActions.put(name, (ActionAST)atAST.getChild(2)); + } + } + } + + public void defineRule(Rule r) { + if ( rules.get(r.name)!=null ) return; + rules.put(r.name, r); + r.index = ruleNumber++; + } + +// public int getNumRules() { +// int n = rules.size(); +// List imports = getAllImportedGrammars(); +// if ( imports!=null ) { +// for (Grammar g : imports) n += g.getNumRules(); +// } +// return n; +// } + + public Rule getRule(String name) { + Rule r = rules.get(name); + if ( r!=null ) return r; + List imports = getAllImportedGrammars(); + if ( imports==null ) return null; + for (Grammar g : imports) { + r = g.rules.get(name); + if ( r!=null ) return r; + } + return null; + } + + public Rule getRule(String grammarName, String ruleName) { + if ( grammarName!=null ) { // scope override + Grammar g = getImportedGrammar(grammarName); + if ( g ==null ) { + return null; + } + return g.rules.get(ruleName); + } + return getRule(ruleName); + } + + public void defineScope(AttributeDict s) { scopes.put(s.getName(), s); } + + /** Get list of all imports from all grammars in the delegate subtree of g. + * The grammars are in import tree preorder. Don't include ourselves + * in list as we're not a delegate of ourselves. + */ + public List getAllImportedGrammars() { + if ( importedGrammars==null ) return null; + List delegates = new ArrayList(); + for (int i = 0; i < importedGrammars.size(); i++) { + Grammar d = importedGrammars.get(i); + delegates.add(d); + List ds = d.getAllImportedGrammars(); + if ( ds!=null ) delegates.addAll( ds ); + } + return delegates; + } + + public List getImportedGrammars() { return importedGrammars; } + + /** Get delegates below direct delegates of g + public List getIndirectDelegates(Grammar g) { + List direct = getDirectDelegates(g); + List delegates = getDelegates(g); + delegates.removeAll(direct); + return delegates; + } +*/ + + /** Return list of imported grammars from root down to our parent. + * Order is [root, ..., this.parent]. (us not included). + */ + public List getGrammarAncestors() { + Grammar root = getOutermostGrammar(); + if ( this==root ) return null; + List grammars = new ArrayList(); + // walk backwards to root, collecting grammars + Grammar p = this.parent; + while ( p!=null ) { + grammars.add(0, p); // add to head so in order later + p = p.parent; + } + return grammars; + } + + /** Return the grammar that imported us and our parents. Return this + * if we're root. + */ + public Grammar getOutermostGrammar() { + if ( parent==null ) return this; + return parent.getOutermostGrammar(); + } + + /** Get the name of the generated recognizer; may or may not be same + * as grammar name. + * Recognizer is TParser and TLexer from T if combined, else + * just use T regardless of grammar type. + */ + public String getRecognizerName() { + String suffix = ""; + List grammarsFromRootToMe = getOutermostGrammar().getGrammarAncestors(); + String qualifiedName = name; + if ( grammarsFromRootToMe!=null ) { + StringBuffer buf = new StringBuffer(); + for (Grammar g : grammarsFromRootToMe) { + buf.append(g.name); + buf.append('_'); + } + buf.append(name); + qualifiedName = buf.toString(); + } + if ( isCombined() || (isLexer() && implicitLexer!=null) ) + { + suffix = Grammar.getGrammarTypeToFileNameSuffix(getType()); + } + return qualifiedName+suffix; + } + + public String getStringLiteralLexerRuleName(String lit) { + int ttype = getTokenType(lit); + return AUTO_GENERATED_TOKEN_NAME_PREFIX +ttype; + } + + /** Return grammar directly imported by this grammar */ + public Grammar getImportedGrammar(String name) { + for (Grammar g : importedGrammars) { + if ( g.name.equals(name) ) return g; + } + return null; + } + + public int getTokenType(String token) { + Integer I = null; + if ( token.charAt(0)=='\'') { + I = stringLiteralToTypeMap.get(token); + } + else { // must be a label like ID + I = tokenNameToTypeMap.get(token); + } + int i = (I!=null)?I.intValue(): Token.INVALID_TYPE; + //System.out.println("grammar type "+type+" "+tokenName+"->"+i); + return i; + } + + /** Given a token type, get a meaningful name for it such as the ID + * or string literal. If this is a lexer and the ttype is in the + * char vocabulary, compute an ANTLR-valid (possibly escaped) char literal. + */ + public String getTokenDisplayName(int ttype) { + String tokenName = null; + int index=0; + // inside any target's char range and is lexer grammar? + if ( isLexer() && + ttype >= MIN_CHAR_VALUE && ttype <= MAX_CHAR_VALUE ) + { + return CharSupport.getANTLRCharLiteralForChar(ttype); + } + else if ( ttype==Token.EOF ) { + tokenName = "EOF"; + } + else { + if ( ttype"; + } + } + + public String getOption(String key) { + if ( ast.options==null ) return null; + return ast.options.get(key); + } + + public String getOption(String key, String defaultValue) { + if ( ast.options==null ) return defaultValue; + String v = ast.options.get(key); + if ( v!=null ) return v; + return defaultValue; + } + + public static Map getStringLiteralAliasesFromLexerRules(GrammarRootAST ast) { + GrammarAST combinedRulesRoot = + (GrammarAST)ast.getFirstChildWithType(ANTLRParser.RULES); + if ( combinedRulesRoot==null ) return null; + + List ruleNodes = combinedRulesRoot.getChildren(); + if ( ruleNodes==null || ruleNodes.size()==0 ) return null; + GrammarASTAdaptor adaptor = new GrammarASTAdaptor(ruleNodes.get(0).token.getInputStream()); + TreeWizard wiz = new TreeWizard(adaptor,ANTLRParser.tokenNames); + Map lexerRuleToStringLiteral = new HashMap(); + + for (GrammarASTWithOptions r : ruleNodes) { + String ruleName = r.getChild(0).getText(); + if ( Character.isUpperCase(ruleName.charAt(0)) ) { + Map nodes = new HashMap(); + boolean isLitRule = + wiz.parse(r, "(RULE %name:ID (BLOCK (ALT %lit:STRING_LITERAL)))", nodes); + if ( isLitRule ) { + GrammarAST litNode = (GrammarAST)nodes.get("lit"); + GrammarAST nameNode = (GrammarAST)nodes.get("name"); + lexerRuleToStringLiteral.put(litNode.getText(), nameNode.getText()); + } + } + } + return lexerRuleToStringLiteral; + } + + public void setLookaheadDFA(int decision, DFA lookaheadDFA) { + decisionDFAs.put(Utils.integer(decision), lookaheadDFA); + } } diff --git a/tool/src/org/antlr/v4/tool/GrammarAST.java b/tool/src/org/antlr/v4/tool/GrammarAST.java new file mode 100644 index 000000000..4b84ad595 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/GrammarAST.java @@ -0,0 +1,99 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.*; +import org.antlr.runtime.tree.*; +import org.antlr.v4.misc.IntervalSet; +import org.antlr.v4.parse.ANTLRParser; +import org.antlr.v4.runtime.atn.ATNState; + +import java.util.*; + +public class GrammarAST extends CommonTree { + /** If we build an ATN, we make AST node point at left edge of ATN construct */ + public ATNState atnState; + + public GrammarAST() {;} + public GrammarAST(Token t) { super(t); } + public GrammarAST(GrammarAST node) { super(node); } + public GrammarAST(int type) { super(new CommonToken(type, ANTLRParser.tokenNames[type])); } + public GrammarAST(int type, Token t) { + this(new CommonToken(type, t.getText())); + token.setInputStream(t.getInputStream()); + token.setLine(t.getLine()); + token.setCharPositionInLine(t.getCharPositionInLine()); + } + public GrammarAST(int type, Token t, String text) { + this(new CommonToken(type, text)); + token.setInputStream(t.getInputStream()); + token.setLine(t.getLine()); + token.setCharPositionInLine(t.getCharPositionInLine()); + } + + public List getNodesWithType(int ttype) { + return getNodesWithType(IntervalSet.of(ttype)); + } + + public List getNodesWithType(IntervalSet types) { + List nodes = new ArrayList(); + List work = new LinkedList(); + work.add(this); + GrammarAST t = null; + while ( work.size()>0 ) { + t = work.remove(0); + if ( types.member(t.getType()) ) nodes.add(t); + if ( t.children!=null ) work.addAll(t.children); + } + return nodes; + } + + public AltAST getOutermostAltNode() { + if ( this instanceof AltAST && parent.parent instanceof RuleAST ) { + return (AltAST)this; + } + if ( parent!=null ) return ((GrammarAST)parent).getOutermostAltNode(); + return null; + } + + // TODO: move to basetree when i settle on how runtime works + // TODO: don't include this node!! + // TODO: reuse other method + public CommonTree getFirstDescendantWithType(int type) { + if ( getType()==type ) return this; + if ( children==null ) return null; + for (Object c : children) { + GrammarAST t = (GrammarAST)c; + if ( t.getType()==type ) return t; + CommonTree d = t.getFirstDescendantWithType(type); + if ( d!=null ) return d; + } + return null; + } + + // TODO: don't include this node!! + public CommonTree getFirstDescendantWithType(org.antlr.runtime.BitSet types) { + if ( types.member(getType()) ) return this; + if ( children==null ) return null; + for (Object c : children) { + GrammarAST t = (GrammarAST)c; + if ( types.member(t.getType()) ) return t; + CommonTree d = t.getFirstDescendantWithType(types); + if ( d!=null ) return d; + } + return null; + } + +// @Override +// public boolean equals(Object obj) { +// return super.equals(obj); +// } + + @Override + public Tree dupNode() { + return new GrammarAST(this); + } + + @Override + public String toString() { + return super.toString(); + } +} diff --git a/tool/src/org/antlr/v4/tool/GrammarRootAST.java b/tool/src/org/antlr/v4/tool/GrammarRootAST.java new file mode 100644 index 000000000..500e7efa2 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/GrammarRootAST.java @@ -0,0 +1,33 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; + +import java.util.*; + +public class GrammarRootAST extends GrammarASTWithOptions { + public static final Map defaultOptions = + new HashMap() { + { + put("language","Java"); + } + }; + public int grammarType; // LEXER, PARSER, TREE, GRAMMAR (combined) + public boolean hasErrors; + + public GrammarRootAST(GrammarAST node) { + super(node); + this.grammarType = ((GrammarRootAST)node).grammarType; + this.hasErrors = ((GrammarRootAST)node).hasErrors; + } + + @Override + public Tree dupNode() { return new GrammarRootAST(this); } + + public GrammarRootAST(int type) { super(type); } + public GrammarRootAST(Token t) { super(t); } + public GrammarRootAST(int type, Token t) { super(type, t); } + public GrammarRootAST(int type, Token t, String text) { + super(type,t,text); + } +} diff --git a/tool/src/org/antlr/v4/tool/LabelElementPair.java b/tool/src/org/antlr/v4/tool/LabelElementPair.java new file mode 100644 index 000000000..e76171e35 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/LabelElementPair.java @@ -0,0 +1,48 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.BitSet; +import org.antlr.v4.parse.ANTLRParser; + +public class LabelElementPair { + public static final BitSet tokenTypeForTokens = new BitSet(); + static { + tokenTypeForTokens.add(ANTLRParser.TOKEN_REF); + tokenTypeForTokens.add(ANTLRParser.STRING_LITERAL); + tokenTypeForTokens.add(ANTLRParser.WILDCARD); + } + + public GrammarAST label; + public GrammarAST element; + public LabelType type; + + public LabelElementPair(Grammar g, GrammarAST label, GrammarAST element, int labelOp) { + this.label = label; + this.element = element; + // compute general case for label type + if ( element.getFirstDescendantWithType(tokenTypeForTokens)!=null ) { + if ( labelOp==ANTLRParser.ASSIGN ) type = LabelType.TOKEN_LABEL; + else type = LabelType.TOKEN_LIST_LABEL; + } + else if ( element.getFirstDescendantWithType(ANTLRParser.RULE_REF)!=null ) { + if ( labelOp==ANTLRParser.ASSIGN ) type = LabelType.RULE_LABEL; + else type = LabelType.RULE_LIST_LABEL; + } + + // now reset if lexer and string + if ( g.isLexer() ) { + if ( element.getFirstDescendantWithType(ANTLRParser.STRING_LITERAL)!=null ) { + if ( labelOp==ANTLRParser.ASSIGN ) type = LabelType.LEXER_STRING_LABEL; + } + } + else if ( g.isTreeGrammar() ) { + if ( element.getFirstDescendantWithType(ANTLRParser.WILDCARD)!=null ) { + if ( labelOp==ANTLRParser.ASSIGN ) type = LabelType.WILDCARD_TREE_LABEL; + else type = LabelType.WILDCARD_TREE_LIST_LABEL; + } + } + } + + public String toString() { + return label.getText()+" "+type+" "+element.toString(); + } +} diff --git a/tool/src/org/antlr/v4/tool/LabelType.java b/tool/src/org/antlr/v4/tool/LabelType.java new file mode 100644 index 000000000..e60367211 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/LabelType.java @@ -0,0 +1,15 @@ +package org.antlr.v4.tool; + +/** the various kinds of labels. t=type, id=ID, types+=type ids+=ID */ +public enum LabelType { + RULE_LABEL, + TOKEN_LABEL, + RULE_LIST_LABEL, + TOKEN_LIST_LABEL, + LEXER_STRING_LABEL, // used in lexer for x='a' + SUBRULE_LABEL, // x=(...) + SUBRULE_LIST_LABEL, // x+=(...) + WILDCARD_TREE_LABEL, // Used in tree grammar x=. + WILDCARD_TREE_LIST_LABEL // Used in tree grammar x+=. + ; +} diff --git a/tool/src/org/antlr/v4/tool/LexerGrammar.java b/tool/src/org/antlr/v4/tool/LexerGrammar.java new file mode 100644 index 000000000..b6430e57d --- /dev/null +++ b/tool/src/org/antlr/v4/tool/LexerGrammar.java @@ -0,0 +1,36 @@ +package org.antlr.v4.tool; + +import org.antlr.misc.MultiMap; +import org.antlr.runtime.RecognitionException; +import org.antlr.tool.Rule; +import org.antlr.v4.Tool; + +/** */ +public class LexerGrammar extends Grammar { + public static final String DEFAULT_MODE_NAME = "DEFAULT_MODE"; + + public MultiMap modes = new MultiMap(); + + public LexerGrammar(Tool tool, GrammarRootAST ast) { + super(tool, ast); + } + + public LexerGrammar(String grammarText) throws RecognitionException { + super(grammarText); + } + + public LexerGrammar(String grammarText, ANTLRToolListener listener) throws RecognitionException { + super(grammarText, listener); + } + + public LexerGrammar(String fileName, String grammarText, ANTLRToolListener listener) throws RecognitionException { + super(fileName, grammarText, listener); + } + + + @Override + public void defineRule(Rule r) { + super.defineRule(r); + modes.map(r.mode, r); + } +} diff --git a/tool/src/org/antlr/v4/tool/PredAST.java b/tool/src/org/antlr/v4/tool/PredAST.java new file mode 100644 index 000000000..d496736e2 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/PredAST.java @@ -0,0 +1,15 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; + +public class PredAST extends ActionAST { + public PredAST(GrammarAST node) { + super(node); + this.resolver = ((ActionAST)node).resolver; + this.chunks = ((ActionAST)node).chunks; + } + + public PredAST(Token t) { super(t); } + public PredAST(int type) { super(type); } + public PredAST(int type, Token t) { super(type, t); } +} diff --git a/tool/src/org/antlr/v4/tool/Rule.java b/tool/src/org/antlr/v4/tool/Rule.java new file mode 100644 index 000000000..d0ed842e9 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/Rule.java @@ -0,0 +1,246 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.v4.parse.ANTLRParser; +import org.stringtemplate.v4.misc.MultiMap; + +import java.util.*; + +public class Rule implements AttributeResolver { + /** Rule refs have a predefined set of attributes as well as + * the return values and args. + */ + public static AttributeDict predefinedRulePropertiesDict = + new AttributeDict(AttributeDict.DictType.PREDEFINED_RULE) {{ + add(new Attribute("text")); + add(new Attribute("start")); + add(new Attribute("stop")); + add(new Attribute("tree")); + add(new Attribute("st")); + }}; + + public static AttributeDict predefinedTreeRulePropertiesDict = + new AttributeDict(AttributeDict.DictType.PREDEFINED_TREE_RULE) {{ + add(new Attribute("text")); + add(new Attribute("start")); // note: no stop; not meaningful + add(new Attribute("tree")); + add(new Attribute("st")); + }}; + + public static AttributeDict predefinedLexerRulePropertiesDict = + new AttributeDict(AttributeDict.DictType.PREDEFINED_LEXER_RULE) {{ + add(new Attribute("text")); + add(new Attribute("type")); + add(new Attribute("line")); + add(new Attribute("index")); + add(new Attribute("pos")); + add(new Attribute("channel")); + add(new Attribute("start")); + add(new Attribute("stop")); + add(new Attribute("int")); + }}; + + public String name; + public List modifiers; + + public RuleAST ast; + public AttributeDict args; + public AttributeDict retvals; + public AttributeDict scope; // scope { int i; } + + /** A list of scope names used by this rule */ + public List useScopes; + + public Grammar g; + + /** If we're in a lexer grammar, we might be in a mode */ + public String mode; + + /** Map a name to an action for this rule like @init {...}. + * The code generator will use this to fill holes in the rule template. + * I track the AST node for the action in case I need the line number + * for errors. + */ + public Map namedActions = + new HashMap(); + + /** Track exception handler actions (exception type is prev child); + * don't track finally action + */ + public List exceptionActions = new ArrayList(); + + public ActionAST finallyAction; + + public int numberOfAlts; + + public boolean isStartRule = true; // nobody calls us + + public Alternative[] alt; + + /** All rules have unique index 1..n */ + public int index; + + public int actionIndex; // if lexer + + public Rule(Grammar g, String name, RuleAST ast, int numberOfAlts) { + this.g = g; + this.name = name; + this.ast = ast; + this.numberOfAlts = numberOfAlts; + alt = new Alternative[numberOfAlts+1]; // 1..n + for (int i=1; i<=numberOfAlts; i++) alt[i] = new Alternative(this); + } + + public void defineActionInAlt(int currentAlt, ActionAST actionAST) { + alt[currentAlt].actions.add(actionAST); + if ( g.isLexer() || actionAST.getType()== ANTLRParser.FORCED_ACTION ) { + actionIndex = g.actions.size() + 1; + g.actions.put(actionAST, actionIndex); + } + } + + public void definePredicateInAlt(int currentAlt, PredAST predAST) { + alt[currentAlt].actions.add(predAST); + g.sempreds.put(predAST, g.sempreds.size() + 1); + } + + public Attribute resolveRetvalOrProperty(String y) { + if ( retvals!=null ) { + Attribute a = retvals.get(y); + if ( a!=null ) return a; + } + AttributeDict d = getPredefinedScope(LabelType.RULE_LABEL); + return d.get(y); + } + + public Set getTokenRefs() { + Set refs = new HashSet(); + for (int i=1; i<=numberOfAlts; i++) { + refs.addAll(alt[i].tokenRefs.keySet()); + } + return refs; + } + + public Set getLabelNames() { + Set refs = new HashSet(); + for (int i=1; i<=numberOfAlts; i++) { + refs.addAll(alt[i].labelDefs.keySet()); + } + return refs; + } + + // TODO: called frequently; make it more efficient + public MultiMap getLabelDefs() { + MultiMap defs = + new MultiMap(); + for (int i=1; i<=numberOfAlts; i++) { + for (List pairs : alt[i].labelDefs.values()) { + for (LabelElementPair p : pairs) { + defs.map(p.label.getText(), p); + } + } + } + return defs; + } + + /** $x Attribute: rule arguments, return values, predefined rule prop. + */ + public Attribute resolveToAttribute(String x, ActionAST node) { + if ( args!=null ) { + Attribute a = args.get(x); if ( a!=null ) return a; + } + if ( retvals!=null ) { + Attribute a = retvals.get(x); if ( a!=null ) return a; + } + AttributeDict properties = getPredefinedScope(LabelType.RULE_LABEL); + return properties.get(x); + } + + /** $x.y Attribute: x is surrounding rule, label ref (in any alts) */ + public Attribute resolveToAttribute(String x, String y, ActionAST node) { + if ( this.name.equals(x) ) { // x is this rule? + return resolveToAttribute(y, node); + } + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null ) { + if ( anyLabelDef.type==LabelType.RULE_LABEL ) { + return g.getRule(anyLabelDef.element.getText()).resolveRetvalOrProperty(y); + } + else { + return getPredefinedScope(anyLabelDef.type).get(y); + } + } + return null; + + } + + public AttributeDict resolveToDynamicScope(String x, ActionAST node) { + Rule r = resolveToRule(x); + if ( r!=null && r.scope!=null ) return r.scope; + return g.scopes.get(x); + } + + public boolean resolvesToLabel(String x, ActionAST node) { + return false; + } + + public boolean resolvesToListLabel(String x, ActionAST node) { + LabelElementPair anyLabelDef = getAnyLabelDef(x); + return anyLabelDef!=null && + (anyLabelDef.type==LabelType.RULE_LIST_LABEL || + anyLabelDef.type==LabelType.TOKEN_LIST_LABEL); + } + + public boolean resolvesToToken(String x, ActionAST node) { + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null && anyLabelDef.type==LabelType.TOKEN_LABEL ) return true; + return false; + } + + public boolean resolvesToAttributeDict(String x, ActionAST node) { + if ( resolvesToToken(x, node) ) return true; + if ( x.equals(name) ) return true; // $r for action in rule r, $r is a dict + if ( scope!=null ) return true; + if ( g.scopes.get(x)!=null ) return true; + return false; + } + + public Rule resolveToRule(String x) { + if ( x.equals(this.name) ) return this; + LabelElementPair anyLabelDef = getAnyLabelDef(x); + if ( anyLabelDef!=null && anyLabelDef.type==LabelType.RULE_LABEL ) { + return g.getRule(anyLabelDef.element.getText()); + } + return g.getRule(x); + } + + public LabelElementPair getAnyLabelDef(String x) { + List labels = getLabelDefs().get(x); + if ( labels!=null ) return labels.get(0); + return null; + } + + public AttributeDict getPredefinedScope(LabelType ltype) { + String grammarLabelKey = g.getTypeString() + ":" + ltype; + return Grammar.grammarAndLabelRefTypeToScope.get(grammarLabelKey); + } + + public boolean isFragment() { + if ( modifiers==null ) return false; + for (GrammarAST a : modifiers) { + if ( a.getText().equals("fragment") ) return true; + } + return false; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append("Rule{name="+name); + if ( args!=null ) buf.append(", args=" + args); + if ( retvals!=null ) buf.append(", retvals=" + retvals); + if ( scope!=null ) buf.append(", scope=" + scope); + buf.append("}"); + return buf.toString(); + } +} diff --git a/tool/src/org/antlr/v4/tool/RuleAST.java b/tool/src/org/antlr/v4/tool/RuleAST.java new file mode 100644 index 000000000..90789f4b1 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/RuleAST.java @@ -0,0 +1,29 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; +import org.antlr.v4.parse.ANTLRParser; + +public class RuleAST extends GrammarASTWithOptions { + public RuleAST(GrammarAST node) { + super(node); + } + + public RuleAST(Token t) { super(t); } + public RuleAST(int type) { super(type); } + + @Override + public Tree dupNode() { return new RuleAST(this); } + + public ActionAST getLexerAction() { + Tree blk = getFirstChildWithType(ANTLRParser.BLOCK); + if ( blk.getChildCount()==1 ) { + Tree onlyAlt = blk.getChild(0); + Tree lastChild = onlyAlt.getChild(onlyAlt.getChildCount()-1); + if ( lastChild.getType()==ANTLRParser.ACTION ) { + return (ActionAST)lastChild; + } + } + return null; + } +} diff --git a/tool/src/org/antlr/v4/tool/TerminalAST.java b/tool/src/org/antlr/v4/tool/TerminalAST.java new file mode 100644 index 000000000..ecb800d49 --- /dev/null +++ b/tool/src/org/antlr/v4/tool/TerminalAST.java @@ -0,0 +1,19 @@ +package org.antlr.v4.tool; + +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; + +public class TerminalAST extends GrammarASTWithOptions { + public static final String defaultTokenOption = "node"; + + public TerminalAST(GrammarAST node) { + super(node); + } + + public TerminalAST(Token t) { super(t); } + public TerminalAST(int type) { super(type); } + public TerminalAST(int type, Token t) { super(type, t); } + + @Override + public Tree dupNode() { return new TerminalAST(this); } +}