Merge pull request #274 from sharwell/fix-267

Support character U+FFFF
This commit is contained in:
Sam Harwell 2013-06-01 20:08:46 -07:00
commit eeda06b698
5 changed files with 140 additions and 55 deletions

View File

@ -35,23 +35,6 @@ import org.antlr.v4.runtime.misc.NotNull;
/** A source of characters for an ANTLR lexer. */
public interface CharStream extends IntStream {
/**
* The minimum allowed value for a character in a {@code CharStream}.
*/
public static final int MIN_CHAR = Character.MIN_VALUE;
/**
* The maximum allowed value for a character in a {@code CharStream}.
* <p/>
* This value is {@code Character.MAX_VALUE - 1}, which reserves the value
* {@code Character.MAX_VALUE} for special use within an implementing class.
* For some implementations, the data buffers required for supporting the
* marked ranges of {@link IntStream} are stored as {@code char[]} instead
* of {@code int[]}, with {@code Character.MAX_VALUE} being used instead of
* {@code -1} to mark the end of the stream internally.
*/
public static final int MAX_CHAR = Character.MAX_VALUE-1;
/**
* This method returns the text for a range of characters within this input
* stream. This method is guaranteed to not throw an exception if the

View File

@ -30,6 +30,7 @@
package org.antlr.v4.runtime.atn;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.dfa.DFAState;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.runtime.misc.NotNull;
@ -56,7 +57,7 @@ public abstract class ATNSimulator {
/* WARNING: DO NOT MERGE THIS LINE. If UUIDs differ during a merge,
* resolve the conflict by generating a new ID!
*/
SERIALIZED_UUID = UUID.fromString("065C46D6-8859-4FD7-A158-83E693BF2B52");
SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
}
/** Must distinguish between missing edge and edge we know leads nowhere */
@ -124,7 +125,6 @@ public abstract class ATNSimulator {
data[i] = (char)(data[i] - 2);
}
List<IntervalSet> sets = new ArrayList<IntervalSet>();
int p = 0;
int version = toInt(data[p++]);
if (version != SERIALIZED_VERSION) {
@ -149,7 +149,7 @@ public abstract class ATNSimulator {
List<Pair<LoopEndState, Integer>> loopBackStateNumbers = new ArrayList<Pair<LoopEndState, Integer>>();
List<Pair<BlockStartState, Integer>> endStateNumbers = new ArrayList<Pair<BlockStartState, Integer>>();
int nstates = toInt(data[p++]);
for (int i=1; i<=nstates; i++) {
for (int i=0; i<nstates; i++) {
int stype = toInt(data[p++]);
// ignore bad type of states
if ( stype==ATNState.INVALID_TYPE ) {
@ -158,6 +158,10 @@ public abstract class ATNSimulator {
}
int ruleIndex = toInt(data[p++]);
if (ruleIndex == Character.MAX_VALUE) {
ruleIndex = -1;
}
ATNState s = stateFactory(stype, ruleIndex);
if ( stype == ATNState.LOOP_END ) { // special case
int loopBackStateNumber = toInt(data[p++]);
@ -200,8 +204,16 @@ public abstract class ATNSimulator {
atn.ruleToStartState[i] = startState;
if ( atn.grammarType == ATNType.LEXER ) {
int tokenType = toInt(data[p++]);
if (tokenType == 0xFFFF) {
tokenType = Token.EOF;
}
atn.ruleToTokenType[i] = tokenType;
int actionIndex = toInt(data[p++]);
if (actionIndex == 0xFFFF) {
actionIndex = -1;
}
atn.ruleToActionIndex[i] = actionIndex;
}
}
@ -229,13 +241,20 @@ public abstract class ATNSimulator {
//
// SETS
//
List<IntervalSet> sets = new ArrayList<IntervalSet>();
int nsets = toInt(data[p++]);
for (int i=1; i<=nsets; i++) {
for (int i=0; i<nsets; i++) {
int nintervals = toInt(data[p]);
p++;
IntervalSet set = new IntervalSet();
sets.add(set);
for (int j=1; j<=nintervals; j++) {
boolean containsEof = toInt(data[p++]) != 0;
if (containsEof) {
set.add(-1);
}
for (int j=0; j<nintervals; j++) {
set.add(toInt(data[p]), toInt(data[p + 1]));
p += 2;
}
@ -245,7 +264,7 @@ public abstract class ATNSimulator {
// EDGES
//
int nedges = toInt(data[p++]);
for (int i=1; i<=nedges; i++) {
for (int i=0; i<nedges; i++) {
int src = toInt(data[p]);
int trg = toInt(data[p+1]);
int ttype = toInt(data[p+2]);
@ -398,7 +417,7 @@ public abstract class ATNSimulator {
}
public static int toInt(char c) {
return c==65535 ? -1 : c;
return c;
}
public static int toInt32(char[] data, int offset) {
@ -425,14 +444,26 @@ public abstract class ATNSimulator {
ATNState target = atn.states.get(trg);
switch (type) {
case Transition.EPSILON : return new EpsilonTransition(target);
case Transition.RANGE : return new RangeTransition(target, arg1, arg2);
case Transition.RANGE :
if (arg3 != 0) {
return new RangeTransition(target, Token.EOF, arg2);
}
else {
return new RangeTransition(target, arg1, arg2);
}
case Transition.RULE :
RuleTransition rt = new RuleTransition((RuleStartState)atn.states.get(arg1), arg2, target);
return rt;
case Transition.PREDICATE :
PredicateTransition pt = new PredicateTransition(target, arg1, arg2, arg3 != 0);
return pt;
case Transition.ATOM : return new AtomTransition(target, arg1);
case Transition.ATOM :
if (arg3 != 0) {
return new AtomTransition(target, Token.EOF);
}
else {
return new AtomTransition(target, arg1);
}
case Transition.ACTION :
ActionTransition a = new ActionTransition(target, arg1, arg2, arg3 != 0);
return a;

View File

@ -32,6 +32,7 @@ package org.antlr.v4.automata;
import org.antlr.v4.misc.Utils;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.atn.ATN;
import org.antlr.v4.runtime.atn.ATNSimulator;
import org.antlr.v4.runtime.atn.ATNState;
@ -54,15 +55,15 @@ import org.antlr.v4.tool.Rule;
import java.io.InvalidClassException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
public class ATNSerializer {
public Grammar g;
public ATN atn;
public List<IntervalSet> sets = new ArrayList<IntervalSet>();
public ATNSerializer(Grammar g, ATN atn) {
this.g = g;
@ -113,6 +114,9 @@ public class ATNSerializer {
data.add(g.getMaxTokenType());
int nedges = 0;
Map<IntervalSet, Integer> setIndices = new HashMap<IntervalSet, Integer>();
List<IntervalSet> sets = new ArrayList<IntervalSet>();
// dump states, count edges and collect sets while doing so
IntegerList nonGreedyStates = new IntegerList();
data.add(atn.states.size());
@ -128,7 +132,14 @@ public class ATNSerializer {
}
data.add(stateType);
data.add(s.ruleIndex);
if (s.ruleIndex == -1) {
data.add(Character.MAX_VALUE);
}
else {
data.add(s.ruleIndex);
}
if ( s.getStateType() == ATNState.LOOP_END ) {
data.add(((LoopEndState)s).loopBackState.stateNumber);
}
@ -146,7 +157,10 @@ public class ATNSerializer {
int edgeType = Transition.serializationTypes.get(t.getClass());
if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) {
SetTransition st = (SetTransition)t;
sets.add(st.set);
if (!setIndices.containsKey(st.set)) {
sets.add(st.set);
setIndices.put(st.set, sets.size() - 1);
}
}
}
}
@ -163,10 +177,20 @@ public class ATNSerializer {
ATNState ruleStartState = atn.ruleToStartState[r];
data.add(ruleStartState.stateNumber);
if ( g.isLexer() ) {
data.add(atn.ruleToTokenType[r]);
if (atn.ruleToTokenType[r] == Token.EOF) {
data.add(Character.MAX_VALUE);
}
else {
data.add(atn.ruleToTokenType[r]);
}
String ruleName = g.rules.getKey(r);
Rule rule = g.getRule(ruleName);
data.add(rule.actionIndex);
if (rule.actionIndex == -1) {
data.add(Character.MAX_VALUE);
}
else {
data.add(rule.actionIndex);
}
}
}
@ -181,15 +205,33 @@ public class ATNSerializer {
int nsets = sets.size();
data.add(nsets);
for (IntervalSet set : sets) {
data.add(set.getIntervals().size());
boolean containsEof = set.contains(Token.EOF);
if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
data.add(set.getIntervals().size() - 1);
}
else {
data.add(set.getIntervals().size());
}
data.add(containsEof ? 1 : 0);
for (Interval I : set.getIntervals()) {
data.add(I.a);
if (I.a == Token.EOF) {
if (I.b == Token.EOF) {
continue;
}
else {
data.add(0);
}
}
else {
data.add(I.a);
}
data.add(I.b);
}
}
data.add(nedges);
int setIndex = 0;
for (ATNState s : atn.states) {
if ( s==null ) {
// might be optimized away
@ -228,25 +270,40 @@ public class ATNSerializer {
case Transition.RANGE :
arg1 = ((RangeTransition)t).from;
arg2 = ((RangeTransition)t).to;
if (arg1 == Token.EOF) {
arg1 = 0;
arg3 = 1;
}
break;
case Transition.ATOM :
arg1 = ((AtomTransition)t).label;
if (arg1 == Token.EOF) {
arg1 = 0;
arg3 = 1;
}
break;
case Transition.ACTION :
ActionTransition at = (ActionTransition)t;
arg1 = at.ruleIndex;
arg2 = at.actionIndex;
if (arg2 == -1) {
arg2 = 0xFFFF;
}
arg3 = at.isCtxDependent ? 1 : 0 ;
break;
case Transition.SET :
arg1 = setIndex++;
arg1 = setIndices.get(((SetTransition)t).set);
break;
case Transition.NOT_SET :
arg1 = setIndex++;
arg1 = setIndices.get(((SetTransition)t).set);
break;
case Transition.WILDCARD :
break;
}
data.add(src);
data.add(trg);
data.add(edgeType);
@ -263,15 +320,11 @@ public class ATNSerializer {
// don't adjust the first value since that's the version number
for (int i = 1; i < data.size(); i++) {
if (data.get(i) < -1 || data.get(i) > 0xFFFE) {
if (data.get(i) < Character.MIN_VALUE || data.get(i) > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Serialized ATN data element out of range.");
}
int value = (data.get(i) + 2) & 0xFFFF;
if (value == 0xFFFF) {
value = -1;
}
data.set(i, value);
}
@ -304,10 +357,14 @@ public class ATNSerializer {
int maxType = ATNSimulator.toInt(data[p++]);
buf.append("max type ").append(maxType).append("\n");
int nstates = ATNSimulator.toInt(data[p++]);
for (int i=1; i<=nstates; i++) {
for (int i=0; i<nstates; i++) {
int stype = ATNSimulator.toInt(data[p++]);
if ( stype==ATNState.INVALID_TYPE ) continue; // ignore bad type of states
int ruleIndex = ATNSimulator.toInt(data[p++]);
if (ruleIndex == Character.MAX_VALUE) {
ruleIndex = -1;
}
String arg = "";
if ( stype == ATNState.LOOP_END ) {
int loopBackStateNumber = ATNSimulator.toInt(data[p++]);
@ -317,7 +374,7 @@ public class ATNSerializer {
int endStateNumber = ATNSimulator.toInt(data[p++]);
arg = " "+endStateNumber;
}
buf.append(i - 1).append(":")
buf.append(i).append(":")
.append(ATNState.serializationNames.get(stype)).append(" ")
.append(ruleIndex).append(arg).append("\n");
}
@ -331,6 +388,9 @@ public class ATNSerializer {
if ( g.isLexer() ) {
int arg1 = ATNSimulator.toInt(data[p++]);
int arg2 = ATNSimulator.toInt(data[p++]);
if (arg2 == Character.MAX_VALUE) {
arg2 = -1;
}
buf.append("rule ").append(i).append(":").append(s).append(" ").append(arg1).append(",").append(arg2).append('\n');
}
else {
@ -343,18 +403,26 @@ public class ATNSerializer {
buf.append("mode ").append(i).append(":").append(s).append('\n');
}
int nsets = ATNSimulator.toInt(data[p++]);
for (int i=1; i<=nsets; i++) {
for (int i=0; i<nsets; i++) {
int nintervals = ATNSimulator.toInt(data[p++]);
buf.append(i-1).append(":");
for (int j=1; j<=nintervals; j++) {
if ( j>1 ) buf.append(", ");
buf.append(i).append(":");
boolean containsEof = data[p++] != 0;
if (containsEof) {
buf.append(getTokenName(Token.EOF));
}
for (int j=0; j<nintervals; j++) {
if ( containsEof || j>0 ) {
buf.append(", ");
}
buf.append(getTokenName(ATNSimulator.toInt(data[p]))).append("..").append(getTokenName(ATNSimulator.toInt(data[p + 1])));
p += 2;
}
buf.append("\n");
}
int nedges = ATNSimulator.toInt(data[p++]);
for (int i=1; i<=nedges; i++) {
for (int i=0; i<nedges; i++) {
int src = ATNSimulator.toInt(data[p]);
int trg = ATNSimulator.toInt(data[p + 1]);
int ttype = ATNSimulator.toInt(data[p + 2]);
@ -368,9 +436,9 @@ public class ATNSerializer {
p += 6;
}
int ndecisions = ATNSimulator.toInt(data[p++]);
for (int i=1; i<=ndecisions; i++) {
for (int i=0; i<ndecisions; i++) {
int s = ATNSimulator.toInt(data[p++]);
buf.append(i-1).append(":").append(s).append("\n");
buf.append(i).append(":").append(s).append("\n");
}
return buf.toString();
}

View File

@ -285,6 +285,9 @@ public class Grammar implements AttributeResolver {
protected void initTokenSymbolTables() {
tokenNameToTypeMap.put("EOF", Token.EOF);
// reserve a spot for the INVALID token
typeToTokenList.add(null);
}
public void loadImportedGrammars() {

View File

@ -77,7 +77,7 @@ public class TestATNSerialization extends BaseTest {
"rule 0:0\n" +
"0->2 EPSILON 0,0,0\n" +
"2->3 ATOM 1,0,0\n" +
"3->4 ATOM -1,0,0\n" +
"3->4 ATOM 0,0,1\n" +
"4->1 EPSILON 0,0,0\n";
ATN atn = createATN(g, true);
String result = ATNSerializer.getDecoded(g, atn);
@ -96,7 +96,7 @@ public class TestATNSerialization extends BaseTest {
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:0\n" +
"0:EOF..EOF, A..A\n" +
"0:EOF, A..A\n" +
"0->2 EPSILON 0,0,0\n" +
"2->3 SET 0,0,0\n" +
"3->1 EPSILON 0,0,0\n";
@ -347,7 +347,7 @@ public class TestATNSerialization extends BaseTest {
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 ATOM 97,0,0\n" +
"4->5 ATOM -1,0,0\n" +
"4->5 ATOM 0,0,1\n" +
"5->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
@ -370,7 +370,7 @@ public class TestATNSerialization extends BaseTest {
"6:BLOCK_END 0\n" +
"rule 0:1 1,-1\n" +
"mode 0:0\n" +
"0:EOF..EOF, '\\n'..'\\n'\n" +
"0:EOF, '\\n'..'\\n'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->5 ATOM 97,0,0\n" +