Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots more tests.

This commit is contained in:
Ben Hamilton 2017-02-17 13:35:00 -08:00
parent ce09abb480
commit fd4246cf3f
25 changed files with 1361 additions and 209 deletions

View File

@ -402,4 +402,282 @@ public class SetsDescriptors {
public String grammar; public String grammar;
} }
public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor {
public String input = "a\u00E4\u3042\u4E9Cc";
public String output = "a\u00E4\u3042\u4E9Cc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// These are actually not escaped -- Java passes the
// raw unescaped Unicode values to the grammar compiler.
LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c';
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor {
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// These are actually not escaped -- Java passes the
// raw unescaped Unicode values to the grammar compiler.
LETTERS : ('a'|'\u00E0'..'\u00E5');
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor {
public String input = "a\u00E4\u3042\u4E9Cc";
public String output = "a\u00E4\u3042\u4E9Cc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c';
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor {
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u00E0'..'\\u00E5');
*/
@CommentHasStringValue
public String grammar;
}
// TODO(bhamiltoncx): This needs to be an error, the V3
// runtime used by the tool doesn't really understand unescaped code points >
// U+FFFF.
// public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor {
// public String input = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("c")
// .toString();
// public String output = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("c\n")
// .toString();
// public String errors = null;
// public String startRule = "a";
// public String grammarName = "T";
// /**
// grammar T;
// a : LETTERS {<InputText():writeln()>} ;
// // These are actually not escaped -- Java passes the
// // raw unescaped Unicode values to the grammar compiler.
// //
// // Each sequence is the UTF-16 encoding of a raw Unicode
// // SMP code point.
// LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c';
// */
// @CommentHasStringValue
// public String grammar;
// }
public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor {
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1D5C2)
.appendCodePoint(0x1D5CE)
.appendCodePoint(0x1D5BA)
.append("c")
.toString();
public String output = new StringBuilder()
.append("a")
.appendCodePoint(0x1D5C2)
.appendCodePoint(0x1D5CE)
.appendCodePoint(0x1D5BA)
.append("c\n")
.toString();
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c';
*/
@CommentHasStringValue
public String grammar;
}
// Turns out Tool.java uses ANTLR 3's runtime, which means it can't use
// CodePointCharStream to understand unescaped code points > U+FFFF.
//
// TODO(bhamiltoncx): This needs to be an error, since we don't currently plan
// to port Tool.java to use ANTLR 4's runtime.
// public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor {
// public String input = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("d")
// .toString();
// public String output = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("d\n")
// .toString();
// public String errors = null;
// public String startRule = "a";
// public String grammarName = "T";
// /**
// grammar T;
// a : LETTERS* 'd' {<InputText():writeln()>} ;
// // These are actually not escaped -- Java passes the
// // raw unescaped Unicode values to the grammar compiler.
// LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43');
// */
// @CommentHasStringValue
// public String grammar;
// }
public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor {
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1F609)
.appendCodePoint(0x1F942)
.appendCodePoint(0x1F700)
.append("d")
.toString();
public String output = new StringBuilder()
.append("a")
.appendCodePoint(0x1F609)
.appendCodePoint(0x1F942)
.appendCodePoint(0x1F700)
.append("d\n")
.toString();
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor {
// Test the code points just before and just after the range.
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1F5FF)
.appendCodePoint(0x1F944)
.append("d")
.toString();
public String output = "ad\n";
public String errors = new StringBuilder()
.append("line 1:1 token recognition error at: '")
.appendCodePoint(0x1F5FF)
.append("'\n")
.append("line 1:2 token recognition error at: '")
.appendCodePoint(0x1F944)
.append("'\n")
.toString();
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor {
public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c";
public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
LETTERS : 'a' ~('b')+ 'c';
*/
@CommentHasStringValue
public String grammar;
}
public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor {
public String input = "abc";
public String output = "abc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";
/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c';
*/
@CommentHasStringValue
public String grammar;
}
} }

View File

@ -22,6 +22,18 @@ namespace Antlr4.Runtime.Atn
/// <remarks>This is the earliest supported serialized UUID.</remarks> /// <remarks>This is the earliest supported serialized UUID.</remarks>
private static readonly Guid BaseSerializedUuid; private static readonly Guid BaseSerializedUuid;
/// <summary>
/// This UUID indicates the serialized ATN contains two sets of
/// IntervalSets, where the second set's values are encoded as
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
/// </summary>
/// <remarks>
/// This UUID indicates the serialized ATN contains two sets of
/// IntervalSets, where the second set's values are encoded as
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
/// </remarks>
private static readonly Guid AddedUnicodeSmp;
/// <summary> /// <summary>
/// This list contains all of the currently supported UUIDs, ordered by when /// This list contains all of the currently supported UUIDs, ordered by when
/// the feature first appeared in this branch. /// the feature first appeared in this branch.
@ -39,14 +51,18 @@ namespace Antlr4.Runtime.Atn
static ATNDeserializer() static ATNDeserializer()
{ {
BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089");
SupportedUuids = new List<Guid>(); SupportedUuids = new List<Guid>();
SupportedUuids.Add(BaseSerializedUuid); SupportedUuids.Add(BaseSerializedUuid);
SerializedUuid = BaseSerializedUuid; SupportedUuids.Add(AddedUnicodeSmp);
SerializedUuid = AddedUnicodeSmp;
} }
[NotNull] [NotNull]
private readonly ATNDeserializationOptions deserializationOptions; private readonly ATNDeserializationOptions deserializationOptions;
private Guid uuid;
public ATNDeserializer() public ATNDeserializer()
: this(ATNDeserializationOptions.Default) : this(ATNDeserializationOptions.Default)
{ {
@ -115,7 +131,11 @@ namespace Antlr4.Runtime.Atn
ReadStates (atn); ReadStates (atn);
ReadRules (atn); ReadRules (atn);
ReadModes (atn); ReadModes (atn);
IList<IntervalSet> sets = ReadSets (atn); IList<IntervalSet> sets = new List<IntervalSet>();
ReadSets (atn, sets, this.ReadInt);
if (IsFeatureSupported(AddedUnicodeSmp, uuid)) {
ReadSets (atn, sets, this.ReadInt32);
}
ReadEdges (atn, sets); ReadEdges (atn, sets);
ReadDecisions (atn); ReadDecisions (atn);
ReadLexerActions (atn); ReadLexerActions (atn);
@ -378,12 +398,11 @@ namespace Antlr4.Runtime.Atn
} }
} }
protected internal virtual IList<IntervalSet> ReadSets(ATN atn) protected internal virtual void ReadSets(ATN atn, IList<IntervalSet> sets, Func<int> readUnicode)
{ {
// //
// SETS // SETS
// //
IList<IntervalSet> sets = new List<IntervalSet>();
int nsets = ReadInt(); int nsets = ReadInt();
for (int i_8 = 0; i_8 < nsets; i_8++) for (int i_8 = 0; i_8 < nsets; i_8++)
{ {
@ -397,10 +416,9 @@ namespace Antlr4.Runtime.Atn
} }
for (int j = 0; j < nintervals; j++) for (int j = 0; j < nintervals; j++)
{ {
set.Add(ReadInt(), ReadInt()); set.Add(readUnicode(), readUnicode());
} }
} }
return sets;
} }
protected internal virtual void ReadModes(ATN atn) protected internal virtual void ReadModes(ATN atn)
@ -530,7 +548,7 @@ namespace Antlr4.Runtime.Atn
protected internal virtual void CheckUUID() protected internal virtual void CheckUUID()
{ {
Guid uuid = ReadUUID(); uuid = ReadUUID();
if (!SupportedUuids.Contains(uuid)) if (!SupportedUuids.Contains(uuid))
{ {
string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid); string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);

View File

@ -57,6 +57,51 @@ using namespace antlrcpp;
const size_t ATNDeserializer::SERIALIZED_VERSION = 3; const size_t ATNDeserializer::SERIALIZED_VERSION = 3;
namespace {
uint32_t deserializeInt32(const std::vector<uint16_t>& data, size_t offset) {
return (uint32_t)data[offset] | ((uint32_t)data[offset + 1] << 16);
}
ssize_t readUnicodeInt(const std::vector<uint16_t>& data, int& p) {
return static_cast<ssize_t>(data[p++]);
}
ssize_t readUnicodeInt32(const std::vector<uint16_t>& data, int& p) {
auto result = deserializeInt32(data, p);
p += 2;
return static_cast<ssize_t>(result);
}
// We templatize this on the function type so the optimizer can inline
// the 16- or 32-bit readUnicodeInt/readUnicodeInt32 as needed.
template <typename F>
void deserializeSets(
const std::vector<uint16_t>& data,
int& p,
std::vector<misc::IntervalSet>& sets,
F readUnicode) {
int nsets = data[p++];
for (int i = 0; i < nsets; i++) {
int nintervals = data[p++];
misc::IntervalSet set;
bool containsEof = data[p++] != 0;
if (containsEof) {
set.add(-1);
}
for (int j = 0; j < nintervals; j++) {
auto a = readUnicode(data, p);
auto b = readUnicode(data, p);
set.add(a, b);
}
sets.push_back(set);
}
}
}
ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) { ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) {
} }
@ -75,8 +120,12 @@ Guid ATNDeserializer::ADDED_LEXER_ACTIONS() {
return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
} }
Guid ATNDeserializer::ADDED_UNICODE_SMP() {
return Guid("59627784-3BE5-417A-B9EB-8131A7286089");
}
Guid ATNDeserializer::SERIALIZED_UUID() { Guid ATNDeserializer::SERIALIZED_UUID() {
return ADDED_LEXER_ACTIONS(); return ADDED_UNICODE_SMP();
} }
Guid ATNDeserializer::BASE_SERIALIZED_UUID() { Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
@ -84,7 +133,7 @@ Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
} }
std::vector<Guid>& ATNDeserializer::SUPPORTED_UUIDS() { std::vector<Guid>& ATNDeserializer::SUPPORTED_UUIDS() {
static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS() }; static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS(), ADDED_UNICODE_SMP() };
return singleton; return singleton;
} }
@ -239,21 +288,14 @@ ATN ATNDeserializer::deserialize(const std::vector<uint16_t>& input) {
// SETS // SETS
// //
std::vector<misc::IntervalSet> sets; std::vector<misc::IntervalSet> sets;
int nsets = data[p++];
for (int i = 0; i < nsets; i++) {
int nintervals = data[p++];
misc::IntervalSet set;
bool containsEof = data[p++] != 0; // First, deserialize sets with 16-bit arguments <= U+FFFF.
if (containsEof) { deserializeSets(data, p, sets, readUnicodeInt);
set.add(-1);
}
for (int j = 0; j < nintervals; j++) { // Next, if the ATN was serialized with the Unicode SMP feature,
set.add(data[p], data[p + 1], true); // deserialize sets with 32-bit arguments <= U+10FFFF.
p += 2; if (isFeatureSupported(ADDED_UNICODE_SMP(), uuid)) {
} deserializeSets(data, p, sets, readUnicodeInt32);
sets.push_back(set);
} }
// //

View File

@ -67,6 +67,13 @@ namespace atn {
*/ */
static Guid ADDED_LEXER_ACTIONS(); static Guid ADDED_LEXER_ACTIONS();
/**
* This UUID indicates the serialized ATN contains two sets of
* IntervalSets, where the second set's values are encoded as
* 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
*/
static Guid ADDED_UNICODE_SMP();
/// This list contains all of the currently supported UUIDs, ordered by when /// This list contains all of the currently supported UUIDs, ordered by when
/// the feature first appeared in this branch. /// the feature first appeared in this branch.
static std::vector<Guid>& SUPPORTED_UUIDS(); static std::vector<Guid>& SUPPORTED_UUIDS();

View File

@ -24,14 +24,7 @@ Interval::Interval() : Interval((ssize_t)-1, -2) { // Need an explicit cast here
Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) { Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) {
} }
Interval::Interval(ssize_t a_, ssize_t b_, bool autoExtend) { Interval::Interval(ssize_t a_, ssize_t b_) : a(a_), b(b_) {
a = a_;
b = b_;
// XXX: temporary hack to make the full Unicode range available.
if (autoExtend && b == 0xFFFF) {
b = 0x10FFFF;
}
} }
size_t Interval::length() const { size_t Interval::length() const {

View File

@ -27,7 +27,7 @@ namespace misc {
Interval(); Interval();
explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings. explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings.
Interval(ssize_t a_, ssize_t b_, bool autoExtend = false); // Automatically extend a value of 0xFFFF to 0x10FFFF. Interval(ssize_t a_, ssize_t b_);
virtual ~Interval() {}; virtual ~Interval() {};
/// return number of elements between a and b inclusively. x..x is length 1. /// return number of elements between a and b inclusively. x..x is length 1.

View File

@ -50,8 +50,8 @@ IntervalSet IntervalSet::of(ssize_t a) {
return IntervalSet({ Interval(a, a) }); return IntervalSet({ Interval(a, a) });
} }
IntervalSet IntervalSet::of(ssize_t a, ssize_t b, bool autoExtend) { IntervalSet IntervalSet::of(ssize_t a, ssize_t b) {
return IntervalSet({ Interval(a, b, autoExtend) }); return IntervalSet({ Interval(a, b) });
} }
void IntervalSet::clear() { void IntervalSet::clear() {
@ -68,8 +68,8 @@ void IntervalSet::add(ssize_t el) {
add(el, el); add(el, el);
} }
void IntervalSet::add(ssize_t a, ssize_t b, bool autoExtend) { void IntervalSet::add(ssize_t a, ssize_t b) {
add(Interval(a, b, autoExtend)); add(Interval(a, b));
} }
void IntervalSet::add(const Interval &addition) { void IntervalSet::add(const Interval &addition) {

View File

@ -44,7 +44,7 @@ namespace misc {
static IntervalSet of(ssize_t a); static IntervalSet of(ssize_t a);
/// Create a set with all ints within range [a..b] (inclusive) /// Create a set with all ints within range [a..b] (inclusive)
static IntervalSet of(ssize_t a, ssize_t b, bool autoExtend = false); static IntervalSet of(ssize_t a, ssize_t b);
virtual void clear(); virtual void clear();
@ -58,7 +58,7 @@ namespace misc {
/// If overlap, combine ranges. For example, /// If overlap, combine ranges. For example,
/// If this is {1..5, 10..20}, adding 6..7 yields /// If this is {1..5, 10..20}, adding 6..7 yields
/// {1..5, 6..7, 10..20}. Adding 4..8 yields {1..8, 10..20}. /// {1..5, 6..7, 10..20}. Adding 4..8 yields {1..8, 10..20}.
virtual void add(ssize_t a, ssize_t b, bool autoExtend = false); virtual void add(ssize_t a, ssize_t b);
public: public:
/// combine all sets in the array returned the or'd value /// combine all sets in the array returned the or'd value

View File

@ -15,15 +15,16 @@ import (
// This is the earliest supported serialized UUID. // This is the earliest supported serialized UUID.
// stick to serialized version for now, we don't need a UUID instance // stick to serialized version for now, we don't need a UUID instance
var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E" var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"
var AddedUnicodeSMP = "59627784-3BE5-417A-B9EB-8131A7286089"
// This list contains all of the currently supported UUIDs, ordered by when // This list contains all of the currently supported UUIDs, ordered by when
// the feature first appeared in this branch. // the feature first appeared in this branch.
var SupportedUUIDs = []string{BaseSerializedUUID} var SupportedUUIDs = []string{BaseSerializedUUID, AddedUnicodeSMP}
var SerializedVersion = 3 var SerializedVersion = 3
// This is the current serialized UUID. // This is the current serialized UUID.
var SerializedUUID = BaseSerializedUUID var SerializedUUID = AddedUnicodeSMP
type LoopEndStateIntPair struct { type LoopEndStateIntPair struct {
item0 *LoopEndState item0 *LoopEndState
@ -91,7 +92,15 @@ func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN {
a.readRules(atn) a.readRules(atn)
a.readModes(atn) a.readModes(atn)
sets := a.readSets(atn) sets := make([]*IntervalSet, 0)
// First, deserialize sets with 16-bit arguments <= U+FFFF.
sets = a.readSets(atn, sets, a.readInt)
// Next, if the ATN was serialized with the Unicode SMP feature,
// deserialize sets with 32-bit arguments <= U+10FFFF.
if (a.isFeatureSupported(AddedUnicodeSMP, a.uuid)) {
sets = a.readSets(atn, sets, a.readInt32)
}
a.readEdges(atn, sets) a.readEdges(atn, sets)
a.readDecisions(atn) a.readDecisions(atn)
@ -266,8 +275,7 @@ func (a *ATNDeserializer) readModes(atn *ATN) {
} }
} }
func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet { func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode func() int) []*IntervalSet {
sets := make([]*IntervalSet, 0)
m := a.readInt() m := a.readInt()
for i := 0; i < m; i++ { for i := 0; i < m; i++ {
@ -283,8 +291,8 @@ func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet {
} }
for j := 0; j < n; j++ { for j := 0; j < n; j++ {
i1 := a.readInt() i1 := readUnicode()
i2 := a.readInt() i2 := readUnicode()
iset.addRange(i1, i2) iset.addRange(i1, i2)
} }
@ -642,6 +650,12 @@ func (a *ATNDeserializer) readInt() int {
return int(v) return int(v)
} }
func (a *ATNDeserializer) readInt32() int {
var low = a.readInt()
var high = a.readInt()
return low | (high << 16)
}
//TODO //TODO
//func (a *ATNDeserializer) readLong() int64 { //func (a *ATNDeserializer) readLong() int64 {
// panic("Not implemented") // panic("Not implemented")

View File

@ -44,6 +44,12 @@ public class ATNDeserializer {
* {@link LexerAction} instances. * {@link LexerAction} instances.
*/ */
private static final UUID ADDED_LEXER_ACTIONS; private static final UUID ADDED_LEXER_ACTIONS;
/**
* This UUID indicates the serialized ATN contains two sets of
* IntervalSets, where the second set's values are encoded as
* 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
*/
private static final UUID ADDED_UNICODE_SMP;
/** /**
* This list contains all of the currently supported UUIDs, ordered by when * This list contains all of the currently supported UUIDs, ordered by when
* the feature first appeared in this branch. * the feature first appeared in this branch.
@ -61,15 +67,58 @@ public class ATNDeserializer {
BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3"); BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61"); ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61");
ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089");
SUPPORTED_UUIDS = new ArrayList<UUID>(); SUPPORTED_UUIDS = new ArrayList<UUID>();
SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID); SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID);
SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS); SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS);
SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS); SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS);
SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP);
SERIALIZED_UUID = ADDED_LEXER_ACTIONS; SERIALIZED_UUID = ADDED_UNICODE_SMP;
} }
interface UnicodeDeserializer {
// Wrapper for readInt() or readInt32()
int readUnicode(char[] data, int p);
// Work around Java not allowing mutation of captured variables
// by returning amount by which to increment p after each read
int size();
}
enum UnicodeDeserializingMode {
UNICODE_BMP,
UNICODE_SMP
}
static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) {
if (mode == UnicodeDeserializingMode.UNICODE_BMP) {
return new UnicodeDeserializer() {
@Override
public int readUnicode(char[] data, int p) {
return toInt(data[p]);
}
@Override
public int size() {
return 1;
}
};
} else {
return new UnicodeDeserializer() {
@Override
public int readUnicode(char[] data, int p) {
return toInt32(data, p);
}
@Override
public int size() {
return 2;
}
};
}
}
private final ATNDeserializationOptions deserializationOptions; private final ATNDeserializationOptions deserializationOptions;
@ -98,7 +147,7 @@ public class ATNDeserializer {
* serialized ATN at or after the feature identified by {@code feature} was * serialized ATN at or after the feature identified by {@code feature} was
* introduced; otherwise, {@code false}. * introduced; otherwise, {@code false}.
*/ */
protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
int featureIndex = SUPPORTED_UUIDS.indexOf(feature); int featureIndex = SUPPORTED_UUIDS.indexOf(feature);
if (featureIndex < 0) { if (featureIndex < 0) {
return false; return false;
@ -258,22 +307,14 @@ public class ATNDeserializer {
// SETS // SETS
// //
List<IntervalSet> sets = new ArrayList<IntervalSet>(); List<IntervalSet> sets = new ArrayList<IntervalSet>();
int nsets = toInt(data[p++]);
for (int i=0; i<nsets; i++) {
int nintervals = toInt(data[p]);
p++;
IntervalSet set = new IntervalSet();
sets.add(set);
boolean containsEof = toInt(data[p++]) != 0; // First, read all sets with 16-bit Unicode code points <= U+FFFF.
if (containsEof) { p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP));
set.add(-1);
}
for (int j=0; j<nintervals; j++) { // Next, if the ATN was serialized with the Unicode SMP feature,
set.add(toInt(data[p]), toInt(data[p + 1])); // deserialize sets with 32-bit arguments <= U+10FFFF.
p += 2; if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) {
} p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP));
} }
// //
@ -510,6 +551,30 @@ public class ATNDeserializer {
return atn; return atn;
} }
private int deserializeSets(char[] data, int p, List<IntervalSet> sets, UnicodeDeserializer unicodeDeserializer) {
int nsets = toInt(data[p++]);
for (int i=0; i<nsets; i++) {
int nintervals = toInt(data[p]);
p++;
IntervalSet set = new IntervalSet();
sets.add(set);
boolean containsEof = toInt(data[p++]) != 0;
if (containsEof) {
set.add(-1);
}
for (int j=0; j<nintervals; j++) {
int a = unicodeDeserializer.readUnicode(data, p);
p += unicodeDeserializer.size();
int b = unicodeDeserializer.readUnicode(data, p);
p += unicodeDeserializer.size();
set.add(a, b);
}
}
return p;
}
/** /**
* Analyze the {@link StarLoopEntryState} states in the specified ATN to set * Analyze the {@link StarLoopEntryState} states in the specified ATN to set
* the {@link StarLoopEntryState#isPrecedenceDecision} field to the * the {@link StarLoopEntryState#isPrecedenceDecision} field to the

View File

@ -14,8 +14,10 @@ import org.antlr.v4.runtime.misc.Utils;
import java.io.InvalidClassException; import java.io.InvalidClassException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.LinkedHashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.UUID; import java.util.UUID;
@ -24,6 +26,10 @@ public class ATNSerializer {
public ATN atn; public ATN atn;
private List<String> tokenNames; private List<String> tokenNames;
private interface CodePointSerializer {
void serializeCodePoint(IntegerList data, int cp);
}
public ATNSerializer(ATN atn) { public ATNSerializer(ATN atn) {
assert atn.grammarType != null; assert atn.grammarType != null;
this.atn = atn; this.atn = atn;
@ -47,9 +53,11 @@ public class ATNSerializer {
* (args are token type,actionIndex in lexer else 0,0) * (args are token type,actionIndex in lexer else 0,0)
* num modes, * num modes,
* mode-0-start-state, mode-1-start-state, ... (parser has 0 modes) * mode-0-start-state, mode-1-start-state, ... (parser has 0 modes)
* num sets * num unicode-bmp-sets
* set-0-interval-count intervals, set-1-interval-count intervals, ... * bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ...
* num total edges, * num unicode-smp-sets
* smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ...
* num total edges,
* src, trg, edge-type, edge arg1, optional edge arg2 (present always), ... * src, trg, edge-type, edge arg1, optional edge arg2 (present always), ...
* num decisions, * num decisions,
* decision-0-start-state, decision-1-start-state, ... * decision-0-start-state, decision-1-start-state, ...
@ -66,8 +74,10 @@ public class ATNSerializer {
data.add(atn.maxTokenType); data.add(atn.maxTokenType);
int nedges = 0; int nedges = 0;
Map<IntervalSet, Integer> setIndices = new HashMap<IntervalSet, Integer>(); // Note that we use a LinkedHashMap as a set to
List<IntervalSet> sets = new ArrayList<IntervalSet>(); // maintain insertion order while deduplicating
// entries with the same key.
Map<IntervalSet, Boolean> sets = new LinkedHashMap<>();
// dump states, count edges and collect sets while doing so // dump states, count edges and collect sets while doing so
IntegerList nonGreedyStates = new IntegerList(); IntegerList nonGreedyStates = new IntegerList();
@ -114,10 +124,7 @@ public class ATNSerializer {
int edgeType = Transition.serializationTypes.get(t.getClass()); int edgeType = Transition.serializationTypes.get(t.getClass());
if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) { if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) {
SetTransition st = (SetTransition)t; SetTransition st = (SetTransition)t;
if (!setIndices.containsKey(st.set)) { sets.put(st.set, true);
sets.add(st.set);
setIndices.put(st.set, sets.size() - 1);
}
} }
} }
} }
@ -156,34 +163,40 @@ public class ATNSerializer {
data.add(modeStartState.stateNumber); data.add(modeStartState.stateNumber);
} }
} }
List<IntervalSet> bmpSets = new ArrayList<>();
int nsets = sets.size(); List<IntervalSet> smpSets = new ArrayList<>();
data.add(nsets); for (IntervalSet set : sets.keySet()) {
for (IntervalSet set : sets) { if (set.getMaxElement() <= Character.MAX_VALUE) {
boolean containsEof = set.contains(Token.EOF); bmpSets.add(set);
if (containsEof && set.getIntervals().get(0).b == Token.EOF) { } else {
data.add(set.getIntervals().size() - 1); smpSets.add(set);
} }
else { }
data.add(set.getIntervals().size()); serializeSets(
} data,
bmpSets,
data.add(containsEof ? 1 : 0); new CodePointSerializer() {
for (Interval I : set.getIntervals()) { @Override
if (I.a == Token.EOF) { public void serializeCodePoint(IntegerList data, int cp) {
if (I.b == Token.EOF) { data.add(cp);
continue;
}
else {
data.add(0);
}
} }
else { });
data.add(I.a); serializeSets(
data,
smpSets,
new CodePointSerializer() {
@Override
public void serializeCodePoint(IntegerList data, int cp) {
serializeInt(data, cp);
} }
});
data.add(I.b); Map<IntervalSet, Integer> setIndices = new HashMap<>();
} int setIndex = 0;
for (IntervalSet bmpSet : bmpSets) {
setIndices.put(bmpSet, setIndex++);
}
for (IntervalSet smpSet : smpSets) {
setIndices.put(smpSet, setIndex++);
} }
data.add(nedges); data.add(nedges);
@ -359,6 +372,42 @@ public class ATNSerializer {
return data; return data;
} }
private static void serializeSets(
IntegerList data,
Collection<IntervalSet> sets,
CodePointSerializer codePointSerializer)
{
int nSets = sets.size();
data.add(nSets);
for (IntervalSet set : sets) {
boolean containsEof = set.contains(Token.EOF);
if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
data.add(set.getIntervals().size() - 1);
}
else {
data.add(set.getIntervals().size());
}
data.add(containsEof ? 1 : 0);
for (Interval I : set.getIntervals()) {
if (I.a == Token.EOF) {
if (I.b == Token.EOF) {
continue;
}
else {
codePointSerializer.serializeCodePoint(data, 0);
}
}
else {
codePointSerializer.serializeCodePoint(data, I.a);
}
codePointSerializer.serializeCodePoint(data, I.b);
}
}
}
public String decode(char[] data) { public String decode(char[] data) {
data = data.clone(); data = data.clone();
// don't adjust the first value since that's the version number // don't adjust the first value since that's the version number
@ -437,25 +486,10 @@ public class ATNSerializer {
int s = ATNDeserializer.toInt(data[p++]); int s = ATNDeserializer.toInt(data[p++]);
buf.append("mode ").append(i).append(":").append(s).append('\n'); buf.append("mode ").append(i).append(":").append(s).append('\n');
} }
int nsets = ATNDeserializer.toInt(data[p++]); int numBMPSets = ATNDeserializer.toInt(data[p++]);
for (int i=0; i<nsets; i++) { p = appendSets(buf, data, p, numBMPSets, 0, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_BMP));
int nintervals = ATNDeserializer.toInt(data[p++]); int numSMPSets = ATNDeserializer.toInt(data[p++]);
buf.append(i).append(":"); p = appendSets(buf, data, p, numSMPSets, numBMPSets, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_SMP));
boolean containsEof = data[p++] != 0;
if (containsEof) {
buf.append(getTokenName(Token.EOF));
}
for (int j=0; j<nintervals; j++) {
if ( containsEof || j>0 ) {
buf.append(", ");
}
buf.append(getTokenName(ATNDeserializer.toInt(data[p]))).append("..").append(getTokenName(ATNDeserializer.toInt(data[p + 1])));
p += 2;
}
buf.append("\n");
}
int nedges = ATNDeserializer.toInt(data[p++]); int nedges = ATNDeserializer.toInt(data[p++]);
for (int i=0; i<nedges; i++) { for (int i=0; i<nedges; i++) {
int src = ATNDeserializer.toInt(data[p]); int src = ATNDeserializer.toInt(data[p]);
@ -491,6 +525,31 @@ public class ATNSerializer {
return buf.toString(); return buf.toString();
} }
private int appendSets(StringBuilder buf, char[] data, int p, int nsets, int setIndexOffset, ATNDeserializer.UnicodeDeserializer unicodeDeserializer) {
for (int i=0; i<nsets; i++) {
int nintervals = ATNDeserializer.toInt(data[p++]);
buf.append(i+setIndexOffset).append(":");
boolean containsEof = data[p++] != 0;
if (containsEof) {
buf.append(getTokenName(Token.EOF));
}
for (int j=0; j<nintervals; j++) {
if ( containsEof || j>0 ) {
buf.append(", ");
}
int a = unicodeDeserializer.readUnicode(data, p);
p += unicodeDeserializer.size();
int b = unicodeDeserializer.readUnicode(data, p);
p += unicodeDeserializer.size();
buf.append(getTokenName(a)).append("..").append(getTokenName(b));
}
buf.append("\n");
}
return p;
}
public String getTokenName(int t) { public String getTokenName(int t) {
if ( t==-1 ) return "EOF"; if ( t==-1 ) return "EOF";

View File

@ -51,14 +51,21 @@ var LexerModeAction = LexerActions.LexerModeAction;
// stick to serialized version for now, we don't need a UUID instance // stick to serialized version for now, we don't need a UUID instance
var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"; var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E";
//
// This UUID indicates the serialized ATN contains two sets of
// IntervalSets, where the second set's values are encoded as
// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
//
var ADDED_UNICODE_SMP = "59627784-3BE5-417A-B9EB-8131A7286089";
// This list contains all of the currently supported UUIDs, ordered by when // This list contains all of the currently supported UUIDs, ordered by when
// the feature first appeared in this branch. // the feature first appeared in this branch.
var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]; var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ];
var SERIALIZED_VERSION = 3; var SERIALIZED_VERSION = 3;
// This is the current serialized UUID. // This is the current serialized UUID.
var SERIALIZED_UUID = BASE_SERIALIZED_UUID; var SERIALIZED_UUID = ADDED_UNICODE_SMP;
function initArray( length, value) { function initArray( length, value) {
var tmp = []; var tmp = [];
@ -91,11 +98,11 @@ function ATNDeserializer (options) {
// introduced; otherwise, {@code false}. // introduced; otherwise, {@code false}.
ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) { ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) {
var idx1 = SUPPORTED_UUIDS.index(feature); var idx1 = SUPPORTED_UUIDS.indexOf(feature);
if (idx1<0) { if (idx1<0) {
return false; return false;
} }
var idx2 = SUPPORTED_UUIDS.index(actualUuid); var idx2 = SUPPORTED_UUIDS.indexOf(actualUuid);
return idx2 >= idx1; return idx2 >= idx1;
}; };
@ -107,7 +114,14 @@ ATNDeserializer.prototype.deserialize = function(data) {
this.readStates(atn); this.readStates(atn);
this.readRules(atn); this.readRules(atn);
this.readModes(atn); this.readModes(atn);
var sets = this.readSets(atn); var sets = [];
// First, deserialize sets with 16-bit arguments <= U+FFFF.
this.readSets(atn, sets, this.readInt.bind(this));
// Next, if the ATN was serialized with the Unicode SMP feature,
// deserialize sets with 32-bit arguments <= U+10FFFF.
if (this.isFeatureSupported(ADDED_UNICODE_SMP, this.uuid)) {
this.readSets(atn, sets, this.readInt32.bind(this));
}
this.readEdges(atn, sets); this.readEdges(atn, sets);
this.readDecisions(atn); this.readDecisions(atn);
this.readLexerActions(atn); this.readLexerActions(atn);
@ -244,8 +258,7 @@ ATNDeserializer.prototype.readModes = function(atn) {
} }
}; };
ATNDeserializer.prototype.readSets = function(atn) { ATNDeserializer.prototype.readSets = function(atn, sets, readUnicode) {
var sets = [];
var m = this.readInt(); var m = this.readInt();
for (var i=0; i<m; i++) { for (var i=0; i<m; i++) {
var iset = new IntervalSet(); var iset = new IntervalSet();
@ -256,12 +269,11 @@ ATNDeserializer.prototype.readSets = function(atn) {
iset.addOne(-1); iset.addOne(-1);
} }
for (var j=0; j<n; j++) { for (var j=0; j<n; j++) {
var i1 = this.readInt(); var i1 = readUnicode();
var i2 = this.readInt(); var i2 = readUnicode();
iset.addRange(i1, i2); iset.addRange(i1, i2);
} }
} }
return sets;
}; };
ATNDeserializer.prototype.readEdges = function(atn, sets) { ATNDeserializer.prototype.readEdges = function(atn, sets) {

View File

@ -278,7 +278,7 @@ class Lexer(Recognizer, TokenSource):
start = self._tokenStartCharIndex start = self._tokenStartCharIndex
stop = self._input.index stop = self._input.index
text = self._input.getText(start, stop) text = self._input.getText(start, stop)
msg = "token recognition error at: '" + self.getErrorDisplay(text) + "'" msg = u"token recognition error at: '" + self.getErrorDisplay(text) + u"'"
listener = self.getErrorListenerDispatch() listener = self.getErrorListenerDispatch()
listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e) listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e)
@ -291,17 +291,17 @@ class Lexer(Recognizer, TokenSource):
def getErrorDisplayForChar(self, c): def getErrorDisplayForChar(self, c):
if ord(c[0])==Token.EOF: if ord(c[0])==Token.EOF:
return "<EOF>" return "<EOF>"
elif c=='\n': elif c==u'\n':
return "\\n" return u"\\n"
elif c=='\t': elif c==u'\t':
return "\\t" return u"\\t"
elif c=='\r': elif c==u'\r':
return "\\r" return u"\\r"
else: else:
return unicode(c) return c
def getCharErrorDisplay(self, c): def getCharErrorDisplay(self, c):
return "'" + self.getErrorDisplayForChar(c) + "'" return u"'" + self.getErrorDisplayForChar(c) + u"'"
# Lexers can normally match any char in it's vocabulary after matching # Lexers can normally match any char in it's vocabulary after matching
# a token, so do the easy thing and just kill a character and hope # a token, so do the easy thing and just kill a character and hope

View File

@ -13,14 +13,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
# This is the earliest supported serialized UUID. # This is the earliest supported serialized UUID.
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E") BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
# This UUID indicates the serialized ATN contains two sets of
# IntervalSets, where the second set's values are encoded as
# 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
# This list contains all of the currently supported UUIDs, ordered by when # This list contains all of the currently supported UUIDs, ordered by when
# the feature first appeared in this branch. # the feature first appeared in this branch.
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ] SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
SERIALIZED_VERSION = 3 SERIALIZED_VERSION = 3
# This is the current serialized UUID. # This is the current serialized UUID.
SERIALIZED_UUID = BASE_SERIALIZED_UUID SERIALIZED_UUID = ADDED_UNICODE_SMP
class ATNDeserializer (object): class ATNDeserializer (object):
@ -59,7 +64,13 @@ class ATNDeserializer (object):
self.readStates(atn) self.readStates(atn)
self.readRules(atn) self.readRules(atn)
self.readModes(atn) self.readModes(atn)
sets = self.readSets(atn) sets = []
# First, read all sets with 16-bit Unicode code points <= U+FFFF.
self.readSets(atn, sets, self.readInt)
# Next, if the ATN was serialized with the Unicode SMP feature,
# deserialize sets with 32-bit arguments <= U+10FFFF.
if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
self.readSets(atn, sets, self.readInt32)
self.readEdges(atn, sets) self.readEdges(atn, sets)
self.readDecisions(atn) self.readDecisions(atn)
self.readLexerActions(atn) self.readLexerActions(atn)
@ -170,8 +181,7 @@ class ATNDeserializer (object):
s = self.readInt() s = self.readInt()
atn.modeToStartState.append(atn.states[s]) atn.modeToStartState.append(atn.states[s])
def readSets(self, atn): def readSets(self, atn, sets, readUnicode):
sets = []
m = self.readInt() m = self.readInt()
for i in range(0, m): for i in range(0, m):
iset = IntervalSet() iset = IntervalSet()
@ -181,10 +191,9 @@ class ATNDeserializer (object):
if containsEof!=0: if containsEof!=0:
iset.addOne(-1) iset.addOne(-1)
for j in range(0, n): for j in range(0, n):
i1 = self.readInt() i1 = readUnicode()
i2 = self.readInt() i2 = readUnicode()
iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive
return sets
def readEdges(self, atn, sets): def readEdges(self, atn, sets):
nedges = self.readInt() nedges = self.readInt()

View File

@ -4,6 +4,7 @@
#/ #/
from uuid import UUID from uuid import UUID
from io import StringIO from io import StringIO
from typing import Callable
from antlr4.Token import Token from antlr4.Token import Token
from antlr4.atn.ATN import ATN from antlr4.atn.ATN import ATN
from antlr4.atn.ATNType import ATNType from antlr4.atn.ATNType import ATNType
@ -15,14 +16,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
# This is the earliest supported serialized UUID. # This is the earliest supported serialized UUID.
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E") BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
# This UUID indicates the serialized ATN contains two sets of
# IntervalSets, where the second set's values are encoded as
# 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
# This list contains all of the currently supported UUIDs, ordered by when # This list contains all of the currently supported UUIDs, ordered by when
# the feature first appeared in this branch. # the feature first appeared in this branch.
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ] SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
SERIALIZED_VERSION = 3 SERIALIZED_VERSION = 3
# This is the current serialized UUID. # This is the current serialized UUID.
SERIALIZED_UUID = BASE_SERIALIZED_UUID SERIALIZED_UUID = ADDED_UNICODE_SMP
class ATNDeserializer (object): class ATNDeserializer (object):
@ -58,7 +64,13 @@ class ATNDeserializer (object):
self.readStates(atn) self.readStates(atn)
self.readRules(atn) self.readRules(atn)
self.readModes(atn) self.readModes(atn)
sets = self.readSets(atn) sets = []
# First, read all sets with 16-bit Unicode code points <= U+FFFF.
self.readSets(atn, sets, self.readInt)
# Next, if the ATN was serialized with the Unicode SMP feature,
# deserialize sets with 32-bit arguments <= U+10FFFF.
if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
self.readSets(atn, sets, self.readInt32)
self.readEdges(atn, sets) self.readEdges(atn, sets)
self.readDecisions(atn) self.readDecisions(atn)
self.readLexerActions(atn) self.readLexerActions(atn)
@ -170,8 +182,7 @@ class ATNDeserializer (object):
s = self.readInt() s = self.readInt()
atn.modeToStartState.append(atn.states[s]) atn.modeToStartState.append(atn.states[s])
def readSets(self, atn:ATN): def readSets(self, atn:ATN, sets:list, readUnicode:Callable[[], int]):
sets = []
m = self.readInt() m = self.readInt()
for i in range(0, m): for i in range(0, m):
iset = IntervalSet() iset = IntervalSet()
@ -181,10 +192,9 @@ class ATNDeserializer (object):
if containsEof!=0: if containsEof!=0:
iset.addOne(-1) iset.addOne(-1)
for j in range(0, n): for j in range(0, n):
i1 = self.readInt() i1 = readUnicode()
i2 = self.readInt() i2 = readUnicode()
iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive
return sets
def readEdges(self, atn:ATN, sets:list): def readEdges(self, atn:ATN, sets:list):
nedges = self.readInt() nedges = self.readInt()

View File

@ -26,21 +26,30 @@ public class ATNDeserializer {
/// for the addition of lexer actions encoded as a sequence of /// for the addition of lexer actions encoded as a sequence of
/// {@link org.antlr.v4.runtime.atn.LexerAction} instances. /// {@link org.antlr.v4.runtime.atn.LexerAction} instances.
private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")! private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
/// This list contains all of the currently supported UUIDs, ordered by when
/// the feature first appeared in this branch. /// This UUID indicates the serialized ATN contains two sets of
/// IntervalSets, where the second set's values are encoded as
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
private static let ADDED_UNICODE_SMP: UUID = UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
/**
* This list contains all of the currently supported UUIDs, ordered by when
* the feature first appeared in this branch.
*/
private static let SUPPORTED_UUIDS: Array<UUID> = { private static let SUPPORTED_UUIDS: Array<UUID> = {
var suuid = Array<UUID>() var suuid = Array<UUID>()
suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID) suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID)
suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS) suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS)
suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS) suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS)
suuid.append(ATNDeserializer.ADDED_UNICODE_SMP)
return suuid return suuid
}() }()
/// This is the current serialized UUID. /// This is the current serialized UUID.
public static let SERIALIZED_UUID: UUID = { public static let SERIALIZED_UUID: UUID = {
// SERIALIZED_UUID = ADDED_LEXER_ACTIONS; // SERIALIZED_UUID = ADDED_UNICODE_SMP;
return UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")! return UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
}() }()
@ -245,24 +254,14 @@ public class ATNDeserializer {
// SETS // SETS
// //
var sets: Array<IntervalSet> = Array<IntervalSet>() var sets: Array<IntervalSet> = Array<IntervalSet>()
let nsets: Int = toInt(data[p])
p += 1
for _ in 0..<nsets {
let nintervals: Int = toInt(data[p])
p += 1
let set: IntervalSet = try IntervalSet()
sets.append(set)
let containsEof: Bool = toInt(data[p]) != 0 // First, deserialize sets with 16-bit arguments <= U+FFFF.
p += 1 try readSets(data, &p, &sets, readUnicodeInt)
if containsEof {
try set.add(-1)
}
for _ in 0..<nintervals { // Next, if the ATN was serialized with the Unicode SMP feature,
try set.add(toInt(data[p]), toInt(data[p + 1])) // deserialize sets with 32-bit arguments <= U+10FFFF.
p += 2 if isFeatureSupported(ATNDeserializer.ADDED_UNICODE_SMP, uuid) {
} try readSets(data, &p, &sets, readUnicodeInt32)
} }
// //
@ -521,6 +520,39 @@ public class ATNDeserializer {
return atn return atn
} }
private func readUnicodeInt(_ data: [Character], _ p: inout Int) -> Int {
let result: Int = toInt(data[p])
p += 1
return result
}
private func readUnicodeInt32(_ data: [Character], _ p: inout Int) -> Int {
let result: Int = toInt32(data, p)
p += 2
return result
}
private func readSets(_ data: [Character], _ p: inout Int, _ sets: inout Array<IntervalSet>, _ readUnicode: ([Character], inout Int) -> Int) throws {
let nsets: Int = toInt(data[p])
p += 1
for _ in 0..<nsets {
let nintervals: Int = toInt(data[p])
p += 1
let set: IntervalSet = try IntervalSet()
sets.append(set)
let containsEof: Bool = toInt(data[p]) != 0
p += 1
if containsEof {
try set.add(-1)
}
for _ in 0..<nintervals {
try set.add(readUnicode(data, &p), readUnicode(data, &p))
}
}
}
public func deserializeFromJson(_ jsonStr: String) -> ATN { public func deserializeFromJson(_ jsonStr: String) -> ATN {
// let jsonStr = Utils.readFile2String(jsonFileName) // let jsonStr = Utils.readFile2String(jsonFileName)
guard !jsonStr.isEmpty else { guard !jsonStr.isEmpty else {

View File

@ -6,8 +6,8 @@
package org.antlr.v4.test.tool; package org.antlr.v4.test.tool;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATN;
import org.antlr.v4.runtime.atn.ATNState; import org.antlr.v4.runtime.atn.ATNState;
import org.antlr.v4.runtime.misc.Utils; import org.antlr.v4.runtime.misc.Utils;
@ -121,6 +121,94 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
checkLexerMatches(lg, "c", expecting); checkLexerMatches(lg, "c", expecting);
} }
@Test public void testLexerSetUnicodeBMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\u611B'|'\u611C')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, "\u611B", expecting);
}
@Test public void testLexerNotSetUnicodeBMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\u611B'|'\u611C')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, "\u611D", expecting);
}
@Test public void testLexerNotSetUnicodeBMPMatchesSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\u611B'|'\u611C')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
}
@Test public void testLexerSetUnicodeSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
}
@Test public void testLexerNotBMPSetMatchesUnicodeSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('a'|'b')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
}
@Test public void testLexerNotBMPSetMatchesBMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('a'|'b')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, "\u611B", expecting);
}
@Test public void testLexerNotBMPSetMatchesSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('a'|'b')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
}
@Test public void testLexerNotSMPSetMatchesBMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, "\u611B", expecting);
}
@Test public void testLexerNotSMPSetMatchesSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1D7C0).toString(), expecting);
}
@Test public void testLexerRangeUnicodeSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\\u{1F4A9}'..'\\u{1F4B0}')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4AF).toString(), expecting);
}
@Test public void testLexerRangeUnicodeBMPToSMP() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\\u611B'..'\\u{1F4B0}')\n ;");
String expecting = "ID, EOF";
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x12001).toString(), expecting);
}
@Test public void testLexerKeywordIDAmbiguity() throws Exception { @Test public void testLexerKeywordIDAmbiguity() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
@ -293,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg, true); ATN atn = createATN(lg, true);
CharStream input = new ANTLRInputStream(inputString); CharStream input = CharStreams.createWithString(inputString);
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE"); ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
DOTGenerator dot = new DOTGenerator(lg); DOTGenerator dot = new DOTGenerator(lg);
// System.out.println(dot.getDOT(startState, true)); // System.out.println(dot.getDOT(startState, true));

View File

@ -291,6 +291,113 @@ public class TestATNSerialization extends BaseJavaToolTest {
assertEquals(expecting, result); assertEquals(expecting, result);
} }
@Test public void testLexerUnicodeSMPLiteralSerializedToSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : '\\u{1F4A9}' ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:128169..128169\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeSMPRangeSerializedToSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : ('a'..'\\u{1F4A9}') ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'a'..128169\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeSMPSetSerializedAfterBMPSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+
"BMP : ('a' | 'x') ;");
String expecting =
"max type 2\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:RULE_START 1\n" +
"4:RULE_STOP 1\n" +
"5:BASIC 0\n" +
"6:BASIC 0\n" +
"7:BASIC 1\n" +
"8:BASIC 1\n" +
"rule 0:1 1\n" +
"rule 1:3 2\n" +
"mode 0:0\n" +
"0:'a'..'a', 'x'..'x'\n" +
"1:128169..128169, 128175..128175\n" +
"0->1 EPSILON 0,0,0\n" +
"0->3 EPSILON 0,0,0\n" +
"1->5 EPSILON 0,0,0\n" +
"3->7 EPSILON 0,0,0\n" +
"5->6 SET 1,0,0\n" +
"6->2 EPSILON 0,0,0\n" +
"7->8 SET 0,0,0\n" +
"8->4 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerNotLiteral() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"INT : ~'a' ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'a'..'a'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerRange() throws Exception { @Test public void testLexerRange() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+
@ -518,6 +625,222 @@ public class TestATNSerialization extends BaseJavaToolTest {
assertEquals(expecting, result); assertEquals(expecting, result);
} }
@Test public void testLexerUnicodeUnescapedBMPNotSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\u4E9C'|'\u4E9D')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u4E9C'..'\\u4E9D'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeUnescapedBMPSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeUnescapedBMPNotSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedBMPNotSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u4E9C'|'\\u4E9D')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u4E9C'..'\\u4E9D'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedBMPSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedBMPNotSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedSMPNotSet() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:128169..128170\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedSMPSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:119823..119827, 128065..128065, 128169..128170\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerUnicodeEscapedSMPNotSetWithRange() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
String expecting =
"max type 1\n" +
"0:TOKEN_START -1\n" +
"1:RULE_START 0\n" +
"2:RULE_STOP 0\n" +
"3:BASIC 0\n" +
"4:BASIC 0\n" +
"rule 0:1 1\n" +
"mode 0:0\n" +
"0:119823..119827, 128065..128065, 128169..128170\n" +
"0->1 EPSILON 0,0,0\n" +
"1->3 EPSILON 0,0,0\n" +
"3->4 NOT_SET 0,0,0\n" +
"4->2 EPSILON 0,0,0\n" +
"0:0\n";
ATN atn = createATN(lg, true);
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
assertEquals(expecting, result);
}
@Test public void testLexerWildcardWithMode() throws Exception { @Test public void testLexerWildcardWithMode() throws Exception {
LexerGrammar lg = new LexerGrammar( LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n"+ "lexer grammar L;\n"+

View File

@ -141,6 +141,24 @@ public class TestTokenTypeAssignment extends BaseJavaToolTest {
assertEquals("'\\n'", literals.toArray()[0]); assertEquals("'\\n'", literals.toArray()[0]);
} }
@Test public void testParserCharLiteralWithBasicUnicodeEscape() throws Exception {
Grammar g = new Grammar(
"grammar t;\n"+
"a : '\\uABCD';\n");
Set<?> literals = g.stringLiteralToTypeMap.keySet();
// must store literals how they appear in the antlr grammar
assertEquals("'\\uABCD'", literals.toArray()[0]);
}
@Test public void testParserCharLiteralWithExtendedUnicodeEscape() throws Exception {
Grammar g = new Grammar(
"grammar t;\n"+
"a : '\\u{1ABCD}';\n");
Set<?> literals = g.stringLiteralToTypeMap.keySet();
// must store literals how they appear in the antlr grammar
assertEquals("'\\u{1ABCD}'", literals.toArray()[0]);
}
protected void checkSymbols(Grammar g, protected void checkSymbols(Grammar g,
String rulesStr, String rulesStr,
String allValidTokensStr) String allValidTokensStr)

View File

@ -0,0 +1,131 @@
/*
* Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.tool;
import org.antlr.v4.gui.Trees;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.LexerInterpreter;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.GrammarParserInterpreter;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class TestUnicodeGrammar extends BaseJavaToolTest {
@Test
public void unicodeBMPLiteralInGrammar() throws Exception {
String grammarText =
"grammar Unicode;\n" +
"r : 'hello' WORLD;\n" +
"WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" +
"WS : [ \\t\\r\\n]+ -> skip;\n";
String inputText = "hello \u4E16\u754C";
assertEquals(
"(r:1 " + inputText + ")",
parseTreeForGrammarWithInput(
grammarText,
"r",
inputText));
}
// TODO: This test cannot pass unless we change either the grammar
// parser to decode surrogate pair literals to code points (which
// would break existing clients) or to treat them as an
// alternative:
//
// '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D')
//
// but I worry that might cause parse ambiguity if we're not careful.
//@Test
public void unicodeSurrogatePairLiteralInGrammar() throws Exception {
String grammarText =
"grammar Unicode;\n" +
"r : 'hello' WORLD;\n" +
"WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" +
"WS : [ \\t\\r\\n]+ -> skip;\n";
String inputText = new StringBuilder("hello ")
.appendCodePoint(0x1F30E)
.toString();
assertEquals(
"(r:1 " + inputText + ")",
parseTreeForGrammarWithInput(
grammarText,
"r",
inputText));
}
@Test
public void unicodeSMPLiteralInGrammar() throws Exception {
String grammarText =
"grammar Unicode;\n" +
"r : 'hello' WORLD;\n" +
"WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" +
"WS : [ \\t\\r\\n]+ -> skip;\n";
String inputText = new StringBuilder("hello ")
.appendCodePoint(0x1F30E)
.toString();
assertEquals(
"(r:1 " + inputText + ")",
parseTreeForGrammarWithInput(
grammarText,
"r",
inputText));
}
@Test
public void unicodeSMPRangeInGrammar() throws Exception {
String grammarText =
"grammar Unicode;\n" +
"r : 'hello' WORLD;\n" +
"WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" +
"WS : [ \\t\\r\\n]+ -> skip;\n";
String inputText = new StringBuilder("hello ")
.appendCodePoint(0x1F30E)
.toString();
assertEquals(
"(r:1 " + inputText + ")",
parseTreeForGrammarWithInput(
grammarText,
"r",
inputText));
}
@Test
public void matchingDanglingSurrogateInInput() throws Exception {
String grammarText =
"grammar Unicode;\n" +
"r : 'hello' WORLD;\n" +
"WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" +
"WS : [ \\t\\r\\n]+ -> skip;\n";
String inputText = "hello \uD83C";
assertEquals(
"(r:1 " + inputText + ")",
parseTreeForGrammarWithInput(
grammarText,
"r",
inputText));
}
private static String parseTreeForGrammarWithInput(
String grammarText,
String rootRule,
String inputText) throws Exception {
Grammar grammar = new Grammar(grammarText);
LexerInterpreter lexEngine = grammar.createLexerInterpreter(
CharStreams.createWithString(inputText));
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
InterpreterTreeTextProvider nodeTextProvider =
new InterpreterTreeTextProvider(grammar.getRuleNames());
return Trees.toStringTree(parseTree, nodeTextProvider);
}
}

View File

@ -288,7 +288,7 @@ public partial class <csIdentifier.(parser.name)> : <superClass; null="Parser">
public override string[] RuleNames { get { return ruleNames; } } public override string[] RuleNames { get { return ruleNames; } }
public override string SerializedAtn { get { return _serializedATN; } } public override string SerializedAtn { get { return new string(_serializedATN); } }
static <csIdentifier.(parser.name)>() { static <csIdentifier.(parser.name)>() {
decisionToDFA = new DFA[_ATN.NumberOfDecisions]; decisionToDFA = new DFA[_ATN.NumberOfDecisions];
@ -1023,7 +1023,7 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
public override string[] ModeNames { get { return modeNames; } } public override string[] ModeNames { get { return modeNames; } }
public override string SerializedAtn { get { return _serializedATN; } } public override string SerializedAtn { get { return new string(_serializedATN); } }
static <csIdentifier.(lexer.name)>() { static <csIdentifier.(lexer.name)>() {
decisionToDFA = new DFA[_ATN.NumberOfDecisions]; decisionToDFA = new DFA[_ATN.NumberOfDecisions];
@ -1038,16 +1038,12 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
SerializedATN(model) ::= << SerializedATN(model) ::= <<
private static string _serializedATN = _serializeATN(); private static char[] _serializedATN = {
private static string _serializeATN() <model.serialized; separator=", ", wrap>,
{ };
StringBuilder sb = new StringBuilder();
sb.Append("<model.serialized; wrap={");<\n><\t>sb.Append("}>");
return sb.ToString();
}
public static readonly ATN _ATN = public static readonly ATN _ATN =
new ATNDeserializer().Deserialize(_serializedATN.ToCharArray()); new ATNDeserializer().Deserialize(_serializedATN);
>> >>

View File

@ -192,23 +192,23 @@ atn::ATN <lexer.name>::_atn;
std::vector\<uint16_t> <lexer.name>::_serializedATN; std::vector\<uint16_t> <lexer.name>::_serializedATN;
std::vector\<std::string> <lexer.name>::_ruleNames = { std::vector\<std::string> <lexer.name>::_ruleNames = {
<lexer.ruleNames: {r | "<r>"}; separator = ", ", wrap, anchor> <lexer.ruleNames: {r | u8"<r>"}; separator = ", ", wrap, anchor>
}; };
std::vector\<std::string> <lexer.name>::_channelNames = { std::vector\<std::string> <lexer.name>::_channelNames = {
"DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | "<c>"}; separator = ", ", wrap, anchor><endif> "DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | u8"<c>"}; separator = ", ", wrap, anchor><endif>
}; };
std::vector\<std::string> <lexer.name>::_modeNames = { std::vector\<std::string> <lexer.name>::_modeNames = {
<lexer.modes: {m | "<m>"}; separator = ", ", wrap, anchor> <lexer.modes: {m | u8"<m>"}; separator = ", ", wrap, anchor>
}; };
std::vector\<std::string> <lexer.name>::_literalNames = { std::vector\<std::string> <lexer.name>::_literalNames = {
<lexer.literalNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor> <lexer.literalNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
}; };
std::vector\<std::string> <lexer.name>::_symbolicNames = { std::vector\<std::string> <lexer.name>::_symbolicNames = {
<lexer.symbolicNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor> <lexer.symbolicNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
}; };
dfa::Vocabulary <lexer.name>::_vocabulary(_literalNames, _symbolicNames); dfa::Vocabulary <lexer.name>::_vocabulary(_literalNames, _symbolicNames);

View File

@ -46,7 +46,7 @@ public class CSharpTarget extends Target {
formatted = String.format("\\x%X", v & 0xFFFF); formatted = String.format("\\x%X", v & 0xFFFF);
} }
return formatted; return "'" + formatted + "'";
} }
@Override @Override

View File

@ -45,10 +45,9 @@ public class CharSupport {
} }
/** Return a string representing the escaped char for code c. E.g., If c /** Return a string representing the escaped char for code c. E.g., If c
* has value 0x100, you will get "\u0100". ASCII gets the usual * has value 0x100, you will get "\\u0100". ASCII gets the usual
* char (non-hex) representation. Control characters are spit out * char (non-hex) representation. Non-ASCII characters are spit out
* as unicode. While this is specially set up for returning Java strings, * as \\uXXXX or \\u{XXXXXX} escapes.
* it can be used by any language target that has the same syntax. :)
*/ */
public static String getANTLRCharLiteralForChar(int c) { public static String getANTLRCharLiteralForChar(int c) {
if ( c< Lexer.MIN_CHAR_VALUE ) { if ( c< Lexer.MIN_CHAR_VALUE ) {
@ -67,11 +66,11 @@ public class CharSupport {
} }
return '\''+Character.toString((char)c)+'\''; return '\''+Character.toString((char)c)+'\'';
} }
// turn on the bit above max "\uFFFF" value so that we pad with zeros if (c <= 0xFFFF) {
// then only take last 4 digits return String.format("\\u%04X", c);
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); } else {
String unicodeStr = "'\\u"+hex+"'"; return String.format("\\u{%06X}", c);
return unicodeStr; }
} }
/** Given a literal like (the 3 char sequence with single quotes) 'a', /** Given a literal like (the 3 char sequence with single quotes) 'a',
@ -92,11 +91,25 @@ public class CharSupport {
if ( literal.charAt(i) == '\\' ) { if ( literal.charAt(i) == '\\' ) {
end = i+2; end = i+2;
if ( i+1 < n && literal.charAt(i+1) == 'u' ) { if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
for (end = i + 2; end < i + 6; end++) { if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence
if ( end>n ) return null; // invalid escape sequence. end = i + 3;
char charAt = literal.charAt(end); while (true) {
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { if ( end + 1 > n ) return null; // invalid escape sequence.
return null; // invalid escape sequence. char charAt = literal.charAt(end++);
if (charAt == '}') {
break;
}
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
return null; // invalid escape sequence.
}
}
} else {
for (end = i + 2; end < i + 6; end++) {
if ( end>n ) return null; // invalid escape sequence.
char charAt = literal.charAt(end);
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
return null; // invalid escape sequence.
}
} }
} }
} }
@ -107,13 +120,13 @@ public class CharSupport {
if ( c==-1 ) { if ( c==-1 ) {
return null; // invalid escape sequence. return null; // invalid escape sequence.
} }
else buf.append((char)c); else buf.appendCodePoint(c);
i = end; i = end;
} }
return buf.toString(); return buf.toString();
} }
/** Given char x or \t or \u1234 return the char value; /** Given char x or \\t or \\u1234 return the char value;
* Unnecessary escapes like '\{' yield -1. * Unnecessary escapes like '\{' yield -1.
*/ */
public static int getCharValueFromCharInGrammarLiteral(String cstr) { public static int getCharValueFromCharInGrammarLiteral(String cstr) {
@ -130,9 +143,31 @@ public class CharSupport {
if ( charVal==0 ) return -1; if ( charVal==0 ) return -1;
return charVal; return charVal;
case 6: case 6:
// '\u1234' // '\\u1234' or '\\u{12}'
if ( !cstr.startsWith("\\u") ) return -1; if ( !cstr.startsWith("\\u") ) return -1;
String unicodeChars = cstr.substring(2, cstr.length()); int startOff;
int endOff;
if ( cstr.charAt(2) == '{' ) {
startOff = 3;
endOff = cstr.indexOf('}');
} else {
startOff = 2;
endOff = cstr.length();
}
return parseHexValue(cstr, startOff, endOff);
default:
if ( cstr.startsWith("\\u{") ) {
return parseHexValue(cstr, 3, cstr.indexOf('}'));
}
return -1;
}
}
private static int parseHexValue(String cstr, int startOff, int endOff) {
if (startOff < 0 || endOff < 0) {
return -1;
}
String unicodeChars = cstr.substring(startOff, endOff);
int result = -1; int result = -1;
try { try {
result = Integer.parseInt(unicodeChars, 16); result = Integer.parseInt(unicodeChars, 16);
@ -140,9 +175,6 @@ public class CharSupport {
catch (NumberFormatException e) { catch (NumberFormatException e) {
} }
return result; return result;
default:
return -1;
}
} }
public static String capitalize(String s) { public static String capitalize(String s) {

View File

@ -615,8 +615,8 @@ SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT
// //
// ANTLR makes no disticintion between a single character literal and a // ANTLR makes no disticintion between a single character literal and a
// multi-character string. All literals are single quote delimited and // multi-character string. All literals are single quote delimited and
// may contain unicode escape sequences of the form \uxxxx, where x // may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx},
// is a valid hexadecimal number (as per Java basically). // where x is a valid hexadecimal number.
STRING_LITERAL STRING_LITERAL
: '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )* : '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
( '\'' ( '\''
@ -652,6 +652,10 @@ ESC_SEQ
// //
UNICODE_ESC UNICODE_ESC
| // A Swift/Hack style Unicode escape sequence
//
UNICODE_EXTENDED_ESC
| // An illegal escape seqeunce | // An illegal escape seqeunce
// //
{ {
@ -720,6 +724,27 @@ UNICODE_ESC
} }
; ;
fragment
UNICODE_EXTENDED_ESC
: 'u{' // Leadin for unicode extended escape sequence
HEX_DIGIT+ // One or more hexadecimal digits
'}' // Leadout for unicode extended escape sequence
// Now check the digit count and issue an error if we need to
{
int numDigits = getCharIndex()-state.tokenStartCharIndex-6;
if (numDigits > 6) {
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
}
}
;
// ---------- // ----------
// Whitespace // Whitespace
// //