forked from jasder/antlr
Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots more tests.
This commit is contained in:
parent
ce09abb480
commit
fd4246cf3f
|
@ -402,4 +402,282 @@ public class SetsDescriptors {
|
||||||
public String grammar;
|
public String grammar;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = "a\u00E4\u3042\u4E9Cc";
|
||||||
|
public String output = "a\u00E4\u3042\u4E9Cc\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
// These are actually not escaped -- Java passes the
|
||||||
|
// raw unescaped Unicode values to the grammar compiler.
|
||||||
|
LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c';
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
|
||||||
|
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS* 'd' {<InputText():writeln()>} ;
|
||||||
|
// These are actually not escaped -- Java passes the
|
||||||
|
// raw unescaped Unicode values to the grammar compiler.
|
||||||
|
LETTERS : ('a'|'\u00E0'..'\u00E5');
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = "a\u00E4\u3042\u4E9Cc";
|
||||||
|
public String output = "a\u00E4\u3042\u4E9Cc\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
// Note the double-backslash to avoid Java passing
|
||||||
|
// unescaped values as part of the grammar.
|
||||||
|
LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c';
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
|
||||||
|
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS* 'd' {<InputText():writeln()>} ;
|
||||||
|
// Note the double-backslash to avoid Java passing
|
||||||
|
// unescaped values as part of the grammar.
|
||||||
|
LETTERS : ('a'|'\\u00E0'..'\\u00E5');
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(bhamiltoncx): This needs to be an error, the V3
|
||||||
|
// runtime used by the tool doesn't really understand unescaped code points >
|
||||||
|
// U+FFFF.
|
||||||
|
// public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor {
|
||||||
|
// public String input = new StringBuilder()
|
||||||
|
// .append("a")
|
||||||
|
// .appendCodePoint(0x1D5C2)
|
||||||
|
// .appendCodePoint(0x1D5CE)
|
||||||
|
// .appendCodePoint(0x1D5BA)
|
||||||
|
// .append("c")
|
||||||
|
// .toString();
|
||||||
|
// public String output = new StringBuilder()
|
||||||
|
// .append("a")
|
||||||
|
// .appendCodePoint(0x1D5C2)
|
||||||
|
// .appendCodePoint(0x1D5CE)
|
||||||
|
// .appendCodePoint(0x1D5BA)
|
||||||
|
// .append("c\n")
|
||||||
|
// .toString();
|
||||||
|
// public String errors = null;
|
||||||
|
// public String startRule = "a";
|
||||||
|
// public String grammarName = "T";
|
||||||
|
|
||||||
|
// /**
|
||||||
|
// grammar T;
|
||||||
|
// a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
// // These are actually not escaped -- Java passes the
|
||||||
|
// // raw unescaped Unicode values to the grammar compiler.
|
||||||
|
// //
|
||||||
|
// // Each sequence is the UTF-16 encoding of a raw Unicode
|
||||||
|
// // SMP code point.
|
||||||
|
// LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c';
|
||||||
|
// */
|
||||||
|
// @CommentHasStringValue
|
||||||
|
// public String grammar;
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
|
public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = new StringBuilder()
|
||||||
|
.append("a")
|
||||||
|
.appendCodePoint(0x1D5C2)
|
||||||
|
.appendCodePoint(0x1D5CE)
|
||||||
|
.appendCodePoint(0x1D5BA)
|
||||||
|
.append("c")
|
||||||
|
.toString();
|
||||||
|
public String output = new StringBuilder()
|
||||||
|
.append("a")
|
||||||
|
.appendCodePoint(0x1D5C2)
|
||||||
|
.appendCodePoint(0x1D5CE)
|
||||||
|
.appendCodePoint(0x1D5BA)
|
||||||
|
.append("c\n")
|
||||||
|
.toString();
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
// Note the double-backslash to avoid Java passing
|
||||||
|
// unescaped values as part of the grammar.
|
||||||
|
LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c';
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Turns out Tool.java uses ANTLR 3's runtime, which means it can't use
|
||||||
|
// CodePointCharStream to understand unescaped code points > U+FFFF.
|
||||||
|
//
|
||||||
|
// TODO(bhamiltoncx): This needs to be an error, since we don't currently plan
|
||||||
|
// to port Tool.java to use ANTLR 4's runtime.
|
||||||
|
|
||||||
|
// public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor {
|
||||||
|
// public String input = new StringBuilder()
|
||||||
|
// .append("a")
|
||||||
|
// .appendCodePoint(0x1D5C2)
|
||||||
|
// .appendCodePoint(0x1D5CE)
|
||||||
|
// .appendCodePoint(0x1D5BA)
|
||||||
|
// .append("d")
|
||||||
|
// .toString();
|
||||||
|
// public String output = new StringBuilder()
|
||||||
|
// .append("a")
|
||||||
|
// .appendCodePoint(0x1D5C2)
|
||||||
|
// .appendCodePoint(0x1D5CE)
|
||||||
|
// .appendCodePoint(0x1D5BA)
|
||||||
|
// .append("d\n")
|
||||||
|
// .toString();
|
||||||
|
// public String errors = null;
|
||||||
|
// public String startRule = "a";
|
||||||
|
// public String grammarName = "T";
|
||||||
|
|
||||||
|
// /**
|
||||||
|
// grammar T;
|
||||||
|
// a : LETTERS* 'd' {<InputText():writeln()>} ;
|
||||||
|
// // These are actually not escaped -- Java passes the
|
||||||
|
// // raw unescaped Unicode values to the grammar compiler.
|
||||||
|
// LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43');
|
||||||
|
// */
|
||||||
|
// @CommentHasStringValue
|
||||||
|
// public String grammar;
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
|
public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor {
|
||||||
|
public String input = new StringBuilder()
|
||||||
|
.append("a")
|
||||||
|
.appendCodePoint(0x1F609)
|
||||||
|
.appendCodePoint(0x1F942)
|
||||||
|
.appendCodePoint(0x1F700)
|
||||||
|
.append("d")
|
||||||
|
.toString();
|
||||||
|
public String output = new StringBuilder()
|
||||||
|
.append("a")
|
||||||
|
.appendCodePoint(0x1F609)
|
||||||
|
.appendCodePoint(0x1F942)
|
||||||
|
.appendCodePoint(0x1F700)
|
||||||
|
.append("d\n")
|
||||||
|
.toString();
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS* 'd' {<InputText():writeln()>} ;
|
||||||
|
// Note the double-backslash to avoid Java passing
|
||||||
|
// unescaped values as part of the grammar.
|
||||||
|
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor {
|
||||||
|
// Test the code points just before and just after the range.
|
||||||
|
public String input = new StringBuilder()
|
||||||
|
.append("a")
|
||||||
|
.appendCodePoint(0x1F5FF)
|
||||||
|
.appendCodePoint(0x1F944)
|
||||||
|
.append("d")
|
||||||
|
.toString();
|
||||||
|
public String output = "ad\n";
|
||||||
|
public String errors = new StringBuilder()
|
||||||
|
.append("line 1:1 token recognition error at: '")
|
||||||
|
.appendCodePoint(0x1F5FF)
|
||||||
|
.append("'\n")
|
||||||
|
.append("line 1:2 token recognition error at: '")
|
||||||
|
.appendCodePoint(0x1F944)
|
||||||
|
.append("'\n")
|
||||||
|
.toString();
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS* 'd' {<InputText():writeln()>} ;
|
||||||
|
// Note the double-backslash to avoid Java passing
|
||||||
|
// unescaped values as part of the grammar.
|
||||||
|
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor {
|
||||||
|
public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c";
|
||||||
|
public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
LETTERS : 'a' ~('b')+ 'c';
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor {
|
||||||
|
public String input = "abc";
|
||||||
|
public String output = "abc\n";
|
||||||
|
public String errors = null;
|
||||||
|
public String startRule = "a";
|
||||||
|
public String grammarName = "T";
|
||||||
|
|
||||||
|
/**
|
||||||
|
grammar T;
|
||||||
|
a : LETTERS {<InputText():writeln()>} ;
|
||||||
|
LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c';
|
||||||
|
*/
|
||||||
|
@CommentHasStringValue
|
||||||
|
public String grammar;
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,18 @@ namespace Antlr4.Runtime.Atn
|
||||||
/// <remarks>This is the earliest supported serialized UUID.</remarks>
|
/// <remarks>This is the earliest supported serialized UUID.</remarks>
|
||||||
private static readonly Guid BaseSerializedUuid;
|
private static readonly Guid BaseSerializedUuid;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// This UUID indicates the serialized ATN contains two sets of
|
||||||
|
/// IntervalSets, where the second set's values are encoded as
|
||||||
|
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// This UUID indicates the serialized ATN contains two sets of
|
||||||
|
/// IntervalSets, where the second set's values are encoded as
|
||||||
|
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
/// </remarks>
|
||||||
|
private static readonly Guid AddedUnicodeSmp;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// This list contains all of the currently supported UUIDs, ordered by when
|
/// This list contains all of the currently supported UUIDs, ordered by when
|
||||||
/// the feature first appeared in this branch.
|
/// the feature first appeared in this branch.
|
||||||
|
@ -39,14 +51,18 @@ namespace Antlr4.Runtime.Atn
|
||||||
static ATNDeserializer()
|
static ATNDeserializer()
|
||||||
{
|
{
|
||||||
BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
||||||
|
AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089");
|
||||||
SupportedUuids = new List<Guid>();
|
SupportedUuids = new List<Guid>();
|
||||||
SupportedUuids.Add(BaseSerializedUuid);
|
SupportedUuids.Add(BaseSerializedUuid);
|
||||||
SerializedUuid = BaseSerializedUuid;
|
SupportedUuids.Add(AddedUnicodeSmp);
|
||||||
|
SerializedUuid = AddedUnicodeSmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
private readonly ATNDeserializationOptions deserializationOptions;
|
private readonly ATNDeserializationOptions deserializationOptions;
|
||||||
|
|
||||||
|
private Guid uuid;
|
||||||
|
|
||||||
public ATNDeserializer()
|
public ATNDeserializer()
|
||||||
: this(ATNDeserializationOptions.Default)
|
: this(ATNDeserializationOptions.Default)
|
||||||
{
|
{
|
||||||
|
@ -115,7 +131,11 @@ namespace Antlr4.Runtime.Atn
|
||||||
ReadStates (atn);
|
ReadStates (atn);
|
||||||
ReadRules (atn);
|
ReadRules (atn);
|
||||||
ReadModes (atn);
|
ReadModes (atn);
|
||||||
IList<IntervalSet> sets = ReadSets (atn);
|
IList<IntervalSet> sets = new List<IntervalSet>();
|
||||||
|
ReadSets (atn, sets, this.ReadInt);
|
||||||
|
if (IsFeatureSupported(AddedUnicodeSmp, uuid)) {
|
||||||
|
ReadSets (atn, sets, this.ReadInt32);
|
||||||
|
}
|
||||||
ReadEdges (atn, sets);
|
ReadEdges (atn, sets);
|
||||||
ReadDecisions (atn);
|
ReadDecisions (atn);
|
||||||
ReadLexerActions (atn);
|
ReadLexerActions (atn);
|
||||||
|
@ -378,12 +398,11 @@ namespace Antlr4.Runtime.Atn
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
|
protected internal virtual void ReadSets(ATN atn, IList<IntervalSet> sets, Func<int> readUnicode)
|
||||||
{
|
{
|
||||||
//
|
//
|
||||||
// SETS
|
// SETS
|
||||||
//
|
//
|
||||||
IList<IntervalSet> sets = new List<IntervalSet>();
|
|
||||||
int nsets = ReadInt();
|
int nsets = ReadInt();
|
||||||
for (int i_8 = 0; i_8 < nsets; i_8++)
|
for (int i_8 = 0; i_8 < nsets; i_8++)
|
||||||
{
|
{
|
||||||
|
@ -397,10 +416,9 @@ namespace Antlr4.Runtime.Atn
|
||||||
}
|
}
|
||||||
for (int j = 0; j < nintervals; j++)
|
for (int j = 0; j < nintervals; j++)
|
||||||
{
|
{
|
||||||
set.Add(ReadInt(), ReadInt());
|
set.Add(readUnicode(), readUnicode());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sets;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected internal virtual void ReadModes(ATN atn)
|
protected internal virtual void ReadModes(ATN atn)
|
||||||
|
@ -530,7 +548,7 @@ namespace Antlr4.Runtime.Atn
|
||||||
|
|
||||||
protected internal virtual void CheckUUID()
|
protected internal virtual void CheckUUID()
|
||||||
{
|
{
|
||||||
Guid uuid = ReadUUID();
|
uuid = ReadUUID();
|
||||||
if (!SupportedUuids.Contains(uuid))
|
if (!SupportedUuids.Contains(uuid))
|
||||||
{
|
{
|
||||||
string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);
|
string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);
|
||||||
|
|
|
@ -57,6 +57,51 @@ using namespace antlrcpp;
|
||||||
|
|
||||||
const size_t ATNDeserializer::SERIALIZED_VERSION = 3;
|
const size_t ATNDeserializer::SERIALIZED_VERSION = 3;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
uint32_t deserializeInt32(const std::vector<uint16_t>& data, size_t offset) {
|
||||||
|
return (uint32_t)data[offset] | ((uint32_t)data[offset + 1] << 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
ssize_t readUnicodeInt(const std::vector<uint16_t>& data, int& p) {
|
||||||
|
return static_cast<ssize_t>(data[p++]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ssize_t readUnicodeInt32(const std::vector<uint16_t>& data, int& p) {
|
||||||
|
auto result = deserializeInt32(data, p);
|
||||||
|
p += 2;
|
||||||
|
return static_cast<ssize_t>(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We templatize this on the function type so the optimizer can inline
|
||||||
|
// the 16- or 32-bit readUnicodeInt/readUnicodeInt32 as needed.
|
||||||
|
template <typename F>
|
||||||
|
void deserializeSets(
|
||||||
|
const std::vector<uint16_t>& data,
|
||||||
|
int& p,
|
||||||
|
std::vector<misc::IntervalSet>& sets,
|
||||||
|
F readUnicode) {
|
||||||
|
int nsets = data[p++];
|
||||||
|
for (int i = 0; i < nsets; i++) {
|
||||||
|
int nintervals = data[p++];
|
||||||
|
misc::IntervalSet set;
|
||||||
|
|
||||||
|
bool containsEof = data[p++] != 0;
|
||||||
|
if (containsEof) {
|
||||||
|
set.add(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < nintervals; j++) {
|
||||||
|
auto a = readUnicode(data, p);
|
||||||
|
auto b = readUnicode(data, p);
|
||||||
|
set.add(a, b);
|
||||||
|
}
|
||||||
|
sets.push_back(set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) {
|
ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,8 +120,12 @@ Guid ATNDeserializer::ADDED_LEXER_ACTIONS() {
|
||||||
return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Guid ATNDeserializer::ADDED_UNICODE_SMP() {
|
||||||
|
return Guid("59627784-3BE5-417A-B9EB-8131A7286089");
|
||||||
|
}
|
||||||
|
|
||||||
Guid ATNDeserializer::SERIALIZED_UUID() {
|
Guid ATNDeserializer::SERIALIZED_UUID() {
|
||||||
return ADDED_LEXER_ACTIONS();
|
return ADDED_UNICODE_SMP();
|
||||||
}
|
}
|
||||||
|
|
||||||
Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
|
Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
|
||||||
|
@ -84,7 +133,7 @@ Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Guid>& ATNDeserializer::SUPPORTED_UUIDS() {
|
std::vector<Guid>& ATNDeserializer::SUPPORTED_UUIDS() {
|
||||||
static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS() };
|
static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS(), ADDED_UNICODE_SMP() };
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,21 +288,14 @@ ATN ATNDeserializer::deserialize(const std::vector<uint16_t>& input) {
|
||||||
// SETS
|
// SETS
|
||||||
//
|
//
|
||||||
std::vector<misc::IntervalSet> sets;
|
std::vector<misc::IntervalSet> sets;
|
||||||
int nsets = data[p++];
|
|
||||||
for (int i = 0; i < nsets; i++) {
|
|
||||||
int nintervals = data[p++];
|
|
||||||
misc::IntervalSet set;
|
|
||||||
|
|
||||||
bool containsEof = data[p++] != 0;
|
// First, deserialize sets with 16-bit arguments <= U+FFFF.
|
||||||
if (containsEof) {
|
deserializeSets(data, p, sets, readUnicodeInt);
|
||||||
set.add(-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j = 0; j < nintervals; j++) {
|
// Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
set.add(data[p], data[p + 1], true);
|
// deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
p += 2;
|
if (isFeatureSupported(ADDED_UNICODE_SMP(), uuid)) {
|
||||||
}
|
deserializeSets(data, p, sets, readUnicodeInt32);
|
||||||
sets.push_back(set);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -67,6 +67,13 @@ namespace atn {
|
||||||
*/
|
*/
|
||||||
static Guid ADDED_LEXER_ACTIONS();
|
static Guid ADDED_LEXER_ACTIONS();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This UUID indicates the serialized ATN contains two sets of
|
||||||
|
* IntervalSets, where the second set's values are encoded as
|
||||||
|
* 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
*/
|
||||||
|
static Guid ADDED_UNICODE_SMP();
|
||||||
|
|
||||||
/// This list contains all of the currently supported UUIDs, ordered by when
|
/// This list contains all of the currently supported UUIDs, ordered by when
|
||||||
/// the feature first appeared in this branch.
|
/// the feature first appeared in this branch.
|
||||||
static std::vector<Guid>& SUPPORTED_UUIDS();
|
static std::vector<Guid>& SUPPORTED_UUIDS();
|
||||||
|
|
|
@ -24,14 +24,7 @@ Interval::Interval() : Interval((ssize_t)-1, -2) { // Need an explicit cast here
|
||||||
Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) {
|
Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Interval::Interval(ssize_t a_, ssize_t b_, bool autoExtend) {
|
Interval::Interval(ssize_t a_, ssize_t b_) : a(a_), b(b_) {
|
||||||
a = a_;
|
|
||||||
b = b_;
|
|
||||||
|
|
||||||
// XXX: temporary hack to make the full Unicode range available.
|
|
||||||
if (autoExtend && b == 0xFFFF) {
|
|
||||||
b = 0x10FFFF;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Interval::length() const {
|
size_t Interval::length() const {
|
||||||
|
|
|
@ -27,7 +27,7 @@ namespace misc {
|
||||||
|
|
||||||
Interval();
|
Interval();
|
||||||
explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings.
|
explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings.
|
||||||
Interval(ssize_t a_, ssize_t b_, bool autoExtend = false); // Automatically extend a value of 0xFFFF to 0x10FFFF.
|
Interval(ssize_t a_, ssize_t b_);
|
||||||
virtual ~Interval() {};
|
virtual ~Interval() {};
|
||||||
|
|
||||||
/// return number of elements between a and b inclusively. x..x is length 1.
|
/// return number of elements between a and b inclusively. x..x is length 1.
|
||||||
|
|
|
@ -50,8 +50,8 @@ IntervalSet IntervalSet::of(ssize_t a) {
|
||||||
return IntervalSet({ Interval(a, a) });
|
return IntervalSet({ Interval(a, a) });
|
||||||
}
|
}
|
||||||
|
|
||||||
IntervalSet IntervalSet::of(ssize_t a, ssize_t b, bool autoExtend) {
|
IntervalSet IntervalSet::of(ssize_t a, ssize_t b) {
|
||||||
return IntervalSet({ Interval(a, b, autoExtend) });
|
return IntervalSet({ Interval(a, b) });
|
||||||
}
|
}
|
||||||
|
|
||||||
void IntervalSet::clear() {
|
void IntervalSet::clear() {
|
||||||
|
@ -68,8 +68,8 @@ void IntervalSet::add(ssize_t el) {
|
||||||
add(el, el);
|
add(el, el);
|
||||||
}
|
}
|
||||||
|
|
||||||
void IntervalSet::add(ssize_t a, ssize_t b, bool autoExtend) {
|
void IntervalSet::add(ssize_t a, ssize_t b) {
|
||||||
add(Interval(a, b, autoExtend));
|
add(Interval(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
void IntervalSet::add(const Interval &addition) {
|
void IntervalSet::add(const Interval &addition) {
|
||||||
|
|
|
@ -44,7 +44,7 @@ namespace misc {
|
||||||
static IntervalSet of(ssize_t a);
|
static IntervalSet of(ssize_t a);
|
||||||
|
|
||||||
/// Create a set with all ints within range [a..b] (inclusive)
|
/// Create a set with all ints within range [a..b] (inclusive)
|
||||||
static IntervalSet of(ssize_t a, ssize_t b, bool autoExtend = false);
|
static IntervalSet of(ssize_t a, ssize_t b);
|
||||||
|
|
||||||
virtual void clear();
|
virtual void clear();
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ namespace misc {
|
||||||
/// If overlap, combine ranges. For example,
|
/// If overlap, combine ranges. For example,
|
||||||
/// If this is {1..5, 10..20}, adding 6..7 yields
|
/// If this is {1..5, 10..20}, adding 6..7 yields
|
||||||
/// {1..5, 6..7, 10..20}. Adding 4..8 yields {1..8, 10..20}.
|
/// {1..5, 6..7, 10..20}. Adding 4..8 yields {1..8, 10..20}.
|
||||||
virtual void add(ssize_t a, ssize_t b, bool autoExtend = false);
|
virtual void add(ssize_t a, ssize_t b);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// combine all sets in the array returned the or'd value
|
/// combine all sets in the array returned the or'd value
|
||||||
|
|
|
@ -15,15 +15,16 @@ import (
|
||||||
// This is the earliest supported serialized UUID.
|
// This is the earliest supported serialized UUID.
|
||||||
// stick to serialized version for now, we don't need a UUID instance
|
// stick to serialized version for now, we don't need a UUID instance
|
||||||
var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"
|
var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"
|
||||||
|
var AddedUnicodeSMP = "59627784-3BE5-417A-B9EB-8131A7286089"
|
||||||
|
|
||||||
// This list contains all of the currently supported UUIDs, ordered by when
|
// This list contains all of the currently supported UUIDs, ordered by when
|
||||||
// the feature first appeared in this branch.
|
// the feature first appeared in this branch.
|
||||||
var SupportedUUIDs = []string{BaseSerializedUUID}
|
var SupportedUUIDs = []string{BaseSerializedUUID, AddedUnicodeSMP}
|
||||||
|
|
||||||
var SerializedVersion = 3
|
var SerializedVersion = 3
|
||||||
|
|
||||||
// This is the current serialized UUID.
|
// This is the current serialized UUID.
|
||||||
var SerializedUUID = BaseSerializedUUID
|
var SerializedUUID = AddedUnicodeSMP
|
||||||
|
|
||||||
type LoopEndStateIntPair struct {
|
type LoopEndStateIntPair struct {
|
||||||
item0 *LoopEndState
|
item0 *LoopEndState
|
||||||
|
@ -91,7 +92,15 @@ func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN {
|
||||||
a.readRules(atn)
|
a.readRules(atn)
|
||||||
a.readModes(atn)
|
a.readModes(atn)
|
||||||
|
|
||||||
sets := a.readSets(atn)
|
sets := make([]*IntervalSet, 0)
|
||||||
|
|
||||||
|
// First, deserialize sets with 16-bit arguments <= U+FFFF.
|
||||||
|
sets = a.readSets(atn, sets, a.readInt)
|
||||||
|
// Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
|
// deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
|
if (a.isFeatureSupported(AddedUnicodeSMP, a.uuid)) {
|
||||||
|
sets = a.readSets(atn, sets, a.readInt32)
|
||||||
|
}
|
||||||
|
|
||||||
a.readEdges(atn, sets)
|
a.readEdges(atn, sets)
|
||||||
a.readDecisions(atn)
|
a.readDecisions(atn)
|
||||||
|
@ -266,8 +275,7 @@ func (a *ATNDeserializer) readModes(atn *ATN) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet {
|
func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode func() int) []*IntervalSet {
|
||||||
sets := make([]*IntervalSet, 0)
|
|
||||||
m := a.readInt()
|
m := a.readInt()
|
||||||
|
|
||||||
for i := 0; i < m; i++ {
|
for i := 0; i < m; i++ {
|
||||||
|
@ -283,8 +291,8 @@ func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet {
|
||||||
}
|
}
|
||||||
|
|
||||||
for j := 0; j < n; j++ {
|
for j := 0; j < n; j++ {
|
||||||
i1 := a.readInt()
|
i1 := readUnicode()
|
||||||
i2 := a.readInt()
|
i2 := readUnicode()
|
||||||
|
|
||||||
iset.addRange(i1, i2)
|
iset.addRange(i1, i2)
|
||||||
}
|
}
|
||||||
|
@ -642,6 +650,12 @@ func (a *ATNDeserializer) readInt() int {
|
||||||
return int(v)
|
return int(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *ATNDeserializer) readInt32() int {
|
||||||
|
var low = a.readInt()
|
||||||
|
var high = a.readInt()
|
||||||
|
return low | (high << 16)
|
||||||
|
}
|
||||||
|
|
||||||
//TODO
|
//TODO
|
||||||
//func (a *ATNDeserializer) readLong() int64 {
|
//func (a *ATNDeserializer) readLong() int64 {
|
||||||
// panic("Not implemented")
|
// panic("Not implemented")
|
||||||
|
|
|
@ -44,6 +44,12 @@ public class ATNDeserializer {
|
||||||
* {@link LexerAction} instances.
|
* {@link LexerAction} instances.
|
||||||
*/
|
*/
|
||||||
private static final UUID ADDED_LEXER_ACTIONS;
|
private static final UUID ADDED_LEXER_ACTIONS;
|
||||||
|
/**
|
||||||
|
* This UUID indicates the serialized ATN contains two sets of
|
||||||
|
* IntervalSets, where the second set's values are encoded as
|
||||||
|
* 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
*/
|
||||||
|
private static final UUID ADDED_UNICODE_SMP;
|
||||||
/**
|
/**
|
||||||
* This list contains all of the currently supported UUIDs, ordered by when
|
* This list contains all of the currently supported UUIDs, ordered by when
|
||||||
* the feature first appeared in this branch.
|
* the feature first appeared in this branch.
|
||||||
|
@ -61,15 +67,58 @@ public class ATNDeserializer {
|
||||||
BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
|
BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
|
||||||
ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61");
|
ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61");
|
||||||
ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
|
||||||
|
ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089");
|
||||||
|
|
||||||
SUPPORTED_UUIDS = new ArrayList<UUID>();
|
SUPPORTED_UUIDS = new ArrayList<UUID>();
|
||||||
SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID);
|
SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID);
|
||||||
SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS);
|
SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS);
|
||||||
SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS);
|
SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS);
|
||||||
|
SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP);
|
||||||
|
|
||||||
SERIALIZED_UUID = ADDED_LEXER_ACTIONS;
|
SERIALIZED_UUID = ADDED_UNICODE_SMP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface UnicodeDeserializer {
|
||||||
|
// Wrapper for readInt() or readInt32()
|
||||||
|
int readUnicode(char[] data, int p);
|
||||||
|
|
||||||
|
// Work around Java not allowing mutation of captured variables
|
||||||
|
// by returning amount by which to increment p after each read
|
||||||
|
int size();
|
||||||
|
}
|
||||||
|
|
||||||
|
enum UnicodeDeserializingMode {
|
||||||
|
UNICODE_BMP,
|
||||||
|
UNICODE_SMP
|
||||||
|
}
|
||||||
|
|
||||||
|
static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) {
|
||||||
|
if (mode == UnicodeDeserializingMode.UNICODE_BMP) {
|
||||||
|
return new UnicodeDeserializer() {
|
||||||
|
@Override
|
||||||
|
public int readUnicode(char[] data, int p) {
|
||||||
|
return toInt(data[p]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return new UnicodeDeserializer() {
|
||||||
|
@Override
|
||||||
|
public int readUnicode(char[] data, int p) {
|
||||||
|
return toInt32(data, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private final ATNDeserializationOptions deserializationOptions;
|
private final ATNDeserializationOptions deserializationOptions;
|
||||||
|
|
||||||
|
@ -98,7 +147,7 @@ public class ATNDeserializer {
|
||||||
* serialized ATN at or after the feature identified by {@code feature} was
|
* serialized ATN at or after the feature identified by {@code feature} was
|
||||||
* introduced; otherwise, {@code false}.
|
* introduced; otherwise, {@code false}.
|
||||||
*/
|
*/
|
||||||
protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
|
static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
|
||||||
int featureIndex = SUPPORTED_UUIDS.indexOf(feature);
|
int featureIndex = SUPPORTED_UUIDS.indexOf(feature);
|
||||||
if (featureIndex < 0) {
|
if (featureIndex < 0) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -258,22 +307,14 @@ public class ATNDeserializer {
|
||||||
// SETS
|
// SETS
|
||||||
//
|
//
|
||||||
List<IntervalSet> sets = new ArrayList<IntervalSet>();
|
List<IntervalSet> sets = new ArrayList<IntervalSet>();
|
||||||
int nsets = toInt(data[p++]);
|
|
||||||
for (int i=0; i<nsets; i++) {
|
|
||||||
int nintervals = toInt(data[p]);
|
|
||||||
p++;
|
|
||||||
IntervalSet set = new IntervalSet();
|
|
||||||
sets.add(set);
|
|
||||||
|
|
||||||
boolean containsEof = toInt(data[p++]) != 0;
|
// First, read all sets with 16-bit Unicode code points <= U+FFFF.
|
||||||
if (containsEof) {
|
p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP));
|
||||||
set.add(-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j=0; j<nintervals; j++) {
|
// Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
set.add(toInt(data[p]), toInt(data[p + 1]));
|
// deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
p += 2;
|
if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) {
|
||||||
}
|
p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP));
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -510,6 +551,30 @@ public class ATNDeserializer {
|
||||||
return atn;
|
return atn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int deserializeSets(char[] data, int p, List<IntervalSet> sets, UnicodeDeserializer unicodeDeserializer) {
|
||||||
|
int nsets = toInt(data[p++]);
|
||||||
|
for (int i=0; i<nsets; i++) {
|
||||||
|
int nintervals = toInt(data[p]);
|
||||||
|
p++;
|
||||||
|
IntervalSet set = new IntervalSet();
|
||||||
|
sets.add(set);
|
||||||
|
|
||||||
|
boolean containsEof = toInt(data[p++]) != 0;
|
||||||
|
if (containsEof) {
|
||||||
|
set.add(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j=0; j<nintervals; j++) {
|
||||||
|
int a = unicodeDeserializer.readUnicode(data, p);
|
||||||
|
p += unicodeDeserializer.size();
|
||||||
|
int b = unicodeDeserializer.readUnicode(data, p);
|
||||||
|
p += unicodeDeserializer.size();
|
||||||
|
set.add(a, b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyze the {@link StarLoopEntryState} states in the specified ATN to set
|
* Analyze the {@link StarLoopEntryState} states in the specified ATN to set
|
||||||
* the {@link StarLoopEntryState#isPrecedenceDecision} field to the
|
* the {@link StarLoopEntryState#isPrecedenceDecision} field to the
|
||||||
|
|
|
@ -14,8 +14,10 @@ import org.antlr.v4.runtime.misc.Utils;
|
||||||
|
|
||||||
import java.io.InvalidClassException;
|
import java.io.InvalidClassException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
@ -24,6 +26,10 @@ public class ATNSerializer {
|
||||||
public ATN atn;
|
public ATN atn;
|
||||||
private List<String> tokenNames;
|
private List<String> tokenNames;
|
||||||
|
|
||||||
|
private interface CodePointSerializer {
|
||||||
|
void serializeCodePoint(IntegerList data, int cp);
|
||||||
|
}
|
||||||
|
|
||||||
public ATNSerializer(ATN atn) {
|
public ATNSerializer(ATN atn) {
|
||||||
assert atn.grammarType != null;
|
assert atn.grammarType != null;
|
||||||
this.atn = atn;
|
this.atn = atn;
|
||||||
|
@ -47,9 +53,11 @@ public class ATNSerializer {
|
||||||
* (args are token type,actionIndex in lexer else 0,0)
|
* (args are token type,actionIndex in lexer else 0,0)
|
||||||
* num modes,
|
* num modes,
|
||||||
* mode-0-start-state, mode-1-start-state, ... (parser has 0 modes)
|
* mode-0-start-state, mode-1-start-state, ... (parser has 0 modes)
|
||||||
* num sets
|
* num unicode-bmp-sets
|
||||||
* set-0-interval-count intervals, set-1-interval-count intervals, ...
|
* bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ...
|
||||||
* num total edges,
|
* num unicode-smp-sets
|
||||||
|
* smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ...
|
||||||
|
* num total edges,
|
||||||
* src, trg, edge-type, edge arg1, optional edge arg2 (present always), ...
|
* src, trg, edge-type, edge arg1, optional edge arg2 (present always), ...
|
||||||
* num decisions,
|
* num decisions,
|
||||||
* decision-0-start-state, decision-1-start-state, ...
|
* decision-0-start-state, decision-1-start-state, ...
|
||||||
|
@ -66,8 +74,10 @@ public class ATNSerializer {
|
||||||
data.add(atn.maxTokenType);
|
data.add(atn.maxTokenType);
|
||||||
int nedges = 0;
|
int nedges = 0;
|
||||||
|
|
||||||
Map<IntervalSet, Integer> setIndices = new HashMap<IntervalSet, Integer>();
|
// Note that we use a LinkedHashMap as a set to
|
||||||
List<IntervalSet> sets = new ArrayList<IntervalSet>();
|
// maintain insertion order while deduplicating
|
||||||
|
// entries with the same key.
|
||||||
|
Map<IntervalSet, Boolean> sets = new LinkedHashMap<>();
|
||||||
|
|
||||||
// dump states, count edges and collect sets while doing so
|
// dump states, count edges and collect sets while doing so
|
||||||
IntegerList nonGreedyStates = new IntegerList();
|
IntegerList nonGreedyStates = new IntegerList();
|
||||||
|
@ -114,10 +124,7 @@ public class ATNSerializer {
|
||||||
int edgeType = Transition.serializationTypes.get(t.getClass());
|
int edgeType = Transition.serializationTypes.get(t.getClass());
|
||||||
if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) {
|
if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) {
|
||||||
SetTransition st = (SetTransition)t;
|
SetTransition st = (SetTransition)t;
|
||||||
if (!setIndices.containsKey(st.set)) {
|
sets.put(st.set, true);
|
||||||
sets.add(st.set);
|
|
||||||
setIndices.put(st.set, sets.size() - 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -156,34 +163,40 @@ public class ATNSerializer {
|
||||||
data.add(modeStartState.stateNumber);
|
data.add(modeStartState.stateNumber);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
List<IntervalSet> bmpSets = new ArrayList<>();
|
||||||
int nsets = sets.size();
|
List<IntervalSet> smpSets = new ArrayList<>();
|
||||||
data.add(nsets);
|
for (IntervalSet set : sets.keySet()) {
|
||||||
for (IntervalSet set : sets) {
|
if (set.getMaxElement() <= Character.MAX_VALUE) {
|
||||||
boolean containsEof = set.contains(Token.EOF);
|
bmpSets.add(set);
|
||||||
if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
|
} else {
|
||||||
data.add(set.getIntervals().size() - 1);
|
smpSets.add(set);
|
||||||
}
|
}
|
||||||
else {
|
}
|
||||||
data.add(set.getIntervals().size());
|
serializeSets(
|
||||||
}
|
data,
|
||||||
|
bmpSets,
|
||||||
data.add(containsEof ? 1 : 0);
|
new CodePointSerializer() {
|
||||||
for (Interval I : set.getIntervals()) {
|
@Override
|
||||||
if (I.a == Token.EOF) {
|
public void serializeCodePoint(IntegerList data, int cp) {
|
||||||
if (I.b == Token.EOF) {
|
data.add(cp);
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
data.add(0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
});
|
||||||
data.add(I.a);
|
serializeSets(
|
||||||
|
data,
|
||||||
|
smpSets,
|
||||||
|
new CodePointSerializer() {
|
||||||
|
@Override
|
||||||
|
public void serializeCodePoint(IntegerList data, int cp) {
|
||||||
|
serializeInt(data, cp);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
data.add(I.b);
|
Map<IntervalSet, Integer> setIndices = new HashMap<>();
|
||||||
}
|
int setIndex = 0;
|
||||||
|
for (IntervalSet bmpSet : bmpSets) {
|
||||||
|
setIndices.put(bmpSet, setIndex++);
|
||||||
|
}
|
||||||
|
for (IntervalSet smpSet : smpSets) {
|
||||||
|
setIndices.put(smpSet, setIndex++);
|
||||||
}
|
}
|
||||||
|
|
||||||
data.add(nedges);
|
data.add(nedges);
|
||||||
|
@ -359,6 +372,42 @@ public class ATNSerializer {
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void serializeSets(
|
||||||
|
IntegerList data,
|
||||||
|
Collection<IntervalSet> sets,
|
||||||
|
CodePointSerializer codePointSerializer)
|
||||||
|
{
|
||||||
|
int nSets = sets.size();
|
||||||
|
data.add(nSets);
|
||||||
|
|
||||||
|
for (IntervalSet set : sets) {
|
||||||
|
boolean containsEof = set.contains(Token.EOF);
|
||||||
|
if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
|
||||||
|
data.add(set.getIntervals().size() - 1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
data.add(set.getIntervals().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
data.add(containsEof ? 1 : 0);
|
||||||
|
for (Interval I : set.getIntervals()) {
|
||||||
|
if (I.a == Token.EOF) {
|
||||||
|
if (I.b == Token.EOF) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
codePointSerializer.serializeCodePoint(data, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
codePointSerializer.serializeCodePoint(data, I.a);
|
||||||
|
}
|
||||||
|
|
||||||
|
codePointSerializer.serializeCodePoint(data, I.b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public String decode(char[] data) {
|
public String decode(char[] data) {
|
||||||
data = data.clone();
|
data = data.clone();
|
||||||
// don't adjust the first value since that's the version number
|
// don't adjust the first value since that's the version number
|
||||||
|
@ -437,25 +486,10 @@ public class ATNSerializer {
|
||||||
int s = ATNDeserializer.toInt(data[p++]);
|
int s = ATNDeserializer.toInt(data[p++]);
|
||||||
buf.append("mode ").append(i).append(":").append(s).append('\n');
|
buf.append("mode ").append(i).append(":").append(s).append('\n');
|
||||||
}
|
}
|
||||||
int nsets = ATNDeserializer.toInt(data[p++]);
|
int numBMPSets = ATNDeserializer.toInt(data[p++]);
|
||||||
for (int i=0; i<nsets; i++) {
|
p = appendSets(buf, data, p, numBMPSets, 0, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_BMP));
|
||||||
int nintervals = ATNDeserializer.toInt(data[p++]);
|
int numSMPSets = ATNDeserializer.toInt(data[p++]);
|
||||||
buf.append(i).append(":");
|
p = appendSets(buf, data, p, numSMPSets, numBMPSets, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_SMP));
|
||||||
boolean containsEof = data[p++] != 0;
|
|
||||||
if (containsEof) {
|
|
||||||
buf.append(getTokenName(Token.EOF));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j=0; j<nintervals; j++) {
|
|
||||||
if ( containsEof || j>0 ) {
|
|
||||||
buf.append(", ");
|
|
||||||
}
|
|
||||||
|
|
||||||
buf.append(getTokenName(ATNDeserializer.toInt(data[p]))).append("..").append(getTokenName(ATNDeserializer.toInt(data[p + 1])));
|
|
||||||
p += 2;
|
|
||||||
}
|
|
||||||
buf.append("\n");
|
|
||||||
}
|
|
||||||
int nedges = ATNDeserializer.toInt(data[p++]);
|
int nedges = ATNDeserializer.toInt(data[p++]);
|
||||||
for (int i=0; i<nedges; i++) {
|
for (int i=0; i<nedges; i++) {
|
||||||
int src = ATNDeserializer.toInt(data[p]);
|
int src = ATNDeserializer.toInt(data[p]);
|
||||||
|
@ -491,6 +525,31 @@ public class ATNSerializer {
|
||||||
return buf.toString();
|
return buf.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int appendSets(StringBuilder buf, char[] data, int p, int nsets, int setIndexOffset, ATNDeserializer.UnicodeDeserializer unicodeDeserializer) {
|
||||||
|
for (int i=0; i<nsets; i++) {
|
||||||
|
int nintervals = ATNDeserializer.toInt(data[p++]);
|
||||||
|
buf.append(i+setIndexOffset).append(":");
|
||||||
|
boolean containsEof = data[p++] != 0;
|
||||||
|
if (containsEof) {
|
||||||
|
buf.append(getTokenName(Token.EOF));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j=0; j<nintervals; j++) {
|
||||||
|
if ( containsEof || j>0 ) {
|
||||||
|
buf.append(", ");
|
||||||
|
}
|
||||||
|
|
||||||
|
int a = unicodeDeserializer.readUnicode(data, p);
|
||||||
|
p += unicodeDeserializer.size();
|
||||||
|
int b = unicodeDeserializer.readUnicode(data, p);
|
||||||
|
p += unicodeDeserializer.size();
|
||||||
|
buf.append(getTokenName(a)).append("..").append(getTokenName(b));
|
||||||
|
}
|
||||||
|
buf.append("\n");
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
public String getTokenName(int t) {
|
public String getTokenName(int t) {
|
||||||
if ( t==-1 ) return "EOF";
|
if ( t==-1 ) return "EOF";
|
||||||
|
|
||||||
|
|
|
@ -51,14 +51,21 @@ var LexerModeAction = LexerActions.LexerModeAction;
|
||||||
// stick to serialized version for now, we don't need a UUID instance
|
// stick to serialized version for now, we don't need a UUID instance
|
||||||
var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E";
|
var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E";
|
||||||
|
|
||||||
|
//
|
||||||
|
// This UUID indicates the serialized ATN contains two sets of
|
||||||
|
// IntervalSets, where the second set's values are encoded as
|
||||||
|
// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
//
|
||||||
|
var ADDED_UNICODE_SMP = "59627784-3BE5-417A-B9EB-8131A7286089";
|
||||||
|
|
||||||
// This list contains all of the currently supported UUIDs, ordered by when
|
// This list contains all of the currently supported UUIDs, ordered by when
|
||||||
// the feature first appeared in this branch.
|
// the feature first appeared in this branch.
|
||||||
var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ];
|
var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ];
|
||||||
|
|
||||||
var SERIALIZED_VERSION = 3;
|
var SERIALIZED_VERSION = 3;
|
||||||
|
|
||||||
// This is the current serialized UUID.
|
// This is the current serialized UUID.
|
||||||
var SERIALIZED_UUID = BASE_SERIALIZED_UUID;
|
var SERIALIZED_UUID = ADDED_UNICODE_SMP;
|
||||||
|
|
||||||
function initArray( length, value) {
|
function initArray( length, value) {
|
||||||
var tmp = [];
|
var tmp = [];
|
||||||
|
@ -91,11 +98,11 @@ function ATNDeserializer (options) {
|
||||||
// introduced; otherwise, {@code false}.
|
// introduced; otherwise, {@code false}.
|
||||||
|
|
||||||
ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) {
|
ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) {
|
||||||
var idx1 = SUPPORTED_UUIDS.index(feature);
|
var idx1 = SUPPORTED_UUIDS.indexOf(feature);
|
||||||
if (idx1<0) {
|
if (idx1<0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
var idx2 = SUPPORTED_UUIDS.index(actualUuid);
|
var idx2 = SUPPORTED_UUIDS.indexOf(actualUuid);
|
||||||
return idx2 >= idx1;
|
return idx2 >= idx1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -107,7 +114,14 @@ ATNDeserializer.prototype.deserialize = function(data) {
|
||||||
this.readStates(atn);
|
this.readStates(atn);
|
||||||
this.readRules(atn);
|
this.readRules(atn);
|
||||||
this.readModes(atn);
|
this.readModes(atn);
|
||||||
var sets = this.readSets(atn);
|
var sets = [];
|
||||||
|
// First, deserialize sets with 16-bit arguments <= U+FFFF.
|
||||||
|
this.readSets(atn, sets, this.readInt.bind(this));
|
||||||
|
// Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
|
// deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
|
if (this.isFeatureSupported(ADDED_UNICODE_SMP, this.uuid)) {
|
||||||
|
this.readSets(atn, sets, this.readInt32.bind(this));
|
||||||
|
}
|
||||||
this.readEdges(atn, sets);
|
this.readEdges(atn, sets);
|
||||||
this.readDecisions(atn);
|
this.readDecisions(atn);
|
||||||
this.readLexerActions(atn);
|
this.readLexerActions(atn);
|
||||||
|
@ -244,8 +258,7 @@ ATNDeserializer.prototype.readModes = function(atn) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
ATNDeserializer.prototype.readSets = function(atn) {
|
ATNDeserializer.prototype.readSets = function(atn, sets, readUnicode) {
|
||||||
var sets = [];
|
|
||||||
var m = this.readInt();
|
var m = this.readInt();
|
||||||
for (var i=0; i<m; i++) {
|
for (var i=0; i<m; i++) {
|
||||||
var iset = new IntervalSet();
|
var iset = new IntervalSet();
|
||||||
|
@ -256,12 +269,11 @@ ATNDeserializer.prototype.readSets = function(atn) {
|
||||||
iset.addOne(-1);
|
iset.addOne(-1);
|
||||||
}
|
}
|
||||||
for (var j=0; j<n; j++) {
|
for (var j=0; j<n; j++) {
|
||||||
var i1 = this.readInt();
|
var i1 = readUnicode();
|
||||||
var i2 = this.readInt();
|
var i2 = readUnicode();
|
||||||
iset.addRange(i1, i2);
|
iset.addRange(i1, i2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sets;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ATNDeserializer.prototype.readEdges = function(atn, sets) {
|
ATNDeserializer.prototype.readEdges = function(atn, sets) {
|
||||||
|
|
|
@ -278,7 +278,7 @@ class Lexer(Recognizer, TokenSource):
|
||||||
start = self._tokenStartCharIndex
|
start = self._tokenStartCharIndex
|
||||||
stop = self._input.index
|
stop = self._input.index
|
||||||
text = self._input.getText(start, stop)
|
text = self._input.getText(start, stop)
|
||||||
msg = "token recognition error at: '" + self.getErrorDisplay(text) + "'"
|
msg = u"token recognition error at: '" + self.getErrorDisplay(text) + u"'"
|
||||||
listener = self.getErrorListenerDispatch()
|
listener = self.getErrorListenerDispatch()
|
||||||
listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e)
|
listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e)
|
||||||
|
|
||||||
|
@ -291,17 +291,17 @@ class Lexer(Recognizer, TokenSource):
|
||||||
def getErrorDisplayForChar(self, c):
|
def getErrorDisplayForChar(self, c):
|
||||||
if ord(c[0])==Token.EOF:
|
if ord(c[0])==Token.EOF:
|
||||||
return "<EOF>"
|
return "<EOF>"
|
||||||
elif c=='\n':
|
elif c==u'\n':
|
||||||
return "\\n"
|
return u"\\n"
|
||||||
elif c=='\t':
|
elif c==u'\t':
|
||||||
return "\\t"
|
return u"\\t"
|
||||||
elif c=='\r':
|
elif c==u'\r':
|
||||||
return "\\r"
|
return u"\\r"
|
||||||
else:
|
else:
|
||||||
return unicode(c)
|
return c
|
||||||
|
|
||||||
def getCharErrorDisplay(self, c):
|
def getCharErrorDisplay(self, c):
|
||||||
return "'" + self.getErrorDisplayForChar(c) + "'"
|
return u"'" + self.getErrorDisplayForChar(c) + u"'"
|
||||||
|
|
||||||
# Lexers can normally match any char in it's vocabulary after matching
|
# Lexers can normally match any char in it's vocabulary after matching
|
||||||
# a token, so do the easy thing and just kill a character and hope
|
# a token, so do the easy thing and just kill a character and hope
|
||||||
|
|
|
@ -13,14 +13,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
|
||||||
# This is the earliest supported serialized UUID.
|
# This is the earliest supported serialized UUID.
|
||||||
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
|
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
|
||||||
|
|
||||||
|
# This UUID indicates the serialized ATN contains two sets of
|
||||||
|
# IntervalSets, where the second set's values are encoded as
|
||||||
|
# 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
|
||||||
|
|
||||||
# This list contains all of the currently supported UUIDs, ordered by when
|
# This list contains all of the currently supported UUIDs, ordered by when
|
||||||
# the feature first appeared in this branch.
|
# the feature first appeared in this branch.
|
||||||
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]
|
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
|
||||||
|
|
||||||
SERIALIZED_VERSION = 3
|
SERIALIZED_VERSION = 3
|
||||||
|
|
||||||
# This is the current serialized UUID.
|
# This is the current serialized UUID.
|
||||||
SERIALIZED_UUID = BASE_SERIALIZED_UUID
|
SERIALIZED_UUID = ADDED_UNICODE_SMP
|
||||||
|
|
||||||
class ATNDeserializer (object):
|
class ATNDeserializer (object):
|
||||||
|
|
||||||
|
@ -59,7 +64,13 @@ class ATNDeserializer (object):
|
||||||
self.readStates(atn)
|
self.readStates(atn)
|
||||||
self.readRules(atn)
|
self.readRules(atn)
|
||||||
self.readModes(atn)
|
self.readModes(atn)
|
||||||
sets = self.readSets(atn)
|
sets = []
|
||||||
|
# First, read all sets with 16-bit Unicode code points <= U+FFFF.
|
||||||
|
self.readSets(atn, sets, self.readInt)
|
||||||
|
# Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
|
# deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
|
if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
|
||||||
|
self.readSets(atn, sets, self.readInt32)
|
||||||
self.readEdges(atn, sets)
|
self.readEdges(atn, sets)
|
||||||
self.readDecisions(atn)
|
self.readDecisions(atn)
|
||||||
self.readLexerActions(atn)
|
self.readLexerActions(atn)
|
||||||
|
@ -170,8 +181,7 @@ class ATNDeserializer (object):
|
||||||
s = self.readInt()
|
s = self.readInt()
|
||||||
atn.modeToStartState.append(atn.states[s])
|
atn.modeToStartState.append(atn.states[s])
|
||||||
|
|
||||||
def readSets(self, atn):
|
def readSets(self, atn, sets, readUnicode):
|
||||||
sets = []
|
|
||||||
m = self.readInt()
|
m = self.readInt()
|
||||||
for i in range(0, m):
|
for i in range(0, m):
|
||||||
iset = IntervalSet()
|
iset = IntervalSet()
|
||||||
|
@ -181,10 +191,9 @@ class ATNDeserializer (object):
|
||||||
if containsEof!=0:
|
if containsEof!=0:
|
||||||
iset.addOne(-1)
|
iset.addOne(-1)
|
||||||
for j in range(0, n):
|
for j in range(0, n):
|
||||||
i1 = self.readInt()
|
i1 = readUnicode()
|
||||||
i2 = self.readInt()
|
i2 = readUnicode()
|
||||||
iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive
|
iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive
|
||||||
return sets
|
|
||||||
|
|
||||||
def readEdges(self, atn, sets):
|
def readEdges(self, atn, sets):
|
||||||
nedges = self.readInt()
|
nedges = self.readInt()
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
#/
|
#/
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
from typing import Callable
|
||||||
from antlr4.Token import Token
|
from antlr4.Token import Token
|
||||||
from antlr4.atn.ATN import ATN
|
from antlr4.atn.ATN import ATN
|
||||||
from antlr4.atn.ATNType import ATNType
|
from antlr4.atn.ATNType import ATNType
|
||||||
|
@ -15,14 +16,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
|
||||||
# This is the earliest supported serialized UUID.
|
# This is the earliest supported serialized UUID.
|
||||||
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
|
BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
|
||||||
|
|
||||||
|
# This UUID indicates the serialized ATN contains two sets of
|
||||||
|
# IntervalSets, where the second set's values are encoded as
|
||||||
|
# 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
|
||||||
|
|
||||||
# This list contains all of the currently supported UUIDs, ordered by when
|
# This list contains all of the currently supported UUIDs, ordered by when
|
||||||
# the feature first appeared in this branch.
|
# the feature first appeared in this branch.
|
||||||
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]
|
SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
|
||||||
|
|
||||||
SERIALIZED_VERSION = 3
|
SERIALIZED_VERSION = 3
|
||||||
|
|
||||||
# This is the current serialized UUID.
|
# This is the current serialized UUID.
|
||||||
SERIALIZED_UUID = BASE_SERIALIZED_UUID
|
SERIALIZED_UUID = ADDED_UNICODE_SMP
|
||||||
|
|
||||||
class ATNDeserializer (object):
|
class ATNDeserializer (object):
|
||||||
|
|
||||||
|
@ -58,7 +64,13 @@ class ATNDeserializer (object):
|
||||||
self.readStates(atn)
|
self.readStates(atn)
|
||||||
self.readRules(atn)
|
self.readRules(atn)
|
||||||
self.readModes(atn)
|
self.readModes(atn)
|
||||||
sets = self.readSets(atn)
|
sets = []
|
||||||
|
# First, read all sets with 16-bit Unicode code points <= U+FFFF.
|
||||||
|
self.readSets(atn, sets, self.readInt)
|
||||||
|
# Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
|
# deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
|
if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
|
||||||
|
self.readSets(atn, sets, self.readInt32)
|
||||||
self.readEdges(atn, sets)
|
self.readEdges(atn, sets)
|
||||||
self.readDecisions(atn)
|
self.readDecisions(atn)
|
||||||
self.readLexerActions(atn)
|
self.readLexerActions(atn)
|
||||||
|
@ -170,8 +182,7 @@ class ATNDeserializer (object):
|
||||||
s = self.readInt()
|
s = self.readInt()
|
||||||
atn.modeToStartState.append(atn.states[s])
|
atn.modeToStartState.append(atn.states[s])
|
||||||
|
|
||||||
def readSets(self, atn:ATN):
|
def readSets(self, atn:ATN, sets:list, readUnicode:Callable[[], int]):
|
||||||
sets = []
|
|
||||||
m = self.readInt()
|
m = self.readInt()
|
||||||
for i in range(0, m):
|
for i in range(0, m):
|
||||||
iset = IntervalSet()
|
iset = IntervalSet()
|
||||||
|
@ -181,10 +192,9 @@ class ATNDeserializer (object):
|
||||||
if containsEof!=0:
|
if containsEof!=0:
|
||||||
iset.addOne(-1)
|
iset.addOne(-1)
|
||||||
for j in range(0, n):
|
for j in range(0, n):
|
||||||
i1 = self.readInt()
|
i1 = readUnicode()
|
||||||
i2 = self.readInt()
|
i2 = readUnicode()
|
||||||
iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive
|
iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive
|
||||||
return sets
|
|
||||||
|
|
||||||
def readEdges(self, atn:ATN, sets:list):
|
def readEdges(self, atn:ATN, sets:list):
|
||||||
nedges = self.readInt()
|
nedges = self.readInt()
|
||||||
|
|
|
@ -26,21 +26,30 @@ public class ATNDeserializer {
|
||||||
/// for the addition of lexer actions encoded as a sequence of
|
/// for the addition of lexer actions encoded as a sequence of
|
||||||
/// {@link org.antlr.v4.runtime.atn.LexerAction} instances.
|
/// {@link org.antlr.v4.runtime.atn.LexerAction} instances.
|
||||||
private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
|
private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
|
||||||
/// This list contains all of the currently supported UUIDs, ordered by when
|
|
||||||
/// the feature first appeared in this branch.
|
/// This UUID indicates the serialized ATN contains two sets of
|
||||||
|
/// IntervalSets, where the second set's values are encoded as
|
||||||
|
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
|
||||||
|
private static let ADDED_UNICODE_SMP: UUID = UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This list contains all of the currently supported UUIDs, ordered by when
|
||||||
|
* the feature first appeared in this branch.
|
||||||
|
*/
|
||||||
private static let SUPPORTED_UUIDS: Array<UUID> = {
|
private static let SUPPORTED_UUIDS: Array<UUID> = {
|
||||||
var suuid = Array<UUID>()
|
var suuid = Array<UUID>()
|
||||||
suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID)
|
suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID)
|
||||||
suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS)
|
suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS)
|
||||||
suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS)
|
suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS)
|
||||||
|
suuid.append(ATNDeserializer.ADDED_UNICODE_SMP)
|
||||||
return suuid
|
return suuid
|
||||||
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
/// This is the current serialized UUID.
|
/// This is the current serialized UUID.
|
||||||
public static let SERIALIZED_UUID: UUID = {
|
public static let SERIALIZED_UUID: UUID = {
|
||||||
// SERIALIZED_UUID = ADDED_LEXER_ACTIONS;
|
// SERIALIZED_UUID = ADDED_UNICODE_SMP;
|
||||||
return UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
|
return UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|
||||||
|
@ -245,24 +254,14 @@ public class ATNDeserializer {
|
||||||
// SETS
|
// SETS
|
||||||
//
|
//
|
||||||
var sets: Array<IntervalSet> = Array<IntervalSet>()
|
var sets: Array<IntervalSet> = Array<IntervalSet>()
|
||||||
let nsets: Int = toInt(data[p])
|
|
||||||
p += 1
|
|
||||||
for _ in 0..<nsets {
|
|
||||||
let nintervals: Int = toInt(data[p])
|
|
||||||
p += 1
|
|
||||||
let set: IntervalSet = try IntervalSet()
|
|
||||||
sets.append(set)
|
|
||||||
|
|
||||||
let containsEof: Bool = toInt(data[p]) != 0
|
// First, deserialize sets with 16-bit arguments <= U+FFFF.
|
||||||
p += 1
|
try readSets(data, &p, &sets, readUnicodeInt)
|
||||||
if containsEof {
|
|
||||||
try set.add(-1)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _ in 0..<nintervals {
|
// Next, if the ATN was serialized with the Unicode SMP feature,
|
||||||
try set.add(toInt(data[p]), toInt(data[p + 1]))
|
// deserialize sets with 32-bit arguments <= U+10FFFF.
|
||||||
p += 2
|
if isFeatureSupported(ATNDeserializer.ADDED_UNICODE_SMP, uuid) {
|
||||||
}
|
try readSets(data, &p, &sets, readUnicodeInt32)
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -521,6 +520,39 @@ public class ATNDeserializer {
|
||||||
return atn
|
return atn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func readUnicodeInt(_ data: [Character], _ p: inout Int) -> Int {
|
||||||
|
let result: Int = toInt(data[p])
|
||||||
|
p += 1
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
private func readUnicodeInt32(_ data: [Character], _ p: inout Int) -> Int {
|
||||||
|
let result: Int = toInt32(data, p)
|
||||||
|
p += 2
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
private func readSets(_ data: [Character], _ p: inout Int, _ sets: inout Array<IntervalSet>, _ readUnicode: ([Character], inout Int) -> Int) throws {
|
||||||
|
let nsets: Int = toInt(data[p])
|
||||||
|
p += 1
|
||||||
|
for _ in 0..<nsets {
|
||||||
|
let nintervals: Int = toInt(data[p])
|
||||||
|
p += 1
|
||||||
|
let set: IntervalSet = try IntervalSet()
|
||||||
|
sets.append(set)
|
||||||
|
|
||||||
|
let containsEof: Bool = toInt(data[p]) != 0
|
||||||
|
p += 1
|
||||||
|
if containsEof {
|
||||||
|
try set.add(-1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..<nintervals {
|
||||||
|
try set.add(readUnicode(data, &p), readUnicode(data, &p))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public func deserializeFromJson(_ jsonStr: String) -> ATN {
|
public func deserializeFromJson(_ jsonStr: String) -> ATN {
|
||||||
// let jsonStr = Utils.readFile2String(jsonFileName)
|
// let jsonStr = Utils.readFile2String(jsonFileName)
|
||||||
guard !jsonStr.isEmpty else {
|
guard !jsonStr.isEmpty else {
|
||||||
|
|
|
@ -6,8 +6,8 @@
|
||||||
|
|
||||||
package org.antlr.v4.test.tool;
|
package org.antlr.v4.test.tool;
|
||||||
|
|
||||||
import org.antlr.v4.runtime.ANTLRInputStream;
|
|
||||||
import org.antlr.v4.runtime.CharStream;
|
import org.antlr.v4.runtime.CharStream;
|
||||||
|
import org.antlr.v4.runtime.CharStreams;
|
||||||
import org.antlr.v4.runtime.atn.ATN;
|
import org.antlr.v4.runtime.atn.ATN;
|
||||||
import org.antlr.v4.runtime.atn.ATNState;
|
import org.antlr.v4.runtime.atn.ATNState;
|
||||||
import org.antlr.v4.runtime.misc.Utils;
|
import org.antlr.v4.runtime.misc.Utils;
|
||||||
|
@ -121,6 +121,94 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
|
||||||
checkLexerMatches(lg, "c", expecting);
|
checkLexerMatches(lg, "c", expecting);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerSetUnicodeBMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\u611B'|'\u611C')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, "\u611B", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotSetUnicodeBMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\u611B'|'\u611C')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, "\u611D", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotSetUnicodeBMPMatchesSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\u611B'|'\u611C')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerSetUnicodeSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotBMPSetMatchesUnicodeSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('a'|'b')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotBMPSetMatchesBMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('a'|'b')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, "\u611B", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotBMPSetMatchesSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('a'|'b')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotSMPSetMatchesBMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, "\u611B", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotSMPSetMatchesSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1D7C0).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerRangeUnicodeSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\\u{1F4A9}'..'\\u{1F4B0}')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4AF).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerRangeUnicodeBMPToSMP() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\\u611B'..'\\u{1F4B0}')\n ;");
|
||||||
|
String expecting = "ID, EOF";
|
||||||
|
checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x12001).toString(), expecting);
|
||||||
|
}
|
||||||
|
|
||||||
@Test public void testLexerKeywordIDAmbiguity() throws Exception {
|
@Test public void testLexerKeywordIDAmbiguity() throws Exception {
|
||||||
LexerGrammar lg = new LexerGrammar(
|
LexerGrammar lg = new LexerGrammar(
|
||||||
"lexer grammar L;\n"+
|
"lexer grammar L;\n"+
|
||||||
|
@ -293,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
|
||||||
|
|
||||||
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
|
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
|
||||||
ATN atn = createATN(lg, true);
|
ATN atn = createATN(lg, true);
|
||||||
CharStream input = new ANTLRInputStream(inputString);
|
CharStream input = CharStreams.createWithString(inputString);
|
||||||
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
|
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
|
||||||
DOTGenerator dot = new DOTGenerator(lg);
|
DOTGenerator dot = new DOTGenerator(lg);
|
||||||
// System.out.println(dot.getDOT(startState, true));
|
// System.out.println(dot.getDOT(startState, true));
|
||||||
|
|
|
@ -291,6 +291,113 @@ public class TestATNSerialization extends BaseJavaToolTest {
|
||||||
assertEquals(expecting, result);
|
assertEquals(expecting, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeSMPLiteralSerializedToSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"INT : '\\u{1F4A9}' ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:128169..128169\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeSMPRangeSerializedToSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"INT : ('a'..'\\u{1F4A9}') ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'a'..128169\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeSMPSetSerializedAfterBMPSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+
|
||||||
|
"BMP : ('a' | 'x') ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 2\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:RULE_START 1\n" +
|
||||||
|
"4:RULE_STOP 1\n" +
|
||||||
|
"5:BASIC 0\n" +
|
||||||
|
"6:BASIC 0\n" +
|
||||||
|
"7:BASIC 1\n" +
|
||||||
|
"8:BASIC 1\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"rule 1:3 2\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'a'..'a', 'x'..'x'\n" +
|
||||||
|
"1:128169..128169, 128175..128175\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"0->3 EPSILON 0,0,0\n" +
|
||||||
|
"1->5 EPSILON 0,0,0\n" +
|
||||||
|
"3->7 EPSILON 0,0,0\n" +
|
||||||
|
"5->6 SET 1,0,0\n" +
|
||||||
|
"6->2 EPSILON 0,0,0\n" +
|
||||||
|
"7->8 SET 0,0,0\n" +
|
||||||
|
"8->4 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerNotLiteral() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"INT : ~'a' ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'a'..'a'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
@Test public void testLexerRange() throws Exception {
|
@Test public void testLexerRange() throws Exception {
|
||||||
LexerGrammar lg = new LexerGrammar(
|
LexerGrammar lg = new LexerGrammar(
|
||||||
"lexer grammar L;\n"+
|
"lexer grammar L;\n"+
|
||||||
|
@ -518,6 +625,222 @@ public class TestATNSerialization extends BaseJavaToolTest {
|
||||||
assertEquals(expecting, result);
|
assertEquals(expecting, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeUnescapedBMPNotSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\u4E9C'|'\u4E9D')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u4E9C'..'\\u4E9D'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeUnescapedBMPSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeUnescapedBMPNotSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedBMPNotSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u4E9C'|'\\u4E9D')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u4E9C'..'\\u4E9D'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedBMPSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedBMPNotSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedSMPNotSet() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:128169..128170\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedSMPSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:119823..119827, 128065..128065, 128169..128170\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testLexerUnicodeEscapedSMPNotSetWithRange() throws Exception {
|
||||||
|
LexerGrammar lg = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
|
||||||
|
String expecting =
|
||||||
|
"max type 1\n" +
|
||||||
|
"0:TOKEN_START -1\n" +
|
||||||
|
"1:RULE_START 0\n" +
|
||||||
|
"2:RULE_STOP 0\n" +
|
||||||
|
"3:BASIC 0\n" +
|
||||||
|
"4:BASIC 0\n" +
|
||||||
|
"rule 0:1 1\n" +
|
||||||
|
"mode 0:0\n" +
|
||||||
|
"0:119823..119827, 128065..128065, 128169..128170\n" +
|
||||||
|
"0->1 EPSILON 0,0,0\n" +
|
||||||
|
"1->3 EPSILON 0,0,0\n" +
|
||||||
|
"3->4 NOT_SET 0,0,0\n" +
|
||||||
|
"4->2 EPSILON 0,0,0\n" +
|
||||||
|
"0:0\n";
|
||||||
|
ATN atn = createATN(lg, true);
|
||||||
|
String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
|
||||||
|
assertEquals(expecting, result);
|
||||||
|
}
|
||||||
|
|
||||||
@Test public void testLexerWildcardWithMode() throws Exception {
|
@Test public void testLexerWildcardWithMode() throws Exception {
|
||||||
LexerGrammar lg = new LexerGrammar(
|
LexerGrammar lg = new LexerGrammar(
|
||||||
"lexer grammar L;\n"+
|
"lexer grammar L;\n"+
|
||||||
|
|
|
@ -141,6 +141,24 @@ public class TestTokenTypeAssignment extends BaseJavaToolTest {
|
||||||
assertEquals("'\\n'", literals.toArray()[0]);
|
assertEquals("'\\n'", literals.toArray()[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void testParserCharLiteralWithBasicUnicodeEscape() throws Exception {
|
||||||
|
Grammar g = new Grammar(
|
||||||
|
"grammar t;\n"+
|
||||||
|
"a : '\\uABCD';\n");
|
||||||
|
Set<?> literals = g.stringLiteralToTypeMap.keySet();
|
||||||
|
// must store literals how they appear in the antlr grammar
|
||||||
|
assertEquals("'\\uABCD'", literals.toArray()[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testParserCharLiteralWithExtendedUnicodeEscape() throws Exception {
|
||||||
|
Grammar g = new Grammar(
|
||||||
|
"grammar t;\n"+
|
||||||
|
"a : '\\u{1ABCD}';\n");
|
||||||
|
Set<?> literals = g.stringLiteralToTypeMap.keySet();
|
||||||
|
// must store literals how they appear in the antlr grammar
|
||||||
|
assertEquals("'\\u{1ABCD}'", literals.toArray()[0]);
|
||||||
|
}
|
||||||
|
|
||||||
protected void checkSymbols(Grammar g,
|
protected void checkSymbols(Grammar g,
|
||||||
String rulesStr,
|
String rulesStr,
|
||||||
String allValidTokensStr)
|
String allValidTokensStr)
|
||||||
|
|
|
@ -0,0 +1,131 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
|
||||||
|
* Use of this file is governed by the BSD 3-clause license that
|
||||||
|
* can be found in the LICENSE.txt file in the project root.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.antlr.v4.test.tool;
|
||||||
|
|
||||||
|
import org.antlr.v4.gui.Trees;
|
||||||
|
import org.antlr.v4.runtime.CharStreams;
|
||||||
|
import org.antlr.v4.runtime.CommonTokenStream;
|
||||||
|
import org.antlr.v4.runtime.LexerInterpreter;
|
||||||
|
import org.antlr.v4.runtime.tree.ParseTree;
|
||||||
|
import org.antlr.v4.tool.Grammar;
|
||||||
|
import org.antlr.v4.tool.GrammarParserInterpreter;
|
||||||
|
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
public class TestUnicodeGrammar extends BaseJavaToolTest {
|
||||||
|
@Test
|
||||||
|
public void unicodeBMPLiteralInGrammar() throws Exception {
|
||||||
|
String grammarText =
|
||||||
|
"grammar Unicode;\n" +
|
||||||
|
"r : 'hello' WORLD;\n" +
|
||||||
|
"WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" +
|
||||||
|
"WS : [ \\t\\r\\n]+ -> skip;\n";
|
||||||
|
String inputText = "hello \u4E16\u754C";
|
||||||
|
assertEquals(
|
||||||
|
"(r:1 " + inputText + ")",
|
||||||
|
parseTreeForGrammarWithInput(
|
||||||
|
grammarText,
|
||||||
|
"r",
|
||||||
|
inputText));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: This test cannot pass unless we change either the grammar
|
||||||
|
// parser to decode surrogate pair literals to code points (which
|
||||||
|
// would break existing clients) or to treat them as an
|
||||||
|
// alternative:
|
||||||
|
//
|
||||||
|
// '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D')
|
||||||
|
//
|
||||||
|
// but I worry that might cause parse ambiguity if we're not careful.
|
||||||
|
//@Test
|
||||||
|
public void unicodeSurrogatePairLiteralInGrammar() throws Exception {
|
||||||
|
String grammarText =
|
||||||
|
"grammar Unicode;\n" +
|
||||||
|
"r : 'hello' WORLD;\n" +
|
||||||
|
"WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" +
|
||||||
|
"WS : [ \\t\\r\\n]+ -> skip;\n";
|
||||||
|
String inputText = new StringBuilder("hello ")
|
||||||
|
.appendCodePoint(0x1F30E)
|
||||||
|
.toString();
|
||||||
|
assertEquals(
|
||||||
|
"(r:1 " + inputText + ")",
|
||||||
|
parseTreeForGrammarWithInput(
|
||||||
|
grammarText,
|
||||||
|
"r",
|
||||||
|
inputText));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void unicodeSMPLiteralInGrammar() throws Exception {
|
||||||
|
String grammarText =
|
||||||
|
"grammar Unicode;\n" +
|
||||||
|
"r : 'hello' WORLD;\n" +
|
||||||
|
"WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" +
|
||||||
|
"WS : [ \\t\\r\\n]+ -> skip;\n";
|
||||||
|
String inputText = new StringBuilder("hello ")
|
||||||
|
.appendCodePoint(0x1F30E)
|
||||||
|
.toString();
|
||||||
|
assertEquals(
|
||||||
|
"(r:1 " + inputText + ")",
|
||||||
|
parseTreeForGrammarWithInput(
|
||||||
|
grammarText,
|
||||||
|
"r",
|
||||||
|
inputText));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void unicodeSMPRangeInGrammar() throws Exception {
|
||||||
|
String grammarText =
|
||||||
|
"grammar Unicode;\n" +
|
||||||
|
"r : 'hello' WORLD;\n" +
|
||||||
|
"WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" +
|
||||||
|
"WS : [ \\t\\r\\n]+ -> skip;\n";
|
||||||
|
String inputText = new StringBuilder("hello ")
|
||||||
|
.appendCodePoint(0x1F30E)
|
||||||
|
.toString();
|
||||||
|
assertEquals(
|
||||||
|
"(r:1 " + inputText + ")",
|
||||||
|
parseTreeForGrammarWithInput(
|
||||||
|
grammarText,
|
||||||
|
"r",
|
||||||
|
inputText));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void matchingDanglingSurrogateInInput() throws Exception {
|
||||||
|
String grammarText =
|
||||||
|
"grammar Unicode;\n" +
|
||||||
|
"r : 'hello' WORLD;\n" +
|
||||||
|
"WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" +
|
||||||
|
"WS : [ \\t\\r\\n]+ -> skip;\n";
|
||||||
|
String inputText = "hello \uD83C";
|
||||||
|
assertEquals(
|
||||||
|
"(r:1 " + inputText + ")",
|
||||||
|
parseTreeForGrammarWithInput(
|
||||||
|
grammarText,
|
||||||
|
"r",
|
||||||
|
inputText));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String parseTreeForGrammarWithInput(
|
||||||
|
String grammarText,
|
||||||
|
String rootRule,
|
||||||
|
String inputText) throws Exception {
|
||||||
|
Grammar grammar = new Grammar(grammarText);
|
||||||
|
LexerInterpreter lexEngine = grammar.createLexerInterpreter(
|
||||||
|
CharStreams.createWithString(inputText));
|
||||||
|
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
|
||||||
|
GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
|
||||||
|
ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
|
||||||
|
InterpreterTreeTextProvider nodeTextProvider =
|
||||||
|
new InterpreterTreeTextProvider(grammar.getRuleNames());
|
||||||
|
return Trees.toStringTree(parseTree, nodeTextProvider);
|
||||||
|
}
|
||||||
|
}
|
|
@ -288,7 +288,7 @@ public partial class <csIdentifier.(parser.name)> : <superClass; null="Parser">
|
||||||
|
|
||||||
public override string[] RuleNames { get { return ruleNames; } }
|
public override string[] RuleNames { get { return ruleNames; } }
|
||||||
|
|
||||||
public override string SerializedAtn { get { return _serializedATN; } }
|
public override string SerializedAtn { get { return new string(_serializedATN); } }
|
||||||
|
|
||||||
static <csIdentifier.(parser.name)>() {
|
static <csIdentifier.(parser.name)>() {
|
||||||
decisionToDFA = new DFA[_ATN.NumberOfDecisions];
|
decisionToDFA = new DFA[_ATN.NumberOfDecisions];
|
||||||
|
@ -1023,7 +1023,7 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
|
||||||
|
|
||||||
public override string[] ModeNames { get { return modeNames; } }
|
public override string[] ModeNames { get { return modeNames; } }
|
||||||
|
|
||||||
public override string SerializedAtn { get { return _serializedATN; } }
|
public override string SerializedAtn { get { return new string(_serializedATN); } }
|
||||||
|
|
||||||
static <csIdentifier.(lexer.name)>() {
|
static <csIdentifier.(lexer.name)>() {
|
||||||
decisionToDFA = new DFA[_ATN.NumberOfDecisions];
|
decisionToDFA = new DFA[_ATN.NumberOfDecisions];
|
||||||
|
@ -1038,16 +1038,12 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
|
||||||
|
|
||||||
|
|
||||||
SerializedATN(model) ::= <<
|
SerializedATN(model) ::= <<
|
||||||
private static string _serializedATN = _serializeATN();
|
private static char[] _serializedATN = {
|
||||||
private static string _serializeATN()
|
<model.serialized; separator=", ", wrap>,
|
||||||
{
|
};
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.Append("<model.serialized; wrap={");<\n><\t>sb.Append("}>");
|
|
||||||
return sb.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static readonly ATN _ATN =
|
public static readonly ATN _ATN =
|
||||||
new ATNDeserializer().Deserialize(_serializedATN.ToCharArray());
|
new ATNDeserializer().Deserialize(_serializedATN);
|
||||||
|
|
||||||
|
|
||||||
>>
|
>>
|
||||||
|
|
|
@ -192,23 +192,23 @@ atn::ATN <lexer.name>::_atn;
|
||||||
std::vector\<uint16_t> <lexer.name>::_serializedATN;
|
std::vector\<uint16_t> <lexer.name>::_serializedATN;
|
||||||
|
|
||||||
std::vector\<std::string> <lexer.name>::_ruleNames = {
|
std::vector\<std::string> <lexer.name>::_ruleNames = {
|
||||||
<lexer.ruleNames: {r | "<r>"}; separator = ", ", wrap, anchor>
|
<lexer.ruleNames: {r | u8"<r>"}; separator = ", ", wrap, anchor>
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector\<std::string> <lexer.name>::_channelNames = {
|
std::vector\<std::string> <lexer.name>::_channelNames = {
|
||||||
"DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | "<c>"}; separator = ", ", wrap, anchor><endif>
|
"DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | u8"<c>"}; separator = ", ", wrap, anchor><endif>
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector\<std::string> <lexer.name>::_modeNames = {
|
std::vector\<std::string> <lexer.name>::_modeNames = {
|
||||||
<lexer.modes: {m | "<m>"}; separator = ", ", wrap, anchor>
|
<lexer.modes: {m | u8"<m>"}; separator = ", ", wrap, anchor>
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector\<std::string> <lexer.name>::_literalNames = {
|
std::vector\<std::string> <lexer.name>::_literalNames = {
|
||||||
<lexer.literalNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor>
|
<lexer.literalNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector\<std::string> <lexer.name>::_symbolicNames = {
|
std::vector\<std::string> <lexer.name>::_symbolicNames = {
|
||||||
<lexer.symbolicNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor>
|
<lexer.symbolicNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
|
||||||
};
|
};
|
||||||
|
|
||||||
dfa::Vocabulary <lexer.name>::_vocabulary(_literalNames, _symbolicNames);
|
dfa::Vocabulary <lexer.name>::_vocabulary(_literalNames, _symbolicNames);
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class CSharpTarget extends Target {
|
||||||
formatted = String.format("\\x%X", v & 0xFFFF);
|
formatted = String.format("\\x%X", v & 0xFFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
return formatted;
|
return "'" + formatted + "'";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -45,10 +45,9 @@ public class CharSupport {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return a string representing the escaped char for code c. E.g., If c
|
/** Return a string representing the escaped char for code c. E.g., If c
|
||||||
* has value 0x100, you will get "\u0100". ASCII gets the usual
|
* has value 0x100, you will get "\\u0100". ASCII gets the usual
|
||||||
* char (non-hex) representation. Control characters are spit out
|
* char (non-hex) representation. Non-ASCII characters are spit out
|
||||||
* as unicode. While this is specially set up for returning Java strings,
|
* as \\uXXXX or \\u{XXXXXX} escapes.
|
||||||
* it can be used by any language target that has the same syntax. :)
|
|
||||||
*/
|
*/
|
||||||
public static String getANTLRCharLiteralForChar(int c) {
|
public static String getANTLRCharLiteralForChar(int c) {
|
||||||
if ( c< Lexer.MIN_CHAR_VALUE ) {
|
if ( c< Lexer.MIN_CHAR_VALUE ) {
|
||||||
|
@ -67,11 +66,11 @@ public class CharSupport {
|
||||||
}
|
}
|
||||||
return '\''+Character.toString((char)c)+'\'';
|
return '\''+Character.toString((char)c)+'\'';
|
||||||
}
|
}
|
||||||
// turn on the bit above max "\uFFFF" value so that we pad with zeros
|
if (c <= 0xFFFF) {
|
||||||
// then only take last 4 digits
|
return String.format("\\u%04X", c);
|
||||||
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
|
} else {
|
||||||
String unicodeStr = "'\\u"+hex+"'";
|
return String.format("\\u{%06X}", c);
|
||||||
return unicodeStr;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Given a literal like (the 3 char sequence with single quotes) 'a',
|
/** Given a literal like (the 3 char sequence with single quotes) 'a',
|
||||||
|
@ -92,11 +91,25 @@ public class CharSupport {
|
||||||
if ( literal.charAt(i) == '\\' ) {
|
if ( literal.charAt(i) == '\\' ) {
|
||||||
end = i+2;
|
end = i+2;
|
||||||
if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
|
if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
|
||||||
for (end = i + 2; end < i + 6; end++) {
|
if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence
|
||||||
if ( end>n ) return null; // invalid escape sequence.
|
end = i + 3;
|
||||||
char charAt = literal.charAt(end);
|
while (true) {
|
||||||
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
|
if ( end + 1 > n ) return null; // invalid escape sequence.
|
||||||
return null; // invalid escape sequence.
|
char charAt = literal.charAt(end++);
|
||||||
|
if (charAt == '}') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
|
||||||
|
return null; // invalid escape sequence.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (end = i + 2; end < i + 6; end++) {
|
||||||
|
if ( end>n ) return null; // invalid escape sequence.
|
||||||
|
char charAt = literal.charAt(end);
|
||||||
|
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
|
||||||
|
return null; // invalid escape sequence.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -107,13 +120,13 @@ public class CharSupport {
|
||||||
if ( c==-1 ) {
|
if ( c==-1 ) {
|
||||||
return null; // invalid escape sequence.
|
return null; // invalid escape sequence.
|
||||||
}
|
}
|
||||||
else buf.append((char)c);
|
else buf.appendCodePoint(c);
|
||||||
i = end;
|
i = end;
|
||||||
}
|
}
|
||||||
return buf.toString();
|
return buf.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Given char x or \t or \u1234 return the char value;
|
/** Given char x or \\t or \\u1234 return the char value;
|
||||||
* Unnecessary escapes like '\{' yield -1.
|
* Unnecessary escapes like '\{' yield -1.
|
||||||
*/
|
*/
|
||||||
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
|
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
|
||||||
|
@ -130,9 +143,31 @@ public class CharSupport {
|
||||||
if ( charVal==0 ) return -1;
|
if ( charVal==0 ) return -1;
|
||||||
return charVal;
|
return charVal;
|
||||||
case 6:
|
case 6:
|
||||||
// '\u1234'
|
// '\\u1234' or '\\u{12}'
|
||||||
if ( !cstr.startsWith("\\u") ) return -1;
|
if ( !cstr.startsWith("\\u") ) return -1;
|
||||||
String unicodeChars = cstr.substring(2, cstr.length());
|
int startOff;
|
||||||
|
int endOff;
|
||||||
|
if ( cstr.charAt(2) == '{' ) {
|
||||||
|
startOff = 3;
|
||||||
|
endOff = cstr.indexOf('}');
|
||||||
|
} else {
|
||||||
|
startOff = 2;
|
||||||
|
endOff = cstr.length();
|
||||||
|
}
|
||||||
|
return parseHexValue(cstr, startOff, endOff);
|
||||||
|
default:
|
||||||
|
if ( cstr.startsWith("\\u{") ) {
|
||||||
|
return parseHexValue(cstr, 3, cstr.indexOf('}'));
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int parseHexValue(String cstr, int startOff, int endOff) {
|
||||||
|
if (startOff < 0 || endOff < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
String unicodeChars = cstr.substring(startOff, endOff);
|
||||||
int result = -1;
|
int result = -1;
|
||||||
try {
|
try {
|
||||||
result = Integer.parseInt(unicodeChars, 16);
|
result = Integer.parseInt(unicodeChars, 16);
|
||||||
|
@ -140,9 +175,6 @@ public class CharSupport {
|
||||||
catch (NumberFormatException e) {
|
catch (NumberFormatException e) {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
default:
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String capitalize(String s) {
|
public static String capitalize(String s) {
|
||||||
|
|
|
@ -615,8 +615,8 @@ SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT
|
||||||
//
|
//
|
||||||
// ANTLR makes no disticintion between a single character literal and a
|
// ANTLR makes no disticintion between a single character literal and a
|
||||||
// multi-character string. All literals are single quote delimited and
|
// multi-character string. All literals are single quote delimited and
|
||||||
// may contain unicode escape sequences of the form \uxxxx, where x
|
// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx},
|
||||||
// is a valid hexadecimal number (as per Java basically).
|
// where x is a valid hexadecimal number.
|
||||||
STRING_LITERAL
|
STRING_LITERAL
|
||||||
: '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
|
: '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
|
||||||
( '\''
|
( '\''
|
||||||
|
@ -652,6 +652,10 @@ ESC_SEQ
|
||||||
//
|
//
|
||||||
UNICODE_ESC
|
UNICODE_ESC
|
||||||
|
|
||||||
|
| // A Swift/Hack style Unicode escape sequence
|
||||||
|
//
|
||||||
|
UNICODE_EXTENDED_ESC
|
||||||
|
|
||||||
| // An illegal escape seqeunce
|
| // An illegal escape seqeunce
|
||||||
//
|
//
|
||||||
{
|
{
|
||||||
|
@ -720,6 +724,27 @@ UNICODE_ESC
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
|
fragment
|
||||||
|
UNICODE_EXTENDED_ESC
|
||||||
|
: 'u{' // Leadin for unicode extended escape sequence
|
||||||
|
|
||||||
|
HEX_DIGIT+ // One or more hexadecimal digits
|
||||||
|
|
||||||
|
'}' // Leadout for unicode extended escape sequence
|
||||||
|
|
||||||
|
// Now check the digit count and issue an error if we need to
|
||||||
|
{
|
||||||
|
int numDigits = getCharIndex()-state.tokenStartCharIndex-6;
|
||||||
|
if (numDigits > 6) {
|
||||||
|
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
|
||||||
|
t.setText(t.getText());
|
||||||
|
t.setLine(input.getLine());
|
||||||
|
t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
|
||||||
|
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
;
|
||||||
|
|
||||||
// ----------
|
// ----------
|
||||||
// Whitespace
|
// Whitespace
|
||||||
//
|
//
|
||||||
|
|
Loading…
Reference in New Issue