From fd4246cf3f90f0e4ee961b8254a8c2220baf0c16 Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Fri, 17 Feb 2017 13:35:00 -0800 Subject: [PATCH] Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots more tests. --- .../runtime/descriptors/SetsDescriptors.java | 278 +++++++++++++++ .../Antlr4.Runtime/Atn/ATNDeserializer.cs | 32 +- .../Cpp/runtime/src/atn/ATNDeserializer.cpp | 72 +++- runtime/Cpp/runtime/src/atn/ATNDeserializer.h | 7 + runtime/Cpp/runtime/src/misc/Interval.cpp | 9 +- runtime/Cpp/runtime/src/misc/Interval.h | 2 +- runtime/Cpp/runtime/src/misc/IntervalSet.cpp | 8 +- runtime/Cpp/runtime/src/misc/IntervalSet.h | 4 +- runtime/Go/antlr/atn_deserializer.go | 28 +- .../antlr/v4/runtime/atn/ATNDeserializer.java | 97 +++++- .../antlr/v4/runtime/atn/ATNSerializer.java | 165 ++++++--- .../src/antlr4/atn/ATNDeserializer.js | 32 +- runtime/Python2/src/antlr4/Lexer.py | 18 +- .../Python2/src/antlr4/atn/ATNDeserializer.py | 25 +- .../Python3/src/antlr4/atn/ATNDeserializer.py | 26 +- .../Sources/Antlr4/atn/ATNDeserializer.swift | 72 ++-- .../v4/test/tool/TestATNLexerInterpreter.java | 92 ++++- .../v4/test/tool/TestATNSerialization.java | 323 ++++++++++++++++++ .../v4/test/tool/TestTokenTypeAssignment.java | 18 + .../v4/test/tool/TestUnicodeGrammar.java | 131 +++++++ .../tool/templates/codegen/CSharp/CSharp.stg | 16 +- .../v4/tool/templates/codegen/Cpp/Cpp.stg | 10 +- .../antlr/v4/codegen/target/CSharpTarget.java | 2 +- tool/src/org/antlr/v4/misc/CharSupport.java | 74 ++-- tool/src/org/antlr/v4/parse/ANTLRLexer.g | 29 +- 25 files changed, 1361 insertions(+), 209 deletions(-) create mode 100644 tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java index 8519077e3..6fe3059cf 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java @@ -402,4 +402,282 @@ public class SetsDescriptors { public String grammar; } + + public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor { + public String input = "a\u00E4\u3042\u4E9Cc"; + public String output = "a\u00E4\u3042\u4E9Cc\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS {} ; + // These are actually not escaped -- Java passes the + // raw unescaped Unicode values to the grammar compiler. + LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c'; + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor { + public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d"; + public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS* 'd' {} ; + // These are actually not escaped -- Java passes the + // raw unescaped Unicode values to the grammar compiler. + LETTERS : ('a'|'\u00E0'..'\u00E5'); + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor { + public String input = "a\u00E4\u3042\u4E9Cc"; + public String output = "a\u00E4\u3042\u4E9Cc\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS {} ; + // Note the double-backslash to avoid Java passing + // unescaped values as part of the grammar. + LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c'; + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor { + public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d"; + public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS* 'd' {} ; + // Note the double-backslash to avoid Java passing + // unescaped values as part of the grammar. + LETTERS : ('a'|'\\u00E0'..'\\u00E5'); + */ + @CommentHasStringValue + public String grammar; + + } + + // TODO(bhamiltoncx): This needs to be an error, the V3 + // runtime used by the tool doesn't really understand unescaped code points > + // U+FFFF. + // public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor { + // public String input = new StringBuilder() + // .append("a") + // .appendCodePoint(0x1D5C2) + // .appendCodePoint(0x1D5CE) + // .appendCodePoint(0x1D5BA) + // .append("c") + // .toString(); + // public String output = new StringBuilder() + // .append("a") + // .appendCodePoint(0x1D5C2) + // .appendCodePoint(0x1D5CE) + // .appendCodePoint(0x1D5BA) + // .append("c\n") + // .toString(); + // public String errors = null; + // public String startRule = "a"; + // public String grammarName = "T"; + + // /** + // grammar T; + // a : LETTERS {} ; + // // These are actually not escaped -- Java passes the + // // raw unescaped Unicode values to the grammar compiler. + // // + // // Each sequence is the UTF-16 encoding of a raw Unicode + // // SMP code point. + // LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c'; + // */ + // @CommentHasStringValue + // public String grammar; + + // } + + public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor { + public String input = new StringBuilder() + .append("a") + .appendCodePoint(0x1D5C2) + .appendCodePoint(0x1D5CE) + .appendCodePoint(0x1D5BA) + .append("c") + .toString(); + public String output = new StringBuilder() + .append("a") + .appendCodePoint(0x1D5C2) + .appendCodePoint(0x1D5CE) + .appendCodePoint(0x1D5BA) + .append("c\n") + .toString(); + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS {} ; + // Note the double-backslash to avoid Java passing + // unescaped values as part of the grammar. + LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c'; + */ + @CommentHasStringValue + public String grammar; + + } + + // Turns out Tool.java uses ANTLR 3's runtime, which means it can't use + // CodePointCharStream to understand unescaped code points > U+FFFF. + // + // TODO(bhamiltoncx): This needs to be an error, since we don't currently plan + // to port Tool.java to use ANTLR 4's runtime. + + // public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor { + // public String input = new StringBuilder() + // .append("a") + // .appendCodePoint(0x1D5C2) + // .appendCodePoint(0x1D5CE) + // .appendCodePoint(0x1D5BA) + // .append("d") + // .toString(); + // public String output = new StringBuilder() + // .append("a") + // .appendCodePoint(0x1D5C2) + // .appendCodePoint(0x1D5CE) + // .appendCodePoint(0x1D5BA) + // .append("d\n") + // .toString(); + // public String errors = null; + // public String startRule = "a"; + // public String grammarName = "T"; + + // /** + // grammar T; + // a : LETTERS* 'd' {} ; + // // These are actually not escaped -- Java passes the + // // raw unescaped Unicode values to the grammar compiler. + // LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43'); + // */ + // @CommentHasStringValue + // public String grammar; + + // } + + public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor { + public String input = new StringBuilder() + .append("a") + .appendCodePoint(0x1F609) + .appendCodePoint(0x1F942) + .appendCodePoint(0x1F700) + .append("d") + .toString(); + public String output = new StringBuilder() + .append("a") + .appendCodePoint(0x1F609) + .appendCodePoint(0x1F942) + .appendCodePoint(0x1F700) + .append("d\n") + .toString(); + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS* 'd' {} ; + // Note the double-backslash to avoid Java passing + // unescaped values as part of the grammar. + LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}'); + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor { + // Test the code points just before and just after the range. + public String input = new StringBuilder() + .append("a") + .appendCodePoint(0x1F5FF) + .appendCodePoint(0x1F944) + .append("d") + .toString(); + public String output = "ad\n"; + public String errors = new StringBuilder() + .append("line 1:1 token recognition error at: '") + .appendCodePoint(0x1F5FF) + .append("'\n") + .append("line 1:2 token recognition error at: '") + .appendCodePoint(0x1F944) + .append("'\n") + .toString(); + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS* 'd' {} ; + // Note the double-backslash to avoid Java passing + // unescaped values as part of the grammar. + LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}'); + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor { + public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c"; + public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS {} ; + LETTERS : 'a' ~('b')+ 'c'; + */ + @CommentHasStringValue + public String grammar; + + } + + public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor { + public String input = "abc"; + public String output = "abc\n"; + public String errors = null; + public String startRule = "a"; + public String grammarName = "T"; + + /** + grammar T; + a : LETTERS {} ; + LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c'; + */ + @CommentHasStringValue + public String grammar; + + } } diff --git a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs index ad1804ccb..871862804 100644 --- a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs +++ b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs @@ -22,6 +22,18 @@ namespace Antlr4.Runtime.Atn /// This is the earliest supported serialized UUID. private static readonly Guid BaseSerializedUuid; + /// + /// This UUID indicates the serialized ATN contains two sets of + /// IntervalSets, where the second set's values are encoded as + /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + /// + /// + /// This UUID indicates the serialized ATN contains two sets of + /// IntervalSets, where the second set's values are encoded as + /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + /// + private static readonly Guid AddedUnicodeSmp; + /// /// This list contains all of the currently supported UUIDs, ordered by when /// the feature first appeared in this branch. @@ -39,14 +51,18 @@ namespace Antlr4.Runtime.Atn static ATNDeserializer() { BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); + AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089"); SupportedUuids = new List(); SupportedUuids.Add(BaseSerializedUuid); - SerializedUuid = BaseSerializedUuid; + SupportedUuids.Add(AddedUnicodeSmp); + SerializedUuid = AddedUnicodeSmp; } [NotNull] private readonly ATNDeserializationOptions deserializationOptions; + private Guid uuid; + public ATNDeserializer() : this(ATNDeserializationOptions.Default) { @@ -115,7 +131,11 @@ namespace Antlr4.Runtime.Atn ReadStates (atn); ReadRules (atn); ReadModes (atn); - IList sets = ReadSets (atn); + IList sets = new List(); + ReadSets (atn, sets, this.ReadInt); + if (IsFeatureSupported(AddedUnicodeSmp, uuid)) { + ReadSets (atn, sets, this.ReadInt32); + } ReadEdges (atn, sets); ReadDecisions (atn); ReadLexerActions (atn); @@ -378,12 +398,11 @@ namespace Antlr4.Runtime.Atn } } - protected internal virtual IList ReadSets(ATN atn) + protected internal virtual void ReadSets(ATN atn, IList sets, Func readUnicode) { // // SETS // - IList sets = new List(); int nsets = ReadInt(); for (int i_8 = 0; i_8 < nsets; i_8++) { @@ -397,10 +416,9 @@ namespace Antlr4.Runtime.Atn } for (int j = 0; j < nintervals; j++) { - set.Add(ReadInt(), ReadInt()); + set.Add(readUnicode(), readUnicode()); } } - return sets; } protected internal virtual void ReadModes(ATN atn) @@ -530,7 +548,7 @@ namespace Antlr4.Runtime.Atn protected internal virtual void CheckUUID() { - Guid uuid = ReadUUID(); + uuid = ReadUUID(); if (!SupportedUuids.Contains(uuid)) { string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid); diff --git a/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp b/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp index 1d7a3f562..7fe7bee33 100755 --- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp +++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp @@ -57,6 +57,51 @@ using namespace antlrcpp; const size_t ATNDeserializer::SERIALIZED_VERSION = 3; +namespace { + +uint32_t deserializeInt32(const std::vector& data, size_t offset) { + return (uint32_t)data[offset] | ((uint32_t)data[offset + 1] << 16); +} + +ssize_t readUnicodeInt(const std::vector& data, int& p) { + return static_cast(data[p++]); +} + +ssize_t readUnicodeInt32(const std::vector& data, int& p) { + auto result = deserializeInt32(data, p); + p += 2; + return static_cast(result); +} + +// We templatize this on the function type so the optimizer can inline +// the 16- or 32-bit readUnicodeInt/readUnicodeInt32 as needed. +template +void deserializeSets( + const std::vector& data, + int& p, + std::vector& sets, + F readUnicode) { + int nsets = data[p++]; + for (int i = 0; i < nsets; i++) { + int nintervals = data[p++]; + misc::IntervalSet set; + + bool containsEof = data[p++] != 0; + if (containsEof) { + set.add(-1); + } + + for (int j = 0; j < nintervals; j++) { + auto a = readUnicode(data, p); + auto b = readUnicode(data, p); + set.add(a, b); + } + sets.push_back(set); + } +} + +} + ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) { } @@ -75,8 +120,12 @@ Guid ATNDeserializer::ADDED_LEXER_ACTIONS() { return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); } +Guid ATNDeserializer::ADDED_UNICODE_SMP() { + return Guid("59627784-3BE5-417A-B9EB-8131A7286089"); +} + Guid ATNDeserializer::SERIALIZED_UUID() { - return ADDED_LEXER_ACTIONS(); + return ADDED_UNICODE_SMP(); } Guid ATNDeserializer::BASE_SERIALIZED_UUID() { @@ -84,7 +133,7 @@ Guid ATNDeserializer::BASE_SERIALIZED_UUID() { } std::vector& ATNDeserializer::SUPPORTED_UUIDS() { - static std::vector singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS() }; + static std::vector singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS(), ADDED_UNICODE_SMP() }; return singleton; } @@ -239,21 +288,14 @@ ATN ATNDeserializer::deserialize(const std::vector& input) { // SETS // std::vector sets; - int nsets = data[p++]; - for (int i = 0; i < nsets; i++) { - int nintervals = data[p++]; - misc::IntervalSet set; - bool containsEof = data[p++] != 0; - if (containsEof) { - set.add(-1); - } + // First, deserialize sets with 16-bit arguments <= U+FFFF. + deserializeSets(data, p, sets, readUnicodeInt); - for (int j = 0; j < nintervals; j++) { - set.add(data[p], data[p + 1], true); - p += 2; - } - sets.push_back(set); + // Next, if the ATN was serialized with the Unicode SMP feature, + // deserialize sets with 32-bit arguments <= U+10FFFF. + if (isFeatureSupported(ADDED_UNICODE_SMP(), uuid)) { + deserializeSets(data, p, sets, readUnicodeInt32); } // diff --git a/runtime/Cpp/runtime/src/atn/ATNDeserializer.h b/runtime/Cpp/runtime/src/atn/ATNDeserializer.h index c2b0156f7..686688135 100755 --- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.h +++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.h @@ -67,6 +67,13 @@ namespace atn { */ static Guid ADDED_LEXER_ACTIONS(); + /** + * This UUID indicates the serialized ATN contains two sets of + * IntervalSets, where the second set's values are encoded as + * 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + */ + static Guid ADDED_UNICODE_SMP(); + /// This list contains all of the currently supported UUIDs, ordered by when /// the feature first appeared in this branch. static std::vector& SUPPORTED_UUIDS(); diff --git a/runtime/Cpp/runtime/src/misc/Interval.cpp b/runtime/Cpp/runtime/src/misc/Interval.cpp index eb45027e7..ae4a518a5 100755 --- a/runtime/Cpp/runtime/src/misc/Interval.cpp +++ b/runtime/Cpp/runtime/src/misc/Interval.cpp @@ -24,14 +24,7 @@ Interval::Interval() : Interval((ssize_t)-1, -2) { // Need an explicit cast here Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) { } -Interval::Interval(ssize_t a_, ssize_t b_, bool autoExtend) { - a = a_; - b = b_; - - // XXX: temporary hack to make the full Unicode range available. - if (autoExtend && b == 0xFFFF) { - b = 0x10FFFF; - } +Interval::Interval(ssize_t a_, ssize_t b_) : a(a_), b(b_) { } size_t Interval::length() const { diff --git a/runtime/Cpp/runtime/src/misc/Interval.h b/runtime/Cpp/runtime/src/misc/Interval.h index 3f8015ef7..e88814376 100755 --- a/runtime/Cpp/runtime/src/misc/Interval.h +++ b/runtime/Cpp/runtime/src/misc/Interval.h @@ -27,7 +27,7 @@ namespace misc { Interval(); explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings. - Interval(ssize_t a_, ssize_t b_, bool autoExtend = false); // Automatically extend a value of 0xFFFF to 0x10FFFF. + Interval(ssize_t a_, ssize_t b_); virtual ~Interval() {}; /// return number of elements between a and b inclusively. x..x is length 1. diff --git a/runtime/Cpp/runtime/src/misc/IntervalSet.cpp b/runtime/Cpp/runtime/src/misc/IntervalSet.cpp index 00c87f653..d694a8aa1 100755 --- a/runtime/Cpp/runtime/src/misc/IntervalSet.cpp +++ b/runtime/Cpp/runtime/src/misc/IntervalSet.cpp @@ -50,8 +50,8 @@ IntervalSet IntervalSet::of(ssize_t a) { return IntervalSet({ Interval(a, a) }); } -IntervalSet IntervalSet::of(ssize_t a, ssize_t b, bool autoExtend) { - return IntervalSet({ Interval(a, b, autoExtend) }); +IntervalSet IntervalSet::of(ssize_t a, ssize_t b) { + return IntervalSet({ Interval(a, b) }); } void IntervalSet::clear() { @@ -68,8 +68,8 @@ void IntervalSet::add(ssize_t el) { add(el, el); } -void IntervalSet::add(ssize_t a, ssize_t b, bool autoExtend) { - add(Interval(a, b, autoExtend)); +void IntervalSet::add(ssize_t a, ssize_t b) { + add(Interval(a, b)); } void IntervalSet::add(const Interval &addition) { diff --git a/runtime/Cpp/runtime/src/misc/IntervalSet.h b/runtime/Cpp/runtime/src/misc/IntervalSet.h index fc2bd3477..e574d58ca 100755 --- a/runtime/Cpp/runtime/src/misc/IntervalSet.h +++ b/runtime/Cpp/runtime/src/misc/IntervalSet.h @@ -44,7 +44,7 @@ namespace misc { static IntervalSet of(ssize_t a); /// Create a set with all ints within range [a..b] (inclusive) - static IntervalSet of(ssize_t a, ssize_t b, bool autoExtend = false); + static IntervalSet of(ssize_t a, ssize_t b); virtual void clear(); @@ -58,7 +58,7 @@ namespace misc { /// If overlap, combine ranges. For example, /// If this is {1..5, 10..20}, adding 6..7 yields /// {1..5, 6..7, 10..20}. Adding 4..8 yields {1..8, 10..20}. - virtual void add(ssize_t a, ssize_t b, bool autoExtend = false); + virtual void add(ssize_t a, ssize_t b); public: /// combine all sets in the array returned the or'd value diff --git a/runtime/Go/antlr/atn_deserializer.go b/runtime/Go/antlr/atn_deserializer.go index 2fc4d9ff3..2a71ca237 100644 --- a/runtime/Go/antlr/atn_deserializer.go +++ b/runtime/Go/antlr/atn_deserializer.go @@ -15,15 +15,16 @@ import ( // This is the earliest supported serialized UUID. // stick to serialized version for now, we don't need a UUID instance var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E" +var AddedUnicodeSMP = "59627784-3BE5-417A-B9EB-8131A7286089" // This list contains all of the currently supported UUIDs, ordered by when // the feature first appeared in this branch. -var SupportedUUIDs = []string{BaseSerializedUUID} +var SupportedUUIDs = []string{BaseSerializedUUID, AddedUnicodeSMP} var SerializedVersion = 3 // This is the current serialized UUID. -var SerializedUUID = BaseSerializedUUID +var SerializedUUID = AddedUnicodeSMP type LoopEndStateIntPair struct { item0 *LoopEndState @@ -91,7 +92,15 @@ func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN { a.readRules(atn) a.readModes(atn) - sets := a.readSets(atn) + sets := make([]*IntervalSet, 0) + + // First, deserialize sets with 16-bit arguments <= U+FFFF. + sets = a.readSets(atn, sets, a.readInt) + // Next, if the ATN was serialized with the Unicode SMP feature, + // deserialize sets with 32-bit arguments <= U+10FFFF. + if (a.isFeatureSupported(AddedUnicodeSMP, a.uuid)) { + sets = a.readSets(atn, sets, a.readInt32) + } a.readEdges(atn, sets) a.readDecisions(atn) @@ -266,8 +275,7 @@ func (a *ATNDeserializer) readModes(atn *ATN) { } } -func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet { - sets := make([]*IntervalSet, 0) +func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode func() int) []*IntervalSet { m := a.readInt() for i := 0; i < m; i++ { @@ -283,8 +291,8 @@ func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet { } for j := 0; j < n; j++ { - i1 := a.readInt() - i2 := a.readInt() + i1 := readUnicode() + i2 := readUnicode() iset.addRange(i1, i2) } @@ -642,6 +650,12 @@ func (a *ATNDeserializer) readInt() int { return int(v) } +func (a *ATNDeserializer) readInt32() int { + var low = a.readInt() + var high = a.readInt() + return low | (high << 16) +} + //TODO //func (a *ATNDeserializer) readLong() int64 { // panic("Not implemented") diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java index b7ac14205..87963ee92 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java @@ -44,6 +44,12 @@ public class ATNDeserializer { * {@link LexerAction} instances. */ private static final UUID ADDED_LEXER_ACTIONS; + /** + * This UUID indicates the serialized ATN contains two sets of + * IntervalSets, where the second set's values are encoded as + * 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + */ + private static final UUID ADDED_UNICODE_SMP; /** * This list contains all of the currently supported UUIDs, ordered by when * the feature first appeared in this branch. @@ -61,15 +67,58 @@ public class ATNDeserializer { BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3"); ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61"); ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); + ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089"); SUPPORTED_UUIDS = new ArrayList(); SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID); SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS); SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS); + SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP); - SERIALIZED_UUID = ADDED_LEXER_ACTIONS; + SERIALIZED_UUID = ADDED_UNICODE_SMP; } + interface UnicodeDeserializer { + // Wrapper for readInt() or readInt32() + int readUnicode(char[] data, int p); + + // Work around Java not allowing mutation of captured variables + // by returning amount by which to increment p after each read + int size(); + } + + enum UnicodeDeserializingMode { + UNICODE_BMP, + UNICODE_SMP + } + + static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) { + if (mode == UnicodeDeserializingMode.UNICODE_BMP) { + return new UnicodeDeserializer() { + @Override + public int readUnicode(char[] data, int p) { + return toInt(data[p]); + } + + @Override + public int size() { + return 1; + } + }; + } else { + return new UnicodeDeserializer() { + @Override + public int readUnicode(char[] data, int p) { + return toInt32(data, p); + } + + @Override + public int size() { + return 2; + } + }; + } + } private final ATNDeserializationOptions deserializationOptions; @@ -98,7 +147,7 @@ public class ATNDeserializer { * serialized ATN at or after the feature identified by {@code feature} was * introduced; otherwise, {@code false}. */ - protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { + static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { int featureIndex = SUPPORTED_UUIDS.indexOf(feature); if (featureIndex < 0) { return false; @@ -258,22 +307,14 @@ public class ATNDeserializer { // SETS // List sets = new ArrayList(); - int nsets = toInt(data[p++]); - for (int i=0; i sets, UnicodeDeserializer unicodeDeserializer) { + int nsets = toInt(data[p++]); + for (int i=0; i tokenNames; + private interface CodePointSerializer { + void serializeCodePoint(IntegerList data, int cp); + } + public ATNSerializer(ATN atn) { assert atn.grammarType != null; this.atn = atn; @@ -47,9 +53,11 @@ public class ATNSerializer { * (args are token type,actionIndex in lexer else 0,0) * num modes, * mode-0-start-state, mode-1-start-state, ... (parser has 0 modes) - * num sets - * set-0-interval-count intervals, set-1-interval-count intervals, ... - * num total edges, + * num unicode-bmp-sets + * bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ... + * num unicode-smp-sets + * smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ... + * num total edges, * src, trg, edge-type, edge arg1, optional edge arg2 (present always), ... * num decisions, * decision-0-start-state, decision-1-start-state, ... @@ -66,8 +74,10 @@ public class ATNSerializer { data.add(atn.maxTokenType); int nedges = 0; - Map setIndices = new HashMap(); - List sets = new ArrayList(); + // Note that we use a LinkedHashMap as a set to + // maintain insertion order while deduplicating + // entries with the same key. + Map sets = new LinkedHashMap<>(); // dump states, count edges and collect sets while doing so IntegerList nonGreedyStates = new IntegerList(); @@ -114,10 +124,7 @@ public class ATNSerializer { int edgeType = Transition.serializationTypes.get(t.getClass()); if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) { SetTransition st = (SetTransition)t; - if (!setIndices.containsKey(st.set)) { - sets.add(st.set); - setIndices.put(st.set, sets.size() - 1); - } + sets.put(st.set, true); } } } @@ -156,34 +163,40 @@ public class ATNSerializer { data.add(modeStartState.stateNumber); } } - - int nsets = sets.size(); - data.add(nsets); - for (IntervalSet set : sets) { - boolean containsEof = set.contains(Token.EOF); - if (containsEof && set.getIntervals().get(0).b == Token.EOF) { - data.add(set.getIntervals().size() - 1); + List bmpSets = new ArrayList<>(); + List smpSets = new ArrayList<>(); + for (IntervalSet set : sets.keySet()) { + if (set.getMaxElement() <= Character.MAX_VALUE) { + bmpSets.add(set); + } else { + smpSets.add(set); } - else { - data.add(set.getIntervals().size()); - } - - data.add(containsEof ? 1 : 0); - for (Interval I : set.getIntervals()) { - if (I.a == Token.EOF) { - if (I.b == Token.EOF) { - continue; - } - else { - data.add(0); - } + } + serializeSets( + data, + bmpSets, + new CodePointSerializer() { + @Override + public void serializeCodePoint(IntegerList data, int cp) { + data.add(cp); } - else { - data.add(I.a); + }); + serializeSets( + data, + smpSets, + new CodePointSerializer() { + @Override + public void serializeCodePoint(IntegerList data, int cp) { + serializeInt(data, cp); } - - data.add(I.b); - } + }); + Map setIndices = new HashMap<>(); + int setIndex = 0; + for (IntervalSet bmpSet : bmpSets) { + setIndices.put(bmpSet, setIndex++); + } + for (IntervalSet smpSet : smpSets) { + setIndices.put(smpSet, setIndex++); } data.add(nedges); @@ -359,6 +372,42 @@ public class ATNSerializer { return data; } + private static void serializeSets( + IntegerList data, + Collection sets, + CodePointSerializer codePointSerializer) + { + int nSets = sets.size(); + data.add(nSets); + + for (IntervalSet set : sets) { + boolean containsEof = set.contains(Token.EOF); + if (containsEof && set.getIntervals().get(0).b == Token.EOF) { + data.add(set.getIntervals().size() - 1); + } + else { + data.add(set.getIntervals().size()); + } + + data.add(containsEof ? 1 : 0); + for (Interval I : set.getIntervals()) { + if (I.a == Token.EOF) { + if (I.b == Token.EOF) { + continue; + } + else { + codePointSerializer.serializeCodePoint(data, 0); + } + } + else { + codePointSerializer.serializeCodePoint(data, I.a); + } + + codePointSerializer.serializeCodePoint(data, I.b); + } + } + } + public String decode(char[] data) { data = data.clone(); // don't adjust the first value since that's the version number @@ -437,25 +486,10 @@ public class ATNSerializer { int s = ATNDeserializer.toInt(data[p++]); buf.append("mode ").append(i).append(":").append(s).append('\n'); } - int nsets = ATNDeserializer.toInt(data[p++]); - for (int i=0; i0 ) { - buf.append(", "); - } - - buf.append(getTokenName(ATNDeserializer.toInt(data[p]))).append("..").append(getTokenName(ATNDeserializer.toInt(data[p + 1]))); - p += 2; - } - buf.append("\n"); - } + int numBMPSets = ATNDeserializer.toInt(data[p++]); + p = appendSets(buf, data, p, numBMPSets, 0, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_BMP)); + int numSMPSets = ATNDeserializer.toInt(data[p++]); + p = appendSets(buf, data, p, numSMPSets, numBMPSets, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_SMP)); int nedges = ATNDeserializer.toInt(data[p++]); for (int i=0; i0 ) { + buf.append(", "); + } + + int a = unicodeDeserializer.readUnicode(data, p); + p += unicodeDeserializer.size(); + int b = unicodeDeserializer.readUnicode(data, p); + p += unicodeDeserializer.size(); + buf.append(getTokenName(a)).append("..").append(getTokenName(b)); + } + buf.append("\n"); + } + return p; + } + public String getTokenName(int t) { if ( t==-1 ) return "EOF"; diff --git a/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js b/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js index cdedf34de..7ee665aed 100644 --- a/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js +++ b/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js @@ -51,14 +51,21 @@ var LexerModeAction = LexerActions.LexerModeAction; // stick to serialized version for now, we don't need a UUID instance var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"; +// +// This UUID indicates the serialized ATN contains two sets of +// IntervalSets, where the second set's values are encoded as +// 32-bit integers to support the full Unicode SMP range up to U+10FFFF. +// +var ADDED_UNICODE_SMP = "59627784-3BE5-417A-B9EB-8131A7286089"; + // This list contains all of the currently supported UUIDs, ordered by when // the feature first appeared in this branch. -var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]; +var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]; var SERIALIZED_VERSION = 3; // This is the current serialized UUID. -var SERIALIZED_UUID = BASE_SERIALIZED_UUID; +var SERIALIZED_UUID = ADDED_UNICODE_SMP; function initArray( length, value) { var tmp = []; @@ -91,11 +98,11 @@ function ATNDeserializer (options) { // introduced; otherwise, {@code false}. ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) { - var idx1 = SUPPORTED_UUIDS.index(feature); + var idx1 = SUPPORTED_UUIDS.indexOf(feature); if (idx1<0) { return false; } - var idx2 = SUPPORTED_UUIDS.index(actualUuid); + var idx2 = SUPPORTED_UUIDS.indexOf(actualUuid); return idx2 >= idx1; }; @@ -107,7 +114,14 @@ ATNDeserializer.prototype.deserialize = function(data) { this.readStates(atn); this.readRules(atn); this.readModes(atn); - var sets = this.readSets(atn); + var sets = []; + // First, deserialize sets with 16-bit arguments <= U+FFFF. + this.readSets(atn, sets, this.readInt.bind(this)); + // Next, if the ATN was serialized with the Unicode SMP feature, + // deserialize sets with 32-bit arguments <= U+10FFFF. + if (this.isFeatureSupported(ADDED_UNICODE_SMP, this.uuid)) { + this.readSets(atn, sets, this.readInt32.bind(this)); + } this.readEdges(atn, sets); this.readDecisions(atn); this.readLexerActions(atn); @@ -244,8 +258,7 @@ ATNDeserializer.prototype.readModes = function(atn) { } }; -ATNDeserializer.prototype.readSets = function(atn) { - var sets = []; +ATNDeserializer.prototype.readSets = function(atn, sets, readUnicode) { var m = this.readInt(); for (var i=0; i" - elif c=='\n': - return "\\n" - elif c=='\t': - return "\\t" - elif c=='\r': - return "\\r" + elif c==u'\n': + return u"\\n" + elif c==u'\t': + return u"\\t" + elif c==u'\r': + return u"\\r" else: - return unicode(c) + return c def getCharErrorDisplay(self, c): - return "'" + self.getErrorDisplayForChar(c) + "'" + return u"'" + self.getErrorDisplayForChar(c) + u"'" # Lexers can normally match any char in it's vocabulary after matching # a token, so do the easy thing and just kill a character and hope diff --git a/runtime/Python2/src/antlr4/atn/ATNDeserializer.py b/runtime/Python2/src/antlr4/atn/ATNDeserializer.py index 2d554cd99..2ad34054d 100644 --- a/runtime/Python2/src/antlr4/atn/ATNDeserializer.py +++ b/runtime/Python2/src/antlr4/atn/ATNDeserializer.py @@ -13,14 +13,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions # This is the earliest supported serialized UUID. BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E") +# This UUID indicates the serialized ATN contains two sets of +# IntervalSets, where the second set's values are encoded as +# 32-bit integers to support the full Unicode SMP range up to U+10FFFF. +ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089") + # This list contains all of the currently supported UUIDs, ordered by when # the feature first appeared in this branch. -SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ] +SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ] SERIALIZED_VERSION = 3 # This is the current serialized UUID. -SERIALIZED_UUID = BASE_SERIALIZED_UUID +SERIALIZED_UUID = ADDED_UNICODE_SMP class ATNDeserializer (object): @@ -59,7 +64,13 @@ class ATNDeserializer (object): self.readStates(atn) self.readRules(atn) self.readModes(atn) - sets = self.readSets(atn) + sets = [] + # First, read all sets with 16-bit Unicode code points <= U+FFFF. + self.readSets(atn, sets, self.readInt) + # Next, if the ATN was serialized with the Unicode SMP feature, + # deserialize sets with 32-bit arguments <= U+10FFFF. + if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid): + self.readSets(atn, sets, self.readInt32) self.readEdges(atn, sets) self.readDecisions(atn) self.readLexerActions(atn) @@ -170,8 +181,7 @@ class ATNDeserializer (object): s = self.readInt() atn.modeToStartState.append(atn.states[s]) - def readSets(self, atn): - sets = [] + def readSets(self, atn, sets, readUnicode): m = self.readInt() for i in range(0, m): iset = IntervalSet() @@ -181,10 +191,9 @@ class ATNDeserializer (object): if containsEof!=0: iset.addOne(-1) for j in range(0, n): - i1 = self.readInt() - i2 = self.readInt() + i1 = readUnicode() + i2 = readUnicode() iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive - return sets def readEdges(self, atn, sets): nedges = self.readInt() diff --git a/runtime/Python3/src/antlr4/atn/ATNDeserializer.py b/runtime/Python3/src/antlr4/atn/ATNDeserializer.py index 8fc63bcab..29adb15be 100644 --- a/runtime/Python3/src/antlr4/atn/ATNDeserializer.py +++ b/runtime/Python3/src/antlr4/atn/ATNDeserializer.py @@ -4,6 +4,7 @@ #/ from uuid import UUID from io import StringIO +from typing import Callable from antlr4.Token import Token from antlr4.atn.ATN import ATN from antlr4.atn.ATNType import ATNType @@ -15,14 +16,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions # This is the earliest supported serialized UUID. BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E") +# This UUID indicates the serialized ATN contains two sets of +# IntervalSets, where the second set's values are encoded as +# 32-bit integers to support the full Unicode SMP range up to U+10FFFF. +ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089") + # This list contains all of the currently supported UUIDs, ordered by when # the feature first appeared in this branch. -SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ] +SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ] SERIALIZED_VERSION = 3 # This is the current serialized UUID. -SERIALIZED_UUID = BASE_SERIALIZED_UUID +SERIALIZED_UUID = ADDED_UNICODE_SMP class ATNDeserializer (object): @@ -58,7 +64,13 @@ class ATNDeserializer (object): self.readStates(atn) self.readRules(atn) self.readModes(atn) - sets = self.readSets(atn) + sets = [] + # First, read all sets with 16-bit Unicode code points <= U+FFFF. + self.readSets(atn, sets, self.readInt) + # Next, if the ATN was serialized with the Unicode SMP feature, + # deserialize sets with 32-bit arguments <= U+10FFFF. + if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid): + self.readSets(atn, sets, self.readInt32) self.readEdges(atn, sets) self.readDecisions(atn) self.readLexerActions(atn) @@ -170,8 +182,7 @@ class ATNDeserializer (object): s = self.readInt() atn.modeToStartState.append(atn.states[s]) - def readSets(self, atn:ATN): - sets = [] + def readSets(self, atn:ATN, sets:list, readUnicode:Callable[[], int]): m = self.readInt() for i in range(0, m): iset = IntervalSet() @@ -181,10 +192,9 @@ class ATNDeserializer (object): if containsEof!=0: iset.addOne(-1) for j in range(0, n): - i1 = self.readInt() - i2 = self.readInt() + i1 = readUnicode() + i2 = readUnicode() iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive - return sets def readEdges(self, atn:ATN, sets:list): nedges = self.readInt() diff --git a/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift b/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift index dce134e3b..df6698f56 100644 --- a/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift +++ b/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift @@ -26,21 +26,30 @@ public class ATNDeserializer { /// for the addition of lexer actions encoded as a sequence of /// {@link org.antlr.v4.runtime.atn.LexerAction} instances. private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")! - /// This list contains all of the currently supported UUIDs, ordered by when - /// the feature first appeared in this branch. + + /// This UUID indicates the serialized ATN contains two sets of + /// IntervalSets, where the second set's values are encoded as + /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + private static let ADDED_UNICODE_SMP: UUID = UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")! + + /** + * This list contains all of the currently supported UUIDs, ordered by when + * the feature first appeared in this branch. + */ private static let SUPPORTED_UUIDS: Array = { var suuid = Array() suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID) suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS) suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS) + suuid.append(ATNDeserializer.ADDED_UNICODE_SMP) return suuid }() /// This is the current serialized UUID. public static let SERIALIZED_UUID: UUID = { - // SERIALIZED_UUID = ADDED_LEXER_ACTIONS; - return UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")! + // SERIALIZED_UUID = ADDED_UNICODE_SMP; + return UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")! }() @@ -245,24 +254,14 @@ public class ATNDeserializer { // SETS // var sets: Array = Array() - let nsets: Int = toInt(data[p]) - p += 1 - for _ in 0.. Int { + let result: Int = toInt(data[p]) + p += 1 + return result + } + + private func readUnicodeInt32(_ data: [Character], _ p: inout Int) -> Int { + let result: Int = toInt32(data, p) + p += 2 + return result + } + + private func readSets(_ data: [Character], _ p: inout Int, _ sets: inout Array, _ readUnicode: ([Character], inout Int) -> Int) throws { + let nsets: Int = toInt(data[p]) + p += 1 + for _ in 0.. ATN { // let jsonStr = Utils.readFile2String(jsonFileName) guard !jsonStr.isEmpty else { diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java index 0324d393d..966416470 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java @@ -6,8 +6,8 @@ package org.antlr.v4.test.tool; -import org.antlr.v4.runtime.ANTLRInputStream; import org.antlr.v4.runtime.CharStream; +import org.antlr.v4.runtime.CharStreams; import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNState; import org.antlr.v4.runtime.misc.Utils; @@ -121,6 +121,94 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest { checkLexerMatches(lg, "c", expecting); } + @Test public void testLexerSetUnicodeBMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\u611B'|'\u611C')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, "\u611B", expecting); + } + + @Test public void testLexerNotSetUnicodeBMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\u611B'|'\u611C')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, "\u611D", expecting); + } + + @Test public void testLexerNotSetUnicodeBMPMatchesSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\u611B'|'\u611C')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting); + } + + @Test public void testLexerSetUnicodeSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\\u{1F4A9}'|'\\u{1F4AA}')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting); + } + + @Test public void testLexerNotBMPSetMatchesUnicodeSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('a'|'b')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting); + } + + @Test public void testLexerNotBMPSetMatchesBMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('a'|'b')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, "\u611B", expecting); + } + + @Test public void testLexerNotBMPSetMatchesSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('a'|'b')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting); + } + + @Test public void testLexerNotSMPSetMatchesBMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, "\u611B", expecting); + } + + @Test public void testLexerNotSMPSetMatchesSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1D7C0).toString(), expecting); + } + + @Test public void testLexerRangeUnicodeSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\\u{1F4A9}'..'\\u{1F4B0}')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4AF).toString(), expecting); + } + + @Test public void testLexerRangeUnicodeBMPToSMP() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\\u611B'..'\\u{1F4B0}')\n ;"); + String expecting = "ID, EOF"; + checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x12001).toString(), expecting); + } + @Test public void testLexerKeywordIDAmbiguity() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ @@ -293,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest { protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { ATN atn = createATN(lg, true); - CharStream input = new ANTLRInputStream(inputString); + CharStream input = CharStreams.createWithString(inputString); ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE"); DOTGenerator dot = new DOTGenerator(lg); // System.out.println(dot.getDOT(startState, true)); diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java index 5e2b57ea1..dcb8eeec7 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java @@ -291,6 +291,113 @@ public class TestATNSerialization extends BaseJavaToolTest { assertEquals(expecting, result); } + @Test public void testLexerUnicodeSMPLiteralSerializedToSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "INT : '\\u{1F4A9}' ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:128169..128169\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeSMPRangeSerializedToSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "INT : ('a'..'\\u{1F4A9}') ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'a'..128169\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeSMPSetSerializedAfterBMPSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+ + "BMP : ('a' | 'x') ;"); + String expecting = + "max type 2\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:RULE_START 1\n" + + "4:RULE_STOP 1\n" + + "5:BASIC 0\n" + + "6:BASIC 0\n" + + "7:BASIC 1\n" + + "8:BASIC 1\n" + + "rule 0:1 1\n" + + "rule 1:3 2\n" + + "mode 0:0\n" + + "0:'a'..'a', 'x'..'x'\n" + + "1:128169..128169, 128175..128175\n" + + "0->1 EPSILON 0,0,0\n" + + "0->3 EPSILON 0,0,0\n" + + "1->5 EPSILON 0,0,0\n" + + "3->7 EPSILON 0,0,0\n" + + "5->6 SET 1,0,0\n" + + "6->2 EPSILON 0,0,0\n" + + "7->8 SET 0,0,0\n" + + "8->4 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerNotLiteral() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "INT : ~'a' ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'a'..'a'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + @Test public void testLexerRange() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ @@ -518,6 +625,222 @@ public class TestATNSerialization extends BaseJavaToolTest { assertEquals(expecting, result); } + @Test public void testLexerUnicodeUnescapedBMPNotSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\u4E9C'|'\u4E9D')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u4E9C'..'\\u4E9D'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeUnescapedBMPSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeUnescapedBMPNotSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedBMPNotSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u4E9C'|'\\u4E9D')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u4E9C'..'\\u4E9D'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedBMPSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedBMPNotSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedSMPNotSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:128169..128170\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedSMPSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:119823..119827, 128065..128065, 128169..128170\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + + @Test public void testLexerUnicodeEscapedSMPNotSetWithRange() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n"+ + "ID : ~('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:119823..119827, 128065..128065, 128169..128170\n" + + "0->1 EPSILON 0,0,0\n" + + "1->3 EPSILON 0,0,0\n" + + "3->4 NOT_SET 0,0,0\n" + + "4->2 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames())); + assertEquals(expecting, result); + } + @Test public void testLexerWildcardWithMode() throws Exception { LexerGrammar lg = new LexerGrammar( "lexer grammar L;\n"+ diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java index c338eec54..91cb9fc00 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java @@ -141,6 +141,24 @@ public class TestTokenTypeAssignment extends BaseJavaToolTest { assertEquals("'\\n'", literals.toArray()[0]); } + @Test public void testParserCharLiteralWithBasicUnicodeEscape() throws Exception { + Grammar g = new Grammar( + "grammar t;\n"+ + "a : '\\uABCD';\n"); + Set literals = g.stringLiteralToTypeMap.keySet(); + // must store literals how they appear in the antlr grammar + assertEquals("'\\uABCD'", literals.toArray()[0]); + } + + @Test public void testParserCharLiteralWithExtendedUnicodeEscape() throws Exception { + Grammar g = new Grammar( + "grammar t;\n"+ + "a : '\\u{1ABCD}';\n"); + Set literals = g.stringLiteralToTypeMap.keySet(); + // must store literals how they appear in the antlr grammar + assertEquals("'\\u{1ABCD}'", literals.toArray()[0]); + } + protected void checkSymbols(Grammar g, String rulesStr, String allValidTokensStr) diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java new file mode 100644 index 000000000..d517b0481 --- /dev/null +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.test.tool; + +import org.antlr.v4.gui.Trees; +import org.antlr.v4.runtime.CharStreams; +import org.antlr.v4.runtime.CommonTokenStream; +import org.antlr.v4.runtime.LexerInterpreter; +import org.antlr.v4.runtime.tree.ParseTree; +import org.antlr.v4.tool.Grammar; +import org.antlr.v4.tool.GrammarParserInterpreter; + +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TestUnicodeGrammar extends BaseJavaToolTest { + @Test + public void unicodeBMPLiteralInGrammar() throws Exception { + String grammarText = + "grammar Unicode;\n" + + "r : 'hello' WORLD;\n" + + "WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" + + "WS : [ \\t\\r\\n]+ -> skip;\n"; + String inputText = "hello \u4E16\u754C"; + assertEquals( + "(r:1 " + inputText + ")", + parseTreeForGrammarWithInput( + grammarText, + "r", + inputText)); + } + + // TODO: This test cannot pass unless we change either the grammar + // parser to decode surrogate pair literals to code points (which + // would break existing clients) or to treat them as an + // alternative: + // + // '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D') + // + // but I worry that might cause parse ambiguity if we're not careful. + //@Test + public void unicodeSurrogatePairLiteralInGrammar() throws Exception { + String grammarText = + "grammar Unicode;\n" + + "r : 'hello' WORLD;\n" + + "WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" + + "WS : [ \\t\\r\\n]+ -> skip;\n"; + String inputText = new StringBuilder("hello ") + .appendCodePoint(0x1F30E) + .toString(); + assertEquals( + "(r:1 " + inputText + ")", + parseTreeForGrammarWithInput( + grammarText, + "r", + inputText)); + } + + @Test + public void unicodeSMPLiteralInGrammar() throws Exception { + String grammarText = + "grammar Unicode;\n" + + "r : 'hello' WORLD;\n" + + "WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" + + "WS : [ \\t\\r\\n]+ -> skip;\n"; + String inputText = new StringBuilder("hello ") + .appendCodePoint(0x1F30E) + .toString(); + assertEquals( + "(r:1 " + inputText + ")", + parseTreeForGrammarWithInput( + grammarText, + "r", + inputText)); + } + + @Test + public void unicodeSMPRangeInGrammar() throws Exception { + String grammarText = + "grammar Unicode;\n" + + "r : 'hello' WORLD;\n" + + "WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" + + "WS : [ \\t\\r\\n]+ -> skip;\n"; + String inputText = new StringBuilder("hello ") + .appendCodePoint(0x1F30E) + .toString(); + assertEquals( + "(r:1 " + inputText + ")", + parseTreeForGrammarWithInput( + grammarText, + "r", + inputText)); + } + + @Test + public void matchingDanglingSurrogateInInput() throws Exception { + String grammarText = + "grammar Unicode;\n" + + "r : 'hello' WORLD;\n" + + "WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" + + "WS : [ \\t\\r\\n]+ -> skip;\n"; + String inputText = "hello \uD83C"; + assertEquals( + "(r:1 " + inputText + ")", + parseTreeForGrammarWithInput( + grammarText, + "r", + inputText)); + } + + private static String parseTreeForGrammarWithInput( + String grammarText, + String rootRule, + String inputText) throws Exception { + Grammar grammar = new Grammar(grammarText); + LexerInterpreter lexEngine = grammar.createLexerInterpreter( + CharStreams.createWithString(inputText)); + CommonTokenStream tokens = new CommonTokenStream(lexEngine); + GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens); + ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index); + InterpreterTreeTextProvider nodeTextProvider = + new InterpreterTreeTextProvider(grammar.getRuleNames()); + return Trees.toStringTree(parseTree, nodeTextProvider); + } +} diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg index b22fc2e64..ce6768531 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg @@ -288,7 +288,7 @@ public partial class : public override string[] RuleNames { get { return ruleNames; } } - public override string SerializedAtn { get { return _serializedATN; } } + public override string SerializedAtn { get { return new string(_serializedATN); } } static () { decisionToDFA = new DFA[_ATN.NumberOfDecisions]; @@ -1023,7 +1023,7 @@ public partial class : { public override string[] ModeNames { get { return modeNames; } } - public override string SerializedAtn { get { return _serializedATN; } } + public override string SerializedAtn { get { return new string(_serializedATN); } } static () { decisionToDFA = new DFA[_ATN.NumberOfDecisions]; @@ -1038,16 +1038,12 @@ public partial class : { SerializedATN(model) ::= << -private static string _serializedATN = _serializeATN(); -private static string _serializeATN() -{ - StringBuilder sb = new StringBuilder(); - sb.Append("<\t>sb.Append("}>"); - return sb.ToString(); -} +private static char[] _serializedATN = { + , +}; public static readonly ATN _ATN = - new ATNDeserializer().Deserialize(_serializedATN.ToCharArray()); + new ATNDeserializer().Deserialize(_serializedATN); >> diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg index d244380f2..6342b660e 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg @@ -192,23 +192,23 @@ atn::ATN ::_atn; std::vector\ ::_serializedATN; std::vector\ ::_ruleNames = { - "}; separator = ", ", wrap, anchor> + "}; separator = ", ", wrap, anchor> }; std::vector\ ::_channelNames = { - "DEFAULT_TOKEN_CHANNEL", "HIDDEN", "}; separator = ", ", wrap, anchor> + "DEFAULT_TOKEN_CHANNEL", "HIDDEN", "}; separator = ", ", wrap, anchor> }; std::vector\ ::_modeNames = { - "}; separator = ", ", wrap, anchor> + "}; separator = ", ", wrap, anchor> }; std::vector\ ::_literalNames = { - }; null = "\"\"", separator = ", ", wrap, anchor> + }; null = "\"\"", separator = ", ", wrap, anchor> }; std::vector\ ::_symbolicNames = { - }; null = "\"\"", separator = ", ", wrap, anchor> + }; null = "\"\"", separator = ", ", wrap, anchor> }; dfa::Vocabulary ::_vocabulary(_literalNames, _symbolicNames); diff --git a/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java b/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java index bc96243af..3ee0b89ef 100644 --- a/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java @@ -46,7 +46,7 @@ public class CSharpTarget extends Target { formatted = String.format("\\x%X", v & 0xFFFF); } - return formatted; + return "'" + formatted + "'"; } @Override diff --git a/tool/src/org/antlr/v4/misc/CharSupport.java b/tool/src/org/antlr/v4/misc/CharSupport.java index e81e95807..ab8184835 100644 --- a/tool/src/org/antlr/v4/misc/CharSupport.java +++ b/tool/src/org/antlr/v4/misc/CharSupport.java @@ -45,10 +45,9 @@ public class CharSupport { } /** Return a string representing the escaped char for code c. E.g., If c - * has value 0x100, you will get "\u0100". ASCII gets the usual - * char (non-hex) representation. Control characters are spit out - * as unicode. While this is specially set up for returning Java strings, - * it can be used by any language target that has the same syntax. :) + * has value 0x100, you will get "\\u0100". ASCII gets the usual + * char (non-hex) representation. Non-ASCII characters are spit out + * as \\uXXXX or \\u{XXXXXX} escapes. */ public static String getANTLRCharLiteralForChar(int c) { if ( c< Lexer.MIN_CHAR_VALUE ) { @@ -67,11 +66,11 @@ public class CharSupport { } return '\''+Character.toString((char)c)+'\''; } - // turn on the bit above max "\uFFFF" value so that we pad with zeros - // then only take last 4 digits - String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); - String unicodeStr = "'\\u"+hex+"'"; - return unicodeStr; + if (c <= 0xFFFF) { + return String.format("\\u%04X", c); + } else { + return String.format("\\u{%06X}", c); + } } /** Given a literal like (the 3 char sequence with single quotes) 'a', @@ -92,11 +91,25 @@ public class CharSupport { if ( literal.charAt(i) == '\\' ) { end = i+2; if ( i+1 < n && literal.charAt(i+1) == 'u' ) { - for (end = i + 2; end < i + 6; end++) { - if ( end>n ) return null; // invalid escape sequence. - char charAt = literal.charAt(end); - if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { - return null; // invalid escape sequence. + if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence + end = i + 3; + while (true) { + if ( end + 1 > n ) return null; // invalid escape sequence. + char charAt = literal.charAt(end++); + if (charAt == '}') { + break; + } + if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { + return null; // invalid escape sequence. + } + } + } else { + for (end = i + 2; end < i + 6; end++) { + if ( end>n ) return null; // invalid escape sequence. + char charAt = literal.charAt(end); + if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { + return null; // invalid escape sequence. + } } } } @@ -107,13 +120,13 @@ public class CharSupport { if ( c==-1 ) { return null; // invalid escape sequence. } - else buf.append((char)c); + else buf.appendCodePoint(c); i = end; } return buf.toString(); } - /** Given char x or \t or \u1234 return the char value; + /** Given char x or \\t or \\u1234 return the char value; * Unnecessary escapes like '\{' yield -1. */ public static int getCharValueFromCharInGrammarLiteral(String cstr) { @@ -130,9 +143,31 @@ public class CharSupport { if ( charVal==0 ) return -1; return charVal; case 6: - // '\u1234' + // '\\u1234' or '\\u{12}' if ( !cstr.startsWith("\\u") ) return -1; - String unicodeChars = cstr.substring(2, cstr.length()); + int startOff; + int endOff; + if ( cstr.charAt(2) == '{' ) { + startOff = 3; + endOff = cstr.indexOf('}'); + } else { + startOff = 2; + endOff = cstr.length(); + } + return parseHexValue(cstr, startOff, endOff); + default: + if ( cstr.startsWith("\\u{") ) { + return parseHexValue(cstr, 3, cstr.indexOf('}')); + } + return -1; + } + } + + private static int parseHexValue(String cstr, int startOff, int endOff) { + if (startOff < 0 || endOff < 0) { + return -1; + } + String unicodeChars = cstr.substring(startOff, endOff); int result = -1; try { result = Integer.parseInt(unicodeChars, 16); @@ -140,9 +175,6 @@ public class CharSupport { catch (NumberFormatException e) { } return result; - default: - return -1; - } } public static String capitalize(String s) { diff --git a/tool/src/org/antlr/v4/parse/ANTLRLexer.g b/tool/src/org/antlr/v4/parse/ANTLRLexer.g index 7d61c1105..939f3ad4f 100644 --- a/tool/src/org/antlr/v4/parse/ANTLRLexer.g +++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g @@ -615,8 +615,8 @@ SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT // // ANTLR makes no disticintion between a single character literal and a // multi-character string. All literals are single quote delimited and -// may contain unicode escape sequences of the form \uxxxx, where x -// is a valid hexadecimal number (as per Java basically). +// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx}, +// where x is a valid hexadecimal number. STRING_LITERAL : '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )* ( '\'' @@ -652,6 +652,10 @@ ESC_SEQ // UNICODE_ESC + | // A Swift/Hack style Unicode escape sequence + // + UNICODE_EXTENDED_ESC + | // An illegal escape seqeunce // { @@ -720,6 +724,27 @@ UNICODE_ESC } ; +fragment +UNICODE_EXTENDED_ESC + : 'u{' // Leadin for unicode extended escape sequence + + HEX_DIGIT+ // One or more hexadecimal digits + + '}' // Leadout for unicode extended escape sequence + + // Now check the digit count and issue an error if we need to + { + int numDigits = getCharIndex()-state.tokenStartCharIndex-6; + if (numDigits > 6) { + Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); + t.setText(t.getText()); + t.setLine(input.getLine()); + t.setCharPositionInLine(input.getCharPositionInLine()-numDigits); + grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t); + } + } + ; + // ---------- // Whitespace //