Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots more tests.

2017-02-17 13:35:00 -08:00 · 2017-02-17 13:35:00 -08:00 · fd4246cf3f
parent ce09abb480
commit fd4246cf3f
25 changed files with 1361 additions and 209 deletions
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java
@ -402,4 +402,282 @@ public class SetsDescriptors {
 		public String grammar;
 	}
 	public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor {
 		public String input = "a\u00E4\u3042\u4E9Cc";
 		public String output = "a\u00E4\u3042\u4E9Cc\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS {<InputText():writeln()>} ;
 		 // These are actually not escaped -- Java passes the
 		 // raw unescaped Unicode values to the grammar compiler.
 		 LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c';
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor {
 		public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
 		public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS* 'd' {<InputText():writeln()>} ;
 		 // These are actually not escaped -- Java passes the
 		 // raw unescaped Unicode values to the grammar compiler.
 		 LETTERS : ('a'|'\u00E0'..'\u00E5');
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor {
 		public String input = "a\u00E4\u3042\u4E9Cc";
 		public String output = "a\u00E4\u3042\u4E9Cc\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS {<InputText():writeln()>} ;
 		 // Note the double-backslash to avoid Java passing
 		 // unescaped values as part of the grammar.
 		 LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c';
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor {
 		public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
 		public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS* 'd' {<InputText():writeln()>} ;
 		 // Note the double-backslash to avoid Java passing
 		 // unescaped values as part of the grammar.
 		 LETTERS : ('a'|'\\u00E0'..'\\u00E5');
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	// TODO(bhamiltoncx): This needs to be an error, the V3
 	// runtime used by the tool doesn't really understand unescaped code points >
 	// U+FFFF.
 	// public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor {
 	//	public String input = new StringBuilder()
 	//			.append("a")
 	//			.appendCodePoint(0x1D5C2)
 	//			.appendCodePoint(0x1D5CE)
 	//			.appendCodePoint(0x1D5BA)
 	//			.append("c")
 	//			.toString();
 	//	public String output = new StringBuilder()
 	//			.append("a")
 	//			.appendCodePoint(0x1D5C2)
 	//			.appendCodePoint(0x1D5CE)
 	//			.appendCodePoint(0x1D5BA)
 	//			.append("c\n")
 	//			.toString();
 	//	public String errors = null;
 	//	public String startRule = "a";
 	//	public String grammarName = "T";
 	//	/**
 	//	 grammar T;
 	//	 a : LETTERS  {<InputText():writeln()>} ;
 	//	 // These are actually not escaped -- Java passes the
 	//	 // raw unescaped Unicode values to the grammar compiler.
 	//	 //
 	//	 // Each sequence is the UTF-16 encoding of a raw Unicode
 	//	 // SMP code point.
 	//	 LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c';
 	//	 */
 	//	@CommentHasStringValue
 	//	public String grammar;
 	// }
 	public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor {
 		public String input = new StringBuilder()
 				.append("a")
 				.appendCodePoint(0x1D5C2)
 				.appendCodePoint(0x1D5CE)
 				.appendCodePoint(0x1D5BA)
 				.append("c")
 				.toString();
 		public String output = new StringBuilder()
 				.append("a")
 				.appendCodePoint(0x1D5C2)
 				.appendCodePoint(0x1D5CE)
 				.appendCodePoint(0x1D5BA)
 				.append("c\n")
 				.toString();
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS  {<InputText():writeln()>} ;
 		 // Note the double-backslash to avoid Java passing
 		 // unescaped values as part of the grammar.
 		 LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c';
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	// Turns out Tool.java uses ANTLR 3's runtime, which means it can't use
 	// CodePointCharStream to understand unescaped code points > U+FFFF.
 	//
 	// TODO(bhamiltoncx): This needs to be an error, since we don't currently plan
 	// to port Tool.java to use ANTLR 4's runtime.
 	// public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor {
 	//	public String input = new StringBuilder()
 	//			.append("a")
 	//			.appendCodePoint(0x1D5C2)
 	//			.appendCodePoint(0x1D5CE)
 	//			.appendCodePoint(0x1D5BA)
 	//			.append("d")
 	//			.toString();
 	//	public String output = new StringBuilder()
 	//			.append("a")
 	//			.appendCodePoint(0x1D5C2)
 	//			.appendCodePoint(0x1D5CE)
 	//			.appendCodePoint(0x1D5BA)
 	//			.append("d\n")
 	//			.toString();
 	//	public String errors = null;
 	//	public String startRule = "a";
 	//	public String grammarName = "T";
 	//	/**
 	//	 grammar T;
 	//	 a : LETTERS* 'd' {<InputText():writeln()>} ;
 	//	 // These are actually not escaped -- Java passes the
 	//	 // raw unescaped Unicode values to the grammar compiler.
 	//	 LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43');
 	//	 */
 	//	@CommentHasStringValue
 	//	public String grammar;
 	// }
 	public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor {
 		public String input = new StringBuilder()
 				.append("a")
 				.appendCodePoint(0x1F609)
 				.appendCodePoint(0x1F942)
 				.appendCodePoint(0x1F700)
 				.append("d")
 				.toString();
 		public String output = new StringBuilder()
 				.append("a")
 				.appendCodePoint(0x1F609)
 				.appendCodePoint(0x1F942)
 				.appendCodePoint(0x1F700)
 				.append("d\n")
 				.toString();
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS* 'd' {<InputText():writeln()>} ;
 		 // Note the double-backslash to avoid Java passing
 		 // unescaped values as part of the grammar.
 		 LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor {
 		// Test the code points just before and just after the range.
 		public String input = new StringBuilder()
 				.append("a")
 				.appendCodePoint(0x1F5FF)
 				.appendCodePoint(0x1F944)
 				.append("d")
 				.toString();
 		public String output = "ad\n";
 		public String errors = new StringBuilder()
 				.append("line 1:1 token recognition error at: '")
 				.appendCodePoint(0x1F5FF)
 				.append("'\n")
 				.append("line 1:2 token recognition error at: '")
 				.appendCodePoint(0x1F944)
 				.append("'\n")
 				.toString();
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS* 'd' {<InputText():writeln()>} ;
 		 // Note the double-backslash to avoid Java passing
 		 // unescaped values as part of the grammar.
 		 LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor {
 		public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c";
 		public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS {<InputText():writeln()>} ;
 		 LETTERS : 'a' ~('b')+ 'c';
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 	public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor {
 		public String input = "abc";
 		public String output = "abc\n";
 		public String errors = null;
 		public String startRule = "a";
 		public String grammarName = "T";
 		/**
 		 grammar T;
 		 a : LETTERS {<InputText():writeln()>} ;
 		 LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c';
 		 */
 		@CommentHasStringValue
 		public String grammar;
 	}
 }
--- a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs
+++ b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs
@ -22,6 +22,18 @@ namespace Antlr4.Runtime.Atn
        /// <remarks>This is the earliest supported serialized UUID.</remarks>
        private static readonly Guid BaseSerializedUuid;
        /// <summary>
        /// This UUID indicates the serialized ATN contains two sets of
        /// IntervalSets, where the second set's values are encoded as
        /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
        /// </summary>
        /// <remarks>
        /// This UUID indicates the serialized ATN contains two sets of
        /// IntervalSets, where the second set's values are encoded as
        /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
        /// </remarks>
        private static readonly Guid AddedUnicodeSmp;
        /// <summary>
        /// This list contains all of the currently supported UUIDs, ordered by when
        /// the feature first appeared in this branch.
@ -39,14 +51,18 @@ namespace Antlr4.Runtime.Atn
        static ATNDeserializer()
        {
 			BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
 			AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089");
            SupportedUuids = new List<Guid>();
            SupportedUuids.Add(BaseSerializedUuid);
-			SerializedUuid = BaseSerializedUuid;
+            SupportedUuids.Add(AddedUnicodeSmp);
 			SerializedUuid = AddedUnicodeSmp;
        }
        [NotNull]
        private readonly ATNDeserializationOptions deserializationOptions;
        private Guid uuid;
        public ATNDeserializer()
            : this(ATNDeserializationOptions.Default)
        {
@ -115,7 +131,11 @@ namespace Antlr4.Runtime.Atn
 			ReadStates (atn);
 			ReadRules (atn);
 			ReadModes (atn);
-			IList<IntervalSet> sets = ReadSets (atn);
+			IList<IntervalSet> sets = new List<IntervalSet>();
 			ReadSets (atn, sets, this.ReadInt);
 			if (IsFeatureSupported(AddedUnicodeSmp, uuid)) {
 				ReadSets (atn, sets, this.ReadInt32);
 			}
 			ReadEdges (atn, sets);
 			ReadDecisions (atn);
 			ReadLexerActions (atn);
@ -378,12 +398,11 @@ namespace Antlr4.Runtime.Atn
 			}
 		}
-		protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
+		protected internal virtual void ReadSets(ATN atn, IList<IntervalSet> sets, Func<int> readUnicode)
 		{
 			//
 			// SETS
 			//
 			IList<IntervalSet> sets = new List<IntervalSet>();
 			int nsets = ReadInt();
 			for (int i_8 = 0; i_8 < nsets; i_8++)
 			{
@ -397,10 +416,9 @@ namespace Antlr4.Runtime.Atn
 				}
 				for (int j = 0; j < nintervals; j++)
 				{
-					set.Add(ReadInt(), ReadInt());
+					set.Add(readUnicode(), readUnicode());
 				}
 			}
 			return sets;
 		}
 		protected internal virtual void ReadModes(ATN atn)
@ -530,7 +548,7 @@ namespace Antlr4.Runtime.Atn
 		protected internal virtual void CheckUUID()
 		{
-			Guid uuid = ReadUUID();
+			uuid = ReadUUID();
 			if (!SupportedUuids.Contains(uuid))
 			{
 				string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);
--- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp
+++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp
@ -57,6 +57,51 @@ using namespace antlrcpp;
 const size_t ATNDeserializer::SERIALIZED_VERSION = 3;
 namespace {
 uint32_t deserializeInt32(const std::vector<uint16_t>& data, size_t offset) {
  return (uint32_t)data[offset] | ((uint32_t)data[offset + 1] << 16);
 }
 ssize_t readUnicodeInt(const std::vector<uint16_t>& data, int& p) {
  return static_cast<ssize_t>(data[p++]);
 }
 ssize_t readUnicodeInt32(const std::vector<uint16_t>& data, int& p) {
  auto result = deserializeInt32(data, p);
  p += 2;
  return static_cast<ssize_t>(result);
 }
 // We templatize this on the function type so the optimizer can inline
 // the 16- or 32-bit readUnicodeInt/readUnicodeInt32 as needed.
 template <typename F>
 void deserializeSets(
  const std::vector<uint16_t>& data,
  int& p,
  std::vector<misc::IntervalSet>& sets,
  F readUnicode) {
  int nsets = data[p++];
  for (int i = 0; i < nsets; i++) {
    int nintervals = data[p++];
    misc::IntervalSet set;
    bool containsEof = data[p++] != 0;
    if (containsEof) {
      set.add(-1);
    }
    for (int j = 0; j < nintervals; j++) {
      auto a = readUnicode(data, p);
      auto b = readUnicode(data, p);
      set.add(a, b);
    }
    sets.push_back(set);
  }
 }
 }
 ATNDeserializer::ATNDeserializer(): ATNDeserializer(ATNDeserializationOptions::getDefaultOptions()) {
 }
@ -75,8 +120,12 @@ Guid ATNDeserializer::ADDED_LEXER_ACTIONS() {
  return Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
 }
 Guid ATNDeserializer::ADDED_UNICODE_SMP() {
  return Guid("59627784-3BE5-417A-B9EB-8131A7286089");
 }
 Guid ATNDeserializer::SERIALIZED_UUID() {
-  return ADDED_LEXER_ACTIONS();
+  return ADDED_UNICODE_SMP();
 }
 Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
@ -84,7 +133,7 @@ Guid ATNDeserializer::BASE_SERIALIZED_UUID() {
 }
 std::vector<Guid>& ATNDeserializer::SUPPORTED_UUIDS() {
-  static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS() };
+  static std::vector<Guid> singleton = { BASE_SERIALIZED_UUID(), ADDED_PRECEDENCE_TRANSITIONS(), ADDED_LEXER_ACTIONS(), ADDED_UNICODE_SMP() };
  return singleton;
 }
@ -239,21 +288,14 @@ ATN ATNDeserializer::deserialize(const std::vector<uint16_t>& input) {
  // SETS
  //
  std::vector<misc::IntervalSet> sets;
  int nsets = data[p++];
  for (int i = 0; i < nsets; i++) {
    int nintervals = data[p++];
    misc::IntervalSet set;
-    bool containsEof = data[p++] != 0;
+  // First, deserialize sets with 16-bit arguments <= U+FFFF.
-    if (containsEof) {
+  deserializeSets(data, p, sets, readUnicodeInt);
      set.add(-1);
    }
-    for (int j = 0; j < nintervals; j++) {
+  // Next, if the ATN was serialized with the Unicode SMP feature,
-      set.add(data[p], data[p + 1], true);
+  // deserialize sets with 32-bit arguments <= U+10FFFF.
-      p += 2;
+  if (isFeatureSupported(ADDED_UNICODE_SMP(), uuid)) {
-    }
+    deserializeSets(data, p, sets, readUnicodeInt32);
    sets.push_back(set);
  }
  //
--- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.h
+++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.h
@ -67,6 +67,13 @@ namespace atn {
     */
    static Guid ADDED_LEXER_ACTIONS();
    /**
     * This UUID indicates the serialized ATN contains two sets of
     * IntervalSets, where the second set's values are encoded as
     * 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
     */
    static Guid ADDED_UNICODE_SMP();
    /// This list contains all of the currently supported UUIDs, ordered by when
    /// the feature first appeared in this branch.
    static std::vector<Guid>& SUPPORTED_UUIDS();
--- a/runtime/Cpp/runtime/src/misc/Interval.cpp
+++ b/runtime/Cpp/runtime/src/misc/Interval.cpp
@ -24,14 +24,7 @@ Interval::Interval() : Interval((ssize_t)-1, -2) { // Need an explicit cast here
 Interval::Interval(size_t a_, size_t b_) : Interval(symbolToNumeric(a_), symbolToNumeric(b_)) {
 }
-Interval::Interval(ssize_t a_, ssize_t b_, bool autoExtend) {
+Interval::Interval(ssize_t a_, ssize_t b_) : a(a_), b(b_) {
  a = a_;
  b = b_;
  // XXX: temporary hack to make the full Unicode range available.
  if (autoExtend && b == 0xFFFF) {
    b = 0x10FFFF;
  }
 }
 size_t Interval::length() const {
--- a/runtime/Cpp/runtime/src/misc/Interval.h
+++ b/runtime/Cpp/runtime/src/misc/Interval.h
@ -27,7 +27,7 @@ namespace misc {
    Interval();
    explicit Interval(size_t a_, size_t b_); // For unsigned -> signed mappings.
-    Interval(ssize_t a_, ssize_t b_, bool autoExtend = false); // Automatically extend a value of 0xFFFF to 0x10FFFF.
+    Interval(ssize_t a_, ssize_t b_);
    virtual ~Interval() {};
    /// return number of elements between a and b inclusively. x..x is length 1.
--- a/runtime/Cpp/runtime/src/misc/IntervalSet.cpp
+++ b/runtime/Cpp/runtime/src/misc/IntervalSet.cpp
@ -50,8 +50,8 @@ IntervalSet IntervalSet::of(ssize_t a) {
  return IntervalSet({ Interval(a, a) });
 }
-IntervalSet IntervalSet::of(ssize_t a, ssize_t b, bool autoExtend) {
+IntervalSet IntervalSet::of(ssize_t a, ssize_t b) {
-  return IntervalSet({ Interval(a, b, autoExtend) });
+  return IntervalSet({ Interval(a, b) });
 }
 void IntervalSet::clear() {
@ -68,8 +68,8 @@ void IntervalSet::add(ssize_t el) {
  add(el, el);
 }
-void IntervalSet::add(ssize_t a, ssize_t b, bool autoExtend) {
+void IntervalSet::add(ssize_t a, ssize_t b) {
-  add(Interval(a, b, autoExtend));
+  add(Interval(a, b));
 }
 void IntervalSet::add(const Interval &addition) {
--- a/runtime/Cpp/runtime/src/misc/IntervalSet.h
+++ b/runtime/Cpp/runtime/src/misc/IntervalSet.h
@ -44,7 +44,7 @@ namespace misc {
    static IntervalSet of(ssize_t a);
    /// Create a set with all ints within range [a..b] (inclusive)
-    static IntervalSet of(ssize_t a, ssize_t b, bool autoExtend = false);
+    static IntervalSet of(ssize_t a, ssize_t b);
    virtual void clear();
@ -58,7 +58,7 @@ namespace misc {
    /// If overlap, combine ranges.  For example,
    /// If this is {1..5, 10..20}, adding 6..7 yields
    /// {1..5, 6..7, 10..20}.  Adding 4..8 yields {1..8, 10..20}.
-    virtual void add(ssize_t a, ssize_t b, bool autoExtend = false);
+    virtual void add(ssize_t a, ssize_t b);
  public:
    /// combine all sets in the array returned the or'd value
--- a/runtime/Go/antlr/atn_deserializer.go
+++ b/runtime/Go/antlr/atn_deserializer.go
@ -15,15 +15,16 @@ import (
 // This is the earliest supported serialized UUID.
 // stick to serialized version for now, we don't need a UUID instance
 var BaseSerializedUUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"
 var AddedUnicodeSMP = "59627784-3BE5-417A-B9EB-8131A7286089"
 // This list contains all of the currently supported UUIDs, ordered by when
 // the feature first appeared in this branch.
-var SupportedUUIDs = []string{BaseSerializedUUID}
+var SupportedUUIDs = []string{BaseSerializedUUID, AddedUnicodeSMP}
 var SerializedVersion = 3
 // This is the current serialized UUID.
-var SerializedUUID = BaseSerializedUUID
+var SerializedUUID = AddedUnicodeSMP
 type LoopEndStateIntPair struct {
 	item0 *LoopEndState
@ -91,7 +92,15 @@ func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN {
 	a.readRules(atn)
 	a.readModes(atn)
-	sets := a.readSets(atn)
+	sets := make([]*IntervalSet, 0)
 	// First, deserialize sets with 16-bit arguments <= U+FFFF.
 	sets = a.readSets(atn, sets, a.readInt)
 	// Next, if the ATN was serialized with the Unicode SMP feature,
 	// deserialize sets with 32-bit arguments <= U+10FFFF.
 	if (a.isFeatureSupported(AddedUnicodeSMP, a.uuid)) {
 		sets = a.readSets(atn, sets, a.readInt32)
 	}
 	a.readEdges(atn, sets)
 	a.readDecisions(atn)
@ -266,8 +275,7 @@ func (a *ATNDeserializer) readModes(atn *ATN) {
 	}
 }
-func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet {
+func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode func() int) []*IntervalSet {
 	sets := make([]*IntervalSet, 0)
 	m := a.readInt()
 	for i := 0; i < m; i++ {
@ -283,8 +291,8 @@ func (a *ATNDeserializer) readSets(atn *ATN) []*IntervalSet {
 		}
 		for j := 0; j < n; j++ {
-			i1 := a.readInt()
+			i1 := readUnicode()
-			i2 := a.readInt()
+			i2 := readUnicode()
 			iset.addRange(i1, i2)
 		}
@ -642,6 +650,12 @@ func (a *ATNDeserializer) readInt() int {
 	return int(v)
 }
 func (a *ATNDeserializer) readInt32() int {
 	var low = a.readInt()
 	var high = a.readInt()
 	return low | (high << 16)
 }
 //TODO
 //func (a *ATNDeserializer) readLong() int64 {
 //    panic("Not implemented")
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java
@ -44,6 +44,12 @@ public class ATNDeserializer {
 	 * {@link LexerAction} instances.
 	 */
 	private static final UUID ADDED_LEXER_ACTIONS;
 	/**
 	 * This UUID indicates the serialized ATN contains two sets of
 	 * IntervalSets, where the second set's values are encoded as
 	 * 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
 	 */
 	private static final UUID ADDED_UNICODE_SMP;
 	/**
 	 * This list contains all of the currently supported UUIDs, ordered by when
 	 * the feature first appeared in this branch.
@ -61,15 +67,58 @@ public class ATNDeserializer {
 		BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
 		ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61");
 		ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
 		ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089");
 		SUPPORTED_UUIDS = new ArrayList<UUID>();
 		SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID);
 		SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS);
 		SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS);
 		SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP);
-		SERIALIZED_UUID = ADDED_LEXER_ACTIONS;
+		SERIALIZED_UUID = ADDED_UNICODE_SMP;
 	}
 	interface UnicodeDeserializer {
 		// Wrapper for readInt() or readInt32()
 		int readUnicode(char[] data, int p);
 		// Work around Java not allowing mutation of captured variables
 		// by returning amount by which to increment p after each read
 		int size();
 	}
 	enum UnicodeDeserializingMode {
 		UNICODE_BMP,
 		UNICODE_SMP
 	}
 	static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) {
 		if (mode == UnicodeDeserializingMode.UNICODE_BMP) {
 			return new UnicodeDeserializer() {
 				@Override
 				public int readUnicode(char[] data, int p) {
 					return toInt(data[p]);
 				}
 				@Override
 				public int size() {
 					return 1;
 				}
 			};
 		} else {
 			return new UnicodeDeserializer() {
 				@Override
 				public int readUnicode(char[] data, int p) {
 					return toInt32(data, p);
 				}
 				@Override
 				public int size() {
 					return 2;
 				}
 			};
 		}
 	}
 	private final ATNDeserializationOptions deserializationOptions;
@ -98,7 +147,7 @@ public class ATNDeserializer {
 	 * serialized ATN at or after the feature identified by {@code feature} was
 	 * introduced; otherwise, {@code false}.
 	 */
-	protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
+	static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
 		int featureIndex = SUPPORTED_UUIDS.indexOf(feature);
 		if (featureIndex < 0) {
 			return false;
@ -258,22 +307,14 @@ public class ATNDeserializer {
 		// SETS
 		//
 		List<IntervalSet> sets = new ArrayList<IntervalSet>();
 		int nsets = toInt(data[p++]);
 		for (int i=0; i<nsets; i++) {
 			int nintervals = toInt(data[p]);
 			p++;
 			IntervalSet set = new IntervalSet();
 			sets.add(set);
-			boolean containsEof = toInt(data[p++]) != 0;
+		// First, read all sets with 16-bit Unicode code points <= U+FFFF.
-			if (containsEof) {
+		p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP));
 				set.add(-1);
 			}
-			for (int j=0; j<nintervals; j++) {
+		// Next, if the ATN was serialized with the Unicode SMP feature,
-				set.add(toInt(data[p]), toInt(data[p + 1]));
+		// deserialize sets with 32-bit arguments <= U+10FFFF.
-				p += 2;
+		if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) {
-			}
+			p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP));
 		}
 		//
@ -510,6 +551,30 @@ public class ATNDeserializer {
 		return atn;
 	}
 	private int deserializeSets(char[] data, int p, List<IntervalSet> sets, UnicodeDeserializer unicodeDeserializer) {
 		int nsets = toInt(data[p++]);
 		for (int i=0; i<nsets; i++) {
 			int nintervals = toInt(data[p]);
 			p++;
 			IntervalSet set = new IntervalSet();
 			sets.add(set);
 			boolean containsEof = toInt(data[p++]) != 0;
 			if (containsEof) {
 				set.add(-1);
 			}
 			for (int j=0; j<nintervals; j++) {
 				int a = unicodeDeserializer.readUnicode(data, p);
 				p += unicodeDeserializer.size();
 				int b = unicodeDeserializer.readUnicode(data, p);
 				p += unicodeDeserializer.size();
 				set.add(a, b);
 			}
 		}
 		return p;
 	}
 	/**
 	 * Analyze the {@link StarLoopEntryState} states in the specified ATN to set
 	 * the {@link StarLoopEntryState#isPrecedenceDecision} field to the
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java
@ -14,8 +14,10 @@ import org.antlr.v4.runtime.misc.Utils;
 import java.io.InvalidClassException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.LinkedHashMap;
 import java.util.Locale;
 import java.util.Map;
 import java.util.UUID;
@ -24,6 +26,10 @@ public class ATNSerializer {
 	public ATN atn;
 	private List<String> tokenNames;
 	private interface CodePointSerializer {
 		void serializeCodePoint(IntegerList data, int cp);
 	}
 	public ATNSerializer(ATN atn) {
 		assert atn.grammarType != null;
 		this.atn = atn;
@ -47,9 +53,11 @@ public class ATNSerializer {
 	 *  	(args are token type,actionIndex in lexer else 0,0)
 	 *      num modes,
 	 *      mode-0-start-state, mode-1-start-state, ... (parser has 0 modes)
-	 *      num sets
+	 *      num unicode-bmp-sets
-	 *      set-0-interval-count intervals, set-1-interval-count intervals, ...
+	 *      bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ...
-	 *  	num total edges,
+	 *      num unicode-smp-sets
 	 *      smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ...
 	 *	num total edges,
 	 *      src, trg, edge-type, edge arg1, optional edge arg2 (present always), ...
 	 *      num decisions,
 	 *      decision-0-start-state, decision-1-start-state, ...
@ -66,8 +74,10 @@ public class ATNSerializer {
 		data.add(atn.maxTokenType);
 		int nedges = 0;
-		Map<IntervalSet, Integer> setIndices = new HashMap<IntervalSet, Integer>();
+		// Note that we use a LinkedHashMap as a set to
-		List<IntervalSet> sets = new ArrayList<IntervalSet>();
+		// maintain insertion order while deduplicating
 		// entries with the same key.
 		Map<IntervalSet, Boolean> sets = new LinkedHashMap<>();
 		// dump states, count edges and collect sets while doing so
 		IntegerList nonGreedyStates = new IntegerList();
@ -114,10 +124,7 @@ public class ATNSerializer {
 				int edgeType = Transition.serializationTypes.get(t.getClass());
 				if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) {
 					SetTransition st = (SetTransition)t;
-					if (!setIndices.containsKey(st.set)) {
+					sets.put(st.set, true);
 						sets.add(st.set);
 						setIndices.put(st.set, sets.size() - 1);
 					}
 				}
 			}
 		}
@ -156,34 +163,40 @@ public class ATNSerializer {
 				data.add(modeStartState.stateNumber);
 			}
 		}
-
+		List<IntervalSet> bmpSets = new ArrayList<>();
-		int nsets = sets.size();
+		List<IntervalSet> smpSets = new ArrayList<>();
-		data.add(nsets);
+		for (IntervalSet set : sets.keySet()) {
-		for (IntervalSet set : sets) {
+			if (set.getMaxElement() <= Character.MAX_VALUE) {
-			boolean containsEof = set.contains(Token.EOF);
+				bmpSets.add(set);
-			if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
+			} else {
-				data.add(set.getIntervals().size() - 1);
+				smpSets.add(set);
 			}
-			else {
+		}
-				data.add(set.getIntervals().size());
+		serializeSets(
-			}
+			data,
-
+			bmpSets,
-			data.add(containsEof ? 1 : 0);
+			new CodePointSerializer() {
-			for (Interval I : set.getIntervals()) {
+				@Override
-				if (I.a == Token.EOF) {
+				public void serializeCodePoint(IntegerList data, int cp) {
-					if (I.b == Token.EOF) {
+					data.add(cp);
 						continue;
 					}
 					else {
 						data.add(0);
 					}
 				}
-				else {
+			});
-					data.add(I.a);
+		serializeSets(
 			data,
 			smpSets,
 			new CodePointSerializer() {
 				@Override
 				public void serializeCodePoint(IntegerList data, int cp) {
 					serializeInt(data, cp);
 				}
-
+			});
-				data.add(I.b);
+		Map<IntervalSet, Integer> setIndices = new HashMap<>();
-			}
+		int setIndex = 0;
 		for (IntervalSet bmpSet : bmpSets) {
 			setIndices.put(bmpSet, setIndex++);
 		}
 		for (IntervalSet smpSet : smpSets) {
 			setIndices.put(smpSet, setIndex++);
 		}
 		data.add(nedges);
@ -359,6 +372,42 @@ public class ATNSerializer {
 		return data;
 	}
 	private static void serializeSets(
 			IntegerList data,
 			Collection<IntervalSet> sets,
 			CodePointSerializer codePointSerializer)
 	{
 		int nSets = sets.size();
 		data.add(nSets);
 		for (IntervalSet set : sets) {
 			boolean containsEof = set.contains(Token.EOF);
 			if (containsEof && set.getIntervals().get(0).b == Token.EOF) {
 				data.add(set.getIntervals().size() - 1);
 			}
 			else {
 				data.add(set.getIntervals().size());
 			}
 			data.add(containsEof ? 1 : 0);
 			for (Interval I : set.getIntervals()) {
 				if (I.a == Token.EOF) {
 					if (I.b == Token.EOF) {
 						continue;
 					}
 					else {
 						codePointSerializer.serializeCodePoint(data, 0);
 					}
 				}
 				else {
 					codePointSerializer.serializeCodePoint(data, I.a);
 				}
 				codePointSerializer.serializeCodePoint(data, I.b);
 			}
 		}
 	}
 	public String decode(char[] data) {
 		data = data.clone();
 		// don't adjust the first value since that's the version number
@ -437,25 +486,10 @@ public class ATNSerializer {
 			int s = ATNDeserializer.toInt(data[p++]);
 			buf.append("mode ").append(i).append(":").append(s).append('\n');
 		}
-		int nsets = ATNDeserializer.toInt(data[p++]);
+		int numBMPSets = ATNDeserializer.toInt(data[p++]);
-		for (int i=0; i<nsets; i++) {
+		p = appendSets(buf, data, p, numBMPSets, 0, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_BMP));
-			int nintervals = ATNDeserializer.toInt(data[p++]);
+		int numSMPSets = ATNDeserializer.toInt(data[p++]);
-			buf.append(i).append(":");
+		p = appendSets(buf, data, p, numSMPSets, numBMPSets, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_SMP));
 			boolean containsEof = data[p++] != 0;
 			if (containsEof) {
 				buf.append(getTokenName(Token.EOF));
 			}
 			for (int j=0; j<nintervals; j++) {
 				if ( containsEof || j>0 ) {
 					buf.append(", ");
 				}
 				buf.append(getTokenName(ATNDeserializer.toInt(data[p]))).append("..").append(getTokenName(ATNDeserializer.toInt(data[p + 1])));
 				p += 2;
 			}
 			buf.append("\n");
 		}
 		int nedges = ATNDeserializer.toInt(data[p++]);
 		for (int i=0; i<nedges; i++) {
 			int src = ATNDeserializer.toInt(data[p]);
@ -491,6 +525,31 @@ public class ATNSerializer {
 		return buf.toString();
 	}
 	private int appendSets(StringBuilder buf, char[] data, int p, int nsets, int setIndexOffset, ATNDeserializer.UnicodeDeserializer unicodeDeserializer) {
 		for (int i=0; i<nsets; i++) {
 			int nintervals = ATNDeserializer.toInt(data[p++]);
 			buf.append(i+setIndexOffset).append(":");
 			boolean containsEof = data[p++] != 0;
 			if (containsEof) {
 				buf.append(getTokenName(Token.EOF));
 			}
 			for (int j=0; j<nintervals; j++) {
 				if ( containsEof || j>0 ) {
 					buf.append(", ");
 				}
 				int a = unicodeDeserializer.readUnicode(data, p);
 				p += unicodeDeserializer.size();
 				int b = unicodeDeserializer.readUnicode(data, p);
 				p += unicodeDeserializer.size();
 				buf.append(getTokenName(a)).append("..").append(getTokenName(b));
 			}
 			buf.append("\n");
 		}
 		return p;
 	}
 	public String getTokenName(int t) {
 		if ( t==-1 ) return "EOF";
--- a/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js
+++ b/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js
@ -51,14 +51,21 @@ var LexerModeAction = LexerActions.LexerModeAction;
 // stick to serialized version for now, we don't need a UUID instance
 var BASE_SERIALIZED_UUID = "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E";
 //
 // This UUID indicates the serialized ATN contains two sets of
 // IntervalSets, where the second set's values are encoded as
 // 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
 //
 var ADDED_UNICODE_SMP = "59627784-3BE5-417A-B9EB-8131A7286089";
 // This list contains all of the currently supported UUIDs, ordered by when
 // the feature first appeared in this branch.
-var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ];
+var SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ];
 var SERIALIZED_VERSION = 3;
 // This is the current serialized UUID.
-var SERIALIZED_UUID = BASE_SERIALIZED_UUID;
+var SERIALIZED_UUID = ADDED_UNICODE_SMP;
 function initArray( length, value) {
 	var tmp = [];
@ -91,11 +98,11 @@ function ATNDeserializer (options) {
 // introduced; otherwise, {@code false}.
 ATNDeserializer.prototype.isFeatureSupported = function(feature, actualUuid) {
-    var idx1 = SUPPORTED_UUIDS.index(feature);
+    var idx1 = SUPPORTED_UUIDS.indexOf(feature);
    if (idx1<0) {
        return false;
    }
-    var idx2 = SUPPORTED_UUIDS.index(actualUuid);
+    var idx2 = SUPPORTED_UUIDS.indexOf(actualUuid);
    return idx2 >= idx1;
 };
@ -107,7 +114,14 @@ ATNDeserializer.prototype.deserialize = function(data) {
    this.readStates(atn);
    this.readRules(atn);
    this.readModes(atn);
-    var sets = this.readSets(atn);
+    var sets = [];
    // First, deserialize sets with 16-bit arguments <= U+FFFF.
    this.readSets(atn, sets, this.readInt.bind(this));
    // Next, if the ATN was serialized with the Unicode SMP feature,
    // deserialize sets with 32-bit arguments <= U+10FFFF.
    if (this.isFeatureSupported(ADDED_UNICODE_SMP, this.uuid)) {
        this.readSets(atn, sets, this.readInt32.bind(this));
    }
    this.readEdges(atn, sets);
    this.readDecisions(atn);
    this.readLexerActions(atn);
@ -244,8 +258,7 @@ ATNDeserializer.prototype.readModes = function(atn) {
    }
 };
-ATNDeserializer.prototype.readSets = function(atn) {
+ATNDeserializer.prototype.readSets = function(atn, sets, readUnicode) {
    var sets = [];
    var m = this.readInt();
    for (var i=0; i<m; i++) {
        var iset = new IntervalSet();
@ -256,12 +269,11 @@ ATNDeserializer.prototype.readSets = function(atn) {
            iset.addOne(-1);
        }
        for (var j=0; j<n; j++) {
-            var i1 = this.readInt();
+            var i1 = readUnicode();
-            var i2 = this.readInt();
+            var i2 = readUnicode();
            iset.addRange(i1, i2);
        }
    }
    return sets;
 };
 ATNDeserializer.prototype.readEdges = function(atn, sets) {
--- a/runtime/Python2/src/antlr4/Lexer.py
+++ b/runtime/Python2/src/antlr4/Lexer.py
@ -278,7 +278,7 @@ class Lexer(Recognizer, TokenSource):
        start = self._tokenStartCharIndex
        stop = self._input.index
        text = self._input.getText(start, stop)
-        msg = "token recognition error at: '" + self.getErrorDisplay(text) + "'"
+        msg = u"token recognition error at: '" + self.getErrorDisplay(text) + u"'"
        listener = self.getErrorListenerDispatch()
        listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e)
@ -291,17 +291,17 @@ class Lexer(Recognizer, TokenSource):
    def getErrorDisplayForChar(self, c):
        if ord(c[0])==Token.EOF:
            return "<EOF>"
-        elif c=='\n':
+        elif c==u'\n':
-            return "\\n"
+            return u"\\n"
-        elif c=='\t':
+        elif c==u'\t':
-            return "\\t"
+            return u"\\t"
-        elif c=='\r':
+        elif c==u'\r':
-            return "\\r"
+            return u"\\r"
        else:
-            return unicode(c)
+            return c
    def getCharErrorDisplay(self, c):
-        return "'" + self.getErrorDisplayForChar(c) + "'"
+        return u"'" + self.getErrorDisplayForChar(c) + u"'"
    # Lexers can normally match any char in it's vocabulary after matching
    #  a token, so do the easy thing and just kill a character and hope
--- a/runtime/Python2/src/antlr4/atn/ATNDeserializer.py
+++ b/runtime/Python2/src/antlr4/atn/ATNDeserializer.py
@ -13,14 +13,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
 # This is the earliest supported serialized UUID.
 BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
 # This UUID indicates the serialized ATN contains two sets of
 # IntervalSets, where the second set's values are encoded as
 # 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
 ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
 # This list contains all of the currently supported UUIDs, ordered by when
 # the feature first appeared in this branch.
-SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]
+SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
 SERIALIZED_VERSION = 3
 # This is the current serialized UUID.
-SERIALIZED_UUID = BASE_SERIALIZED_UUID
+SERIALIZED_UUID = ADDED_UNICODE_SMP
 class ATNDeserializer (object):
@ -59,7 +64,13 @@ class ATNDeserializer (object):
        self.readStates(atn)
        self.readRules(atn)
        self.readModes(atn)
-        sets = self.readSets(atn)
+        sets = []
        # First, read all sets with 16-bit Unicode code points <= U+FFFF.
        self.readSets(atn, sets, self.readInt)
        # Next, if the ATN was serialized with the Unicode SMP feature,
        # deserialize sets with 32-bit arguments <= U+10FFFF.
        if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
            self.readSets(atn, sets, self.readInt32)
        self.readEdges(atn, sets)
        self.readDecisions(atn)
        self.readLexerActions(atn)
@ -170,8 +181,7 @@ class ATNDeserializer (object):
            s = self.readInt()
            atn.modeToStartState.append(atn.states[s])
-    def readSets(self, atn):
+    def readSets(self, atn, sets, readUnicode):
        sets = []
        m = self.readInt()
        for i in range(0, m):
            iset = IntervalSet()
@ -181,10 +191,9 @@ class ATNDeserializer (object):
            if containsEof!=0:
                iset.addOne(-1)
            for j in range(0, n):
-                i1 = self.readInt()
+                i1 = readUnicode()
-                i2 = self.readInt()
+                i2 = readUnicode()
                iset.addRange(Interval(i1, i2 + 1)) # range upper limit is exclusive
        return sets
    def readEdges(self, atn, sets):
        nedges = self.readInt()
--- a/runtime/Python3/src/antlr4/atn/ATNDeserializer.py
+++ b/runtime/Python3/src/antlr4/atn/ATNDeserializer.py
@ -4,6 +4,7 @@
 #/
 from uuid import UUID
 from io import StringIO
 from typing import Callable
 from antlr4.Token import Token
 from antlr4.atn.ATN import ATN
 from antlr4.atn.ATNType import ATNType
@ -15,14 +16,19 @@ from antlr4.atn.ATNDeserializationOptions import ATNDeserializationOptions
 # This is the earliest supported serialized UUID.
 BASE_SERIALIZED_UUID = UUID("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")
 # This UUID indicates the serialized ATN contains two sets of
 # IntervalSets, where the second set's values are encoded as
 # 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
 ADDED_UNICODE_SMP = UUID("59627784-3BE5-417A-B9EB-8131A7286089")
 # This list contains all of the currently supported UUIDs, ordered by when
 # the feature first appeared in this branch.
-SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID ]
+SUPPORTED_UUIDS = [ BASE_SERIALIZED_UUID, ADDED_UNICODE_SMP ]
 SERIALIZED_VERSION = 3
 # This is the current serialized UUID.
-SERIALIZED_UUID = BASE_SERIALIZED_UUID
+SERIALIZED_UUID = ADDED_UNICODE_SMP
 class ATNDeserializer (object):
@ -58,7 +64,13 @@ class ATNDeserializer (object):
        self.readStates(atn)
        self.readRules(atn)
        self.readModes(atn)
-        sets = self.readSets(atn)
+        sets = []
        # First, read all sets with 16-bit Unicode code points <= U+FFFF.
        self.readSets(atn, sets, self.readInt)
        # Next, if the ATN was serialized with the Unicode SMP feature,
        # deserialize sets with 32-bit arguments <= U+10FFFF.
        if self.isFeatureSupported(ADDED_UNICODE_SMP, self.uuid):
            self.readSets(atn, sets, self.readInt32)
        self.readEdges(atn, sets)
        self.readDecisions(atn)
        self.readLexerActions(atn)
@ -170,8 +182,7 @@ class ATNDeserializer (object):
            s = self.readInt()
            atn.modeToStartState.append(atn.states[s])
-    def readSets(self, atn:ATN):
+    def readSets(self, atn:ATN, sets:list, readUnicode:Callable[[], int]):
        sets = []
        m = self.readInt()
        for i in range(0, m):
            iset = IntervalSet()
@ -181,10 +192,9 @@ class ATNDeserializer (object):
            if containsEof!=0:
                iset.addOne(-1)
            for j in range(0, n):
-                i1 = self.readInt()
+                i1 = readUnicode()
-                i2 = self.readInt()
+                i2 = readUnicode()
                iset.addRange(range(i1, i2 + 1)) # range upper limit is exclusive
        return sets
    def readEdges(self, atn:ATN, sets:list):
        nedges = self.readInt()
--- a/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift
+++ b/runtime/Swift/Sources/Antlr4/atn/ATNDeserializer.swift
@ -26,21 +26,30 @@ public class ATNDeserializer {
    /// for the addition of lexer actions encoded as a sequence of
    /// {@link org.antlr.v4.runtime.atn.LexerAction} instances.
    private static let ADDED_LEXER_ACTIONS: UUID = UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
-    /// This list contains all of the currently supported UUIDs, ordered by when
+
-    /// the feature first appeared in this branch.
+    /// This UUID indicates the serialized ATN contains two sets of
    /// IntervalSets, where the second set's values are encoded as
    /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
    private static let ADDED_UNICODE_SMP: UUID = UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
    /**
    * This list contains all of the currently supported UUIDs, ordered by when
    * the feature first appeared in this branch.
    */
    private static let SUPPORTED_UUIDS: Array<UUID> = {
        var suuid = Array<UUID>()
        suuid.append(ATNDeserializer.BASE_SERIALIZED_UUID)
        suuid.append(ATNDeserializer.ADDED_PRECEDENCE_TRANSITIONS)
        suuid.append(ATNDeserializer.ADDED_LEXER_ACTIONS)
        suuid.append(ATNDeserializer.ADDED_UNICODE_SMP)
        return suuid
    }()
    /// This is the current serialized UUID.
    public static let SERIALIZED_UUID: UUID = {
-        // SERIALIZED_UUID = ADDED_LEXER_ACTIONS;
+        // SERIALIZED_UUID = ADDED_UNICODE_SMP;
-        return UUID(uuidString: "AADB8D7E-AEEF-4415-AD2B-8204D6CF042E")!
+        return UUID(uuidString: "59627784-3BE5-417A-B9EB-8131A7286089")!
    }()
@ -245,24 +254,14 @@ public class ATNDeserializer {
        // SETS
        //
        var sets: Array<IntervalSet> = Array<IntervalSet>()
        let nsets: Int = toInt(data[p])
        p += 1
        for _ in 0..<nsets {
            let nintervals: Int = toInt(data[p])
            p += 1
            let set: IntervalSet = try IntervalSet()
            sets.append(set)
-            let containsEof: Bool = toInt(data[p]) != 0
+        // First, deserialize sets with 16-bit arguments <= U+FFFF.
-            p += 1
+        try readSets(data, &p, &sets, readUnicodeInt)
            if containsEof {
                try set.add(-1)
            }
-            for _ in 0..<nintervals {
+        // Next, if the ATN was serialized with the Unicode SMP feature,
-                try set.add(toInt(data[p]), toInt(data[p + 1]))
+        // deserialize sets with 32-bit arguments <= U+10FFFF.
-                p += 2
+        if isFeatureSupported(ATNDeserializer.ADDED_UNICODE_SMP, uuid) {
-            }
+            try readSets(data, &p, &sets, readUnicodeInt32)
        }
        //
@ -521,6 +520,39 @@ public class ATNDeserializer {
        return atn
    }
    private func readUnicodeInt(_ data: [Character], _ p: inout Int) -> Int {
        let result: Int = toInt(data[p])
        p += 1
        return result
    }
    private func readUnicodeInt32(_ data: [Character], _ p: inout Int) -> Int {
        let result: Int = toInt32(data, p)
        p += 2
        return result
    }
    private func readSets(_ data: [Character], _ p: inout Int, _ sets: inout Array<IntervalSet>, _ readUnicode: ([Character], inout Int) -> Int) throws {
        let nsets: Int = toInt(data[p])
        p += 1
        for _ in 0..<nsets {
            let nintervals: Int = toInt(data[p])
            p += 1
            let set: IntervalSet = try IntervalSet()
            sets.append(set)
            let containsEof: Bool = toInt(data[p]) != 0
            p += 1
            if containsEof {
                try set.add(-1)
            }
            for _ in 0..<nintervals {
                try set.add(readUnicode(data, &p), readUnicode(data, &p))
            }
        }
    }
    public func deserializeFromJson(_ jsonStr: String) -> ATN {
        // let jsonStr = Utils.readFile2String(jsonFileName)
        guard !jsonStr.isEmpty else {
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java
@ -6,8 +6,8 @@
 package org.antlr.v4.test.tool;
 import org.antlr.v4.runtime.ANTLRInputStream;
 import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.atn.ATN;
 import org.antlr.v4.runtime.atn.ATNState;
 import org.antlr.v4.runtime.misc.Utils;
@ -121,6 +121,94 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
 		checkLexerMatches(lg, "c", expecting);
 	}
 	@Test public void testLexerSetUnicodeBMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\u611B'|'\u611C')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, "\u611B", expecting);
 	}
 	@Test public void testLexerNotSetUnicodeBMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\u611B'|'\u611C')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, "\u611D", expecting);
 	}
 		@Test public void testLexerNotSetUnicodeBMPMatchesSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\u611B'|'\u611C')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
 	}
 	@Test public void testLexerSetUnicodeSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
 	}
 	@Test public void testLexerNotBMPSetMatchesUnicodeSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('a'|'b')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
 	}
 	@Test public void testLexerNotBMPSetMatchesBMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('a'|'b')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, "\u611B", expecting);
 	}
 	@Test public void testLexerNotBMPSetMatchesSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('a'|'b')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4A9).toString(), expecting);
 	}
 	@Test public void testLexerNotSMPSetMatchesBMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, "\u611B", expecting);
 	}
 	@Test public void testLexerNotSMPSetMatchesSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1D7C0).toString(), expecting);
 	}
 	@Test public void testLexerRangeUnicodeSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\\u{1F4A9}'..'\\u{1F4B0}')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x1F4AF).toString(), expecting);
 	}
 	@Test public void testLexerRangeUnicodeBMPToSMP() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\\u611B'..'\\u{1F4B0}')\n ;");
 		String expecting = "ID, EOF";
 		checkLexerMatches(lg, new StringBuilder().appendCodePoint(0x12001).toString(), expecting);
 	}
 	@Test public void testLexerKeywordIDAmbiguity() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
@ -293,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
 	protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
 		ATN atn = createATN(lg, true);
-		CharStream input = new ANTLRInputStream(inputString);
+		CharStream input = CharStreams.createWithString(inputString);
 		ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
 		DOTGenerator dot = new DOTGenerator(lg);
 //		System.out.println(dot.getDOT(startState, true));
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java
@ -291,6 +291,113 @@ public class TestATNSerialization extends BaseJavaToolTest {
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeSMPLiteralSerializedToSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"INT : '\\u{1F4A9}' ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:128169..128169\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeSMPRangeSerializedToSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"INT : ('a'..'\\u{1F4A9}') ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'a'..128169\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeSMPSetSerializedAfterBMPSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+
 			"BMP : ('a' | 'x') ;");
 		String expecting =
 			"max type 2\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:RULE_START 1\n" +
 			"4:RULE_STOP 1\n" +
 			"5:BASIC 0\n" +
 			"6:BASIC 0\n" +
 			"7:BASIC 1\n" +
 			"8:BASIC 1\n" +
 			"rule 0:1 1\n" +
 			"rule 1:3 2\n" +
 			"mode 0:0\n" +
 			"0:'a'..'a', 'x'..'x'\n" +
 			"1:128169..128169, 128175..128175\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"0->3 EPSILON 0,0,0\n" +
 			"1->5 EPSILON 0,0,0\n" +
 			"3->7 EPSILON 0,0,0\n" +
 			"5->6 SET 1,0,0\n" +
 			"6->2 EPSILON 0,0,0\n" +
 			"7->8 SET 0,0,0\n" +
 			"8->4 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerNotLiteral() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"INT : ~'a' ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'a'..'a'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
@ -518,6 +625,222 @@ public class TestATNSerialization extends BaseJavaToolTest {
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeUnescapedBMPNotSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\u4E9C'|'\u4E9D')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u4E9C'..'\\u4E9D'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeUnescapedBMPSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeUnescapedBMPNotSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\u4E9C'|'\u4E9D'|'\u6C5F'|'\u305F'..'\u307B')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedBMPNotSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u4E9C'|'\\u4E9D')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u4E9C'..'\\u4E9D'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedBMPSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedBMPNotSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u4E9C'|'\\u4E9D'|'\\u6C5F'|'\\u305F'..'\\u307B')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:'\\u305F'..'\\u307B', '\\u4E9C'..'\\u4E9D', '\\u6C5F'..'\\u6C5F'\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedSMPNotSet() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:128169..128170\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedSMPSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:119823..119827, 128065..128065, 128169..128170\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerUnicodeEscapedSMPNotSetWithRange() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
 			"ID : ~('\\u{1F4A9}'|'\\u{1F4AA}'|'\\u{1F441}'|'\\u{1D40F}'..'\\u{1D413}')\n ;");
 		String expecting =
 			"max type 1\n" +
 			"0:TOKEN_START -1\n" +
 			"1:RULE_START 0\n" +
 			"2:RULE_STOP 0\n" +
 			"3:BASIC 0\n" +
 			"4:BASIC 0\n" +
 			"rule 0:1 1\n" +
 			"mode 0:0\n" +
 			"0:119823..119827, 128065..128065, 128169..128170\n" +
 			"0->1 EPSILON 0,0,0\n" +
 			"1->3 EPSILON 0,0,0\n" +
 			"3->4 NOT_SET 0,0,0\n" +
 			"4->2 EPSILON 0,0,0\n" +
 			"0:0\n";
 		ATN atn = createATN(lg, true);
 		String result = ATNSerializer.getDecoded(atn, Arrays.asList(lg.getTokenNames()));
 		assertEquals(expecting, result);
 	}
 	@Test public void testLexerWildcardWithMode() throws Exception {
 		LexerGrammar lg = new LexerGrammar(
 			"lexer grammar L;\n"+
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestTokenTypeAssignment.java
@ -141,6 +141,24 @@ public class TestTokenTypeAssignment extends BaseJavaToolTest {
 		assertEquals("'\\n'", literals.toArray()[0]);
 	}
 	@Test public void testParserCharLiteralWithBasicUnicodeEscape() throws Exception {
 		Grammar g = new Grammar(
 				"grammar t;\n"+
 				"a : '\\uABCD';\n");
 		Set<?> literals = g.stringLiteralToTypeMap.keySet();
 		// must store literals how they appear in the antlr grammar
 		assertEquals("'\\uABCD'", literals.toArray()[0]);
 	}
 	@Test public void testParserCharLiteralWithExtendedUnicodeEscape() throws Exception {
 		Grammar g = new Grammar(
 				"grammar t;\n"+
 				"a : '\\u{1ABCD}';\n");
 		Set<?> literals = g.stringLiteralToTypeMap.keySet();
 		// must store literals how they appear in the antlr grammar
 		assertEquals("'\\u{1ABCD}'", literals.toArray()[0]);
 	}
 	protected void checkSymbols(Grammar g,
 								String rulesStr,
 								String allValidTokensStr)
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java
@ -0,0 +1,131 @@
 /*
 * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
 package org.antlr.v4.test.tool;
 import org.antlr.v4.gui.Trees;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CommonTokenStream;
 import org.antlr.v4.runtime.LexerInterpreter;
 import org.antlr.v4.runtime.tree.ParseTree;
 import org.antlr.v4.tool.Grammar;
 import org.antlr.v4.tool.GrammarParserInterpreter;
 import org.junit.Before;
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
 public class TestUnicodeGrammar extends BaseJavaToolTest {
 	@Test
 	public void unicodeBMPLiteralInGrammar() throws Exception {
 		String grammarText =
 			"grammar Unicode;\n" +
 			"r : 'hello' WORLD;\n" +
 			"WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" +
 			"WS : [ \\t\\r\\n]+ -> skip;\n";
 		String inputText = "hello \u4E16\u754C";
 		assertEquals(
 				"(r:1 " + inputText + ")",
 				parseTreeForGrammarWithInput(
 						grammarText,
 						"r",
 						inputText));
 	}
 	// TODO: This test cannot pass unless we change either the grammar
 	// parser to decode surrogate pair literals to code points (which
 	// would break existing clients) or to treat them as an
 	// alternative:
 	//
 	// '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D')
 	//
 	// but I worry that might cause parse ambiguity if we're not careful.
 	//@Test
 	public void unicodeSurrogatePairLiteralInGrammar() throws Exception {
 		String grammarText =
 			"grammar Unicode;\n" +
 			"r : 'hello' WORLD;\n" +
 			"WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" +
 			"WS : [ \\t\\r\\n]+ -> skip;\n";
 		String inputText = new StringBuilder("hello ")
 				.appendCodePoint(0x1F30E)
 				.toString();
 		assertEquals(
 				"(r:1 " + inputText + ")",
 				parseTreeForGrammarWithInput(
 						grammarText,
 						"r",
 						inputText));
 	}
 	@Test
 	public void unicodeSMPLiteralInGrammar() throws Exception {
 		String grammarText =
 			"grammar Unicode;\n" +
 			"r : 'hello' WORLD;\n" +
 			"WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" +
 			"WS : [ \\t\\r\\n]+ -> skip;\n";
 		String inputText = new StringBuilder("hello ")
 				.appendCodePoint(0x1F30E)
 				.toString();
 		assertEquals(
 				"(r:1 " + inputText + ")",
 				parseTreeForGrammarWithInput(
 						grammarText,
 						"r",
 						inputText));
 	}
 	@Test
 	public void unicodeSMPRangeInGrammar() throws Exception {
 		String grammarText =
 			"grammar Unicode;\n" +
 			"r : 'hello' WORLD;\n" +
 			"WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" +
 			"WS : [ \\t\\r\\n]+ -> skip;\n";
 		String inputText = new StringBuilder("hello ")
 				.appendCodePoint(0x1F30E)
 				.toString();
 		assertEquals(
 				"(r:1 " + inputText + ")",
 				parseTreeForGrammarWithInput(
 						grammarText,
 						"r",
 						inputText));
 	}
 	@Test
 	public void matchingDanglingSurrogateInInput() throws Exception {
 		String grammarText =
 			"grammar Unicode;\n" +
 			"r : 'hello' WORLD;\n" +
 			"WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" +
 			"WS : [ \\t\\r\\n]+ -> skip;\n";
 		String inputText = "hello \uD83C";
 		assertEquals(
 				"(r:1 " + inputText + ")",
 				parseTreeForGrammarWithInput(
 						grammarText,
 						"r",
 						inputText));
 	}
 	private static String parseTreeForGrammarWithInput(
 			String grammarText,
 			String rootRule,
 			String inputText) throws Exception {
 		Grammar grammar = new Grammar(grammarText);
 		LexerInterpreter lexEngine = grammar.createLexerInterpreter(
 				CharStreams.createWithString(inputText));
 		CommonTokenStream tokens = new CommonTokenStream(lexEngine);
 		GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
 		ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
 		InterpreterTreeTextProvider nodeTextProvider =
 				new InterpreterTreeTextProvider(grammar.getRuleNames());
 		return Trees.toStringTree(parseTree, nodeTextProvider);
 	}
 }
--- a/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg
+++ b/tool/resources/org/antlr/v4/tool/templates/codegen/CSharp/CSharp.stg
@ -288,7 +288,7 @@ public partial class <csIdentifier.(parser.name)> : <superClass; null="Parser">
 	public override string[] RuleNames { get { return ruleNames; } }
-	public override string SerializedAtn { get { return _serializedATN; } }
+	public override string SerializedAtn { get { return new string(_serializedATN); } }
 	static <csIdentifier.(parser.name)>() {
 		decisionToDFA = new DFA[_ATN.NumberOfDecisions];
@ -1023,7 +1023,7 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
 	public override string[] ModeNames { get { return modeNames; } }
-	public override string SerializedAtn { get { return _serializedATN; } }
+	public override string SerializedAtn { get { return new string(_serializedATN); } }
 	static <csIdentifier.(lexer.name)>() {
 		decisionToDFA = new DFA[_ATN.NumberOfDecisions];
@ -1038,16 +1038,12 @@ public partial class <csIdentifier.(lexer.name)> : <superClass; null="Lexer"> {
 SerializedATN(model) ::= <<
-private static string _serializedATN = _serializeATN();
+private static char[] _serializedATN = {
-private static string _serializeATN()
+	<model.serialized; separator=", ", wrap>,
-{
+};
    StringBuilder sb = new StringBuilder();
    sb.Append("<model.serialized; wrap={");<\n><\t>sb.Append("}>");
    return sb.ToString();
 }
 public static readonly ATN _ATN =
-	new ATNDeserializer().Deserialize(_serializedATN.ToCharArray());
+	new ATNDeserializer().Deserialize(_serializedATN);
 >>
--- a/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg
+++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg
@ -192,23 +192,23 @@ atn::ATN <lexer.name>::_atn;
 std::vector\<uint16_t> <lexer.name>::_serializedATN;
 std::vector\<std::string> <lexer.name>::_ruleNames = {
-  <lexer.ruleNames: {r | "<r>"}; separator = ", ", wrap, anchor>
+  <lexer.ruleNames: {r | u8"<r>"}; separator = ", ", wrap, anchor>
 };
 std::vector\<std::string> <lexer.name>::_channelNames = {
-  "DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | "<c>"}; separator = ", ", wrap, anchor><endif>
+  "DEFAULT_TOKEN_CHANNEL", "HIDDEN"<if (lexer.channels)>, <lexer.channels: {c | u8"<c>"}; separator = ", ", wrap, anchor><endif>
 };
 std::vector\<std::string> <lexer.name>::_modeNames = {
-  <lexer.modes: {m | "<m>"}; separator = ", ", wrap, anchor>
+  <lexer.modes: {m | u8"<m>"}; separator = ", ", wrap, anchor>
 };
 std::vector\<std::string> <lexer.name>::_literalNames = {
-  <lexer.literalNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor>
+  <lexer.literalNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
 };
 std::vector\<std::string> <lexer.name>::_symbolicNames = {
-  <lexer.symbolicNames: {t | <t>}; null = "\"\"", separator = ", ", wrap, anchor>
+  <lexer.symbolicNames: {t | u8<t>}; null = "\"\"", separator = ", ", wrap, anchor>
 };
 dfa::Vocabulary <lexer.name>::_vocabulary(_literalNames, _symbolicNames);
--- a/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java
+++ b/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java
@ -46,7 +46,7 @@ public class CSharpTarget extends Target {
 			formatted = String.format("\\x%X", v & 0xFFFF);
 		}
-		return formatted;
+		return "'" + formatted + "'";
 	}
 	@Override
--- a/tool/src/org/antlr/v4/misc/CharSupport.java
+++ b/tool/src/org/antlr/v4/misc/CharSupport.java
@ -45,10 +45,9 @@ public class CharSupport {
 	}
 	/** Return a string representing the escaped char for code c.  E.g., If c
-	 *  has value 0x100, you will get "\u0100".  ASCII gets the usual
+	 *  has value 0x100, you will get "\\u0100".  ASCII gets the usual
-	 *  char (non-hex) representation.  Control characters are spit out
+	 *  char (non-hex) representation.  Non-ASCII characters are spit out
-	 *  as unicode.  While this is specially set up for returning Java strings,
+	 *  as \\uXXXX or \\u{XXXXXX} escapes.
 	 *  it can be used by any language target that has the same syntax. :)
 	 */
 	public static String getANTLRCharLiteralForChar(int c) {
 		if ( c< Lexer.MIN_CHAR_VALUE ) {
@ -67,11 +66,11 @@ public class CharSupport {
 			}
 			return '\''+Character.toString((char)c)+'\'';
 		}
-		// turn on the bit above max "\uFFFF" value so that we pad with zeros
+		if (c <= 0xFFFF) {
-		// then only take last 4 digits
+			return String.format("\\u%04X", c);
-		String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
+		} else {
-		String unicodeStr = "'\\u"+hex+"'";
+			return String.format("\\u{%06X}", c);
-		return unicodeStr;
+		}
 	}
 	/** Given a literal like (the 3 char sequence with single quotes) 'a',
@ -92,11 +91,25 @@ public class CharSupport {
 			if ( literal.charAt(i) == '\\' ) {
 				end = i+2;
 				if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
-					for (end = i + 2; end < i + 6; end++) {
+					if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence
-						if ( end>n ) return null; // invalid escape sequence.
+						end = i + 3;
-						char charAt = literal.charAt(end);
+						while (true) {
-						if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
+							if ( end + 1 > n ) return null; // invalid escape sequence.
-							return null; // invalid escape sequence.
+							char charAt = literal.charAt(end++);
 							if (charAt == '}') {
 								break;
 							}
 							if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
 								return null; // invalid escape sequence.
 							}
 						}
 					} else {
 						for (end = i + 2; end < i + 6; end++) {
 							if ( end>n ) return null; // invalid escape sequence.
 							char charAt = literal.charAt(end);
 							if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
 								return null; // invalid escape sequence.
 							}
 						}
 					}
 				}
@ -107,13 +120,13 @@ public class CharSupport {
 			if ( c==-1 ) {
 				return null; // invalid escape sequence.
 			}
-			else buf.append((char)c);
+			else buf.appendCodePoint(c);
 			i = end;
 		}
 		return buf.toString();
 	}
-	/** Given char x or \t or \u1234 return the char value;
+	/** Given char x or \\t or \\u1234 return the char value;
 	 *  Unnecessary escapes like '\{' yield -1.
 	 */
 	public static int getCharValueFromCharInGrammarLiteral(String cstr) {
@ -130,9 +143,31 @@ public class CharSupport {
 				if ( charVal==0 ) return -1;
 				return charVal;
 			case 6:
-				// '\u1234'
+				// '\\u1234' or '\\u{12}'
 				if ( !cstr.startsWith("\\u") ) return -1;
-				String unicodeChars = cstr.substring(2, cstr.length());
+				int startOff;
 				int endOff;
 				if ( cstr.charAt(2) == '{' ) {
 					startOff = 3;
 					endOff = cstr.indexOf('}');
 				} else {
 					startOff = 2;
 					endOff = cstr.length();
 				}
 				return parseHexValue(cstr, startOff, endOff);
 			default:
 				if ( cstr.startsWith("\\u{") ) {
 					return parseHexValue(cstr, 3, cstr.indexOf('}'));
 				}
 				return -1;
 		}
 	}
 	private static int parseHexValue(String cstr, int startOff, int endOff) {
 		if (startOff < 0 || endOff < 0) {
 			return -1;
 		}
 		String unicodeChars = cstr.substring(startOff, endOff);
 				int result = -1;
 				try {
 					result = Integer.parseInt(unicodeChars, 16);
@ -140,9 +175,6 @@ public class CharSupport {
 				catch (NumberFormatException e) {
 				}
 				return result;
 			default:
 				return -1;
 		}
 	}
 	public static String capitalize(String s) {
--- a/tool/src/org/antlr/v4/parse/ANTLRLexer.g
+++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g
@ -615,8 +615,8 @@ SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT
 //
 // ANTLR makes no disticintion between a single character literal and a
 // multi-character string. All literals are single quote delimited and
-// may contain unicode escape sequences of the form \uxxxx, where x
+// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx},
-// is a valid hexadecimal number (as per Java basically).
+// where x is a valid hexadecimal number.
 STRING_LITERAL
    :  '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
       (    '\''
@ -652,6 +652,10 @@ ESC_SEQ
    	      //
    	      UNICODE_ESC
            | // A Swift/Hack style Unicode escape sequence
              //
              UNICODE_EXTENDED_ESC
    	    | // An illegal escape seqeunce
    	      //
    	      {
@ -720,6 +724,27 @@ UNICODE_ESC
    	}
    ;
 fragment
 UNICODE_EXTENDED_ESC
    :   'u{' // Leadin for unicode extended escape sequence
        HEX_DIGIT+ // One or more hexadecimal digits
        '}' // Leadout for unicode extended escape sequence
        // Now check the digit count and issue an error if we need to
        {
            int numDigits = getCharIndex()-state.tokenStartCharIndex-6;
            if (numDigits > 6) {
                Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
                t.setText(t.getText());
                t.setLine(input.getLine());
                t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
                grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
            }
        }
    ;
 // ----------
 // Whitespace
 //