diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeEscapes.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeEscapes.java new file mode 100644 index 000000000..279246256 --- /dev/null +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeEscapes.java @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.test.tool; + +import org.antlr.v4.codegen.UnicodeEscapes; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TestUnicodeEscapes { + @Test + public void latinJavaEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendJavaStyleEscapedCodePoint(0x0061, sb); + assertEquals("\\u0061", sb.toString()); + } + + @Test + public void latinPythonEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendPythonStyleEscapedCodePoint(0x0061, sb); + assertEquals("\\u0061", sb.toString()); + } + + @Test + public void latinSwiftEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0x0061, sb); + assertEquals("\\u{0061}", sb.toString()); + } + + @Test + public void bmpJavaEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendJavaStyleEscapedCodePoint(0xABCD, sb); + assertEquals("\\uABCD", sb.toString()); + } + + @Test + public void bmpPythonEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendPythonStyleEscapedCodePoint(0xABCD, sb); + assertEquals("\\uABCD", sb.toString()); + } + + @Test + public void bmpSwiftEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0xABCD, sb); + assertEquals("\\u{ABCD}", sb.toString()); + } + + @Test + public void smpJavaEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendJavaStyleEscapedCodePoint(0x1F4A9, sb); + assertEquals("\\uD83D\\uDCA9", sb.toString()); + } + + @Test + public void smpPythonEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendPythonStyleEscapedCodePoint(0x1F4A9, sb); + assertEquals("\\U0001F4A9", sb.toString()); + } + + @Test + public void smpSwiftEscape() { + StringBuilder sb = new StringBuilder(); + UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0x1F4A9, sb); + assertEquals("\\u{1F4A9}", sb.toString()); + } +} diff --git a/tool/src/org/antlr/v4/codegen/Target.java b/tool/src/org/antlr/v4/codegen/Target.java index 9a2fc5de3..8555f34ef 100644 --- a/tool/src/org/antlr/v4/codegen/Target.java +++ b/tool/src/org/antlr/v4/codegen/Target.java @@ -9,6 +9,7 @@ package org.antlr.v4.codegen; import org.antlr.v4.Tool; import org.antlr.v4.codegen.model.RuleFunction; import org.antlr.v4.codegen.model.SerializedATN; +import org.antlr.v4.misc.CharSupport; import org.antlr.v4.misc.Utils; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.runtime.RuntimeMetaData; @@ -146,17 +147,22 @@ public abstract class Target { if ( quoted ) { buf.append('"'); } - for (int i=0; i= 0x7F; // DEL and beyond (keeps source code 7-bit US-ASCII) + } + /** Assume 16-bit char */ public String encodeIntAsCharEscape(int v) { if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) { diff --git a/tool/src/org/antlr/v4/codegen/UnicodeEscapes.java b/tool/src/org/antlr/v4/codegen/UnicodeEscapes.java new file mode 100644 index 000000000..565ee2750 --- /dev/null +++ b/tool/src/org/antlr/v4/codegen/UnicodeEscapes.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.codegen; + +/** + * Utility class to escape Unicode code points using various + * languages' syntaxes. + */ +public abstract class UnicodeEscapes { + static public void appendJavaStyleEscapedCodePoint(int codePoint, StringBuilder sb) { + if (Character.isSupplementaryCodePoint(codePoint)) { + // char is not an 'integral' type, so we have to explicitly convert + // to int before passing to the %X formatter or else it throws. + sb.append(String.format("\\u%04X", (int)Character.highSurrogate(codePoint))); + sb.append(String.format("\\u%04X", (int)Character.lowSurrogate(codePoint))); + } else { + sb.append(String.format("\\u%04X", codePoint)); + } + } + + static public void appendPythonStyleEscapedCodePoint(int codePoint, StringBuilder sb) { + if (Character.isSupplementaryCodePoint(codePoint)) { + sb.append(String.format("\\U%08X", codePoint)); + } else { + sb.append(String.format("\\u%04X", codePoint)); + } + } + + static public void appendSwiftStyleEscapedCodePoint(int codePoint, StringBuilder sb) { + sb.append(String.format("\\u{%04X}", codePoint)); + } +} diff --git a/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java b/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java index c14c88723..3ee0b89ef 100644 --- a/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/CSharpTarget.java @@ -7,6 +7,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.tool.ErrorType; import org.antlr.v4.tool.ast.GrammarAST; import org.stringtemplate.v4.NumberRenderer; @@ -36,78 +37,16 @@ public class CSharpTarget extends Target { throw new IllegalArgumentException(String.format("Cannot encode the specified value: %d", v)); } + String formatted; if (v >= 0 && v < targetCharValueEscape.length && targetCharValueEscape[v] != null) { - return targetCharValueEscape[v]; + formatted = targetCharValueEscape[v]; + } else if (v >= 0x20 && v < 127 && (v < '0' || v > '9') && (v < 'a' || v > 'f') && (v < 'A' || v > 'F')) { + formatted = Character.toString((char)v); + } else { + formatted = String.format("\\x%X", v & 0xFFFF); } - if (v >= 0x20 && v < 127 && (v < '0' || v > '9') && (v < 'a' || v > 'f') && (v < 'A' || v > 'F')) { - return String.valueOf((char)v); - } - - return String.format("\\x%X", v & 0xFFFF); - } - - @Override - public String getTargetStringLiteralFromANTLRStringLiteral( - CodeGenerator generator, - String literal, boolean addQuotes) - { - StringBuilder sb = new StringBuilder(); - String is = literal; - - if ( addQuotes ) sb.append('"'); - - for (int i = 1; i < is.length() -1; i++) { - if (is.charAt(i) == '\\') { - // Anything escaped is what it is! We assume that - // people know how to escape characters correctly. However - // we catch anything that does not need an escape in Java (which - // is what the default implementation is dealing with and remove - // the escape. The C target does this for instance. - // - switch (is.charAt(i+1)) { - // Pass through any escapes that Java also needs - // - case '"': - case 'n': - case 'r': - case 't': - case 'b': - case 'f': - case '\\': - // Pass the escape through - sb.append('\\'); - break; - - case 'u': // Assume unnnn - // Pass the escape through as double \\ - // so that Java leaves as \u0000 string not char - sb.append('\\'); - sb.append('\\'); - break; - - default: - // Remove the escape by virtue of not adding it here - // Thus \' becomes ' and so on - break; - } - - // Go past the \ character - i++; - } else { - // Characters that don't need \ in ANTLR 'strings' but do in Java - if (is.charAt(i) == '"') { - // We need to escape " in Java - sb.append('\\'); - } - } - // Add in the next character, which may have been escaped - sb.append(is.charAt(i)); - } - - if ( addQuotes ) sb.append('"'); - - return sb.toString(); + return "'" + formatted + "'"; } @Override @@ -150,4 +89,9 @@ public class CSharpTarget extends Target { return result; } + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + // C# and Python share the same escaping style. + UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/CppTarget.java b/tool/src/org/antlr/v4/codegen/target/CppTarget.java index 2b274f6c1..162bf67db 100644 --- a/tool/src/org/antlr/v4/codegen/target/CppTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/CppTarget.java @@ -6,6 +6,7 @@ package org.antlr.v4.codegen.target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; import org.antlr.v4.tool.ErrorType; @@ -68,81 +69,6 @@ public class CppTarget extends Target { badWords.add("parserRule"); } - /** - * {@inheritDoc} - *

- * For C++, this is the translation {@code 'a\n"'} → {@code "a\n\""}. - * Expect single quotes around the incoming literal. Just flip the quotes - * and replace double quotes with {@code \"}. - *

- * Note that we have decided to allow people to use '\"' without penalty, so - * we must build the target string in a loop as {@link String#replace} - * cannot handle both {@code \"} and {@code "} without a lot of messing - * around. - */ - @Override - public String getTargetStringLiteralFromANTLRStringLiteral( - CodeGenerator generator, - String literal, boolean addQuotes) - { - StringBuilder sb = new StringBuilder(); - String is = literal; - - if ( addQuotes ) sb.append('"'); - - for (int i = 1; i < is.length() -1; i++) { - if (is.charAt(i) == '\\') { - // Anything escaped is what it is! We assume that - // people know how to escape characters correctly. However - // we catch anything that does not need an escape in Java (which - // is what the default implementation is dealing with and remove - // the escape. The C target does this for instance. - // - switch (is.charAt(i+1)) { - // Pass through any escapes that Java also needs - // - case '"': - case 'n': - case 'r': - case 't': - case 'b': - case 'f': - case '\\': - // Pass the escape through - sb.append('\\'); - break; - - case 'u': // Assume unnnn - // Pass the escape through as double \\ - // so that Java leaves as \u0000 string not char - sb.append('\\'); - sb.append('\\'); - break; - - default: - // Remove the escape by virtue of not adding it here - // Thus \' becomes ' and so on - break; - } - - // Go past the \ character - i++; - } else { - // Characters that don't need \ in ANTLR 'strings' but do in Java - if (is.charAt(i) == '"') { - // We need to escape " in Java - sb.append('\\'); - } - } - // Add in the next character, which may have been escaped - sb.append(is.charAt(i)); - } - - if ( addQuotes ) sb.append('"'); - - return sb.toString(); - } - @Override public String encodeIntAsCharEscape(int v) { return "0x" + Integer.toHexString(v) + ", "; @@ -232,4 +158,10 @@ public class CppTarget extends Target { return result; } + + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + // C99 and Python share the same escaping style. + UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/GoTarget.java b/tool/src/org/antlr/v4/codegen/target/GoTarget.java index 1f4c373f8..491092dd5 100644 --- a/tool/src/org/antlr/v4/codegen/target/GoTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/GoTarget.java @@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.ast.GrammarAST; @@ -214,5 +215,10 @@ public class GoTarget extends Target { } } -} + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + // Go and Python share the same escaping style. + UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb); + } +} diff --git a/tool/src/org/antlr/v4/codegen/target/JavaScriptTarget.java b/tool/src/org/antlr/v4/codegen/target/JavaScriptTarget.java index 3830b2b9e..5d218742e 100644 --- a/tool/src/org/antlr/v4/codegen/target/JavaScriptTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/JavaScriptTarget.java @@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.tool.ast.GrammarAST; import org.stringtemplate.v4.STGroup; import org.stringtemplate.v4.StringRenderer; @@ -67,81 +68,6 @@ public class JavaScriptTarget extends Target { badWords.add("parserRule"); } - /** - * {@inheritDoc} - *

- * For Java, this is the translation {@code 'a\n"'} → {@code "a\n\""}. - * Expect single quotes around the incoming literal. Just flip the quotes - * and replace double quotes with {@code \"}. - *

- * Note that we have decided to allow people to use '\"' without penalty, so - * we must build the target string in a loop as {@link String#replace} - * cannot handle both {@code \"} and {@code "} without a lot of messing - * around. - */ - @Override - public String getTargetStringLiteralFromANTLRStringLiteral( - CodeGenerator generator, - String literal, boolean addQuotes) - { - StringBuilder sb = new StringBuilder(); - String is = literal; - - if ( addQuotes ) sb.append('"'); - - for (int i = 1; i < is.length() -1; i++) { - if (is.charAt(i) == '\\') { - // Anything escaped is what it is! We assume that - // people know how to escape characters correctly. However - // we catch anything that does not need an escape in Java (which - // is what the default implementation is dealing with and remove - // the escape. The C target does this for instance. - // - switch (is.charAt(i+1)) { - // Pass through any escapes that Java also needs - // - case '"': - case 'n': - case 'r': - case 't': - case 'b': - case 'f': - case '\\': - // Pass the escape through - sb.append('\\'); - break; - - case 'u': // Assume unnnn - // Pass the escape through as double \\ - // so that Java leaves as \u0000 string not char - sb.append('\\'); - sb.append('\\'); - break; - - default: - // Remove the escape by virtue of not adding it here - // Thus \' becomes ' and so on - break; - } - - // Go past the \ character - i++; - } else { - // Characters that don't need \ in ANTLR 'strings' but do in Java - if (is.charAt(i) == '"') { - // We need to escape " in Java - sb.append('\\'); - } - } - // Add in the next character, which may have been escaped - sb.append(is.charAt(i)); - } - - if ( addQuotes ) sb.append('"'); - - return sb.toString(); - } - @Override public String encodeIntAsCharEscape(int v) { if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) { @@ -210,4 +136,10 @@ public class JavaScriptTarget extends Target { public boolean supportsOverloadedMethods() { return false; } + + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + // JavaScript and Java share the same escaping style. + UnicodeEscapes.appendJavaStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/JavaTarget.java b/tool/src/org/antlr/v4/codegen/target/JavaTarget.java index a7cb9003f..8dda59034 100644 --- a/tool/src/org/antlr/v4/codegen/target/JavaTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/JavaTarget.java @@ -9,6 +9,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.Tool; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.tool.ast.GrammarAST; import org.stringtemplate.v4.STGroup; import org.stringtemplate.v4.StringRenderer; @@ -99,4 +100,9 @@ public class JavaTarget extends Target { } } + + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + UnicodeEscapes.appendJavaStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/Python2Target.java b/tool/src/org/antlr/v4/codegen/target/Python2Target.java index cb203ac5a..6f53af6b3 100644 --- a/tool/src/org/antlr/v4/codegen/target/Python2Target.java +++ b/tool/src/org/antlr/v4/codegen/target/Python2Target.java @@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.tool.ast.GrammarAST; import org.stringtemplate.v4.STGroup; import org.stringtemplate.v4.StringRenderer; @@ -109,4 +110,9 @@ public class Python2Target extends Target { badWords.add("rule"); badWords.add("parserRule"); } + + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/Python3Target.java b/tool/src/org/antlr/v4/codegen/target/Python3Target.java index 598b2d3b9..85ab6c958 100644 --- a/tool/src/org/antlr/v4/codegen/target/Python3Target.java +++ b/tool/src/org/antlr/v4/codegen/target/Python3Target.java @@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.tool.ast.GrammarAST; import org.stringtemplate.v4.STGroup; import org.stringtemplate.v4.StringRenderer; @@ -115,5 +116,8 @@ public class Python3Target extends Target { badWords.add("parserRule"); } - + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb); + } } diff --git a/tool/src/org/antlr/v4/codegen/target/SwiftTarget.java b/tool/src/org/antlr/v4/codegen/target/SwiftTarget.java index b96d910ec..f38450fb3 100644 --- a/tool/src/org/antlr/v4/codegen/target/SwiftTarget.java +++ b/tool/src/org/antlr/v4/codegen/target/SwiftTarget.java @@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.codegen.Target; +import org.antlr.v4.codegen.UnicodeEscapes; import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNDeserializer; @@ -550,4 +551,9 @@ public class SwiftTarget extends Target { } } + + @Override + protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) { + UnicodeEscapes.appendSwiftStyleEscapedCodePoint(codePoint, sb); + } }