Emit language-specific Unicode escapes when generating code containing non-ASCII Unicode values

This commit is contained in:
Ben Hamilton 2017-02-10 14:00:33 -08:00
parent 182f3c4647
commit 0049d6d9ae
11 changed files with 235 additions and 245 deletions

View File

@ -0,0 +1,78 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.tool;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class TestUnicodeEscapes {
@Test
public void latinJavaEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendJavaStyleEscapedCodePoint(0x0061, sb);
assertEquals("\\u0061", sb.toString());
}
@Test
public void latinPythonEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendPythonStyleEscapedCodePoint(0x0061, sb);
assertEquals("\\u0061", sb.toString());
}
@Test
public void latinSwiftEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0x0061, sb);
assertEquals("\\u{0061}", sb.toString());
}
@Test
public void bmpJavaEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendJavaStyleEscapedCodePoint(0xABCD, sb);
assertEquals("\\uABCD", sb.toString());
}
@Test
public void bmpPythonEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendPythonStyleEscapedCodePoint(0xABCD, sb);
assertEquals("\\uABCD", sb.toString());
}
@Test
public void bmpSwiftEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0xABCD, sb);
assertEquals("\\u{ABCD}", sb.toString());
}
@Test
public void smpJavaEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendJavaStyleEscapedCodePoint(0x1F4A9, sb);
assertEquals("\\uD83D\\uDCA9", sb.toString());
}
@Test
public void smpPythonEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendPythonStyleEscapedCodePoint(0x1F4A9, sb);
assertEquals("\\U0001F4A9", sb.toString());
}
@Test
public void smpSwiftEscape() {
StringBuilder sb = new StringBuilder();
UnicodeEscapes.appendSwiftStyleEscapedCodePoint(0x1F4A9, sb);
assertEquals("\\u{1F4A9}", sb.toString());
}
}

View File

@ -9,6 +9,7 @@ package org.antlr.v4.codegen;
import org.antlr.v4.Tool;
import org.antlr.v4.codegen.model.RuleFunction;
import org.antlr.v4.codegen.model.SerializedATN;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.misc.Utils;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.RuntimeMetaData;
@ -146,17 +147,22 @@ public abstract class Target {
if ( quoted ) {
buf.append('"');
}
for (int i=0; i<s.length(); i++) {
int c = s.charAt(i);
for (int i=0; i<s.length(); ) {
int c = s.codePointAt(i);
if ( c!='\'' && // don't escape single quotes in strings for java
c<targetCharValueEscape.length &&
targetCharValueEscape[c]!=null )
{
buf.append(targetCharValueEscape[c]);
}
else {
buf.append((char)c);
else if (shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(c)) {
appendUnicodeEscapedCodePoint(i, buf);
}
else
{
buf.appendCodePoint(c);
}
i += Character.charCount(c);
}
if ( quoted ) {
buf.append('"');
@ -164,6 +170,12 @@ public abstract class Target {
return buf.toString();
}
/**
* Escape the Unicode code point appropriately for this language
* and append the escaped value to {@code sb}.
*/
abstract protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb);
public String getTargetStringLiteralFromString(String s) {
return getTargetStringLiteralFromString(s, true);
}
@ -194,15 +206,19 @@ public abstract class Target {
if ( addQuotes ) sb.append('"');
for (int i = 1; i < is.length() -1; i++) {
if (is.charAt(i) == '\\') {
for (int i = 1; i < is.length() -1; ) {
int codePoint = is.codePointAt(i);
int toAdvance = Character.charCount(codePoint);
if (codePoint == '\\') {
// Anything escaped is what it is! We assume that
// people know how to escape characters correctly. However
// we catch anything that does not need an escape in Java (which
// is what the default implementation is dealing with and remove
// the escape. The C target does this for instance.
//
switch (is.charAt(i+1)) {
int escapedCodePoint = is.codePointAt(i+toAdvance);
toAdvance++;
switch (escapedCodePoint) {
// Pass through any escapes that Java also needs
//
case '"':
@ -214,32 +230,43 @@ public abstract class Target {
case '\\':
// Pass the escape through
sb.append('\\');
sb.appendCodePoint(escapedCodePoint);
break;
case 'u': // Assume unnnn
// Pass the escape through as double \\
// so that Java leaves as \u0000 string not char
sb.append('\\');
sb.append('\\');
break;
default:
// Remove the escape by virtue of not adding it here
// Thus \' becomes ' and so on
break;
case 'u': // Either unnnn or u{nnnnnn}
if (is.charAt(i+toAdvance) == '{') {
while (is.charAt(i+toAdvance) != '}') {
toAdvance++;
}
// Go past the \ character
i++;
toAdvance++;
} else {
// Characters that don't need \ in ANTLR 'strings' but do in Java
if (is.charAt(i) == '"') {
// We need to escape " in Java
sb.append('\\');
toAdvance += 4;
}
String fullEscape = is.substring(i, i + toAdvance);
appendUnicodeEscapedCodePoint(
CharSupport.getCharValueFromCharInGrammarLiteral(fullEscape),
sb);
break;
default:
if (shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(escapedCodePoint)) {
appendUnicodeEscapedCodePoint(escapedCodePoint, sb);
} else {
sb.appendCodePoint(escapedCodePoint);
}
break;
}
} else {
if (codePoint == 0x22) {
// ANTLR doesn't escape " in literal strings,
// but every other language needs to do so.
sb.append("\\\"");
} else if (shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(codePoint)) {
appendUnicodeEscapedCodePoint(codePoint, sb);
} else {
sb.appendCodePoint(codePoint);
}
}
// Add in the next character, which may have been escaped
sb.append(is.charAt(i));
i += toAdvance;
}
if ( addQuotes ) sb.append('"');
@ -247,6 +274,19 @@ public abstract class Target {
return sb.toString();
}
private static boolean shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(int codePoint) {
// We don't want anyone passing 0x0A (newline) or 0x22
// (double-quote) here because Java treats \\u000A as
// a literal newline and \\u0022 as a literal
// double-quote, so Unicode escaping doesn't help.
assert codePoint != 0x0A && codePoint != 0x22;
return
codePoint < 0x20 || // control characters up to but not including space
codePoint == 0x5C || // backslash
codePoint >= 0x7F; // DEL and beyond (keeps source code 7-bit US-ASCII)
}
/** Assume 16-bit char */
public String encodeIntAsCharEscape(int v) {
if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) {

View File

@ -0,0 +1,36 @@
/*
* Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.codegen;
/**
* Utility class to escape Unicode code points using various
* languages' syntaxes.
*/
public abstract class UnicodeEscapes {
static public void appendJavaStyleEscapedCodePoint(int codePoint, StringBuilder sb) {
if (Character.isSupplementaryCodePoint(codePoint)) {
// char is not an 'integral' type, so we have to explicitly convert
// to int before passing to the %X formatter or else it throws.
sb.append(String.format("\\u%04X", (int)Character.highSurrogate(codePoint)));
sb.append(String.format("\\u%04X", (int)Character.lowSurrogate(codePoint)));
} else {
sb.append(String.format("\\u%04X", codePoint));
}
}
static public void appendPythonStyleEscapedCodePoint(int codePoint, StringBuilder sb) {
if (Character.isSupplementaryCodePoint(codePoint)) {
sb.append(String.format("\\U%08X", codePoint));
} else {
sb.append(String.format("\\u%04X", codePoint));
}
}
static public void appendSwiftStyleEscapedCodePoint(int codePoint, StringBuilder sb) {
sb.append(String.format("\\u{%04X}", codePoint));
}
}

View File

@ -7,6 +7,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.tool.ErrorType;
import org.antlr.v4.tool.ast.GrammarAST;
import org.stringtemplate.v4.NumberRenderer;
@ -36,78 +37,16 @@ public class CSharpTarget extends Target {
throw new IllegalArgumentException(String.format("Cannot encode the specified value: %d", v));
}
String formatted;
if (v >= 0 && v < targetCharValueEscape.length && targetCharValueEscape[v] != null) {
return targetCharValueEscape[v];
}
if (v >= 0x20 && v < 127 && (v < '0' || v > '9') && (v < 'a' || v > 'f') && (v < 'A' || v > 'F')) {
return String.valueOf((char)v);
}
return String.format("\\x%X", v & 0xFFFF);
}
@Override
public String getTargetStringLiteralFromANTLRStringLiteral(
CodeGenerator generator,
String literal, boolean addQuotes)
{
StringBuilder sb = new StringBuilder();
String is = literal;
if ( addQuotes ) sb.append('"');
for (int i = 1; i < is.length() -1; i++) {
if (is.charAt(i) == '\\') {
// Anything escaped is what it is! We assume that
// people know how to escape characters correctly. However
// we catch anything that does not need an escape in Java (which
// is what the default implementation is dealing with and remove
// the escape. The C target does this for instance.
//
switch (is.charAt(i+1)) {
// Pass through any escapes that Java also needs
//
case '"':
case 'n':
case 'r':
case 't':
case 'b':
case 'f':
case '\\':
// Pass the escape through
sb.append('\\');
break;
case 'u': // Assume unnnn
// Pass the escape through as double \\
// so that Java leaves as \u0000 string not char
sb.append('\\');
sb.append('\\');
break;
default:
// Remove the escape by virtue of not adding it here
// Thus \' becomes ' and so on
break;
}
// Go past the \ character
i++;
formatted = targetCharValueEscape[v];
} else if (v >= 0x20 && v < 127 && (v < '0' || v > '9') && (v < 'a' || v > 'f') && (v < 'A' || v > 'F')) {
formatted = Character.toString((char)v);
} else {
// Characters that don't need \ in ANTLR 'strings' but do in Java
if (is.charAt(i) == '"') {
// We need to escape " in Java
sb.append('\\');
}
}
// Add in the next character, which may have been escaped
sb.append(is.charAt(i));
formatted = String.format("\\x%X", v & 0xFFFF);
}
if ( addQuotes ) sb.append('"');
return sb.toString();
return "'" + formatted + "'";
}
@Override
@ -150,4 +89,9 @@ public class CSharpTarget extends Target {
return result;
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
// C# and Python share the same escaping style.
UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -6,6 +6,7 @@
package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.tool.ErrorType;
@ -68,81 +69,6 @@ public class CppTarget extends Target {
badWords.add("parserRule");
}
/**
* {@inheritDoc}
* <p/>
* For C++, this is the translation {@code 'a\n"'} &rarr; {@code "a\n\""}.
* Expect single quotes around the incoming literal. Just flip the quotes
* and replace double quotes with {@code \"}.
* <p/>
* Note that we have decided to allow people to use '\"' without penalty, so
* we must build the target string in a loop as {@link String#replace}
* cannot handle both {@code \"} and {@code "} without a lot of messing
* around.
*/
@Override
public String getTargetStringLiteralFromANTLRStringLiteral(
CodeGenerator generator,
String literal, boolean addQuotes)
{
StringBuilder sb = new StringBuilder();
String is = literal;
if ( addQuotes ) sb.append('"');
for (int i = 1; i < is.length() -1; i++) {
if (is.charAt(i) == '\\') {
// Anything escaped is what it is! We assume that
// people know how to escape characters correctly. However
// we catch anything that does not need an escape in Java (which
// is what the default implementation is dealing with and remove
// the escape. The C target does this for instance.
//
switch (is.charAt(i+1)) {
// Pass through any escapes that Java also needs
//
case '"':
case 'n':
case 'r':
case 't':
case 'b':
case 'f':
case '\\':
// Pass the escape through
sb.append('\\');
break;
case 'u': // Assume unnnn
// Pass the escape through as double \\
// so that Java leaves as \u0000 string not char
sb.append('\\');
sb.append('\\');
break;
default:
// Remove the escape by virtue of not adding it here
// Thus \' becomes ' and so on
break;
}
// Go past the \ character
i++;
} else {
// Characters that don't need \ in ANTLR 'strings' but do in Java
if (is.charAt(i) == '"') {
// We need to escape " in Java
sb.append('\\');
}
}
// Add in the next character, which may have been escaped
sb.append(is.charAt(i));
}
if ( addQuotes ) sb.append('"');
return sb.toString();
}
@Override
public String encodeIntAsCharEscape(int v) {
return "0x" + Integer.toHexString(v) + ", ";
@ -232,4 +158,10 @@ public class CppTarget extends Target {
return result;
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
// C99 and Python share the same escaping style.
UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.ast.GrammarAST;
@ -214,5 +215,10 @@ public class GoTarget extends Target {
}
}
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
// Go and Python share the same escaping style.
UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.tool.ast.GrammarAST;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.StringRenderer;
@ -67,81 +68,6 @@ public class JavaScriptTarget extends Target {
badWords.add("parserRule");
}
/**
* {@inheritDoc}
* <p>
* For Java, this is the translation {@code 'a\n"'} &rarr; {@code "a\n\""}.
* Expect single quotes around the incoming literal. Just flip the quotes
* and replace double quotes with {@code \"}.
* <p>
* Note that we have decided to allow people to use '\"' without penalty, so
* we must build the target string in a loop as {@link String#replace}
* cannot handle both {@code \"} and {@code "} without a lot of messing
* around.
*/
@Override
public String getTargetStringLiteralFromANTLRStringLiteral(
CodeGenerator generator,
String literal, boolean addQuotes)
{
StringBuilder sb = new StringBuilder();
String is = literal;
if ( addQuotes ) sb.append('"');
for (int i = 1; i < is.length() -1; i++) {
if (is.charAt(i) == '\\') {
// Anything escaped is what it is! We assume that
// people know how to escape characters correctly. However
// we catch anything that does not need an escape in Java (which
// is what the default implementation is dealing with and remove
// the escape. The C target does this for instance.
//
switch (is.charAt(i+1)) {
// Pass through any escapes that Java also needs
//
case '"':
case 'n':
case 'r':
case 't':
case 'b':
case 'f':
case '\\':
// Pass the escape through
sb.append('\\');
break;
case 'u': // Assume unnnn
// Pass the escape through as double \\
// so that Java leaves as \u0000 string not char
sb.append('\\');
sb.append('\\');
break;
default:
// Remove the escape by virtue of not adding it here
// Thus \' becomes ' and so on
break;
}
// Go past the \ character
i++;
} else {
// Characters that don't need \ in ANTLR 'strings' but do in Java
if (is.charAt(i) == '"') {
// We need to escape " in Java
sb.append('\\');
}
}
// Add in the next character, which may have been escaped
sb.append(is.charAt(i));
}
if ( addQuotes ) sb.append('"');
return sb.toString();
}
@Override
public String encodeIntAsCharEscape(int v) {
if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) {
@ -210,4 +136,10 @@ public class JavaScriptTarget extends Target {
public boolean supportsOverloadedMethods() {
return false;
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
// JavaScript and Java share the same escaping style.
UnicodeEscapes.appendJavaStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -9,6 +9,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.Tool;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.tool.ast.GrammarAST;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.StringRenderer;
@ -99,4 +100,9 @@ public class JavaTarget extends Target {
}
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
UnicodeEscapes.appendJavaStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.tool.ast.GrammarAST;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.StringRenderer;
@ -109,4 +110,9 @@ public class Python2Target extends Target {
badWords.add("rule");
badWords.add("parserRule");
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.tool.ast.GrammarAST;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.StringRenderer;
@ -115,5 +116,8 @@ public class Python3Target extends Target {
badWords.add("parserRule");
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
UnicodeEscapes.appendPythonStyleEscapedCodePoint(codePoint, sb);
}
}

View File

@ -8,6 +8,7 @@ package org.antlr.v4.codegen.target;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.codegen.Target;
import org.antlr.v4.codegen.UnicodeEscapes;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.atn.ATN;
import org.antlr.v4.runtime.atn.ATNDeserializer;
@ -550,4 +551,9 @@ public class SwiftTarget extends Target {
}
}
@Override
protected void appendUnicodeEscapedCodePoint(int codePoint, StringBuilder sb) {
UnicodeEscapes.appendSwiftStyleEscapedCodePoint(codePoint, sb);
}
}