got unicode and escape sequences into literals for bytecodes

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6904]
2010-06-08 11:49:31 -08:00 · 2010-06-08 11:49:31 -08:00 · 98bddc4446
parent b9a5cf8691
commit 98bddc4446
5 changed files with 86 additions and 7 deletions
--- a/runtime/Java/src/org/antlr/v4/runtime/pda/Bytecode.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/pda/Bytecode.java
@ -133,15 +133,15 @@ public class Bytecode {
 					case NONE:
 						break;
 					case BYTE:
-						if ( operandsAreChars ) operands.add("'"+(char)code[ip]+"'");
+						if ( operandsAreChars ) operands.add(quotedCharLiteral(code[ip]));
 						else operands.add(String.valueOf(code[ip]));
 						break;
 					case CHAR :
-						if ( operandsAreChars ) operands.add("'"+(char)getShort(code, ip)+"'");
+						if ( operandsAreChars ) operands.add(quotedCharLiteral(getShort(code, ip)));
 						else operands.add(String.valueOf(getShort(code, ip)));
 						break;
 					case INT :
-						if ( operandsAreChars ) operands.add("'"+(char)getInt(code, ip)+"'");
+						if ( operandsAreChars ) operands.add(quotedCharLiteral(getInt(code, ip)));
 						else operands.add(String.valueOf(getInt(code, ip)));
 					case SHORT :
 					case ADDR :
@ -173,5 +173,45 @@ public class Bytecode {
 		int b1 = memory[index++]&0xFF; // mask off sign-extended bits
 		int b2 = memory[index++]&0xFF;
 		return b1<<(8*1) | b2;
-	}	
+	}
+
+	public static String LiteralCharValueEscape[] = new String[255];
+
+	static {
+		LiteralCharValueEscape['\n'] = "\\n";
+		LiteralCharValueEscape['\r'] = "\\r";
+		LiteralCharValueEscape['\t'] = "\\t";
+		LiteralCharValueEscape['\b'] = "\\b";
+		LiteralCharValueEscape['\f'] = "\\f";
+		LiteralCharValueEscape['\\'] = "\\\\";
+		LiteralCharValueEscape['\''] = "\\'";		
+	}
+	
+	/** Return a string representing the escaped char for code c.  E.g., If c
+	 *  has value 0x100, you will get "\u0100".  ASCII gets the usual
+	 *  char (non-hex) representation.  Control characters are spit out
+	 *  as unicode.
+	 */
+	public static String quotedCharLiteral(int c) {
+		if ( c<LiteralCharValueEscape.length && LiteralCharValueEscape[c]!=null ) {
+			return '\''+LiteralCharValueEscape[c]+'\'';
+		}
+		if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN &&
+			 !Character.isISOControl((char)c) ) {
+			if ( c=='\\' ) {
+				return "'\\\\'";
+			}
+			if ( c=='\'') {
+				return "'\\''";
+			}
+			return '\''+Character.toString((char)c)+'\'';
+		}
+		// turn on the bit above max "\uFFFF" value so that we pad with zeros
+		// then only take last 4 digits
+		String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
+		String unicodeStr = "'\\u"+hex+"'";
+		return unicodeStr;
+	}
+
+	
 }
--- a/tool/src/org/antlr/v4/codegen/Target.java
+++ b/tool/src/org/antlr/v4/codegen/Target.java
@ -92,6 +92,7 @@ public class Target {
 	 *  to an appropriate target char literal.
 	 *
 	 *  Expect single quotes around the incoming literal.
+	 *  TODO: unused and should call CharSupport.getANTLRCharLiteralForChar anyway
 	 */
 	public String getTargetCharLiteralCharValue(int c) {
 		StringBuffer buf = new StringBuffer();
--- a/tool/src/org/antlr/v4/misc/CharSupport.java
+++ b/tool/src/org/antlr/v4/misc/CharSupport.java
@ -104,14 +104,17 @@ public class CharSupport {
 					i++;
 					if ( literal.charAt(i)=='u' ) { // '\u1234'
 						i++;
-						String unicodeChars = literal.substring(3,literal.length()-1);
-						buf.append((char)Integer.parseInt(unicodeChars, 16));
+						String unicodeChars = literal.substring(i,i+4);
+						int h = Integer.parseInt(unicodeChars, 16);
+						buf.append((char)h);
+						i += 4;
 					}
 					else {
 						char escChar = literal.charAt(i);
 						int charVal = ANTLRLiteralEscapedCharValue[escChar];
 						if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
 						else buf.append((char)charVal);
+						i++;
 					}
 					break;
 				default :
@ -121,6 +124,6 @@ public class CharSupport {
 			}
 		}
 		return buf.toString();
-	}	
+	}

 }
--- a/tool/test/org/antlr/v4/test/TestPDABytecodeGeneration.java
+++ b/tool/test/org/antlr/v4/test/TestPDABytecodeGeneration.java
@ -11,6 +11,25 @@ import org.antlr.v4.tool.LexerGrammar;
 import org.junit.Test;

 public class TestPDABytecodeGeneration extends BaseTest {
+	@Test public void unicode() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar L;\n" +
+			"A : '\\u0030'..'\\u8000'+ 'a' ;\n" +
+			"B : '\\u0020' | '\\n';");
+		String expecting =
+			"0000:\tsplit         7, 24\n" +
+			"0007:\trange16       '0', '\\u8000'\n" +
+			"0012:\tsplit         7, 19\n" +
+			"0019:\tmatch8        'a'\n" +
+			"0021:\taccept        4\n" +
+			"0024:\tsplit         31, 36\n" +
+			"0031:\tmatch8        ' '\n" +
+			"0033:\tjmp           38\n" +
+			"0036:\tmatch8        '\\n'\n" +
+			"0038:\taccept        5\n";
+		checkBytecode(g, expecting);
+	}
+
 	@Test public void testString() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar L;\n"+
--- a/tool/test/org/antlr/v4/test/TestPDABytecodeInterp.java
+++ b/tool/test/org/antlr/v4/test/TestPDABytecodeInterp.java
@ -19,6 +19,22 @@ public class TestPDABytecodeInterp extends BaseTest {
 		checkMatches(g, "abab", expecting);
 	}

+	@Test public void testUnicode() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"A : '\\u0020' ;"); // space
+		String expecting = "A, A, EOF";
+		checkMatches(g, "  ", expecting);
+	}
+
+	@Test public void testEscapes() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar L;\n"+
+			"WS : '\t'|'\n'|' ' ;");
+		String expecting = "WS, WS, WS, WS, WS, EOF";
+		checkMatches(g, " \t\n\n ", expecting);
+	}
+
 	@Test public void testNotChar() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar L;\n"+