got unicode and escape sequences into literals for bytecodes

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6904]
This commit is contained in:
parrt 2010-06-08 11:49:31 -08:00
parent b9a5cf8691
commit 98bddc4446
5 changed files with 86 additions and 7 deletions

View File

@ -133,15 +133,15 @@ public class Bytecode {
case NONE:
break;
case BYTE:
if ( operandsAreChars ) operands.add("'"+(char)code[ip]+"'");
if ( operandsAreChars ) operands.add(quotedCharLiteral(code[ip]));
else operands.add(String.valueOf(code[ip]));
break;
case CHAR :
if ( operandsAreChars ) operands.add("'"+(char)getShort(code, ip)+"'");
if ( operandsAreChars ) operands.add(quotedCharLiteral(getShort(code, ip)));
else operands.add(String.valueOf(getShort(code, ip)));
break;
case INT :
if ( operandsAreChars ) operands.add("'"+(char)getInt(code, ip)+"'");
if ( operandsAreChars ) operands.add(quotedCharLiteral(getInt(code, ip)));
else operands.add(String.valueOf(getInt(code, ip)));
case SHORT :
case ADDR :
@ -173,5 +173,45 @@ public class Bytecode {
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
int b2 = memory[index++]&0xFF;
return b1<<(8*1) | b2;
}
}
public static String LiteralCharValueEscape[] = new String[255];
static {
LiteralCharValueEscape['\n'] = "\\n";
LiteralCharValueEscape['\r'] = "\\r";
LiteralCharValueEscape['\t'] = "\\t";
LiteralCharValueEscape['\b'] = "\\b";
LiteralCharValueEscape['\f'] = "\\f";
LiteralCharValueEscape['\\'] = "\\\\";
LiteralCharValueEscape['\''] = "\\'";
}
/** Return a string representing the escaped char for code c. E.g., If c
* has value 0x100, you will get "\u0100". ASCII gets the usual
* char (non-hex) representation. Control characters are spit out
* as unicode.
*/
public static String quotedCharLiteral(int c) {
if ( c<LiteralCharValueEscape.length && LiteralCharValueEscape[c]!=null ) {
return '\''+LiteralCharValueEscape[c]+'\'';
}
if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN &&
!Character.isISOControl((char)c) ) {
if ( c=='\\' ) {
return "'\\\\'";
}
if ( c=='\'') {
return "'\\''";
}
return '\''+Character.toString((char)c)+'\'';
}
// turn on the bit above max "\uFFFF" value so that we pad with zeros
// then only take last 4 digits
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
String unicodeStr = "'\\u"+hex+"'";
return unicodeStr;
}
}

View File

@ -92,6 +92,7 @@ public class Target {
* to an appropriate target char literal.
*
* Expect single quotes around the incoming literal.
* TODO: unused and should call CharSupport.getANTLRCharLiteralForChar anyway
*/
public String getTargetCharLiteralCharValue(int c) {
StringBuffer buf = new StringBuffer();

View File

@ -104,14 +104,17 @@ public class CharSupport {
i++;
if ( literal.charAt(i)=='u' ) { // '\u1234'
i++;
String unicodeChars = literal.substring(3,literal.length()-1);
buf.append((char)Integer.parseInt(unicodeChars, 16));
String unicodeChars = literal.substring(i,i+4);
int h = Integer.parseInt(unicodeChars, 16);
buf.append((char)h);
i += 4;
}
else {
char escChar = literal.charAt(i);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
else buf.append((char)charVal);
i++;
}
break;
default :
@ -121,6 +124,6 @@ public class CharSupport {
}
}
return buf.toString();
}
}
}

View File

@ -11,6 +11,25 @@ import org.antlr.v4.tool.LexerGrammar;
import org.junit.Test;
public class TestPDABytecodeGeneration extends BaseTest {
@Test public void unicode() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar L;\n" +
"A : '\\u0030'..'\\u8000'+ 'a' ;\n" +
"B : '\\u0020' | '\\n';");
String expecting =
"0000:\tsplit 7, 24\n" +
"0007:\trange16 '0', '\\u8000'\n" +
"0012:\tsplit 7, 19\n" +
"0019:\tmatch8 'a'\n" +
"0021:\taccept 4\n" +
"0024:\tsplit 31, 36\n" +
"0031:\tmatch8 ' '\n" +
"0033:\tjmp 38\n" +
"0036:\tmatch8 '\\n'\n" +
"0038:\taccept 5\n";
checkBytecode(g, expecting);
}
@Test public void testString() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar L;\n"+

View File

@ -19,6 +19,22 @@ public class TestPDABytecodeInterp extends BaseTest {
checkMatches(g, "abab", expecting);
}
@Test public void testUnicode() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar L;\n"+
"A : '\\u0020' ;"); // space
String expecting = "A, A, EOF";
checkMatches(g, " ", expecting);
}
@Test public void testEscapes() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar L;\n"+
"WS : '\t'|'\n'|' ' ;");
String expecting = "WS, WS, WS, WS, WS, EOF";
checkMatches(g, " \t\n\n ", expecting);
}
@Test public void testNotChar() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar L;\n"+