forked from jasder/antlr
got unicode and escape sequences into literals for bytecodes
[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6904]
This commit is contained in:
parent
b9a5cf8691
commit
98bddc4446
|
@ -133,15 +133,15 @@ public class Bytecode {
|
|||
case NONE:
|
||||
break;
|
||||
case BYTE:
|
||||
if ( operandsAreChars ) operands.add("'"+(char)code[ip]+"'");
|
||||
if ( operandsAreChars ) operands.add(quotedCharLiteral(code[ip]));
|
||||
else operands.add(String.valueOf(code[ip]));
|
||||
break;
|
||||
case CHAR :
|
||||
if ( operandsAreChars ) operands.add("'"+(char)getShort(code, ip)+"'");
|
||||
if ( operandsAreChars ) operands.add(quotedCharLiteral(getShort(code, ip)));
|
||||
else operands.add(String.valueOf(getShort(code, ip)));
|
||||
break;
|
||||
case INT :
|
||||
if ( operandsAreChars ) operands.add("'"+(char)getInt(code, ip)+"'");
|
||||
if ( operandsAreChars ) operands.add(quotedCharLiteral(getInt(code, ip)));
|
||||
else operands.add(String.valueOf(getInt(code, ip)));
|
||||
case SHORT :
|
||||
case ADDR :
|
||||
|
@ -173,5 +173,45 @@ public class Bytecode {
|
|||
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
|
||||
int b2 = memory[index++]&0xFF;
|
||||
return b1<<(8*1) | b2;
|
||||
}
|
||||
}
|
||||
|
||||
public static String LiteralCharValueEscape[] = new String[255];
|
||||
|
||||
static {
|
||||
LiteralCharValueEscape['\n'] = "\\n";
|
||||
LiteralCharValueEscape['\r'] = "\\r";
|
||||
LiteralCharValueEscape['\t'] = "\\t";
|
||||
LiteralCharValueEscape['\b'] = "\\b";
|
||||
LiteralCharValueEscape['\f'] = "\\f";
|
||||
LiteralCharValueEscape['\\'] = "\\\\";
|
||||
LiteralCharValueEscape['\''] = "\\'";
|
||||
}
|
||||
|
||||
/** Return a string representing the escaped char for code c. E.g., If c
|
||||
* has value 0x100, you will get "\u0100". ASCII gets the usual
|
||||
* char (non-hex) representation. Control characters are spit out
|
||||
* as unicode.
|
||||
*/
|
||||
public static String quotedCharLiteral(int c) {
|
||||
if ( c<LiteralCharValueEscape.length && LiteralCharValueEscape[c]!=null ) {
|
||||
return '\''+LiteralCharValueEscape[c]+'\'';
|
||||
}
|
||||
if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN &&
|
||||
!Character.isISOControl((char)c) ) {
|
||||
if ( c=='\\' ) {
|
||||
return "'\\\\'";
|
||||
}
|
||||
if ( c=='\'') {
|
||||
return "'\\''";
|
||||
}
|
||||
return '\''+Character.toString((char)c)+'\'';
|
||||
}
|
||||
// turn on the bit above max "\uFFFF" value so that we pad with zeros
|
||||
// then only take last 4 digits
|
||||
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
|
||||
String unicodeStr = "'\\u"+hex+"'";
|
||||
return unicodeStr;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -92,6 +92,7 @@ public class Target {
|
|||
* to an appropriate target char literal.
|
||||
*
|
||||
* Expect single quotes around the incoming literal.
|
||||
* TODO: unused and should call CharSupport.getANTLRCharLiteralForChar anyway
|
||||
*/
|
||||
public String getTargetCharLiteralCharValue(int c) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
|
|
|
@ -104,14 +104,17 @@ public class CharSupport {
|
|||
i++;
|
||||
if ( literal.charAt(i)=='u' ) { // '\u1234'
|
||||
i++;
|
||||
String unicodeChars = literal.substring(3,literal.length()-1);
|
||||
buf.append((char)Integer.parseInt(unicodeChars, 16));
|
||||
String unicodeChars = literal.substring(i,i+4);
|
||||
int h = Integer.parseInt(unicodeChars, 16);
|
||||
buf.append((char)h);
|
||||
i += 4;
|
||||
}
|
||||
else {
|
||||
char escChar = literal.charAt(i);
|
||||
int charVal = ANTLRLiteralEscapedCharValue[escChar];
|
||||
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
|
||||
else buf.append((char)charVal);
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
default :
|
||||
|
@ -121,6 +124,6 @@ public class CharSupport {
|
|||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,6 +11,25 @@ import org.antlr.v4.tool.LexerGrammar;
|
|||
import org.junit.Test;
|
||||
|
||||
public class TestPDABytecodeGeneration extends BaseTest {
|
||||
@Test public void unicode() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar L;\n" +
|
||||
"A : '\\u0030'..'\\u8000'+ 'a' ;\n" +
|
||||
"B : '\\u0020' | '\\n';");
|
||||
String expecting =
|
||||
"0000:\tsplit 7, 24\n" +
|
||||
"0007:\trange16 '0', '\\u8000'\n" +
|
||||
"0012:\tsplit 7, 19\n" +
|
||||
"0019:\tmatch8 'a'\n" +
|
||||
"0021:\taccept 4\n" +
|
||||
"0024:\tsplit 31, 36\n" +
|
||||
"0031:\tmatch8 ' '\n" +
|
||||
"0033:\tjmp 38\n" +
|
||||
"0036:\tmatch8 '\\n'\n" +
|
||||
"0038:\taccept 5\n";
|
||||
checkBytecode(g, expecting);
|
||||
}
|
||||
|
||||
@Test public void testString() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
|
|
@ -19,6 +19,22 @@ public class TestPDABytecodeInterp extends BaseTest {
|
|||
checkMatches(g, "abab", expecting);
|
||||
}
|
||||
|
||||
@Test public void testUnicode() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"A : '\\u0020' ;"); // space
|
||||
String expecting = "A, A, EOF";
|
||||
checkMatches(g, " ", expecting);
|
||||
}
|
||||
|
||||
@Test public void testEscapes() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
"WS : '\t'|'\n'|' ' ;");
|
||||
String expecting = "WS, WS, WS, WS, WS, EOF";
|
||||
checkMatches(g, " \t\n\n ", expecting);
|
||||
}
|
||||
|
||||
@Test public void testNotChar() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar L;\n"+
|
||||
|
|
Loading…
Reference in New Issue