forked from jasder/antlr
got unicode and escape sequences into literals for bytecodes
[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 6904]
This commit is contained in:
parent
b9a5cf8691
commit
98bddc4446
|
@ -133,15 +133,15 @@ public class Bytecode {
|
||||||
case NONE:
|
case NONE:
|
||||||
break;
|
break;
|
||||||
case BYTE:
|
case BYTE:
|
||||||
if ( operandsAreChars ) operands.add("'"+(char)code[ip]+"'");
|
if ( operandsAreChars ) operands.add(quotedCharLiteral(code[ip]));
|
||||||
else operands.add(String.valueOf(code[ip]));
|
else operands.add(String.valueOf(code[ip]));
|
||||||
break;
|
break;
|
||||||
case CHAR :
|
case CHAR :
|
||||||
if ( operandsAreChars ) operands.add("'"+(char)getShort(code, ip)+"'");
|
if ( operandsAreChars ) operands.add(quotedCharLiteral(getShort(code, ip)));
|
||||||
else operands.add(String.valueOf(getShort(code, ip)));
|
else operands.add(String.valueOf(getShort(code, ip)));
|
||||||
break;
|
break;
|
||||||
case INT :
|
case INT :
|
||||||
if ( operandsAreChars ) operands.add("'"+(char)getInt(code, ip)+"'");
|
if ( operandsAreChars ) operands.add(quotedCharLiteral(getInt(code, ip)));
|
||||||
else operands.add(String.valueOf(getInt(code, ip)));
|
else operands.add(String.valueOf(getInt(code, ip)));
|
||||||
case SHORT :
|
case SHORT :
|
||||||
case ADDR :
|
case ADDR :
|
||||||
|
@ -173,5 +173,45 @@ public class Bytecode {
|
||||||
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
|
int b1 = memory[index++]&0xFF; // mask off sign-extended bits
|
||||||
int b2 = memory[index++]&0xFF;
|
int b2 = memory[index++]&0xFF;
|
||||||
return b1<<(8*1) | b2;
|
return b1<<(8*1) | b2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String LiteralCharValueEscape[] = new String[255];
|
||||||
|
|
||||||
|
static {
|
||||||
|
LiteralCharValueEscape['\n'] = "\\n";
|
||||||
|
LiteralCharValueEscape['\r'] = "\\r";
|
||||||
|
LiteralCharValueEscape['\t'] = "\\t";
|
||||||
|
LiteralCharValueEscape['\b'] = "\\b";
|
||||||
|
LiteralCharValueEscape['\f'] = "\\f";
|
||||||
|
LiteralCharValueEscape['\\'] = "\\\\";
|
||||||
|
LiteralCharValueEscape['\''] = "\\'";
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return a string representing the escaped char for code c. E.g., If c
|
||||||
|
* has value 0x100, you will get "\u0100". ASCII gets the usual
|
||||||
|
* char (non-hex) representation. Control characters are spit out
|
||||||
|
* as unicode.
|
||||||
|
*/
|
||||||
|
public static String quotedCharLiteral(int c) {
|
||||||
|
if ( c<LiteralCharValueEscape.length && LiteralCharValueEscape[c]!=null ) {
|
||||||
|
return '\''+LiteralCharValueEscape[c]+'\'';
|
||||||
|
}
|
||||||
|
if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN &&
|
||||||
|
!Character.isISOControl((char)c) ) {
|
||||||
|
if ( c=='\\' ) {
|
||||||
|
return "'\\\\'";
|
||||||
|
}
|
||||||
|
if ( c=='\'') {
|
||||||
|
return "'\\''";
|
||||||
|
}
|
||||||
|
return '\''+Character.toString((char)c)+'\'';
|
||||||
|
}
|
||||||
|
// turn on the bit above max "\uFFFF" value so that we pad with zeros
|
||||||
|
// then only take last 4 digits
|
||||||
|
String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
|
||||||
|
String unicodeStr = "'\\u"+hex+"'";
|
||||||
|
return unicodeStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,6 +92,7 @@ public class Target {
|
||||||
* to an appropriate target char literal.
|
* to an appropriate target char literal.
|
||||||
*
|
*
|
||||||
* Expect single quotes around the incoming literal.
|
* Expect single quotes around the incoming literal.
|
||||||
|
* TODO: unused and should call CharSupport.getANTLRCharLiteralForChar anyway
|
||||||
*/
|
*/
|
||||||
public String getTargetCharLiteralCharValue(int c) {
|
public String getTargetCharLiteralCharValue(int c) {
|
||||||
StringBuffer buf = new StringBuffer();
|
StringBuffer buf = new StringBuffer();
|
||||||
|
|
|
@ -104,14 +104,17 @@ public class CharSupport {
|
||||||
i++;
|
i++;
|
||||||
if ( literal.charAt(i)=='u' ) { // '\u1234'
|
if ( literal.charAt(i)=='u' ) { // '\u1234'
|
||||||
i++;
|
i++;
|
||||||
String unicodeChars = literal.substring(3,literal.length()-1);
|
String unicodeChars = literal.substring(i,i+4);
|
||||||
buf.append((char)Integer.parseInt(unicodeChars, 16));
|
int h = Integer.parseInt(unicodeChars, 16);
|
||||||
|
buf.append((char)h);
|
||||||
|
i += 4;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
char escChar = literal.charAt(i);
|
char escChar = literal.charAt(i);
|
||||||
int charVal = ANTLRLiteralEscapedCharValue[escChar];
|
int charVal = ANTLRLiteralEscapedCharValue[escChar];
|
||||||
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
|
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
|
||||||
else buf.append((char)charVal);
|
else buf.append((char)charVal);
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default :
|
default :
|
||||||
|
@ -121,6 +124,6 @@ public class CharSupport {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return buf.toString();
|
return buf.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,25 @@ import org.antlr.v4.tool.LexerGrammar;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestPDABytecodeGeneration extends BaseTest {
|
public class TestPDABytecodeGeneration extends BaseTest {
|
||||||
|
@Test public void unicode() throws Exception {
|
||||||
|
LexerGrammar g = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n" +
|
||||||
|
"A : '\\u0030'..'\\u8000'+ 'a' ;\n" +
|
||||||
|
"B : '\\u0020' | '\\n';");
|
||||||
|
String expecting =
|
||||||
|
"0000:\tsplit 7, 24\n" +
|
||||||
|
"0007:\trange16 '0', '\\u8000'\n" +
|
||||||
|
"0012:\tsplit 7, 19\n" +
|
||||||
|
"0019:\tmatch8 'a'\n" +
|
||||||
|
"0021:\taccept 4\n" +
|
||||||
|
"0024:\tsplit 31, 36\n" +
|
||||||
|
"0031:\tmatch8 ' '\n" +
|
||||||
|
"0033:\tjmp 38\n" +
|
||||||
|
"0036:\tmatch8 '\\n'\n" +
|
||||||
|
"0038:\taccept 5\n";
|
||||||
|
checkBytecode(g, expecting);
|
||||||
|
}
|
||||||
|
|
||||||
@Test public void testString() throws Exception {
|
@Test public void testString() throws Exception {
|
||||||
LexerGrammar g = new LexerGrammar(
|
LexerGrammar g = new LexerGrammar(
|
||||||
"lexer grammar L;\n"+
|
"lexer grammar L;\n"+
|
||||||
|
|
|
@ -19,6 +19,22 @@ public class TestPDABytecodeInterp extends BaseTest {
|
||||||
checkMatches(g, "abab", expecting);
|
checkMatches(g, "abab", expecting);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test public void testUnicode() throws Exception {
|
||||||
|
LexerGrammar g = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"A : '\\u0020' ;"); // space
|
||||||
|
String expecting = "A, A, EOF";
|
||||||
|
checkMatches(g, " ", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testEscapes() throws Exception {
|
||||||
|
LexerGrammar g = new LexerGrammar(
|
||||||
|
"lexer grammar L;\n"+
|
||||||
|
"WS : '\t'|'\n'|' ' ;");
|
||||||
|
String expecting = "WS, WS, WS, WS, WS, EOF";
|
||||||
|
checkMatches(g, " \t\n\n ", expecting);
|
||||||
|
}
|
||||||
|
|
||||||
@Test public void testNotChar() throws Exception {
|
@Test public void testNotChar() throws Exception {
|
||||||
LexerGrammar g = new LexerGrammar(
|
LexerGrammar g = new LexerGrammar(
|
||||||
"lexer grammar L;\n"+
|
"lexer grammar L;\n"+
|
||||||
|
|
Loading…
Reference in New Issue