Change UnbufferedCharStream to use 32-bit Unicode code points and 32-bit buffer
This commit is contained in:
parent
1f0db00cd0
commit
8108b342ad
|
@ -25,7 +25,7 @@ public class UnbufferedCharStream implements CharStream {
|
|||
* we keep adding to buffer. Otherwise, {@link #consume consume()} resets so
|
||||
* we start filling at index 0 again.
|
||||
*/
|
||||
protected char[] data;
|
||||
protected int[] data;
|
||||
|
||||
/**
|
||||
* The number of characters currently in {@link #data data}.
|
||||
|
@ -82,7 +82,7 @@ public class UnbufferedCharStream implements CharStream {
|
|||
/** Useful for subclasses that pull char from other than this.input. */
|
||||
public UnbufferedCharStream(int bufferSize) {
|
||||
n = 0;
|
||||
data = new char[bufferSize];
|
||||
data = new int[bufferSize];
|
||||
}
|
||||
|
||||
public UnbufferedCharStream(InputStream input) {
|
||||
|
@ -145,13 +145,36 @@ public class UnbufferedCharStream implements CharStream {
|
|||
*/
|
||||
protected int fill(int n) {
|
||||
for (int i=0; i<n; i++) {
|
||||
if (this.n > 0 && data[this.n - 1] == (char)IntStream.EOF) {
|
||||
if (this.n > 0 && data[this.n - 1] == IntStream.EOF) {
|
||||
return i;
|
||||
}
|
||||
|
||||
try {
|
||||
int c = nextChar();
|
||||
add(c);
|
||||
if (c > Character.MAX_VALUE || c == IntStream.EOF) {
|
||||
add(c);
|
||||
} else {
|
||||
char ch = (char) c;
|
||||
if (Character.isLowSurrogate(ch)) {
|
||||
throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
|
||||
} else if (Character.isHighSurrogate(ch)) {
|
||||
int lowSurrogate = nextChar();
|
||||
if (lowSurrogate > Character.MAX_VALUE) {
|
||||
throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
|
||||
} else if (lowSurrogate == IntStream.EOF) {
|
||||
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)");
|
||||
} else {
|
||||
char lowSurrogateChar = (char) lowSurrogate;
|
||||
if (Character.isLowSurrogate(lowSurrogateChar)) {
|
||||
add(Character.toCodePoint(ch, lowSurrogateChar));
|
||||
} else {
|
||||
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
|
@ -173,7 +196,7 @@ public class UnbufferedCharStream implements CharStream {
|
|||
if ( n>=data.length ) {
|
||||
data = Arrays.copyOf(data, data.length * 2);
|
||||
}
|
||||
data[n++] = (char)c;
|
||||
data[n++] = c;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -183,8 +206,8 @@ public class UnbufferedCharStream implements CharStream {
|
|||
int index = p + i - 1;
|
||||
if ( index < 0 ) throw new IndexOutOfBoundsException();
|
||||
if ( index >= n ) return IntStream.EOF;
|
||||
char c = data[index];
|
||||
if ( c==(char)IntStream.EOF ) return IntStream.EOF;
|
||||
int c = data[index];
|
||||
if ( c==IntStream.EOF ) return IntStream.EOF;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
|
|
@ -313,6 +313,30 @@ public class TestUnbufferedCharStream extends BaseJavaToolTest {
|
|||
assertEquals(expecting, tokens.getTokens().toString());
|
||||
}
|
||||
|
||||
@Test public void testUnicodeSMP() throws Exception {
|
||||
TestingUnbufferedCharStream input = createStream("\uD83C\uDF0E");
|
||||
assertEquals(0x1F30E, input.LA(1));
|
||||
assertEquals("\uD83C\uDF0E", input.getBuffer());
|
||||
input.consume();
|
||||
assertEquals(IntStream.EOF, input.LA(1));
|
||||
assertEquals("\uFFFF", input.getBuffer());
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testDanglingHighSurrogateAtEOFThrows() throws Exception {
|
||||
createStream("\uD83C");
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testDanglingHighSurrogateThrows() throws Exception {
|
||||
createStream("\uD83C\u0123");
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testDanglingLowSurrogateThrows() throws Exception {
|
||||
createStream("\uDF0E");
|
||||
}
|
||||
|
||||
protected static TestingUnbufferedCharStream createStream(String text) {
|
||||
return new TestingUnbufferedCharStream(new StringReader(text));
|
||||
}
|
||||
|
@ -336,7 +360,13 @@ public class TestUnbufferedCharStream extends BaseJavaToolTest {
|
|||
*/
|
||||
public String getRemainingBuffer() {
|
||||
if ( n==0 ) return "";
|
||||
return new String(data,p,n-p);
|
||||
int len = n;
|
||||
if (data[len-1] == IntStream.EOF) {
|
||||
// Don't pass -1 to new String().
|
||||
return new String(data,p,len-p-1) + "\uFFFF";
|
||||
} else {
|
||||
return new String(data,p,len-p);
|
||||
}
|
||||
}
|
||||
|
||||
/** For testing. What's in moving window buffer into data stream.
|
||||
|
@ -344,7 +374,14 @@ public class TestUnbufferedCharStream extends BaseJavaToolTest {
|
|||
*/
|
||||
public String getBuffer() {
|
||||
if ( n==0 ) return "";
|
||||
return new String(data,0,n);
|
||||
int len = n;
|
||||
// Don't pass -1 to new String().
|
||||
if (data[len-1] == IntStream.EOF) {
|
||||
// Don't pass -1 to new String().
|
||||
return new String(data,0,len-1) + "\uFFFF";
|
||||
} else {
|
||||
return new String(data,0,len);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue