diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/Utils.java b/runtime/Java/src/org/antlr/v4/runtime/misc/Utils.java index bd299cd73..dad10e7c2 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/Utils.java +++ b/runtime/Java/src/org/antlr/v4/runtime/misc/Utils.java @@ -53,6 +53,18 @@ public class Utils { return buf.toString(); } + public static String join(T[] array, String separator) { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < array.length; i++) { + builder.append(array[i]); + if (i < array.length - 1) { + builder.append(separator); + } + } + + return builder.toString(); + } + public static int numNonnull(Object[] data) { int n = 0; if ( data == null ) return n; diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg index 4dbb6d942..9fc48557d 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg @@ -817,10 +817,23 @@ public class extends { } >> - SerializedATN(model) ::= << + + +private static final int _serializedATNSegments = ; + = + "<\t>"}>";}; separator="\n"> +public static final String _serializedATN = Utils.join( + new String[] { + }; separator=",\n"> + }, + "" +); + + public static final String _serializedATN = "<\t>"}>"; + public static final ATN _ATN = ATNSimulator.deserialize(_serializedATN.toCharArray()); static { diff --git a/tool/src/org/antlr/v4/automata/ATNSerializer.java b/tool/src/org/antlr/v4/automata/ATNSerializer.java index b30a77801..3a2371829 100644 --- a/tool/src/org/antlr/v4/automata/ATNSerializer.java +++ b/tool/src/org/antlr/v4/automata/ATNSerializer.java @@ -263,7 +263,10 @@ public class ATNSerializer { // don't adjust the first value since that's the version number for (int i = 1; i < data.size(); i++) { - assert data.get(i) >= -1 && data.get(i) < 0xFFFF; + if (data.get(i) < -1 || data.get(i) > 0xFFFE) { + throw new UnsupportedOperationException("Serialized ATN data element out of range."); + } + int value = (data.get(i) + 2) & 0xFFFF; if (value == 0xFFFF) { value = -1; diff --git a/tool/src/org/antlr/v4/codegen/JavaTarget.java b/tool/src/org/antlr/v4/codegen/JavaTarget.java index 174487fd9..25203819f 100644 --- a/tool/src/org/antlr/v4/codegen/JavaTarget.java +++ b/tool/src/org/antlr/v4/codegen/JavaTarget.java @@ -163,6 +163,13 @@ public class JavaTarget extends Target { return "\\u"+hex; } + @Override + public int getSerializedATNSegmentLimit() { + // 65535 is the class file format byte limit for a UTF-8 encoded string literal + // 3 is the maximum number of bytes it takes to encode a value in the range 0-0xFFFF + return 65535 / 3; + } + @Override protected boolean visibleGrammarSymbolCausesIssueInGeneratedCode(GrammarAST idNode) { return getBadWords().contains(idNode.getText()); diff --git a/tool/src/org/antlr/v4/codegen/Target.java b/tool/src/org/antlr/v4/codegen/Target.java index 66e5ed34a..ed862f2e8 100644 --- a/tool/src/org/antlr/v4/codegen/Target.java +++ b/tool/src/org/antlr/v4/codegen/Target.java @@ -31,6 +31,7 @@ package org.antlr.v4.codegen; import org.antlr.v4.codegen.model.RuleFunction; +import org.antlr.v4.codegen.model.SerializedATN; import org.antlr.v4.misc.Utils; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.runtime.Token; @@ -287,6 +288,18 @@ public abstract class Target { return getTokenTypeAsTargetLabel(getCodeGenerator().g, ttype); } + /** + * Gets the maximum number of 16-bit unsigned integers that can be encoded + * in a single segment of the serialized ATN. + * + * @see SerializedATN#getSegments + * + * @return the serialized ATN segment limit + */ + public int getSerializedATNSegmentLimit() { + return Integer.MAX_VALUE; + } + public boolean grammarSymbolCausesIssueInGeneratedCode(GrammarAST idNode) { switch (idNode.getParent().getType()) { case ANTLRParser.ASSIGN: diff --git a/tool/src/org/antlr/v4/codegen/model/SerializedATN.java b/tool/src/org/antlr/v4/codegen/model/SerializedATN.java index df7fe2320..7345e53c1 100644 --- a/tool/src/org/antlr/v4/codegen/model/SerializedATN.java +++ b/tool/src/org/antlr/v4/codegen/model/SerializedATN.java @@ -51,4 +51,15 @@ public class SerializedATN extends OutputModelObject { } // System.out.println(ATNSerializer.getDecoded(factory.getGrammar(), atn)); } + + public String[][] getSegments() { + List segments = new ArrayList(); + int segmentLimit = factory.getGenerator().getTarget().getSerializedATNSegmentLimit(); + for (int i = 0; i < serialized.size(); i += segmentLimit) { + List currentSegment = serialized.subList(i, Math.min(i + segmentLimit, serialized.size())); + segments.add(currentSegment.toArray(new String[currentSegment.size()])); + } + + return segments.toArray(new String[segments.size()][]); + } } diff --git a/tool/test/org/antlr/v4/test/TestLexerExec.java b/tool/test/org/antlr/v4/test/TestLexerExec.java index d7617c4c8..cbba4ec9e 100644 --- a/tool/test/org/antlr/v4/test/TestLexerExec.java +++ b/tool/test/org/antlr/v4/test/TestLexerExec.java @@ -693,6 +693,28 @@ public class TestLexerExec extends BaseTest { assertEquals(expecting, found); } + /** + * This is a regression test for antlr/antlr4#76 "Serialized ATN strings + * should be split when longer than 2^16 bytes (class file limitation)" + * https://github.com/antlr/antlr4/issues/76 + */ + @Test + public void testLargeLexer() throws Exception { + StringBuilder grammar = new StringBuilder(); + grammar.append("lexer grammar L;\n"); + grammar.append("WS : [ \\t\\r\\n]+ -> skip;\n"); + for (int i = 0; i < 4000; i++) { + grammar.append("KW").append(i).append(" : '").append("KW").append(i).append("';\n"); + } + + String input = "KW400"; + String found = execLexer("L.g4", grammar.toString(), "L", input); + String expecting = + "[@0,0:4='KW400',<402>,1:0]\n" + + "[@1,5:4='',<-1>,1:5]\n"; + assertEquals(expecting, found); + } + protected String load(String fileName, @Nullable String encoding) throws IOException {