diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg index 258557724..93ce70477 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg @@ -1,40 +1,45 @@ // args must be , -parserFile(f, parser) ::= << +ParserFile(f, parser, dfaDefs, bitSetDefs) ::= << // $ANTLR ANTLRVersion> generatedTimestamp> import org.antlr.runtime.*; >> -parser(p,funcs,dfaDefs) ::= << +Parser(p, funcs) ::= << public class { - - + + + } >> -DFA(dfa) ::= << +DFADef(dfa) ::= << // define >> -parserFunction(f,code) ::= << +BitSetDef(b) ::= << +// define +>> + +RuleFunction(f,code) ::= << }>void () { } >> -codeBlock(c, ops) ::= << - +CodeBlock(c, ops) ::= << + >> -switch(c, alts) ::= << +LL1Choice(c, alts) ::= << switch ( input.LA(1) ) { - + } >> -matchToken(m) ::= << -match(); +MatchToken(m) ::= << +match(, ); >> codeFileExtension() ::= ".java" diff --git a/tool/src/org/antlr/v4/automata/LexerNFAFactory.java b/tool/src/org/antlr/v4/automata/LexerNFAFactory.java index 6c1e5fd42..137e59027 100644 --- a/tool/src/org/antlr/v4/automata/LexerNFAFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerNFAFactory.java @@ -1,6 +1,6 @@ package org.antlr.v4.automata; -import org.antlr.v4.codegen.Target; +import org.antlr.v4.misc.CharSupport; import org.antlr.v4.tool.GrammarAST; import org.antlr.v4.tool.LexerGrammar; import org.antlr.v4.tool.Rule; @@ -45,8 +45,8 @@ public class LexerNFAFactory extends ParserNFAFactory { public Handle range(GrammarAST a, GrammarAST b) { BasicState left = newState(a); BasicState right = newState(b); - int t1 = Target.getCharValueFromGrammarCharLiteral(a.getText()); - int t2 = Target.getCharValueFromGrammarCharLiteral(b.getText()); + int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText()); + int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText()); left.transition = new RangeTransition(t1, t2, right); a.nfaState = left; b.nfaState = left; diff --git a/tool/src/org/antlr/v4/automata/ParserNFAFactory.java b/tool/src/org/antlr/v4/automata/ParserNFAFactory.java index bcf9922c5..7be7372bc 100644 --- a/tool/src/org/antlr/v4/automata/ParserNFAFactory.java +++ b/tool/src/org/antlr/v4/automata/ParserNFAFactory.java @@ -3,7 +3,7 @@ package org.antlr.v4.automata; import org.antlr.runtime.RecognitionException; import org.antlr.runtime.tree.CommonTreeNodeStream; -import org.antlr.v4.codegen.Target; +import org.antlr.v4.misc.CharSupport; import org.antlr.v4.misc.IntervalSet; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.parse.GrammarASTAdaptor; @@ -103,7 +103,7 @@ public class ParserNFAFactory implements NFAFactory { GrammarAST ast = A.left.ast; int ttype = 0; if ( g.isLexer() ) { - ttype = Target.getCharValueFromGrammarCharLiteral(ast.getText()); + ttype = CharSupport.getCharValueFromGrammarCharLiteral(ast.getText()); } else { ttype = g.getTokenType(ast.getText()); diff --git a/tool/src/org/antlr/v4/automata/RangeTransition.java b/tool/src/org/antlr/v4/automata/RangeTransition.java index ecf30fb69..e701dbcaf 100644 --- a/tool/src/org/antlr/v4/automata/RangeTransition.java +++ b/tool/src/org/antlr/v4/automata/RangeTransition.java @@ -1,6 +1,6 @@ package org.antlr.v4.automata; -import org.antlr.v4.codegen.Target; +import org.antlr.v4.misc.CharSupport; import org.antlr.v4.misc.IntervalSet; public class RangeTransition extends Transition { @@ -21,7 +21,7 @@ public class RangeTransition extends Transition { @Override public String toString() { - return Target.getANTLRCharLiteralForChar(from)+".."+ - Target.getANTLRCharLiteralForChar(to); + return CharSupport.getANTLRCharLiteralForChar(from)+".."+ + CharSupport.getANTLRCharLiteralForChar(to); } } diff --git a/tool/src/org/antlr/v4/codegen/CodeGenerator.java b/tool/src/org/antlr/v4/codegen/CodeGenerator.java index 2ac163b70..4e2b7fd1b 100644 --- a/tool/src/org/antlr/v4/codegen/CodeGenerator.java +++ b/tool/src/org/antlr/v4/codegen/CodeGenerator.java @@ -1,8 +1,13 @@ package org.antlr.v4.codegen; +import org.antlr.v4.codegen.src.BitSetDef; import org.antlr.v4.codegen.src.OutputModelObject; +import org.antlr.v4.codegen.src.ParserFile; +import org.antlr.v4.misc.IntSet; +import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.tool.ErrorType; import org.antlr.v4.tool.Grammar; +import org.antlr.v4.tool.GrammarAST; import org.stringtemplate.v4.*; import java.io.IOException; @@ -19,6 +24,7 @@ public abstract class CodeGenerator { public Grammar g; public Target target; public STGroup templates; + public ParserFile outputModel; public int lineWidth = 72; @@ -72,8 +78,7 @@ public abstract class CodeGenerator { public void write() { OutputModelObject root = buildOutputModel(); - OutputModelWalker walker = new OutputModelWalker(g.tool, templates, - ParserGenerator.modelToTemplateMap); + OutputModelWalker walker = new OutputModelWalker(g.tool, templates); ST outputFileST = walker.walk(root); // WRITE FILES @@ -130,4 +135,16 @@ public abstract class CodeGenerator { // } return g.name+VOCAB_FILE_EXTENSION; } + + public BitSetDef defineBitSet(GrammarAST ast, IntSet follow) { + String inRuleName = ast.nfaState.rule.name; + String elementName = ast.getText(); // assume rule ref + if ( ast.getType() == ANTLRParser.TOKEN_REF ) { + target.getTokenTypeAsTargetLabel(g, ast.getType() ); + } + String name = "FOLLOW_"+elementName+"_in_"+inRuleName+ast.token.getTokenIndex(); + BitSetDef b = new BitSetDef(this, name, follow); + outputModel.bitSetDefs.add(b); + return b; + } } diff --git a/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java b/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java index b1b934c79..115aff9cf 100644 --- a/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java +++ b/tool/src/org/antlr/v4/codegen/NFABytecodeGenerator.java @@ -5,6 +5,7 @@ import org.antlr.runtime.Token; import org.antlr.runtime.tree.CommonTreeNodeStream; import org.antlr.runtime.tree.TreeNodeStream; import org.antlr.v4.codegen.nfa.*; +import org.antlr.v4.misc.CharSupport; import org.antlr.v4.misc.DoubleKeyMap; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.parse.GrammarASTAdaptor; @@ -89,7 +90,7 @@ public class NFABytecodeGenerator extends TreeParser { } public void emitString(Token t) { - String chars = Target.getStringFromGrammarStringLiteral(t.getText()); + String chars = CharSupport.getStringFromGrammarStringLiteral(t.getText()); for (char c : chars.toCharArray()) { emit(new MatchInstr(t, c)); } diff --git a/tool/src/org/antlr/v4/codegen/OutputModelWalker.java b/tool/src/org/antlr/v4/codegen/OutputModelWalker.java index 803dd403a..adf6ed8fa 100644 --- a/tool/src/org/antlr/v4/codegen/OutputModelWalker.java +++ b/tool/src/org/antlr/v4/codegen/OutputModelWalker.java @@ -15,20 +15,19 @@ import java.util.*; public class OutputModelWalker { Tool tool; STGroup templates; - Map modelToTemplateMap; + //Map modelToTemplateMap; public OutputModelWalker(Tool tool, - STGroup templates, - Map modelToTemplateMap) + STGroup templates) { this.tool = tool; this.templates = templates; - this.modelToTemplateMap = modelToTemplateMap; + //this.modelToTemplateMap = modelToTemplateMap; } public ST walk(OutputModelObject omo) { // CREATE TEMPLATE FOR THIS OUTPUT OBJECT - String templateName = modelToTemplateMap.get(omo.getClass()); + String templateName = omo.getClass().getSimpleName(); if ( templateName == null ) { tool.errMgr.toolError(ErrorType.NO_MODEL_TO_TEMPLATE_MAPPING, omo.getClass().getSimpleName()); return new BlankST(); diff --git a/tool/src/org/antlr/v4/codegen/ParserGenerator.java b/tool/src/org/antlr/v4/codegen/ParserGenerator.java index 8410f81c1..c73e454bd 100644 --- a/tool/src/org/antlr/v4/codegen/ParserGenerator.java +++ b/tool/src/org/antlr/v4/codegen/ParserGenerator.java @@ -1,30 +1,31 @@ package org.antlr.v4.codegen; -import org.antlr.v4.codegen.src.*; +import org.antlr.v4.codegen.src.OutputModelObject; +import org.antlr.v4.codegen.src.Parser; +import org.antlr.v4.codegen.src.ParserFile; import org.antlr.v4.tool.Grammar; -import java.util.HashMap; -import java.util.Map; - - /** */ public class ParserGenerator extends CodeGenerator { - public static final Map modelToTemplateMap = new HashMap() {{ - put(ParserFile.class, "parserFile"); - put(Parser.class, "parser"); - put(RuleFunction.class, "parserFunction"); - put(DFADef.class, "DFA"); - put(CodeBlock.class, "codeBlock"); - put(LL1Choice.class, "switch"); - put(MatchToken.class, "matchToken"); - }}; +// public static final Map modelToTemplateMap = new HashMap() {{ +// put(ParserFile.class, "parserFile"); +// put(Parser.class, "parser"); +// put(RuleFunction.class, "parserFunction"); +// put(DFADef.class, "DFA"); +// put(CodeBlock.class, "codeBlock"); +// put(LL1Choice.class, "switch"); +// put(MatchToken.class, "matchToken"); +// }}; public ParserGenerator(Grammar g) { super(g); } public OutputModelObject buildOutputModel() { - Parser p = new Parser(this); - return new ParserFile(this, p, getRecognizerFileName()); + ParserFile pf = new ParserFile(this, getRecognizerFileName()); + outputModel = pf; + pf.parser = new Parser(this, pf); // side-effect: fills pf dfa and bitset defs + // at this point, model is built + return outputModel; } } diff --git a/tool/src/org/antlr/v4/codegen/Target.java b/tool/src/org/antlr/v4/codegen/Target.java index 62eda7d0b..3ec6a0a29 100644 --- a/tool/src/org/antlr/v4/codegen/Target.java +++ b/tool/src/org/antlr/v4/codegen/Target.java @@ -1,6 +1,7 @@ package org.antlr.v4.codegen; import org.antlr.v4.automata.Label; +import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.tool.Grammar; import org.stringtemplate.v4.ST; @@ -8,31 +9,28 @@ import java.io.IOException; /** */ public class Target { - /** When converting ANTLR char and string literals, here is the - * value set of escape chars. + /** For pure strings of Java 16-bit unicode char, how can we display + * it in the target language as a literal. Useful for dumping + * predicates and such that may refer to chars that need to be escaped + * when represented as strings. Also, templates need to be escaped so + * that the target language can hold them as a string. + * + * I have defined (via the constructor) the set of typical escapes, + * but your Target subclass is free to alter the translated chars or + * add more definitions. This is nonstatic so each target can have + * a different set in memory at same time. */ - public static int ANTLRLiteralEscapedCharValue[] = new int[255]; + protected String[] targetCharValueEscape = new String[255]; - /** Given a char, we need to be able to show as an ANTLR literal. - */ - public static String ANTLRLiteralCharValueEscape[] = new String[255]; - - static { - ANTLRLiteralEscapedCharValue['n'] = '\n'; - ANTLRLiteralEscapedCharValue['r'] = '\r'; - ANTLRLiteralEscapedCharValue['t'] = '\t'; - ANTLRLiteralEscapedCharValue['b'] = '\b'; - ANTLRLiteralEscapedCharValue['f'] = '\f'; - ANTLRLiteralEscapedCharValue['\\'] = '\\'; - ANTLRLiteralEscapedCharValue['\''] = '\''; - ANTLRLiteralEscapedCharValue['"'] = '"'; - ANTLRLiteralCharValueEscape['\n'] = "\\n"; - ANTLRLiteralCharValueEscape['\r'] = "\\r"; - ANTLRLiteralCharValueEscape['\t'] = "\\t"; - ANTLRLiteralCharValueEscape['\b'] = "\\b"; - ANTLRLiteralCharValueEscape['\f'] = "\\f"; - ANTLRLiteralCharValueEscape['\\'] = "\\\\"; - ANTLRLiteralCharValueEscape['\''] = "\\'"; + public Target() { + targetCharValueEscape['\n'] = "\\n"; + targetCharValueEscape['\r'] = "\\r"; + targetCharValueEscape['\t'] = "\\t"; + targetCharValueEscape['\b'] = "\\b"; + targetCharValueEscape['\f'] = "\\f"; + targetCharValueEscape['\\'] = "\\\\"; + targetCharValueEscape['\''] = "\\'"; + targetCharValueEscape['"'] = "\\\""; } protected void genRecognizerFile(CodeGenerator generator, @@ -52,96 +50,66 @@ public class Target { { // no header file by default } - - /** Given a literal like (the 3 char sequence with single quotes) 'a', - * return the int value of 'a'. Convert escape sequences here also. + + /** Get a meaningful name for a token type useful during code generation. + * Literals without associated names are converted to the string equivalent + * of their integer values. Used to generate x==ID and x==34 type comparisons + * etc... Essentially we are looking for the most obvious way to refer + * to a token type in the generated code. If in the lexer, return the + * char literal translated to the target language. For example, ttype=10 + * will yield '\n' from the getTokenDisplayName method. That must + * be converted to the target languages literals. For most C-derived + * languages no translation is needed. */ - public static int getCharValueFromGrammarCharLiteral(String literal) { - switch ( literal.length() ) { - case 3 : - // 'x' - return literal.charAt(1); // no escape char - case 4 : - // '\x' (antlr lexer will catch invalid char) - if ( Character.isDigit(literal.charAt(2)) ) { -// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR, -// "invalid char literal: "+literal); - return -1; - } - int escChar = literal.charAt(2); - int charVal = ANTLRLiteralEscapedCharValue[escChar]; - if ( charVal==0 ) { - // Unnecessary escapes like '\{' should just yield { - return escChar; - } - return charVal; - case 8 : - // '\u1234' - String unicodeChars = literal.substring(3,literal.length()-1); - return Integer.parseInt(unicodeChars, 16); - default : -// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR, -// "invalid char literal: "+literal); - return -1; + public String getTokenTypeAsTargetLabel(Grammar g, int ttype) { + if ( g.getType() == ANTLRParser.LEXER ) { +// String name = g.getTokenDisplayName(ttype); +// return getTargetCharLiteralFromANTLRCharLiteral(this,name); } + String name = g.getTokenDisplayName(ttype); + // If name is a literal, return the token type instead + if ( name.charAt(0)=='\'' ) { + return String.valueOf(ttype); + } + return name; } - public static String getStringFromGrammarStringLiteral(String literal) { - StringBuilder buf = new StringBuilder(); - int n = literal.length(); - int i = 1; // skip first quote - while ( i < (n-1) ) { // scan all but last quote - switch ( literal.charAt(i) ) { - case '\\' : - i++; - if ( literal.charAt(i)=='u' ) { // '\u1234' - i++; - String unicodeChars = literal.substring(3,literal.length()-1); - buf.append((char)Integer.parseInt(unicodeChars, 16)); - } - else { - char escChar = literal.charAt(i); - int charVal = ANTLRLiteralEscapedCharValue[escChar]; - if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield { - else buf.append((char)charVal); - } - break; - default : - buf.append(literal.charAt(i)); - i++; - break; - } + /** Convert from an ANTLR char literal found in a grammar file to + * an equivalent char literal in the target language. For most + * languages, this means leaving 'x' as 'x'. Actually, we need + * to escape '\u000A' so that it doesn't get converted to \n by + * the compiler. Convert the literal to the char value and then + * to an appropriate target char literal. + * + * Expect single quotes around the incoming literal. + */ + public String getTargetCharLiteralCharValue(int c) { + StringBuffer buf = new StringBuffer(); + buf.append('\''); + if ( c'"; - } - if ( c getChildren() { - return new ArrayList() {{ add("funcs"); add("dfaDefs"); }}; + return new ArrayList() {{ add("funcs"); }}; } } diff --git a/tool/src/org/antlr/v4/codegen/src/ParserFile.java b/tool/src/org/antlr/v4/codegen/src/ParserFile.java index 822c5625d..5499f0e70 100644 --- a/tool/src/org/antlr/v4/codegen/src/ParserFile.java +++ b/tool/src/org/antlr/v4/codegen/src/ParserFile.java @@ -9,15 +9,20 @@ import java.util.List; public class ParserFile extends OutputModelObject { public String fileName; public Parser parser; + public List dfaDefs = new ArrayList(); + public List bitSetDefs = new ArrayList(); - public ParserFile(CodeGenerator gen, Parser p, String fileName) { + public ParserFile(CodeGenerator gen, String fileName) { this.gen = gen; - parser = p; this.fileName = fileName; } @Override public List getChildren() { - return new ArrayList() {{ add("parser"); }}; + return new ArrayList() {{ + add("parser"); + add("dfaDefs"); + add("bitSetDefs"); + }}; } } diff --git a/tool/src/org/antlr/v4/misc/CharSupport.java b/tool/src/org/antlr/v4/misc/CharSupport.java new file mode 100644 index 000000000..b5306c0bb --- /dev/null +++ b/tool/src/org/antlr/v4/misc/CharSupport.java @@ -0,0 +1,126 @@ +package org.antlr.v4.misc; + +import org.antlr.v4.automata.Label; + +/** */ +public class CharSupport { + /** When converting ANTLR char and string literals, here is the + * value set of escape chars. + */ + public static int ANTLRLiteralEscapedCharValue[] = new int[255]; + + /** Given a char, we need to be able to show as an ANTLR literal. + */ + public static String ANTLRLiteralCharValueEscape[] = new String[255]; + + static { + ANTLRLiteralEscapedCharValue['n'] = '\n'; + ANTLRLiteralEscapedCharValue['r'] = '\r'; + ANTLRLiteralEscapedCharValue['t'] = '\t'; + ANTLRLiteralEscapedCharValue['b'] = '\b'; + ANTLRLiteralEscapedCharValue['f'] = '\f'; + ANTLRLiteralEscapedCharValue['\\'] = '\\'; + ANTLRLiteralEscapedCharValue['\''] = '\''; + ANTLRLiteralEscapedCharValue['"'] = '"'; + ANTLRLiteralCharValueEscape['\n'] = "\\n"; + ANTLRLiteralCharValueEscape['\r'] = "\\r"; + ANTLRLiteralCharValueEscape['\t'] = "\\t"; + ANTLRLiteralCharValueEscape['\b'] = "\\b"; + ANTLRLiteralCharValueEscape['\f'] = "\\f"; + ANTLRLiteralCharValueEscape['\\'] = "\\\\"; + ANTLRLiteralCharValueEscape['\''] = "\\'"; + } + + /** Return a string representing the escaped char for code c. E.g., If c + * has value 0x100, you will get "\u0100". ASCII gets the usual + * char (non-hex) representation. Control characters are spit out + * as unicode. While this is specially set up for returning Java strings, + * it can be used by any language target that has the same syntax. :) + */ + public static String getANTLRCharLiteralForChar(int c) { + if ( c< Label.MIN_CHAR_VALUE ) { + return "''"; + } + if ( c= Label.MIN_CHAR_VALUE && ttype <= Label.MAX_CHAR_VALUE ) { - return Target.getANTLRCharLiteralForChar(ttype); + return CharSupport.getANTLRCharLiteralForChar(ttype); } // faux label? else if ( ttype<0 ) {