From 5c3c8d6e7a95e5e60a99219c02ff1bae3fcc14d6 Mon Sep 17 00:00:00 2001 From: parrt Date: Tue, 3 Jan 2012 10:58:01 -0800 Subject: [PATCH] Added TokenFactory, CommonTokenFactory. Update the parser in the lexer with methods to set the factory. Alter the default error strategy and the lexer to use the factory. The parser's set token factory method updates the token source, usually the lexer, and the error handling strategy. I had to add the set token factory method to token source as well to make all of this work. [git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9789] --- .../antlr/v4/runtime/ANTLRErrorStrategy.java | 34 ++++------ .../antlr/v4/runtime/CommonTokenFactory.java | 55 ++++++++++++++++ .../v4/runtime/DefaultErrorStrategy.java | 28 ++++---- .../Java/src/org/antlr/v4/runtime/Lexer.java | 66 +++++++++---------- .../Java/src/org/antlr/v4/runtime/Parser.java | 7 ++ .../src/org/antlr/v4/runtime/Recognizer.java | 2 + .../Java/src/org/antlr/v4/runtime/Token.java | 2 +- .../org/antlr/v4/runtime/TokenFactory.java | 47 +++++++++++++ .../src/org/antlr/v4/runtime/TokenSource.java | 3 + .../v4/tool/interp/LexerInterpreter.java | 6 ++ .../antlr/v4/test/TestCommonTokenStream.java | 4 ++ 11 files changed, 185 insertions(+), 69 deletions(-) create mode 100644 runtime/Java/src/org/antlr/v4/runtime/CommonTokenFactory.java create mode 100644 runtime/Java/src/org/antlr/v4/runtime/TokenFactory.java diff --git a/runtime/Java/src/org/antlr/v4/runtime/ANTLRErrorStrategy.java b/runtime/Java/src/org/antlr/v4/runtime/ANTLRErrorStrategy.java index 5d07747c5..5d3a515de 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/ANTLRErrorStrategy.java +++ b/runtime/Java/src/org/antlr/v4/runtime/ANTLRErrorStrategy.java @@ -31,10 +31,8 @@ import org.antlr.v4.runtime.misc.OrderedHashSet; * TODO: what to do about lexers */ public interface ANTLRErrorStrategy { - /** Report any kind of RecognitionException. */ - void reportError(@NotNull Parser recognizer, - @Nullable RecognitionException e) - throws RecognitionException; + /** To create missing tokens, we need a factory */ + public void setTokenFactory(TokenFactory factory); /** When matching elements within alternative, use this method * to recover. The default implementation uses single token @@ -109,13 +107,18 @@ public interface ANTLRErrorStrategy { /** Reset the error handler. Call this when the parser * matches a valid token (indicating no longer in recovery mode) - * and from its own reset method. - */ - void endErrorCondition(@NotNull Parser recognizer); + * and from its own reset method. + */ + void endErrorCondition(@NotNull Parser recognizer); - /** Called when the parser detects a true ambiguity: an input sequence can be matched - * literally by two or more pass through the grammar. ANTLR resolves the ambiguity in - * favor of the alternative appearing first in the grammar. The start and stop index are + /** Report any kind of RecognitionException. */ + void reportError(@NotNull Parser recognizer, + @Nullable RecognitionException e) + throws RecognitionException; + + /** Called when the parser detects a true ambiguity: an input sequence can be matched + * literally by two or more pass through the grammar. ANTLR resolves the ambiguity in + * favor of the alternative appearing first in the grammar. The start and stop index are * zero-based absolute indices into the token stream. ambigAlts is a set of alternative numbers * that can match the input sequence. This method is only called when we are parsing with * full context. @@ -124,17 +127,6 @@ public interface ANTLRErrorStrategy { DFA dfa, int startIndex, int stopIndex, @NotNull IntervalSet ambigAlts, @NotNull OrderedHashSet configs); - /** Called by the parser when it detects an input sequence that can be matched by two paths - * through the grammar. The difference between this and the reportAmbiguity method lies in - * the difference between Strong LL parsing and LL parsing. If we are not parsing with context, - * we can't be sure if a conflict is an ambiguity or simply a weakness in the Strong LL parsing - * strategy. If we are parsing with full context, this method is never called. - */ -// void reportConflict(@NotNull BaseRecognizer recognizer, -// int startIndex, int stopIndex, @NotNull IntervalSet ambigAlts, -// @NotNull OrderedHashSet configs); - - void reportAttemptingFullContext(@NotNull Parser recognizer, @NotNull DFA dfa, int startIndex, int stopIndex, diff --git a/runtime/Java/src/org/antlr/v4/runtime/CommonTokenFactory.java b/runtime/Java/src/org/antlr/v4/runtime/CommonTokenFactory.java new file mode 100644 index 000000000..e36a9632c --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/CommonTokenFactory.java @@ -0,0 +1,55 @@ +/* + [The "BSD license"] + Copyright (c) 2012 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.antlr.v4.runtime; + +public class CommonTokenFactory implements TokenFactory { + public static final TokenFactory DEFAULT = new CommonTokenFactory(); + + @Override + public CommonToken create(TokenSource source, int type, String text, + int channel, int start, int stop, + int line, int charPositionInLine) + { + CommonToken t = new CommonToken(source, type, channel, start, stop); + t.setLine(line); + t.setCharPositionInLine(charPositionInLine); + if ( text!=null ) { + t.setText(text); + t.setStartIndex(-1); + t.setStopIndex(-1); + } + return t; + } + + @Override + public CommonToken create(int type, String text) { + return new CommonToken(type, text); + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/DefaultErrorStrategy.java b/runtime/Java/src/org/antlr/v4/runtime/DefaultErrorStrategy.java index bab64fb6e..8de3ce476 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/DefaultErrorStrategy.java +++ b/runtime/Java/src/org/antlr/v4/runtime/DefaultErrorStrategy.java @@ -39,6 +39,9 @@ import org.antlr.v4.runtime.misc.OrderedHashSet; * and tree parsers. */ public class DefaultErrorStrategy implements ANTLRErrorStrategy { + /** How to create token objects */ + protected TokenFactory _factory = CommonTokenFactory.DEFAULT; + /** This is true after we see an error and before having successfully * matched a token. Prevents generation of more than one error message * per error. @@ -55,6 +58,11 @@ public class DefaultErrorStrategy implements ANTLRErrorStrategy { protected IntervalSet lastErrorStates; + @Override + public void setTokenFactory(TokenFactory factory) { + this._factory = factory; + } + @Override public void beginErrorCondition(Parser recognizer) { errorRecoveryMode = true; @@ -354,26 +362,20 @@ public class DefaultErrorStrategy implements ANTLRErrorStrategy { */ protected Token getMissingSymbol(Parser recognizer) { Token currentSymbol = recognizer.getCurrentToken(); - if (!(currentSymbol instanceof Token)) { - throw new UnsupportedOperationException("This error strategy only supports Token symbols."); - } - IntervalSet expecting = getExpectedTokens(recognizer); int expectedTokenType = expecting.getMinElement(); // get any element String tokenText; if ( expectedTokenType== Token.EOF ) tokenText = ""; else tokenText = ""; - CommonToken t = new CommonToken(expectedTokenType, tokenText); - Token current = (Token)currentSymbol; + Token current = currentSymbol; if ( current.getType() == Token.EOF ) { - current = ((TokenStream)recognizer.getInputStream()).LT(-1); + current = recognizer.getInputStream().LT(-1); } - t.line = current.getLine(); - t.charPositionInLine = current.getCharPositionInLine(); - t.channel = Token.DEFAULT_CHANNEL; - t.source = current.getTokenSource(); - t.index = -1; // indicate we conjured this up because it has no index - return (Token)t; + return + _factory.create(current.getTokenSource(), expectedTokenType, tokenText, + Token.DEFAULT_CHANNEL, + -1, -1, + current.getLine(), current.getCharPositionInLine()); } public IntervalSet getExpectedTokens(Parser recognizer) { diff --git a/runtime/Java/src/org/antlr/v4/runtime/Lexer.java b/runtime/Java/src/org/antlr/v4/runtime/Lexer.java index c07634eee..a9b3c531d 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/Lexer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/Lexer.java @@ -50,7 +50,10 @@ public abstract class Lexer extends Recognizer public static final int MIN_CHAR_VALUE = '\u0000'; public static final int MAX_CHAR_VALUE = '\uFFFE'; - public CharStream input; + public CharStream _input; + + /** How to create token objects */ + protected TokenFactory _factory = CommonTokenFactory.DEFAULT; /** The goal of all lexer rules/methods is to create a token object. * This is an instance variable as multiple rules may collaborate to @@ -94,13 +97,13 @@ public abstract class Lexer extends Recognizer public String text; public Lexer(CharStream input) { - this.input = input; + this._input = input; } public void reset() { // wack Lexer state variables - if ( input!=null ) { - input.seek(0); // rewind the input + if ( _input !=null ) { + _input.seek(0); // rewind the input } token = null; type = Token.INVALID_TYPE; @@ -124,13 +127,13 @@ public abstract class Lexer extends Recognizer */ @Override public Token nextToken() { - if ( hitEOF ) return emitEOF(); + if ( hitEOF ) return anEOF(); outer: while (true) { token = null; channel = Token.DEFAULT_CHANNEL; - tokenStartCharIndex = input.index(); + tokenStartCharIndex = _input.index(); tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine(); tokenStartLine = getInterpreter().getLine(); text = null; @@ -141,14 +144,14 @@ public abstract class Lexer extends Recognizer // " at index "+input.index()); int ttype; try { - ttype = getInterpreter().match(input, mode); + ttype = getInterpreter().match(_input, mode); } catch (LexerNoViableAltException e) { notifyListeners(e); // report error recover(e); ttype = SKIP; } - if ( input.LA(1)==CharStream.EOF ) { + if ( _input.LA(1)==CharStream.EOF ) { hitEOF = true; } if ( type == Token.INVALID_TYPE ) type = ttype; @@ -195,22 +198,27 @@ public abstract class Lexer extends Recognizer return mode; } + @Override + public void setTokenFactory(TokenFactory factory) { + this._factory = factory; + } + /** Set the char stream and reset the lexer */ @Override public void setInputStream(IntStream input) { - this.input = null; + this._input = null; reset(); - this.input = (CharStream)input; + this._input = (CharStream)input; } @Override public String getSourceName() { - return input.getSourceName(); + return _input.getSourceName(); } @Override public CharStream getInputStream() { - return input; + return _input; } /** Currently does not support multiple emits per nextToken invocation @@ -228,35 +236,25 @@ public abstract class Lexer extends Recognizer * outermost lexical rule. The token object should point into the * char buffer start..stop. If there is a text override in 'text', * use that to set the token's text. Override this method to emit - * custom Token objects. - * - * If you are building trees, then you should also override - * Parser or TreeParser.getMissingSymbol(). + * custom Token objects or provide a new factory. */ public Token emit() { - WritableToken t = new CommonToken(this, type, - channel, tokenStartCharIndex, - getCharIndex()-1); - t.setLine(tokenStartLine); - if ( text!=null ) t.setText(text); - t.setCharPositionInLine(tokenStartCharPositionInLine); + Token t = _factory.create(this, type, text, channel, tokenStartCharIndex, getCharIndex()-1, + tokenStartLine, tokenStartCharPositionInLine); emit(t); return t; } - public Token emitEOF() { - WritableToken eof = new CommonToken(this,Token.EOF, - Token.DEFAULT_CHANNEL, - input.index(),input.index()-1); - eof.setLine(getLine()); + public Token anEOF() { + int cpos = getCharPositionInLine(); // The character position for EOF is one beyond the position of // the previous token's last character - int cpos = getCharPositionInLine(); if ( token!=null ) { int n = token.getStopIndex() - token.getStartIndex() + 1; cpos = token.getCharPositionInLine()+n; } - eof.setCharPositionInLine(cpos); + Token eof = _factory.create(this, Token.EOF, null, channel, _input.index(), _input.index()-1, + getLine(), cpos); return eof; } @@ -272,7 +270,7 @@ public abstract class Lexer extends Recognizer /** What is the index of the current character of lookahead? */ public int getCharIndex() { - return input.index(); + return _input.index(); } /** Return the text matched so far for the current token or any @@ -282,7 +280,7 @@ public abstract class Lexer extends Recognizer if ( text!=null ) { return text; } - return getInterpreter().getText(input); + return getInterpreter().getText(_input); // return ((CharStream)input).substring(tokenStartCharIndex,getCharIndex()-1); } @@ -318,12 +316,12 @@ public abstract class Lexer extends Recognizer } public void recover(LexerNoViableAltException e) { - getInterpreter().consume(input); // skip a char and try again + getInterpreter().consume(_input); // skip a char and try again } public void notifyListeners(LexerNoViableAltException e) { String msg = "token recognition error at: '"+ - input.substring(tokenStartCharIndex,input.index())+"'"; + _input.substring(tokenStartCharIndex, _input.index())+"'"; ANTLRErrorListener[] listeners = getListeners(); if ( listeners.length == 0 ) { System.err.println("line "+tokenStartLine+":"+ @@ -364,6 +362,6 @@ public abstract class Lexer extends Recognizer //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); //re.printStackTrace(); // TODO: Do we lose character or line position information? - input.consume(); + _input.consume(); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/Parser.java b/runtime/Java/src/org/antlr/v4/runtime/Parser.java index 348f9b424..3156ca6d5 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/Parser.java +++ b/runtime/Java/src/org/antlr/v4/runtime/Parser.java @@ -159,6 +159,13 @@ public abstract class Parser extends Recognizer factory) { + _input.getTokenSource().setTokenFactory(factory); + _errHandler.setTokenFactory(factory); + } + @Override public TokenStream getInputStream() { return getTokenStream(); } diff --git a/runtime/Java/src/org/antlr/v4/runtime/Recognizer.java b/runtime/Java/src/org/antlr/v4/runtime/Recognizer.java index 1a2d351a1..7f848fde4 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/Recognizer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/Recognizer.java @@ -134,4 +134,6 @@ public abstract class Recognizer { public abstract IntStream getInputStream(); public abstract void setInputStream(IntStream input); + + public abstract void setTokenFactory(TokenFactory input); } diff --git a/runtime/Java/src/org/antlr/v4/runtime/Token.java b/runtime/Java/src/org/antlr/v4/runtime/Token.java index 52ca12020..a04188047 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/Token.java +++ b/runtime/Java/src/org/antlr/v4/runtime/Token.java @@ -35,7 +35,7 @@ package org.antlr.v4.runtime; */ public interface Token { public static final int INVALID_TYPE = 0; - public static final Token INVALID_TOKEN = new CommonToken(INVALID_TYPE); +// public static final Token INVALID_TOKEN = new CommonToken(INVALID_TYPE); public static final int MIN_TOKEN_TYPE = 1; /** During lookahead operations, this "token" signifies we hit rule end ATN state diff --git a/runtime/Java/src/org/antlr/v4/runtime/TokenFactory.java b/runtime/Java/src/org/antlr/v4/runtime/TokenFactory.java new file mode 100644 index 000000000..dbc8de43a --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/TokenFactory.java @@ -0,0 +1,47 @@ +/* + [The "BSD license"] + Copyright (c) 2012 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.antlr.v4.runtime; + +/** The default mechanism for creating tokens. It's used by default in Lexer and + * the error handling strategy (to create missing tokens). Notifying the parser + * of a new factory means that it notifies it's token source and error strategy. + */ +public interface TokenFactory { + /** This is the method used to create tokens in the lexer and in the + * error handling strategy. If text!=null, than the start and stop positions + * are wiped to -1 in the text override is set in the CommonToken. + */ + Symbol create(TokenSource source, int type, String text, + int channel, int start, int stop, + int line, int charPositionInLine); + + /** Generically useful */ + Symbol create(int type, String text); +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/TokenSource.java b/runtime/Java/src/org/antlr/v4/runtime/TokenSource.java index b1117de67..5ff2e9dfd 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/TokenSource.java +++ b/runtime/Java/src/org/antlr/v4/runtime/TokenSource.java @@ -62,4 +62,7 @@ public interface TokenSource { * ask lexers input stream. */ public String getSourceName(); + + /** Optional method that lets users set factory in lexer or other source */ + public void setTokenFactory(TokenFactory factory); } diff --git a/tool/src/org/antlr/v4/tool/interp/LexerInterpreter.java b/tool/src/org/antlr/v4/tool/interp/LexerInterpreter.java index 6a613cdfd..9bad5fea4 100644 --- a/tool/src/org/antlr/v4/tool/interp/LexerInterpreter.java +++ b/tool/src/org/antlr/v4/tool/interp/LexerInterpreter.java @@ -60,6 +60,11 @@ public class LexerInterpreter implements TokenSource { public String getSourceName() { return g.name; } + @Override + public void setTokenFactory(TokenFactory factory) { + // TODO: use TokenFactory + } + public int getCharPositionInLine() { return 0; } @@ -79,6 +84,7 @@ public class LexerInterpreter implements TokenSource { int tokenStartLine = interp.getLine(); int ttype = interp.match(input, Lexer.DEFAULT_MODE); int stop = input.index()-1; + // TODO: use TokenFactory WritableToken t = new CommonToken(this, ttype, Token.DEFAULT_CHANNEL, start, stop); t.setLine(tokenStartLine); t.setCharPositionInLine(tokenStartCharPositionInLine); diff --git a/tool/test/org/antlr/v4/test/TestCommonTokenStream.java b/tool/test/org/antlr/v4/test/TestCommonTokenStream.java index dcb0d0ae9..636ca98dc 100644 --- a/tool/test/org/antlr/v4/test/TestCommonTokenStream.java +++ b/tool/test/org/antlr/v4/test/TestCommonTokenStream.java @@ -209,6 +209,10 @@ public class TestCommonTokenStream extends BaseTest { public CharStream getInputStream() { return null; } + + @Override + public void setTokenFactory(TokenFactory factory) { + } }; CommonTokenStream tokens = new CommonTokenStream(lexer);