forked from jasder/antlr
Added TokenFactory, CommonTokenFactory. Update the parser in the lexer with methods to set the factory. Alter the default error strategy and the lexer to use the factory. The parser's set token factory method updates the token source, usually the lexer, and the error handling strategy. I had to add the set token factory method to token source as well to make all of this work.
[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9789]
This commit is contained in:
parent
3aeeb2b277
commit
5c3c8d6e7a
|
@ -31,10 +31,8 @@ import org.antlr.v4.runtime.misc.OrderedHashSet;
|
|||
* TODO: what to do about lexers
|
||||
*/
|
||||
public interface ANTLRErrorStrategy {
|
||||
/** Report any kind of RecognitionException. */
|
||||
void reportError(@NotNull Parser recognizer,
|
||||
@Nullable RecognitionException e)
|
||||
throws RecognitionException;
|
||||
/** To create missing tokens, we need a factory */
|
||||
public void setTokenFactory(TokenFactory<?> factory);
|
||||
|
||||
/** When matching elements within alternative, use this method
|
||||
* to recover. The default implementation uses single token
|
||||
|
@ -113,6 +111,11 @@ public interface ANTLRErrorStrategy {
|
|||
*/
|
||||
void endErrorCondition(@NotNull Parser recognizer);
|
||||
|
||||
/** Report any kind of RecognitionException. */
|
||||
void reportError(@NotNull Parser recognizer,
|
||||
@Nullable RecognitionException e)
|
||||
throws RecognitionException;
|
||||
|
||||
/** Called when the parser detects a true ambiguity: an input sequence can be matched
|
||||
* literally by two or more pass through the grammar. ANTLR resolves the ambiguity in
|
||||
* favor of the alternative appearing first in the grammar. The start and stop index are
|
||||
|
@ -124,17 +127,6 @@ public interface ANTLRErrorStrategy {
|
|||
DFA dfa, int startIndex, int stopIndex, @NotNull IntervalSet ambigAlts,
|
||||
@NotNull OrderedHashSet<ATNConfig> configs);
|
||||
|
||||
/** Called by the parser when it detects an input sequence that can be matched by two paths
|
||||
* through the grammar. The difference between this and the reportAmbiguity method lies in
|
||||
* the difference between Strong LL parsing and LL parsing. If we are not parsing with context,
|
||||
* we can't be sure if a conflict is an ambiguity or simply a weakness in the Strong LL parsing
|
||||
* strategy. If we are parsing with full context, this method is never called.
|
||||
*/
|
||||
// void reportConflict(@NotNull BaseRecognizer recognizer,
|
||||
// int startIndex, int stopIndex, @NotNull IntervalSet ambigAlts,
|
||||
// @NotNull OrderedHashSet<ATNConfig> configs);
|
||||
|
||||
|
||||
void reportAttemptingFullContext(@NotNull Parser recognizer,
|
||||
@NotNull DFA dfa,
|
||||
int startIndex, int stopIndex,
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
[The "BSD license"]
|
||||
Copyright (c) 2012 Terence Parr
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
3. The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
public class CommonTokenFactory implements TokenFactory<CommonToken> {
|
||||
public static final TokenFactory<CommonToken> DEFAULT = new CommonTokenFactory();
|
||||
|
||||
@Override
|
||||
public CommonToken create(TokenSource source, int type, String text,
|
||||
int channel, int start, int stop,
|
||||
int line, int charPositionInLine)
|
||||
{
|
||||
CommonToken t = new CommonToken(source, type, channel, start, stop);
|
||||
t.setLine(line);
|
||||
t.setCharPositionInLine(charPositionInLine);
|
||||
if ( text!=null ) {
|
||||
t.setText(text);
|
||||
t.setStartIndex(-1);
|
||||
t.setStopIndex(-1);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CommonToken create(int type, String text) {
|
||||
return new CommonToken(type, text);
|
||||
}
|
||||
}
|
|
@ -39,6 +39,9 @@ import org.antlr.v4.runtime.misc.OrderedHashSet;
|
|||
* and tree parsers.
|
||||
*/
|
||||
public class DefaultErrorStrategy implements ANTLRErrorStrategy {
|
||||
/** How to create token objects */
|
||||
protected TokenFactory<?> _factory = CommonTokenFactory.DEFAULT;
|
||||
|
||||
/** This is true after we see an error and before having successfully
|
||||
* matched a token. Prevents generation of more than one error message
|
||||
* per error.
|
||||
|
@ -55,6 +58,11 @@ public class DefaultErrorStrategy implements ANTLRErrorStrategy {
|
|||
|
||||
protected IntervalSet lastErrorStates;
|
||||
|
||||
@Override
|
||||
public void setTokenFactory(TokenFactory<?> factory) {
|
||||
this._factory = factory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void beginErrorCondition(Parser recognizer) {
|
||||
errorRecoveryMode = true;
|
||||
|
@ -354,26 +362,20 @@ public class DefaultErrorStrategy implements ANTLRErrorStrategy {
|
|||
*/
|
||||
protected Token getMissingSymbol(Parser recognizer) {
|
||||
Token currentSymbol = recognizer.getCurrentToken();
|
||||
if (!(currentSymbol instanceof Token)) {
|
||||
throw new UnsupportedOperationException("This error strategy only supports Token symbols.");
|
||||
}
|
||||
|
||||
IntervalSet expecting = getExpectedTokens(recognizer);
|
||||
int expectedTokenType = expecting.getMinElement(); // get any element
|
||||
String tokenText;
|
||||
if ( expectedTokenType== Token.EOF ) tokenText = "<missing EOF>";
|
||||
else tokenText = "<missing "+recognizer.getTokenNames()[expectedTokenType]+">";
|
||||
CommonToken t = new CommonToken(expectedTokenType, tokenText);
|
||||
Token current = (Token)currentSymbol;
|
||||
Token current = currentSymbol;
|
||||
if ( current.getType() == Token.EOF ) {
|
||||
current = ((TokenStream)recognizer.getInputStream()).LT(-1);
|
||||
current = recognizer.getInputStream().LT(-1);
|
||||
}
|
||||
t.line = current.getLine();
|
||||
t.charPositionInLine = current.getCharPositionInLine();
|
||||
t.channel = Token.DEFAULT_CHANNEL;
|
||||
t.source = current.getTokenSource();
|
||||
t.index = -1; // indicate we conjured this up because it has no index
|
||||
return (Token)t;
|
||||
return
|
||||
_factory.create(current.getTokenSource(), expectedTokenType, tokenText,
|
||||
Token.DEFAULT_CHANNEL,
|
||||
-1, -1,
|
||||
current.getLine(), current.getCharPositionInLine());
|
||||
}
|
||||
|
||||
public IntervalSet getExpectedTokens(Parser recognizer) {
|
||||
|
|
|
@ -50,7 +50,10 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
public static final int MIN_CHAR_VALUE = '\u0000';
|
||||
public static final int MAX_CHAR_VALUE = '\uFFFE';
|
||||
|
||||
public CharStream input;
|
||||
public CharStream _input;
|
||||
|
||||
/** How to create token objects */
|
||||
protected TokenFactory<?> _factory = CommonTokenFactory.DEFAULT;
|
||||
|
||||
/** The goal of all lexer rules/methods is to create a token object.
|
||||
* This is an instance variable as multiple rules may collaborate to
|
||||
|
@ -94,13 +97,13 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
public String text;
|
||||
|
||||
public Lexer(CharStream input) {
|
||||
this.input = input;
|
||||
this._input = input;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
// wack Lexer state variables
|
||||
if ( input!=null ) {
|
||||
input.seek(0); // rewind the input
|
||||
if ( _input !=null ) {
|
||||
_input.seek(0); // rewind the input
|
||||
}
|
||||
token = null;
|
||||
type = Token.INVALID_TYPE;
|
||||
|
@ -124,13 +127,13 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
*/
|
||||
@Override
|
||||
public Token nextToken() {
|
||||
if ( hitEOF ) return emitEOF();
|
||||
if ( hitEOF ) return anEOF();
|
||||
|
||||
outer:
|
||||
while (true) {
|
||||
token = null;
|
||||
channel = Token.DEFAULT_CHANNEL;
|
||||
tokenStartCharIndex = input.index();
|
||||
tokenStartCharIndex = _input.index();
|
||||
tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine();
|
||||
tokenStartLine = getInterpreter().getLine();
|
||||
text = null;
|
||||
|
@ -141,14 +144,14 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
// " at index "+input.index());
|
||||
int ttype;
|
||||
try {
|
||||
ttype = getInterpreter().match(input, mode);
|
||||
ttype = getInterpreter().match(_input, mode);
|
||||
}
|
||||
catch (LexerNoViableAltException e) {
|
||||
notifyListeners(e); // report error
|
||||
recover(e);
|
||||
ttype = SKIP;
|
||||
}
|
||||
if ( input.LA(1)==CharStream.EOF ) {
|
||||
if ( _input.LA(1)==CharStream.EOF ) {
|
||||
hitEOF = true;
|
||||
}
|
||||
if ( type == Token.INVALID_TYPE ) type = ttype;
|
||||
|
@ -195,22 +198,27 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
return mode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTokenFactory(TokenFactory<?> factory) {
|
||||
this._factory = factory;
|
||||
}
|
||||
|
||||
/** Set the char stream and reset the lexer */
|
||||
@Override
|
||||
public void setInputStream(IntStream input) {
|
||||
this.input = null;
|
||||
this._input = null;
|
||||
reset();
|
||||
this.input = (CharStream)input;
|
||||
this._input = (CharStream)input;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceName() {
|
||||
return input.getSourceName();
|
||||
return _input.getSourceName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharStream getInputStream() {
|
||||
return input;
|
||||
return _input;
|
||||
}
|
||||
|
||||
/** Currently does not support multiple emits per nextToken invocation
|
||||
|
@ -228,35 +236,25 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
* outermost lexical rule. The token object should point into the
|
||||
* char buffer start..stop. If there is a text override in 'text',
|
||||
* use that to set the token's text. Override this method to emit
|
||||
* custom Token objects.
|
||||
*
|
||||
* If you are building trees, then you should also override
|
||||
* Parser or TreeParser.getMissingSymbol().
|
||||
* custom Token objects or provide a new factory.
|
||||
*/
|
||||
public Token emit() {
|
||||
WritableToken t = new CommonToken(this, type,
|
||||
channel, tokenStartCharIndex,
|
||||
getCharIndex()-1);
|
||||
t.setLine(tokenStartLine);
|
||||
if ( text!=null ) t.setText(text);
|
||||
t.setCharPositionInLine(tokenStartCharPositionInLine);
|
||||
Token t = _factory.create(this, type, text, channel, tokenStartCharIndex, getCharIndex()-1,
|
||||
tokenStartLine, tokenStartCharPositionInLine);
|
||||
emit(t);
|
||||
return t;
|
||||
}
|
||||
|
||||
public Token emitEOF() {
|
||||
WritableToken eof = new CommonToken(this,Token.EOF,
|
||||
Token.DEFAULT_CHANNEL,
|
||||
input.index(),input.index()-1);
|
||||
eof.setLine(getLine());
|
||||
public Token anEOF() {
|
||||
int cpos = getCharPositionInLine();
|
||||
// The character position for EOF is one beyond the position of
|
||||
// the previous token's last character
|
||||
int cpos = getCharPositionInLine();
|
||||
if ( token!=null ) {
|
||||
int n = token.getStopIndex() - token.getStartIndex() + 1;
|
||||
cpos = token.getCharPositionInLine()+n;
|
||||
}
|
||||
eof.setCharPositionInLine(cpos);
|
||||
Token eof = _factory.create(this, Token.EOF, null, channel, _input.index(), _input.index()-1,
|
||||
getLine(), cpos);
|
||||
return eof;
|
||||
}
|
||||
|
||||
|
@ -272,7 +270,7 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
|
||||
/** What is the index of the current character of lookahead? */
|
||||
public int getCharIndex() {
|
||||
return input.index();
|
||||
return _input.index();
|
||||
}
|
||||
|
||||
/** Return the text matched so far for the current token or any
|
||||
|
@ -282,7 +280,7 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
if ( text!=null ) {
|
||||
return text;
|
||||
}
|
||||
return getInterpreter().getText(input);
|
||||
return getInterpreter().getText(_input);
|
||||
// return ((CharStream)input).substring(tokenStartCharIndex,getCharIndex()-1);
|
||||
}
|
||||
|
||||
|
@ -318,12 +316,12 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
}
|
||||
|
||||
public void recover(LexerNoViableAltException e) {
|
||||
getInterpreter().consume(input); // skip a char and try again
|
||||
getInterpreter().consume(_input); // skip a char and try again
|
||||
}
|
||||
|
||||
public void notifyListeners(LexerNoViableAltException e) {
|
||||
String msg = "token recognition error at: '"+
|
||||
input.substring(tokenStartCharIndex,input.index())+"'";
|
||||
_input.substring(tokenStartCharIndex, _input.index())+"'";
|
||||
ANTLRErrorListener<Integer>[] listeners = getListeners();
|
||||
if ( listeners.length == 0 ) {
|
||||
System.err.println("line "+tokenStartLine+":"+
|
||||
|
@ -364,6 +362,6 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
|
|||
//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
|
||||
//re.printStackTrace();
|
||||
// TODO: Do we lose character or line position information?
|
||||
input.consume();
|
||||
_input.consume();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,6 +159,13 @@ public abstract class Parser extends Recognizer<Token, v2ParserATNSimulator<Toke
|
|||
return syntaxErrors;
|
||||
}
|
||||
|
||||
/** Tell our token source and error strategy about a new way to create tokens */
|
||||
@Override
|
||||
public void setTokenFactory(TokenFactory<?> factory) {
|
||||
_input.getTokenSource().setTokenFactory(factory);
|
||||
_errHandler.setTokenFactory(factory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream getInputStream() { return getTokenStream(); }
|
||||
|
||||
|
|
|
@ -134,4 +134,6 @@ public abstract class Recognizer<Symbol, ATNInterpreter extends ATNSimulator> {
|
|||
public abstract IntStream getInputStream();
|
||||
|
||||
public abstract void setInputStream(IntStream input);
|
||||
|
||||
public abstract void setTokenFactory(TokenFactory<?> input);
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ package org.antlr.v4.runtime;
|
|||
*/
|
||||
public interface Token {
|
||||
public static final int INVALID_TYPE = 0;
|
||||
public static final Token INVALID_TOKEN = new CommonToken(INVALID_TYPE);
|
||||
// public static final Token INVALID_TOKEN = new CommonToken(INVALID_TYPE);
|
||||
public static final int MIN_TOKEN_TYPE = 1;
|
||||
|
||||
/** During lookahead operations, this "token" signifies we hit rule end ATN state
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
[The "BSD license"]
|
||||
Copyright (c) 2012 Terence Parr
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
3. The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
/** The default mechanism for creating tokens. It's used by default in Lexer and
|
||||
* the error handling strategy (to create missing tokens). Notifying the parser
|
||||
* of a new factory means that it notifies it's token source and error strategy.
|
||||
*/
|
||||
public interface TokenFactory<Symbol extends Token> {
|
||||
/** This is the method used to create tokens in the lexer and in the
|
||||
* error handling strategy. If text!=null, than the start and stop positions
|
||||
* are wiped to -1 in the text override is set in the CommonToken.
|
||||
*/
|
||||
Symbol create(TokenSource source, int type, String text,
|
||||
int channel, int start, int stop,
|
||||
int line, int charPositionInLine);
|
||||
|
||||
/** Generically useful */
|
||||
Symbol create(int type, String text);
|
||||
}
|
|
@ -62,4 +62,7 @@ public interface TokenSource {
|
|||
* ask lexers input stream.
|
||||
*/
|
||||
public String getSourceName();
|
||||
|
||||
/** Optional method that lets users set factory in lexer or other source */
|
||||
public void setTokenFactory(TokenFactory<?> factory);
|
||||
}
|
||||
|
|
|
@ -60,6 +60,11 @@ public class LexerInterpreter implements TokenSource {
|
|||
|
||||
public String getSourceName() { return g.name; }
|
||||
|
||||
@Override
|
||||
public void setTokenFactory(TokenFactory<?> factory) {
|
||||
// TODO: use TokenFactory
|
||||
}
|
||||
|
||||
public int getCharPositionInLine() {
|
||||
return 0;
|
||||
}
|
||||
|
@ -79,6 +84,7 @@ public class LexerInterpreter implements TokenSource {
|
|||
int tokenStartLine = interp.getLine();
|
||||
int ttype = interp.match(input, Lexer.DEFAULT_MODE);
|
||||
int stop = input.index()-1;
|
||||
// TODO: use TokenFactory
|
||||
WritableToken t = new CommonToken(this, ttype, Token.DEFAULT_CHANNEL, start, stop);
|
||||
t.setLine(tokenStartLine);
|
||||
t.setCharPositionInLine(tokenStartCharPositionInLine);
|
||||
|
|
|
@ -209,6 +209,10 @@ public class TestCommonTokenStream extends BaseTest {
|
|||
public CharStream getInputStream() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTokenFactory(TokenFactory<?> factory) {
|
||||
}
|
||||
};
|
||||
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||
|
|
Loading…
Reference in New Issue