recursive rule bug in lexer; the lexer ATN simulator was not checking for empty stack at rule stop states.

This commit is contained in:
Terence Parr 2012-09-23 18:04:46 -07:00
parent 1b60543207
commit 262a331a5b
6 changed files with 81 additions and 7 deletions

View File

@ -332,6 +332,22 @@ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator>
this._token = _token;
}
public void setType(int ttype) {
_type = ttype;
}
public int getType() {
return _type;
}
public void setChannel(int channel) {
_channel = channel;
}
public int getChannel() {
return _channel;
}
public String[] getModeNames() {
return null;
}

View File

@ -436,7 +436,7 @@ public class LexerATNSimulator extends ATNSimulator {
// that rule is done. this is how we cut off nongreedy .+ loops.
reach = deleteWildcardConfigsForAlt(reach, ci, c.alt);
// move to next char, looking for longer match
// move to next char, looking for longer match
// (we continue processing if there are states in reach)
}
}
@ -583,6 +583,13 @@ public class LexerATNSimulator extends ATNSimulator {
for (SingletonPredictionContext ctx : config.context) {
if ( !ctx.isEmpty() ) {
PredictionContext newContext = ctx.parent; // "pop" invoking state
if ( ctx.invokingState==PredictionContext.EMPTY_FULL_CTX_INVOKING_STATE ) {
// we have no context info. Don't pursue.
if ( debug ) System.out.println("FALLING off token "+
recog.getRuleNames()[config.state.ruleIndex]);
configs.add(config);
continue;
}
ATNState invokingState = atn.states.get(ctx.invokingState);
RuleTransition rt = (RuleTransition)invokingState.transition(0);
ATNState retState = rt.followState;

2
tool/playground/A-input Normal file
View File

@ -0,0 +1,2 @@
{{x}
}

View File

@ -1,6 +1,25 @@
grammar A;
lexer grammar A;
s : INT { System.out.println($start.getText());} ;
/*
For input
INT : [0-9]+ ;
WS : [ \t\n]+ -> skip ;
{{x}
}
This matches {{x} and then thinks that it can stop because it can match that
without going into the recursive call. The context for the stop state in ACTION
is (2,1,[[$, 6 $]]) so it deletes everything else associated with this token.
Seems like we should favor the first alternative, but we can't do that within
a single rule.
weird though that this one works
STRING : '"' ( '\\' '"' | . )* '"' ;
wouldn't it get to the end of the rule also by the wild-card route?
Maybe it's a simple order of operations or order in which i process the
alternatives?
*/
ACTION : '{' ( ACTION | . )* '}' ;
WS : [ \r\t\n]+ -> skip ;

View File

@ -1,2 +1,6 @@
lexer grammar T;
A : 'a';
grammar T;
s : INT { System.out.println($start.getText());} ;
INT : [0-9]+ {$type = 3; String x = $text; $channel, $mode} ;
WS : [ \t\n]+ -> skip ;

View File

@ -0,0 +1,26 @@
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CommonTokenFactory;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.UnbufferedCharStream;
import java.io.FileInputStream;
import java.io.InputStream;
public class TestA {
public static void main(String[] args) throws Exception {
String inputFile = null;
if ( args.length>0 ) inputFile = args[0];
InputStream is = System.in;
if ( inputFile!=null ) {
is = new FileInputStream(inputFile);
}
CharStream input = new UnbufferedCharStream(is);
A lex = new A(input);
lex.setTokenFactory(new CommonTokenFactory(true));
CommonTokenStream tokens = new CommonTokenStream(lex);
tokens.fill();
System.out.println(tokens.getTokens());
}
}