From e90b322dd42424f28ce1f0ffdaf548d07d46c18f Mon Sep 17 00:00:00 2001 From: parrt Date: Sat, 28 Jan 2012 14:36:46 -0800 Subject: [PATCH] ~[] stuff is allowed and works inside sets etc... [git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9926] --- CHANGES.txt | 4 ++ .../src/org/antlr/v4/runtime/CharStream.java | 2 + .../antlr/v4/runtime/atn/ATNSimulator.java | 2 +- .../v4/runtime/atn/NotSetTransition.java | 10 +--- tool/playground/E.g | 5 +- .../antlr/v4/automata/LexerATNFactory.java | 49 ++++++------------- .../antlr/v4/automata/ParserATNFactory.java | 3 +- tool/src/org/antlr/v4/parse/ANTLRParser.g | 1 + tool/src/org/antlr/v4/parse/ATNBuilder.g | 1 + .../test/org/antlr/v4/test/TestLexerExec.java | 28 +++++++++++ 10 files changed, 58 insertions(+), 47 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 7c16d8168..3267aabd4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,9 @@ ANTLR v4 Honey Badger early access +Jan 28, 2012 + +* ~[] stuff is allowed and works inside sets etc... + Jan 22, 2012 * Added ranges, escapes to [a-z] notation in lexer: diff --git a/runtime/Java/src/org/antlr/v4/runtime/CharStream.java b/runtime/Java/src/org/antlr/v4/runtime/CharStream.java index 8269f6d56..da3a490ff 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/CharStream.java +++ b/runtime/Java/src/org/antlr/v4/runtime/CharStream.java @@ -31,6 +31,8 @@ package org.antlr.v4.runtime; /** A source of characters for an ANTLR lexer */ public interface CharStream extends IntStream { public static final int EOF = -1; + public static final int MIN_CHAR = Character.MIN_VALUE; + public static final int MAX_CHAR = Character.MAX_VALUE-1; // FFFE is max /** For unbuffered streams, you can't use this; primarily I'm providing * a useful interface for action code. Just make sure actions don't diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java index 5e0e454bb..e4515f07c 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java @@ -161,7 +161,7 @@ public abstract class ATNSimulator { ActionTransition a = new ActionTransition(target, arg1, arg2, arg3 != 0); return a; case Transition.SET : return new SetTransition(target, sets.get(arg1)); - case Transition.NOT_SET : return new NotSetTransition(target, sets.get(arg1), null); + case Transition.NOT_SET : return new NotSetTransition(target, sets.get(arg1)); case Transition.WILDCARD : return new WildcardTransition(target); } return null; diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/NotSetTransition.java b/runtime/Java/src/org/antlr/v4/runtime/atn/NotSetTransition.java index 470a09b7b..f4640f6c6 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/NotSetTransition.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/NotSetTransition.java @@ -29,19 +29,13 @@ package org.antlr.v4.runtime.atn; +import org.antlr.v4.runtime.misc.IntervalSet; import org.antlr.v4.runtime.misc.NotNull; import org.antlr.v4.runtime.misc.Nullable; -import org.antlr.v4.runtime.misc.IntervalSet; public class NotSetTransition extends SetTransition { - // keep both set, notSet; we can only compute at construction time - // since only then do we have grammar, which knows token set for complement. - @Nullable - public final IntervalSet notSet; - - public NotSetTransition(@NotNull ATNState target, @Nullable IntervalSet set, @Nullable IntervalSet notSet) { + public NotSetTransition(@NotNull ATNState target, @Nullable IntervalSet set) { super(target, set); - this.notSet = notSet; } @Override diff --git a/tool/playground/E.g b/tool/playground/E.g index 350c06c0b..d768df39c 100644 --- a/tool/playground/E.g +++ b/tool/playground/E.g @@ -1,4 +1,3 @@ lexer grammar E; -I : '0'..'9'+ {System.out.println("I");} ; -ID : [a-zA-Z] [a-zA-Z0-9]* ; -WS : [ \n\u000D] -> skip ; +I : ~[ab] ~[cd]* {System.out.println("I");} ; +WS : [ \n\u000D]+ -> skip ; diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java index f2429ec5c..8ffc78839 100644 --- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -30,7 +30,6 @@ package org.antlr.v4.automata; import org.antlr.runtime.CommonToken; -import org.antlr.runtime.Token; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.misc.CharSupport; import org.antlr.v4.parse.ANTLRParser; @@ -161,20 +160,21 @@ public class LexerATNFactory extends ParserATNFactory { ATNState right = newState(associatedAST); IntervalSet set = new IntervalSet(); for (GrammarAST t : alts) { - if ( t.getType()== ANTLRParser.RANGE ) { + if ( t.getType()==ANTLRParser.RANGE ) { int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText()); int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText()); set.add(a, b); } + else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) { + set.addAll(getSetFromCharSetLiteral(t)); + } else { int c = CharSupport.getCharValueFromGrammarCharLiteral(t.getText()); set.add(c); } } if ( invert ) { - // TODO: what? should be chars not token types - IntervalSet notSet = set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType()); - left.addTransition(new NotSetTransition(right, set, notSet)); + left.addTransition(new NotSetTransition(right, set)); } else { left.addTransition(new SetTransition(right, set)); @@ -210,36 +210,21 @@ public class LexerATNFactory extends ParserATNFactory { public Handle charSetLiteral(GrammarAST charSetAST) { ATNState left = newState(charSetAST); ATNState right = newState(charSetAST); - String cset = '"'+charSetAST.getText()+'"'; + IntervalSet set = getSetFromCharSetLiteral(charSetAST); + left.addTransition(new SetTransition(right, set)); + charSetAST.atnState = left; + return new Handle(left, right); + } + public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) { + String chars = charSetAST.getText(); + chars = chars.substring(1, chars.length()-1); + String cset = '"'+ chars +'"'; IntervalSet set = new IntervalSet(); -// int n = cset.length(); -// int i = 0; -// while ( i < n ) { -// if ( (i+2)=n ) break; // ignore spurious \ on end -// if ( cset.charAt(i+1) == 'u' ) end = i+6; -// if ( end>n ) break; -// int c = CharSupport.getCharValueFromCharInGrammarLiteral(cset.substring(i,end)); -// set.add(c); -// i = end; -// } -// else { -// set.add(cset.charAt(i)); -// i++; -// } -// } // unescape all valid escape char like \n, leaving escaped dashes as '\-' // so we can avoid seeing them as '-' range ops. - String chars = CharSupport.getStringFromGrammarStringLiteral(cset); + chars = CharSupport.getStringFromGrammarStringLiteral(cset); // now make x-y become set of char int n = chars.length(); for (int i=0; i< n; i++) { @@ -257,9 +242,7 @@ public class LexerATNFactory extends ParserATNFactory { set.add(c); } } - left.addTransition(new SetTransition(right, set)); - charSetAST.atnState = left; - return new Handle(left, right); + return set; } @Override diff --git a/tool/src/org/antlr/v4/automata/ParserATNFactory.java b/tool/src/org/antlr/v4/automata/ParserATNFactory.java index b88488965..f5abd4657 100644 --- a/tool/src/org/antlr/v4/automata/ParserATNFactory.java +++ b/tool/src/org/antlr/v4/automata/ParserATNFactory.java @@ -179,8 +179,7 @@ public class ParserATNFactory implements ATNFactory { set.add(ttype); } if ( invert ) { - IntervalSet notSet = set.complement(Token.MIN_TOKEN_TYPE, g.getMaxTokenType()); - left.addTransition(new NotSetTransition(right, set, notSet)); + left.addTransition(new NotSetTransition(right, set)); } else { left.addTransition(new SetTransition(right, set)); diff --git a/tool/src/org/antlr/v4/parse/ANTLRParser.g b/tool/src/org/antlr/v4/parse/ANTLRParser.g index 8644980c7..9758da126 100644 --- a/tool/src/org/antlr/v4/parse/ANTLRParser.g +++ b/tool/src/org/antlr/v4/parse/ANTLRParser.g @@ -827,6 +827,7 @@ setElement : TOKEN_REF | STRING_LITERAL | range + | LEXER_CHAR_SET ; // ------------- diff --git a/tool/src/org/antlr/v4/parse/ATNBuilder.g b/tool/src/org/antlr/v4/parse/ATNBuilder.g index c6985daa1..8ab10147c 100644 --- a/tool/src/org/antlr/v4/parse/ATNBuilder.g +++ b/tool/src/org/antlr/v4/parse/ATNBuilder.g @@ -163,6 +163,7 @@ setElement : STRING_LITERAL | TOKEN_REF | ^(RANGE a=STRING_LITERAL b=STRING_LITERAL) + | LEXER_CHAR_SET ; atom returns [ATNFactory.Handle p] diff --git a/tool/test/org/antlr/v4/test/TestLexerExec.java b/tool/test/org/antlr/v4/test/TestLexerExec.java index 14c664ee0..4d95ab6d5 100644 --- a/tool/test/org/antlr/v4/test/TestLexerExec.java +++ b/tool/test/org/antlr/v4/test/TestLexerExec.java @@ -228,6 +228,34 @@ public class TestLexerExec extends BaseTest { assertEquals(expecting, found); } + @Test public void testCharSetNot() throws Exception { + String grammar = + "lexer grammar L;\n"+ + "I : ~[ab \n] ~[ \ncd]* {System.out.println(\"I\");} ;\n"+ + "WS : [ \\n\\u000D]+ -> skip ;"; + String found = execLexer("L.g", grammar, "L", "xaf"); + String expecting = + "I\n" + + "[@0,0:2='xaf',<3>,1:0]\n" + + "[@1,3:2='',<-1>,1:3]\n"; + assertEquals(expecting, found); + } + + @Test public void testCharSetInSet() throws Exception { + String grammar = + "lexer grammar L;\n"+ + "I : (~[ab \n]|'a') {System.out.println(\"I\");} ;\n"+ + "WS : [ \\n\\u000D]+ -> skip ;"; + String found = execLexer("L.g", grammar, "L", "a x"); + String expecting = + "I\n" + + "I\n" + + "[@0,0:0='a',<3>,1:0]\n" + + "[@1,2:2='x',<3>,1:2]\n" + + "[@2,3:2='',<-1>,1:3]\n"; + assertEquals(expecting, found); + } + @Test public void testCharSetRange() throws Exception { String grammar = "lexer grammar L;\n"+