ugh. fix "[a-z]" for real now

2012-02-02 10:41:27 -08:00 · 2012-02-02 10:41:27 -08:00 · c47c07299a
parent 5cec4721ab
commit c47c07299a
4 changed files with 55 additions and 41 deletions
--- a/tool/src/org/antlr/v4/Tool.java
+++ b/tool/src/org/antlr/v4/Tool.java
@ -368,6 +368,7 @@ public class Tool {
 			GrammarASTAdaptor adaptor = new GrammarASTAdaptor(in);
 			ANTLRLexer lexer = new ANTLRLexer(in);
 			CommonTokenStream tokens = new CommonTokenStream(lexer);
+			lexer.tokens = tokens;
 			ToolANTLRParser p = new ToolANTLRParser(tokens, this);
 			p.setTreeAdaptor(adaptor);
 			ParserRuleReturnScope r = p.grammarSpec();
--- a/tool/src/org/antlr/v4/analysis/LeftRecursiveRuleTransformer.java
+++ b/tool/src/org/antlr/v4/analysis/LeftRecursiveRuleTransformer.java
@ -157,6 +157,7 @@ public class LeftRecursiveRuleTransformer {
 		ANTLRLexer lexer = new ANTLRLexer(new ANTLRStringStream(ruleText));
 		GrammarASTAdaptor adaptor = new GrammarASTAdaptor();
 		CommonTokenStream tokens = new CommonTokenStream(lexer);
+		lexer.tokens = tokens;
 		ToolANTLRParser p = new ToolANTLRParser(tokens, tool);
 		p.setTreeAdaptor(adaptor);
 		try {
--- a/tool/src/org/antlr/v4/parse/ANTLRLexer.g
+++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g
@ -119,7 +119,27 @@ package org.antlr.v4.parse;


@members {
-    public boolean isLexer = false;
+    public CommonTokenStream tokens; // track stream we push to; need for context info
+    public boolean isLexerRule = false;
+
+	/** scan backwards from current point in this.tokens list
+	 *  looking for the start of the rule or subrule.
+	 *  Return token or null if for some reason we can't find the start.
+	 */
+	public Token getRuleOrSubruleStartToken() {
+	    if ( tokens==null ) return null;
+		int i = tokens.index();
+        int n = tokens.size();
+        if ( i>=n ) i = n-1; // seems index == n as we lex
+		while ( i>=0 && i<n) {
+			int ttype = tokens.get(i).getType();
+			if ( ttype == LPAREN || ttype == TOKEN_REF || ttype == RULE_REF ) {
+				return tokens.get(i);
+			}
+			i--;
+		}
+		return null;
+	}
 }

 // --------
@ -222,8 +242,8 @@ COMMENT

 ARG_OR_CHARSET
 options {k=1;}
-    :   {isLexer}?=> LEXER_CHAR_SET {$type=LEXER_CHAR_SET;}
-    |   {!isLexer}?=> ARG_ACTION    {$type=ARG_ACTION;}
+    :   {isLexerRule}?=> LEXER_CHAR_SET {$type=LEXER_CHAR_SET;}
+    |   {!isLexerRule}?=> ARG_ACTION    {$type=ARG_ACTION;}
    ;

 fragment
@ -407,7 +427,7 @@ TOKENS_SPEC  : 'tokens'  WSNLCHARS* '{'  ;

 IMPORT       : 'import'               ;
 FRAGMENT     : 'fragment'             ;
-LEXER        : 'lexer'     {isLexer=true;} ;
+LEXER        : 'lexer'                ;
 PARSER       : 'parser'               ;
 GRAMMAR      : 'grammar'              ;
 PROTECTED    : 'protected'            ;
@ -425,7 +445,22 @@ MODE         : 'mode'                 ;
 //
 // Character sequences used as separators, delimters, operators, etc
 //
-COLON        : ':'                    ;
+COLON        : ':'
+               {
+               // scan backwards, looking for a RULE_REF or TOKEN_REF.
+               // which would indicate the start of a rule definition.
+               // If we see a LPAREN, then it's the start of the subrule.
+               // this.tokens is the token string we are pushing into, so
+               // just loop backwards looking for a rule definition. Then
+               // we set isLexerRule.
+               Token t = getRuleOrSubruleStartToken();
+               if ( t!=null ) {
+                    if ( t.getType()==RULE_REF ) isLexerRule = false;
+                    else if ( t.getType()==TOKEN_REF ) isLexerRule = true;
+                    // else must be subrule; don't alter context
+               }
+               }
+             ;
 COLONCOLON   : '::'                   ;
 COMMA        : ','                    ;
 SEMI         : ';'                    ;
@ -449,35 +484,13 @@ POUND        : '#'                    ;
 NOT          : '~'                    ;
 RBRACE       : '}'                    ;

-/*
-// ---------------
-// Token reference
-//
-// The names of all tokens must start with an upper case letter and so
-// the lexer can distinguish them directly.
-//
-TOKEN_REF
-    : ('A'..'Z') ('A'..'Z' | 'a'..'z' | '0'..'9' | '_')*
-    ;
-
-// --------------
-// Rule reference
-//
-// The names of all rules must start with a lower case letter
-// so the lexer can distibguish them directly. The parser takes
-// care of the case such as id=rulename
-//
-RULE_REF
-    : ('a'..'z') ('A'..'Z' | 'a'..'z' | '0'..'9' | '_')*
-    ;
-    */
-
 /** Allow unicode rule/token names */
 ID			:	a=NameStartChar NameChar*
 				{
 				if ( Character.isUpperCase($a.text.charAt(0)) ) $type = TOKEN_REF;
 				else $type = RULE_REF;
-				};
+				}
+			;

 fragment
 NameChar    :   NameStartChar
--- a/tool/src/org/antlr/v4/parse/ANTLRParser.g
+++ b/tool/src/org/antlr/v4/parse/ANTLRParser.g
@ -57,7 +57,6 @@ options {
 // nodes for the AST we are generating. The tokens section is where we
 // specify any such tokens
 tokens {
-    LEXER;
    RULE;
 	PREC_RULE; // flip to this if we find that it's left-recursive
    RULES;
@ -89,7 +88,7 @@ tokens {
    LIST;
    ELEMENT_OPTIONS;      // TOKEN<options>
    RESULT;
-    
+
    // lexer action stuff
    LEXER_ALT_ACTION;
    LEXER_ACTION_CALL; // ID(foo)
@ -340,7 +339,7 @@ sync
 rule:	parserRule
 	|	lexerRule
 	;
-	
+
 // The specification of an EBNF rule in ANTLR style, with all the
 // rule level parameters, declarations, actions, rewrite specs and so
 // on.
@ -561,7 +560,7 @@ lexerRuleBlock
    	// just resyncing; ignore error
 		retval.tree = (GrammarAST)adaptor.errorNode(input, retval.start, input.LT(-1), null);
    }
-    
+
 lexerAltList
 	:	lexerAlt (OR lexerAlt)* -> lexerAlt+
 	;
@ -586,11 +585,11 @@ lexerElement
 	:	labeledLexerElement
 		(	ebnfSuffix	-> ^( ebnfSuffix ^(BLOCK<BlockAST>[$labeledLexerElement.start,"BLOCK"] ^(ALT<AltAST> labeledLexerElement) ) )
 		|				-> labeledLexerElement
-		)		
+		)
 	|	lexerAtom
 		(	ebnfSuffix	-> ^( ebnfSuffix ^(BLOCK<BlockAST>[$lexerAtom.start,"BLOCK"] ^(ALT<AltAST> lexerAtom) ) )
 		|				-> lexerAtom
-		)		
+		)
 	|	lexerBlock
 		(	ebnfSuffix	-> ^(ebnfSuffix lexerBlock)
 		|				-> lexerBlock
@ -624,14 +623,14 @@ lexerElement
        reportError(re);
        recover(input,re);
 	}
-	
+
 labeledLexerElement
 	:	id (ass=ASSIGN|ass=PLUS_ASSIGN)
 		(	lexerAtom	-> ^($ass id lexerAtom)
 		|	block 		-> ^($ass id block)
 		)
 	;
-	
+
 lexerBlock
 	:	LPAREN lexerAltList RPAREN
      -> ^(BLOCK<BlockAST>[$LPAREN,"BLOCK"] lexerAltList )
@ -648,7 +647,7 @@ lexerCommand
 	;

 lexerCommandExpr
-	:	id 
+	:	id
 	|	INT
 	;

@ -755,12 +754,12 @@ ebnfSuffix
  	|	STAR 		-> CLOSURE<StarBlockAST>[$start]
   	|	PLUS	 	-> POSITIVE_CLOSURE<PlusBlockAST>[$start]
 	;
-	
+
 lexerAtom
-	:	range 
+	:	range
 	|	terminal
    |   RULE_REF<RuleRefAST>
-    |	notSet 
+    |	notSet
    |	wildcard
    |	LEXER_CHAR_SET
 	;