rebuilt XPath using ANTLR itself; added error handling; added ! operator

2013-09-13 13:53:42 -07:00 · 2013-09-13 13:53:42 -07:00 · 01082414c3
parent a86895c557
commit 01082414c3
11 changed files with 348 additions and 57 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -2,6 +2,11 @@ ANTLR v4 Honey Badger

 September 11, 2013

+* Add ! operator to XPath
+* Use ANTLR v4 XPathLexer.g4 not regex
+
+September 11, 2013
+
 * Copy lots of find node stuff from v3 GrammarAST to Trees class in runtime.
 * Add to ParseTree [BREAKING CHANGE]:
 	Collection<ParseTree> findAll(String xpath);
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java
@ -1,30 +1,54 @@
 package org.antlr.v4.runtime.tree.xpath;

+import org.antlr.v4.runtime.ANTLRInputStream;
+import org.antlr.v4.runtime.CommonTokenStream;
+import org.antlr.v4.runtime.LexerNoViableAltException;
 import org.antlr.v4.runtime.Parser;
 import org.antlr.v4.runtime.ParserRuleContext;
+import org.antlr.v4.runtime.Token;
 import org.antlr.v4.runtime.misc.Utils;
 import org.antlr.v4.runtime.tree.ParseTree;

+import java.io.IOException;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;

-/** Represent a subset of XPath path syntax for use in identifying nodes in
+/** Represent a subset of XPath XML path syntax for use in identifying nodes in
 *  parse trees.
 *
- *  Split path into words and separators / and // then walk from left to right.
- *  At each separator-word pair, find set of nodes. Next stage uses those as
- *  work list.
+ *  Split path into words and separators / and // via ANTLR itself then walk
+ *  path elements from left to right.  At each separator-word pair, find set
+ *  of nodes. Next stage uses those as work list.
 *
- *  See {@link org.antlr.v4.test.TestXPath} for descriptions.
+ *  The basic interface is ParseTree.findAll(parser, pathString). But that is
+ *  just shorthand for:
 *
+ *  XPath p = new XPath(parser, xpath);
+ *  return p.evaluate(this);
+ *
+ *  See {@link org.antlr.v4.test.TestXPath} for descriptions. In short, this allows
+ *  operators:
+ *
+ *      /         root
+ *      //        anywhere
+ *      !         invert; this must appear directly after root or anywhere operator
+ *
+ *  and path elements:
+ *
+ *      ID        token name
+ *      'string'  any string literal token from the grammar
+ *      expr      rule name
+ *      *         wildcard matching any node
+ *
+ *  Whitespace is not allowed.
 */
 public class XPath {
 	public static final String WILDCARD = "*"; // word not operator/separator
+	public static final String NOT = "!"; 	   // word for invert operator

 	protected String path;
 	protected XPathElement[] elements;
@ -34,62 +58,116 @@ public class XPath {
 		this.parser = parser;
 		this.path = path;
 		elements = split(path);
-		System.out.println(Arrays.toString(elements));
+//		System.out.println(Arrays.toString(elements));
 	}

 	// TODO: check for invalid token/rule names, bad syntax

 	public XPathElement[] split(String path) {
-		Pattern pattern = Pattern.compile("//|/|\\w+|'.+?'|\\*"); // TODO: handle escapes in strings?
-		Matcher matcher = pattern.matcher(path);
-		List<String> pathStrings = new ArrayList<String>();
-		while (matcher.find()) {
-			pathStrings.add(matcher.group());
+		ANTLRInputStream in;
+		try {
+			in = new ANTLRInputStream(new StringReader(path));
 		}
-		System.out.println("path="+path+"=>"+pathStrings);
+		catch (IOException ioe) {
+			throw new IllegalArgumentException("Could not read path: "+path, ioe);
+		}
+		XPathLexer lexer = new XPathLexer(in) {
+			public void recover(LexerNoViableAltException e) { throw e;	}
+		};
+		lexer.removeErrorListeners();
+		lexer.addErrorListener(new XPathLexerErrorListener());
+		CommonTokenStream tokenStream = new CommonTokenStream(lexer);
+		try {
+			tokenStream.fill();
+		}
+		catch (LexerNoViableAltException e) {
+			int pos = lexer.getCharPositionInLine();
+			String msg = "Invalid tokens or characters at index "+pos+" in path '"+path+"'";
+			throw new IllegalArgumentException(msg, e);
+		}
+
+		List<Token> tokens = tokenStream.getTokens();
+//		System.out.println("path="+path+"=>"+tokens);
 		List<XPathElement> elements = new ArrayList<XPathElement>();
-		int n = pathStrings.size();
+		int n = tokens.size();
 		int i=0;
+loop:
 		while ( i<n ) {
-			String el = pathStrings.get(i);
-			if ( el.startsWith("/") ) {
-				i++;
-				if ( i>=n ) {
-					System.out.println("missing element name after operator");
-				}
-				String next = pathStrings.get(i);
-				boolean anywhere = el.equals("//");
-				elements.add( getXPathElement(next, anywhere) );
-				i++;
-			}
-			else {
-				elements.add( getXPathElement(el, false) );
-				i++;
+			Token el = tokens.get(i);
+			Token next = null;
+			switch ( el.getType() ) {
+				case XPathLexer.ROOT :
+				case XPathLexer.ANYWHERE :
+					boolean anywhere = el.getType() == XPathLexer.ANYWHERE;
+					i++;
+					next = tokens.get(i);
+					boolean invert = next.getType()==XPathLexer.BANG;
+					if ( invert ) {
+						i++;
+						next = tokens.get(i);
+					}
+					XPathElement pathElement = getXPathElement(next, anywhere);
+					pathElement.invert = invert;
+					elements.add(pathElement);
+					i++;
+				//case XPathLexer.BANG :
+					break;
+
+				case XPathLexer.TOKEN_REF :
+				case XPathLexer.RULE_REF :
+				case XPathLexer.WILDCARD :
+					elements.add( getXPathElement(el, false) );
+					i++;
+					break;
+
+				case Token.EOF :
+					break loop;
+
+				default :
+					throw new IllegalArgumentException("Unknowth path element "+el);
 			}
 		}
 		return elements.toArray(new XPathElement[0]);
 	}

 	/** Convert word like * or ID or expr to a path element. anywhere is true
-	 *  if // preceds the word.
+	 *  if // precedes the word.
 	 */
-	protected XPathElement getXPathElement(String word, boolean anywhere) {
+	protected XPathElement getXPathElement(Token wordToken, boolean anywhere) {
+		if ( wordToken.getType()==Token.EOF ) {
+			throw new IllegalArgumentException("Missing path element at end of path");
+		}
+		String word = wordToken.getText();
 		Map<String, Integer> ruleIndexes = Utils.toMap(parser.getRuleNames());
 		Map<String, Integer> tokenTypes = Utils.toMap(parser.getTokenNames());
-		if ( word.equals(WILDCARD) ) {
-			return anywhere ?
-				new XPathWildcardAnywhereElement() :
-				new XPathWildcardElement();
-		}
-		else if ( word.charAt(0)=='\'' || Character.isUpperCase(word.charAt(0)) ) {
-			return anywhere ?
-				new XPathTokenAnywhereElement(word, tokenTypes.get(word)) :
-				new XPathTokenElement(word, tokenTypes.get(word));
-		}
-		else {
-			return anywhere ?
-				new XPathRuleAnywhereElement(word, ruleIndexes.get(word)) :
-				new XPathRuleElement(word, ruleIndexes.get(word));
+		Integer ttype = tokenTypes.get(word);
+		Integer ruleIndex = ruleIndexes.get(word);
+		switch ( wordToken.getType() ) {
+			case XPathLexer.WILDCARD :
+				return anywhere ?
+					new XPathWildcardAnywhereElement() :
+					new XPathWildcardElement();
+			case XPathLexer.TOKEN_REF :
+			case XPathLexer.STRING :
+				if ( ttype==null ) {
+					throw new IllegalArgumentException(word+
+													   " at index "+
+													   wordToken.getStartIndex()+
+													   " isn't a valid token name");
+				}
+				return anywhere ?
+					new XPathTokenAnywhereElement(word, ttype) :
+					new XPathTokenElement(word, ttype);
+			default :
+				if ( ruleIndex==null ) {
+					throw new IllegalArgumentException(word+
+													   " at index "+
+													   wordToken.getStartIndex()+
+													   " isn't a valid rule name");
+				}
+				return anywhere ?
+					new XPathRuleAnywhereElement(word, ruleIndex) :
+					new XPathRuleElement(word, ruleIndex);
 		}
 	}

--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java
@ -5,7 +5,8 @@ import org.antlr.v4.runtime.tree.ParseTree;
 import java.util.Collection;

 public abstract class XPathElement {
-	public String nodeName;
+	protected String nodeName;
+	protected boolean invert;

 	/** Construct element like /ID or or ID or "/*" etc...
 	 *  op is null if just node
@ -19,6 +20,7 @@ public abstract class XPathElement {

 	@Override
 	public String toString() {
-		return getClass().getSimpleName()+"["+nodeName+"]";
+		String inv = invert ? "!" : "";
+		return getClass().getSimpleName()+"["+inv+nodeName+"]";
 	}
 }
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4
@ -0,0 +1,68 @@
+lexer grammar XPathLexer;
+
+@header {package org.antlr.v4.runtime.tree.xpath;}
+
+tokens { TOKEN_REF, RULE_REF }
+
+// "//|/|//[!]|/\\[!]|\\w+|'.+?'|[*]"
+// TODO: handle escapes in strings?
+
+/*
+path : separator? word (separator word)* EOF ;
+
+separator
+	:	'/'  '!'
+	|	'//' '!'
+	|	'/'
+	|	'//'
+	;
+
+word:	TOKEN_REF
+	|	RULE_REF
+	|	STRING
+	|	'*'
+	;
+*/
+
+ANYWHERE : '//' ;
+ROOT	 : '/' ;
+WILDCARD : '*' ;
+BANG	 : '!' ;
+
+ID			:	NameStartChar NameChar*
+				{
+				String text = getText();
+				if ( Character.isUpperCase(text.charAt(0)) ) setType(TOKEN_REF);
+				else setType(RULE_REF);
+				}
+			;
+
+fragment
+NameChar    :   NameStartChar
+            |   '0'..'9'
+            |   '_'
+            |   '\u00B7'
+            |   '\u0300'..'\u036F'
+            |   '\u203F'..'\u2040'
+            ;
+
+fragment
+NameStartChar
+            :   'A'..'Z' | 'a'..'z'
+            |   '\u00C0'..'\u00D6'
+            |   '\u00D8'..'\u00F6'
+            |   '\u00F8'..'\u02FF'
+            |   '\u0370'..'\u037D'
+            |   '\u037F'..'\u1FFF'
+            |   '\u200C'..'\u200D'
+            |   '\u2070'..'\u218F'
+            |   '\u2C00'..'\u2FEF'
+            |   '\u3001'..'\uD7FF'
+            |   '\uF900'..'\uFDCF'
+            |   '\uFDF0'..'\uFFFD'
+            ; // ignores | ['\u10000-'\uEFFFF] ;
+
+STRING : '\'' .*? '\'' ;
+
+//WS : [ \t\r\n]+ -> skip ;
+
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java
@ -0,0 +1,14 @@
+package org.antlr.v4.runtime.tree.xpath;
+
+import org.antlr.v4.runtime.BaseErrorListener;
+import org.antlr.v4.runtime.RecognitionException;
+import org.antlr.v4.runtime.Recognizer;
+
+public class XPathLexerErrorListener extends BaseErrorListener {
+	@Override
+	public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol,
+							int line, int charPositionInLine, String msg,
+							RecognitionException e)
+	{
+	}
+}
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java
@ -18,13 +18,12 @@ public class XPathRuleElement extends XPathElement {
 	public Collection<ParseTree> evaluate(ParseTree t) {
 				// return all children of t that match nodeName
 		List<ParseTree> nodes = new ArrayList<ParseTree>();
-		if ( t.getChildren()==null) {
-			System.out.println();
-		}
 		for (ParseTree c : t.getChildren()) {
 			if ( c instanceof ParserRuleContext ) {
 				ParserRuleContext ctx = (ParserRuleContext)c;
-				if ( ctx.getRuleIndex() == ruleIndex ) {
+				if ( (ctx.getRuleIndex() == ruleIndex && !invert) ||
+					 (ctx.getRuleIndex() != ruleIndex && invert) )
+				{
 					nodes.add(c);
 				}
 			}
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java
@ -21,7 +21,9 @@ public class XPathTokenElement extends XPathElement {
 		for (ParseTree c : t.getChildren()) {
 			if ( c instanceof TerminalNode ) {
 				TerminalNode tnode = (TerminalNode)c;
-				if ( tnode.getSymbol().getType() == tokenType ) {
+				if ( (tnode.getSymbol().getType() == tokenType && !invert) ||
+					 (tnode.getSymbol().getType() != tokenType && invert) )
+				{
 					nodes.add(c);
 				}
 			}
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java
@ -3,6 +3,7 @@ package org.antlr.v4.runtime.tree.xpath;
 import org.antlr.v4.runtime.tree.ParseTree;
 import org.antlr.v4.runtime.tree.Trees;

+import java.util.ArrayList;
 import java.util.Collection;

 public class XPathWildcardAnywhereElement extends XPathElement {
@ -12,6 +13,7 @@ public class XPathWildcardAnywhereElement extends XPathElement {

 	@Override
 	public Collection<ParseTree> evaluate(ParseTree t) {
+		if ( invert ) return new ArrayList<ParseTree>(); // !* is weird but valid (empty)
 		return Trees.descendants(t);
 	}
 }
--- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java
@ -12,6 +12,7 @@ public class XPathWildcardElement extends XPathElement {

 	@Override
 	public Collection<ParseTree> evaluate(final ParseTree t) {
+		if ( invert ) return new ArrayList<ParseTree>(); // !* is weird but valid (empty)
 		return new ArrayList<ParseTree>() {{addAll(t.getChildren());}};
 	}
 }
--- a/tool/test/org/antlr/v4/test/BaseTest.java
+++ b/tool/test/org/antlr/v4/test/BaseTest.java
@ -511,7 +511,7 @@ public abstract class BaseTest {
 			args = new Integer[] {0};
 		}
 		ParseTree result = (ParseTree)startRule.invoke(parser, args);
-		System.out.println("parse tree = "+result.toStringTree(parser));
+//		System.out.println("parse tree = "+result.toStringTree(parser));
 		return result;
 	}

--- a/tool/test/org/antlr/v4/test/TestXPath.java
+++ b/tool/test/org/antlr/v4/test/TestXPath.java
@ -12,6 +12,7 @@ import java.util.ArrayList;
 import java.util.List;

 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 public class TestXPath extends BaseTest {
@ -45,9 +46,10 @@ public class TestXPath extends BaseTest {
 		"NEWLINE:'\\r'? '\\n' -> skip;     // return newlines to parser (is end-statement signal)\n" +
 		"WS  :   [ \\t]+ -> skip ; // toss out whitespace\n";

-	@Test public void test() throws Exception {
+	@Test public void testValidPaths() throws Exception {
 		boolean ok =
-			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser", "ExprLexer", false);
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
 		assertTrue(ok);

 		String input =
@ -68,7 +70,11 @@ public class TestXPath extends BaseTest {
 			"//primary/*",		// all kids of any primary
 			"//func/*/stat",	// all stat nodes grandkids of any func node
 			"/prog/func/'def'",	// all def literal kids of func kid of prog
-			"//stat/';'"		// all ';' under any stat node
+			"//stat/';'",		// all ';' under any stat node
+			"//expr/primary/!ID",	// anything but ID under primary under any expr node
+			"//expr/!primary",	// anything but primary under any expr node
+			"//!*",				// nothing anywhere
+			"/!*",				// nothing at root
 		};
 		String expected[] = {
 			"[func, func]",
@ -85,7 +91,11 @@ public class TestXPath extends BaseTest {
 			"[3, 4, y, 1, 2, x]",
 			"[stat, stat, stat, stat]",
 			"[def, def]",
-			"[;, ;, ;, ;]"
+			"[;, ;, ;, ;]",
+			"[3, 4, 1, 2]",
+			"[expr, expr, expr, expr, expr, expr]",
+			"[]",
+			"[]",
 		};

 		for (int i=0; i<xpath.length; i++) {
@ -95,6 +105,116 @@ public class TestXPath extends BaseTest {
 		}
 	}

+	@Test public void testWeirdChar() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "&";
+		String expected = "Invalid tokens or characters at index 0 in path '&'";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	@Test public void testWeirdChar2() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "//w&e/";
+		String expected = "Invalid tokens or characters at index 3 in path '//w&e/'";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	@Test public void testBadSyntax() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "///";
+		String expected = "/ at index 2 isn't a valid rule name";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	@Test public void testMissingWordAtEnd() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "//";
+		String expected = "Missing path element at end of path";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	@Test public void testBadTokenName() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "//Ick";
+		String expected = "Ick at index 2 isn't a valid token name";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	@Test public void testBadRuleName() throws Exception {
+		boolean ok =
+			rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
+										  "ExprLexer", false);
+		assertTrue(ok);
+
+		String input =
+			"def f(x,y) { x = 3+4; y; ; }\n" +
+			"def g(x) { return 1+2*x; }\n";
+		String path = "/prog/ick";
+		String expected = "ick at index 6 isn't a valid rule name";
+
+		testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
+	}
+
+	protected void testError(String input, String path, String expected,
+							 String startRuleName,
+							 String parserName, String lexerName)
+		throws Exception
+	{
+		Pair<Parser, Lexer> pl = getParserAndLexer(input, parserName, lexerName);
+		Parser parser = pl.a;
+		ParseTree tree = execStartRule(startRuleName, parser);
+
+		IllegalArgumentException e = null;
+		try {
+			tree.findAll(parser, path);
+		}
+		catch (IllegalArgumentException iae) {
+			e = iae;
+		}
+		assertNotNull(e);
+		assertEquals(expected, e.getMessage());
+	}
+
 	public List<String> getNodeStrings(String input, String xpath,
 									   String startRuleName,
 									   String parserName, String lexerName)