From 01082414c35fe676d336a7da162824e59eec834e Mon Sep 17 00:00:00 2001 From: Terence Parr Date: Fri, 13 Sep 2013 13:53:42 -0700 Subject: [PATCH] rebuilt XPath using ANTLR itself; added error handling; added ! operator --- CHANGES.txt | 5 + .../antlr/v4/runtime/tree/xpath/XPath.java | 168 +++++++++++++----- .../v4/runtime/tree/xpath/XPathElement.java | 6 +- .../antlr/v4/runtime/tree/xpath/XPathLexer.g4 | 68 +++++++ .../tree/xpath/XPathLexerErrorListener.java | 14 ++ .../runtime/tree/xpath/XPathRuleElement.java | 7 +- .../runtime/tree/xpath/XPathTokenElement.java | 4 +- .../xpath/XPathWildcardAnywhereElement.java | 2 + .../tree/xpath/XPathWildcardElement.java | 1 + tool/test/org/antlr/v4/test/BaseTest.java | 2 +- tool/test/org/antlr/v4/test/TestXPath.java | 128 ++++++++++++- 11 files changed, 348 insertions(+), 57 deletions(-) create mode 100644 runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4 create mode 100644 runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java diff --git a/CHANGES.txt b/CHANGES.txt index f98f7ca6e..b4d3fe06e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,11 @@ ANTLR v4 Honey Badger September 11, 2013 +* Add ! operator to XPath +* Use ANTLR v4 XPathLexer.g4 not regex + +September 11, 2013 + * Copy lots of find node stuff from v3 GrammarAST to Trees class in runtime. * Add to ParseTree [BREAKING CHANGE]: Collection findAll(String xpath); diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java index 0fce4823d..3189cc177 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPath.java @@ -1,30 +1,54 @@ package org.antlr.v4.runtime.tree.xpath; +import org.antlr.v4.runtime.ANTLRInputStream; +import org.antlr.v4.runtime.CommonTokenStream; +import org.antlr.v4.runtime.LexerNoViableAltException; import org.antlr.v4.runtime.Parser; import org.antlr.v4.runtime.ParserRuleContext; +import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.misc.Utils; import org.antlr.v4.runtime.tree.ParseTree; +import java.io.IOException; +import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -/** Represent a subset of XPath path syntax for use in identifying nodes in +/** Represent a subset of XPath XML path syntax for use in identifying nodes in * parse trees. * - * Split path into words and separators / and // then walk from left to right. - * At each separator-word pair, find set of nodes. Next stage uses those as - * work list. + * Split path into words and separators / and // via ANTLR itself then walk + * path elements from left to right. At each separator-word pair, find set + * of nodes. Next stage uses those as work list. * - * See {@link org.antlr.v4.test.TestXPath} for descriptions. + * The basic interface is ParseTree.findAll(parser, pathString). But that is + * just shorthand for: * + * XPath p = new XPath(parser, xpath); + * return p.evaluate(this); + * + * See {@link org.antlr.v4.test.TestXPath} for descriptions. In short, this allows + * operators: + * + * / root + * // anywhere + * ! invert; this must appear directly after root or anywhere operator + * + * and path elements: + * + * ID token name + * 'string' any string literal token from the grammar + * expr rule name + * * wildcard matching any node + * + * Whitespace is not allowed. */ public class XPath { public static final String WILDCARD = "*"; // word not operator/separator + public static final String NOT = "!"; // word for invert operator protected String path; protected XPathElement[] elements; @@ -34,62 +58,116 @@ public class XPath { this.parser = parser; this.path = path; elements = split(path); - System.out.println(Arrays.toString(elements)); +// System.out.println(Arrays.toString(elements)); } // TODO: check for invalid token/rule names, bad syntax public XPathElement[] split(String path) { - Pattern pattern = Pattern.compile("//|/|\\w+|'.+?'|\\*"); // TODO: handle escapes in strings? - Matcher matcher = pattern.matcher(path); - List pathStrings = new ArrayList(); - while (matcher.find()) { - pathStrings.add(matcher.group()); + ANTLRInputStream in; + try { + in = new ANTLRInputStream(new StringReader(path)); } - System.out.println("path="+path+"=>"+pathStrings); + catch (IOException ioe) { + throw new IllegalArgumentException("Could not read path: "+path, ioe); + } + XPathLexer lexer = new XPathLexer(in) { + public void recover(LexerNoViableAltException e) { throw e; } + }; + lexer.removeErrorListeners(); + lexer.addErrorListener(new XPathLexerErrorListener()); + CommonTokenStream tokenStream = new CommonTokenStream(lexer); + try { + tokenStream.fill(); + } + catch (LexerNoViableAltException e) { + int pos = lexer.getCharPositionInLine(); + String msg = "Invalid tokens or characters at index "+pos+" in path '"+path+"'"; + throw new IllegalArgumentException(msg, e); + } + + List tokens = tokenStream.getTokens(); +// System.out.println("path="+path+"=>"+tokens); List elements = new ArrayList(); - int n = pathStrings.size(); + int n = tokens.size(); int i=0; +loop: while ( i=n ) { - System.out.println("missing element name after operator"); - } - String next = pathStrings.get(i); - boolean anywhere = el.equals("//"); - elements.add( getXPathElement(next, anywhere) ); - i++; - } - else { - elements.add( getXPathElement(el, false) ); - i++; + Token el = tokens.get(i); + Token next = null; + switch ( el.getType() ) { + case XPathLexer.ROOT : + case XPathLexer.ANYWHERE : + boolean anywhere = el.getType() == XPathLexer.ANYWHERE; + i++; + next = tokens.get(i); + boolean invert = next.getType()==XPathLexer.BANG; + if ( invert ) { + i++; + next = tokens.get(i); + } + XPathElement pathElement = getXPathElement(next, anywhere); + pathElement.invert = invert; + elements.add(pathElement); + i++; + //case XPathLexer.BANG : + break; + + case XPathLexer.TOKEN_REF : + case XPathLexer.RULE_REF : + case XPathLexer.WILDCARD : + elements.add( getXPathElement(el, false) ); + i++; + break; + + case Token.EOF : + break loop; + + default : + throw new IllegalArgumentException("Unknowth path element "+el); } } return elements.toArray(new XPathElement[0]); } /** Convert word like * or ID or expr to a path element. anywhere is true - * if // preceds the word. + * if // precedes the word. */ - protected XPathElement getXPathElement(String word, boolean anywhere) { + protected XPathElement getXPathElement(Token wordToken, boolean anywhere) { + if ( wordToken.getType()==Token.EOF ) { + throw new IllegalArgumentException("Missing path element at end of path"); + } + String word = wordToken.getText(); Map ruleIndexes = Utils.toMap(parser.getRuleNames()); Map tokenTypes = Utils.toMap(parser.getTokenNames()); - if ( word.equals(WILDCARD) ) { - return anywhere ? - new XPathWildcardAnywhereElement() : - new XPathWildcardElement(); - } - else if ( word.charAt(0)=='\'' || Character.isUpperCase(word.charAt(0)) ) { - return anywhere ? - new XPathTokenAnywhereElement(word, tokenTypes.get(word)) : - new XPathTokenElement(word, tokenTypes.get(word)); - } - else { - return anywhere ? - new XPathRuleAnywhereElement(word, ruleIndexes.get(word)) : - new XPathRuleElement(word, ruleIndexes.get(word)); + Integer ttype = tokenTypes.get(word); + Integer ruleIndex = ruleIndexes.get(word); + switch ( wordToken.getType() ) { + case XPathLexer.WILDCARD : + return anywhere ? + new XPathWildcardAnywhereElement() : + new XPathWildcardElement(); + case XPathLexer.TOKEN_REF : + case XPathLexer.STRING : + if ( ttype==null ) { + throw new IllegalArgumentException(word+ + " at index "+ + wordToken.getStartIndex()+ + " isn't a valid token name"); + } + return anywhere ? + new XPathTokenAnywhereElement(word, ttype) : + new XPathTokenElement(word, ttype); + default : + if ( ruleIndex==null ) { + throw new IllegalArgumentException(word+ + " at index "+ + wordToken.getStartIndex()+ + " isn't a valid rule name"); + } + return anywhere ? + new XPathRuleAnywhereElement(word, ruleIndex) : + new XPathRuleElement(word, ruleIndex); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java index 66e5ab495..083bfe548 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathElement.java @@ -5,7 +5,8 @@ import org.antlr.v4.runtime.tree.ParseTree; import java.util.Collection; public abstract class XPathElement { - public String nodeName; + protected String nodeName; + protected boolean invert; /** Construct element like /ID or or ID or "/*" etc... * op is null if just node @@ -19,6 +20,7 @@ public abstract class XPathElement { @Override public String toString() { - return getClass().getSimpleName()+"["+nodeName+"]"; + String inv = invert ? "!" : ""; + return getClass().getSimpleName()+"["+inv+nodeName+"]"; } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4 b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4 new file mode 100644 index 000000000..9d54f5d61 --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexer.g4 @@ -0,0 +1,68 @@ +lexer grammar XPathLexer; + +@header {package org.antlr.v4.runtime.tree.xpath;} + +tokens { TOKEN_REF, RULE_REF } + +// "//|/|//[!]|/\\[!]|\\w+|'.+?'|[*]" +// TODO: handle escapes in strings? + +/* +path : separator? word (separator word)* EOF ; + +separator + : '/' '!' + | '//' '!' + | '/' + | '//' + ; + +word: TOKEN_REF + | RULE_REF + | STRING + | '*' + ; +*/ + +ANYWHERE : '//' ; +ROOT : '/' ; +WILDCARD : '*' ; +BANG : '!' ; + +ID : NameStartChar NameChar* + { + String text = getText(); + if ( Character.isUpperCase(text.charAt(0)) ) setType(TOKEN_REF); + else setType(RULE_REF); + } + ; + +fragment +NameChar : NameStartChar + | '0'..'9' + | '_' + | '\u00B7' + | '\u0300'..'\u036F' + | '\u203F'..'\u2040' + ; + +fragment +NameStartChar + : 'A'..'Z' | 'a'..'z' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u02FF' + | '\u0370'..'\u037D' + | '\u037F'..'\u1FFF' + | '\u200C'..'\u200D' + | '\u2070'..'\u218F' + | '\u2C00'..'\u2FEF' + | '\u3001'..'\uD7FF' + | '\uF900'..'\uFDCF' + | '\uFDF0'..'\uFFFD' + ; // ignores | ['\u10000-'\uEFFFF] ; + +STRING : '\'' .*? '\'' ; + +//WS : [ \t\r\n]+ -> skip ; + diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java new file mode 100644 index 000000000..30f163eda --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathLexerErrorListener.java @@ -0,0 +1,14 @@ +package org.antlr.v4.runtime.tree.xpath; + +import org.antlr.v4.runtime.BaseErrorListener; +import org.antlr.v4.runtime.RecognitionException; +import org.antlr.v4.runtime.Recognizer; + +public class XPathLexerErrorListener extends BaseErrorListener { + @Override + public void syntaxError(Recognizer recognizer, Object offendingSymbol, + int line, int charPositionInLine, String msg, + RecognitionException e) + { + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java index f1733b544..5af5c4c5f 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathRuleElement.java @@ -18,13 +18,12 @@ public class XPathRuleElement extends XPathElement { public Collection evaluate(ParseTree t) { // return all children of t that match nodeName List nodes = new ArrayList(); - if ( t.getChildren()==null) { - System.out.println(); - } for (ParseTree c : t.getChildren()) { if ( c instanceof ParserRuleContext ) { ParserRuleContext ctx = (ParserRuleContext)c; - if ( ctx.getRuleIndex() == ruleIndex ) { + if ( (ctx.getRuleIndex() == ruleIndex && !invert) || + (ctx.getRuleIndex() != ruleIndex && invert) ) + { nodes.add(c); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java index 5adb25505..9bc98a82a 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathTokenElement.java @@ -21,7 +21,9 @@ public class XPathTokenElement extends XPathElement { for (ParseTree c : t.getChildren()) { if ( c instanceof TerminalNode ) { TerminalNode tnode = (TerminalNode)c; - if ( tnode.getSymbol().getType() == tokenType ) { + if ( (tnode.getSymbol().getType() == tokenType && !invert) || + (tnode.getSymbol().getType() != tokenType && invert) ) + { nodes.add(c); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java index 1c0fc0789..4a2af465a 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardAnywhereElement.java @@ -3,6 +3,7 @@ package org.antlr.v4.runtime.tree.xpath; import org.antlr.v4.runtime.tree.ParseTree; import org.antlr.v4.runtime.tree.Trees; +import java.util.ArrayList; import java.util.Collection; public class XPathWildcardAnywhereElement extends XPathElement { @@ -12,6 +13,7 @@ public class XPathWildcardAnywhereElement extends XPathElement { @Override public Collection evaluate(ParseTree t) { + if ( invert ) return new ArrayList(); // !* is weird but valid (empty) return Trees.descendants(t); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java index a7ea4f24e..f02897bc9 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java +++ b/runtime/Java/src/org/antlr/v4/runtime/tree/xpath/XPathWildcardElement.java @@ -12,6 +12,7 @@ public class XPathWildcardElement extends XPathElement { @Override public Collection evaluate(final ParseTree t) { + if ( invert ) return new ArrayList(); // !* is weird but valid (empty) return new ArrayList() {{addAll(t.getChildren());}}; } } diff --git a/tool/test/org/antlr/v4/test/BaseTest.java b/tool/test/org/antlr/v4/test/BaseTest.java index c02e17a36..bf09413a8 100644 --- a/tool/test/org/antlr/v4/test/BaseTest.java +++ b/tool/test/org/antlr/v4/test/BaseTest.java @@ -511,7 +511,7 @@ public abstract class BaseTest { args = new Integer[] {0}; } ParseTree result = (ParseTree)startRule.invoke(parser, args); - System.out.println("parse tree = "+result.toStringTree(parser)); +// System.out.println("parse tree = "+result.toStringTree(parser)); return result; } diff --git a/tool/test/org/antlr/v4/test/TestXPath.java b/tool/test/org/antlr/v4/test/TestXPath.java index 9ca3400a3..560e4c49c 100644 --- a/tool/test/org/antlr/v4/test/TestXPath.java +++ b/tool/test/org/antlr/v4/test/TestXPath.java @@ -12,6 +12,7 @@ import java.util.ArrayList; import java.util.List; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; public class TestXPath extends BaseTest { @@ -45,9 +46,10 @@ public class TestXPath extends BaseTest { "NEWLINE:'\\r'? '\\n' -> skip; // return newlines to parser (is end-statement signal)\n" + "WS : [ \\t]+ -> skip ; // toss out whitespace\n"; - @Test public void test() throws Exception { + @Test public void testValidPaths() throws Exception { boolean ok = - rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser", "ExprLexer", false); + rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser", + "ExprLexer", false); assertTrue(ok); String input = @@ -68,7 +70,11 @@ public class TestXPath extends BaseTest { "//primary/*", // all kids of any primary "//func/*/stat", // all stat nodes grandkids of any func node "/prog/func/'def'", // all def literal kids of func kid of prog - "//stat/';'" // all ';' under any stat node + "//stat/';'", // all ';' under any stat node + "//expr/primary/!ID", // anything but ID under primary under any expr node + "//expr/!primary", // anything but primary under any expr node + "//!*", // nothing anywhere + "/!*", // nothing at root }; String expected[] = { "[func, func]", @@ -85,7 +91,11 @@ public class TestXPath extends BaseTest { "[3, 4, y, 1, 2, x]", "[stat, stat, stat, stat]", "[def, def]", - "[;, ;, ;, ;]" + "[;, ;, ;, ;]", + "[3, 4, 1, 2]", + "[expr, expr, expr, expr, expr, expr]", + "[]", + "[]", }; for (int i=0; i pl = getParserAndLexer(input, parserName, lexerName); + Parser parser = pl.a; + ParseTree tree = execStartRule(startRuleName, parser); + + IllegalArgumentException e = null; + try { + tree.findAll(parser, path); + } + catch (IllegalArgumentException iae) { + e = iae; + } + assertNotNull(e); + assertEquals(expected, e.getMessage()); + } + public List getNodeStrings(String input, String xpath, String startRuleName, String parserName, String lexerName)