rebuilt XPath using ANTLR itself; added error handling; added ! operator

This commit is contained in:
Terence Parr 2013-09-13 13:53:42 -07:00
parent a86895c557
commit 01082414c3
11 changed files with 348 additions and 57 deletions

View File

@ -2,6 +2,11 @@ ANTLR v4 Honey Badger
September 11, 2013
* Add ! operator to XPath
* Use ANTLR v4 XPathLexer.g4 not regex
September 11, 2013
* Copy lots of find node stuff from v3 GrammarAST to Trees class in runtime.
* Add to ParseTree [BREAKING CHANGE]:
Collection<ParseTree> findAll(String xpath);

View File

@ -1,30 +1,54 @@
package org.antlr.v4.runtime.tree.xpath;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.LexerNoViableAltException;
import org.antlr.v4.runtime.Parser;
import org.antlr.v4.runtime.ParserRuleContext;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.misc.Utils;
import org.antlr.v4.runtime.tree.ParseTree;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** Represent a subset of XPath path syntax for use in identifying nodes in
/** Represent a subset of XPath XML path syntax for use in identifying nodes in
* parse trees.
*
* Split path into words and separators / and // then walk from left to right.
* At each separator-word pair, find set of nodes. Next stage uses those as
* work list.
* Split path into words and separators / and // via ANTLR itself then walk
* path elements from left to right. At each separator-word pair, find set
* of nodes. Next stage uses those as work list.
*
* See {@link org.antlr.v4.test.TestXPath} for descriptions.
* The basic interface is ParseTree.findAll(parser, pathString). But that is
* just shorthand for:
*
* XPath p = new XPath(parser, xpath);
* return p.evaluate(this);
*
* See {@link org.antlr.v4.test.TestXPath} for descriptions. In short, this allows
* operators:
*
* / root
* // anywhere
* ! invert; this must appear directly after root or anywhere operator
*
* and path elements:
*
* ID token name
* 'string' any string literal token from the grammar
* expr rule name
* * wildcard matching any node
*
* Whitespace is not allowed.
*/
public class XPath {
public static final String WILDCARD = "*"; // word not operator/separator
public static final String NOT = "!"; // word for invert operator
protected String path;
protected XPathElement[] elements;
@ -34,62 +58,116 @@ public class XPath {
this.parser = parser;
this.path = path;
elements = split(path);
System.out.println(Arrays.toString(elements));
// System.out.println(Arrays.toString(elements));
}
// TODO: check for invalid token/rule names, bad syntax
public XPathElement[] split(String path) {
Pattern pattern = Pattern.compile("//|/|\\w+|'.+?'|\\*"); // TODO: handle escapes in strings?
Matcher matcher = pattern.matcher(path);
List<String> pathStrings = new ArrayList<String>();
while (matcher.find()) {
pathStrings.add(matcher.group());
ANTLRInputStream in;
try {
in = new ANTLRInputStream(new StringReader(path));
}
System.out.println("path="+path+"=>"+pathStrings);
catch (IOException ioe) {
throw new IllegalArgumentException("Could not read path: "+path, ioe);
}
XPathLexer lexer = new XPathLexer(in) {
public void recover(LexerNoViableAltException e) { throw e; }
};
lexer.removeErrorListeners();
lexer.addErrorListener(new XPathLexerErrorListener());
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
try {
tokenStream.fill();
}
catch (LexerNoViableAltException e) {
int pos = lexer.getCharPositionInLine();
String msg = "Invalid tokens or characters at index "+pos+" in path '"+path+"'";
throw new IllegalArgumentException(msg, e);
}
List<Token> tokens = tokenStream.getTokens();
// System.out.println("path="+path+"=>"+tokens);
List<XPathElement> elements = new ArrayList<XPathElement>();
int n = pathStrings.size();
int n = tokens.size();
int i=0;
loop:
while ( i<n ) {
String el = pathStrings.get(i);
if ( el.startsWith("/") ) {
i++;
if ( i>=n ) {
System.out.println("missing element name after operator");
}
String next = pathStrings.get(i);
boolean anywhere = el.equals("//");
elements.add( getXPathElement(next, anywhere) );
i++;
}
else {
elements.add( getXPathElement(el, false) );
i++;
Token el = tokens.get(i);
Token next = null;
switch ( el.getType() ) {
case XPathLexer.ROOT :
case XPathLexer.ANYWHERE :
boolean anywhere = el.getType() == XPathLexer.ANYWHERE;
i++;
next = tokens.get(i);
boolean invert = next.getType()==XPathLexer.BANG;
if ( invert ) {
i++;
next = tokens.get(i);
}
XPathElement pathElement = getXPathElement(next, anywhere);
pathElement.invert = invert;
elements.add(pathElement);
i++;
//case XPathLexer.BANG :
break;
case XPathLexer.TOKEN_REF :
case XPathLexer.RULE_REF :
case XPathLexer.WILDCARD :
elements.add( getXPathElement(el, false) );
i++;
break;
case Token.EOF :
break loop;
default :
throw new IllegalArgumentException("Unknowth path element "+el);
}
}
return elements.toArray(new XPathElement[0]);
}
/** Convert word like * or ID or expr to a path element. anywhere is true
* if // preceds the word.
* if // precedes the word.
*/
protected XPathElement getXPathElement(String word, boolean anywhere) {
protected XPathElement getXPathElement(Token wordToken, boolean anywhere) {
if ( wordToken.getType()==Token.EOF ) {
throw new IllegalArgumentException("Missing path element at end of path");
}
String word = wordToken.getText();
Map<String, Integer> ruleIndexes = Utils.toMap(parser.getRuleNames());
Map<String, Integer> tokenTypes = Utils.toMap(parser.getTokenNames());
if ( word.equals(WILDCARD) ) {
return anywhere ?
new XPathWildcardAnywhereElement() :
new XPathWildcardElement();
}
else if ( word.charAt(0)=='\'' || Character.isUpperCase(word.charAt(0)) ) {
return anywhere ?
new XPathTokenAnywhereElement(word, tokenTypes.get(word)) :
new XPathTokenElement(word, tokenTypes.get(word));
}
else {
return anywhere ?
new XPathRuleAnywhereElement(word, ruleIndexes.get(word)) :
new XPathRuleElement(word, ruleIndexes.get(word));
Integer ttype = tokenTypes.get(word);
Integer ruleIndex = ruleIndexes.get(word);
switch ( wordToken.getType() ) {
case XPathLexer.WILDCARD :
return anywhere ?
new XPathWildcardAnywhereElement() :
new XPathWildcardElement();
case XPathLexer.TOKEN_REF :
case XPathLexer.STRING :
if ( ttype==null ) {
throw new IllegalArgumentException(word+
" at index "+
wordToken.getStartIndex()+
" isn't a valid token name");
}
return anywhere ?
new XPathTokenAnywhereElement(word, ttype) :
new XPathTokenElement(word, ttype);
default :
if ( ruleIndex==null ) {
throw new IllegalArgumentException(word+
" at index "+
wordToken.getStartIndex()+
" isn't a valid rule name");
}
return anywhere ?
new XPathRuleAnywhereElement(word, ruleIndex) :
new XPathRuleElement(word, ruleIndex);
}
}

View File

@ -5,7 +5,8 @@ import org.antlr.v4.runtime.tree.ParseTree;
import java.util.Collection;
public abstract class XPathElement {
public String nodeName;
protected String nodeName;
protected boolean invert;
/** Construct element like /ID or or ID or "/*" etc...
* op is null if just node
@ -19,6 +20,7 @@ public abstract class XPathElement {
@Override
public String toString() {
return getClass().getSimpleName()+"["+nodeName+"]";
String inv = invert ? "!" : "";
return getClass().getSimpleName()+"["+inv+nodeName+"]";
}
}

View File

@ -0,0 +1,68 @@
lexer grammar XPathLexer;
@header {package org.antlr.v4.runtime.tree.xpath;}
tokens { TOKEN_REF, RULE_REF }
// "//|/|//[!]|/\\[!]|\\w+|'.+?'|[*]"
// TODO: handle escapes in strings?
/*
path : separator? word (separator word)* EOF ;
separator
: '/' '!'
| '//' '!'
| '/'
| '//'
;
word: TOKEN_REF
| RULE_REF
| STRING
| '*'
;
*/
ANYWHERE : '//' ;
ROOT : '/' ;
WILDCARD : '*' ;
BANG : '!' ;
ID : NameStartChar NameChar*
{
String text = getText();
if ( Character.isUpperCase(text.charAt(0)) ) setType(TOKEN_REF);
else setType(RULE_REF);
}
;
fragment
NameChar : NameStartChar
| '0'..'9'
| '_'
| '\u00B7'
| '\u0300'..'\u036F'
| '\u203F'..'\u2040'
;
fragment
NameStartChar
: 'A'..'Z' | 'a'..'z'
| '\u00C0'..'\u00D6'
| '\u00D8'..'\u00F6'
| '\u00F8'..'\u02FF'
| '\u0370'..'\u037D'
| '\u037F'..'\u1FFF'
| '\u200C'..'\u200D'
| '\u2070'..'\u218F'
| '\u2C00'..'\u2FEF'
| '\u3001'..'\uD7FF'
| '\uF900'..'\uFDCF'
| '\uFDF0'..'\uFFFD'
; // ignores | ['\u10000-'\uEFFFF] ;
STRING : '\'' .*? '\'' ;
//WS : [ \t\r\n]+ -> skip ;

View File

@ -0,0 +1,14 @@
package org.antlr.v4.runtime.tree.xpath;
import org.antlr.v4.runtime.BaseErrorListener;
import org.antlr.v4.runtime.RecognitionException;
import org.antlr.v4.runtime.Recognizer;
public class XPathLexerErrorListener extends BaseErrorListener {
@Override
public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol,
int line, int charPositionInLine, String msg,
RecognitionException e)
{
}
}

View File

@ -18,13 +18,12 @@ public class XPathRuleElement extends XPathElement {
public Collection<ParseTree> evaluate(ParseTree t) {
// return all children of t that match nodeName
List<ParseTree> nodes = new ArrayList<ParseTree>();
if ( t.getChildren()==null) {
System.out.println();
}
for (ParseTree c : t.getChildren()) {
if ( c instanceof ParserRuleContext ) {
ParserRuleContext ctx = (ParserRuleContext)c;
if ( ctx.getRuleIndex() == ruleIndex ) {
if ( (ctx.getRuleIndex() == ruleIndex && !invert) ||
(ctx.getRuleIndex() != ruleIndex && invert) )
{
nodes.add(c);
}
}

View File

@ -21,7 +21,9 @@ public class XPathTokenElement extends XPathElement {
for (ParseTree c : t.getChildren()) {
if ( c instanceof TerminalNode ) {
TerminalNode tnode = (TerminalNode)c;
if ( tnode.getSymbol().getType() == tokenType ) {
if ( (tnode.getSymbol().getType() == tokenType && !invert) ||
(tnode.getSymbol().getType() != tokenType && invert) )
{
nodes.add(c);
}
}

View File

@ -3,6 +3,7 @@ package org.antlr.v4.runtime.tree.xpath;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.Trees;
import java.util.ArrayList;
import java.util.Collection;
public class XPathWildcardAnywhereElement extends XPathElement {
@ -12,6 +13,7 @@ public class XPathWildcardAnywhereElement extends XPathElement {
@Override
public Collection<ParseTree> evaluate(ParseTree t) {
if ( invert ) return new ArrayList<ParseTree>(); // !* is weird but valid (empty)
return Trees.descendants(t);
}
}

View File

@ -12,6 +12,7 @@ public class XPathWildcardElement extends XPathElement {
@Override
public Collection<ParseTree> evaluate(final ParseTree t) {
if ( invert ) return new ArrayList<ParseTree>(); // !* is weird but valid (empty)
return new ArrayList<ParseTree>() {{addAll(t.getChildren());}};
}
}

View File

@ -511,7 +511,7 @@ public abstract class BaseTest {
args = new Integer[] {0};
}
ParseTree result = (ParseTree)startRule.invoke(parser, args);
System.out.println("parse tree = "+result.toStringTree(parser));
// System.out.println("parse tree = "+result.toStringTree(parser));
return result;
}

View File

@ -12,6 +12,7 @@ import java.util.ArrayList;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
public class TestXPath extends BaseTest {
@ -45,9 +46,10 @@ public class TestXPath extends BaseTest {
"NEWLINE:'\\r'? '\\n' -> skip; // return newlines to parser (is end-statement signal)\n" +
"WS : [ \\t]+ -> skip ; // toss out whitespace\n";
@Test public void test() throws Exception {
@Test public void testValidPaths() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser", "ExprLexer", false);
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
@ -68,7 +70,11 @@ public class TestXPath extends BaseTest {
"//primary/*", // all kids of any primary
"//func/*/stat", // all stat nodes grandkids of any func node
"/prog/func/'def'", // all def literal kids of func kid of prog
"//stat/';'" // all ';' under any stat node
"//stat/';'", // all ';' under any stat node
"//expr/primary/!ID", // anything but ID under primary under any expr node
"//expr/!primary", // anything but primary under any expr node
"//!*", // nothing anywhere
"/!*", // nothing at root
};
String expected[] = {
"[func, func]",
@ -85,7 +91,11 @@ public class TestXPath extends BaseTest {
"[3, 4, y, 1, 2, x]",
"[stat, stat, stat, stat]",
"[def, def]",
"[;, ;, ;, ;]"
"[;, ;, ;, ;]",
"[3, 4, 1, 2]",
"[expr, expr, expr, expr, expr, expr]",
"[]",
"[]",
};
for (int i=0; i<xpath.length; i++) {
@ -95,6 +105,116 @@ public class TestXPath extends BaseTest {
}
}
@Test public void testWeirdChar() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "&";
String expected = "Invalid tokens or characters at index 0 in path '&'";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
@Test public void testWeirdChar2() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "//w&e/";
String expected = "Invalid tokens or characters at index 3 in path '//w&e/'";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
@Test public void testBadSyntax() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "///";
String expected = "/ at index 2 isn't a valid rule name";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
@Test public void testMissingWordAtEnd() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "//";
String expected = "Missing path element at end of path";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
@Test public void testBadTokenName() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "//Ick";
String expected = "Ick at index 2 isn't a valid token name";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
@Test public void testBadRuleName() throws Exception {
boolean ok =
rawGenerateAndBuildRecognizer("Expr.g4", grammar, "ExprParser",
"ExprLexer", false);
assertTrue(ok);
String input =
"def f(x,y) { x = 3+4; y; ; }\n" +
"def g(x) { return 1+2*x; }\n";
String path = "/prog/ick";
String expected = "ick at index 6 isn't a valid rule name";
testError(input, path, expected, "prog", "ExprParser", "ExprLexer");
}
protected void testError(String input, String path, String expected,
String startRuleName,
String parserName, String lexerName)
throws Exception
{
Pair<Parser, Lexer> pl = getParserAndLexer(input, parserName, lexerName);
Parser parser = pl.a;
ParseTree tree = execStartRule(startRuleName, parser);
IllegalArgumentException e = null;
try {
tree.findAll(parser, path);
}
catch (IllegalArgumentException iae) {
e = iae;
}
assertNotNull(e);
assertEquals(expected, e.getMessage());
}
public List<String> getNodeStrings(String input, String xpath,
String startRuleName,
String parserName, String lexerName)