Started to work on the regex reverse-engineering phase.
git-svn-id: http://code.djangoproject.com/svn/django/trunk@7850 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
c88e55b4fe
commit
c2f753a8ba
|
@ -0,0 +1,123 @@
|
||||||
|
"""
|
||||||
|
Functions for reversing a regular expression (used in reverse URL resolving).
|
||||||
|
|
||||||
|
This is not, and is not intended to be, a complete reg-exp decompiler. It
|
||||||
|
should be good enough for almost all sane URLs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from bisect import bisect
|
||||||
|
|
||||||
|
GROUP_CLASS = re.compile(r'''\((?:
|
||||||
|
(?P<positional>[^?])| # Unnamed (positional) capturing group.
|
||||||
|
\?(?:
|
||||||
|
P<(?P<named>[\w]+)>(?P<contents>.*)| # Named capturing group.
|
||||||
|
P=(?P<repeat>.+)| # Repeat of a previous named group.
|
||||||
|
(?P<grouping>:)| # Non-capturing grouping parens.
|
||||||
|
(?P<comment>\#)| # Comment group
|
||||||
|
(?P<illegal>.) # Anything else (which will be an error)
|
||||||
|
)
|
||||||
|
).*\)''', re.VERBOSE)
|
||||||
|
|
||||||
|
def normalize(pattern):
|
||||||
|
"""
|
||||||
|
Given a reg-exp pattern, normalizes it to a list of forms that suffice for
|
||||||
|
reverse matching. This does the following:
|
||||||
|
|
||||||
|
(1) For any repeating sections, keeps the minimum number of occurrences
|
||||||
|
permitted (this means zero for optional groups).
|
||||||
|
(2) If an optional group includes parameters, include one occurrence of
|
||||||
|
that group (along with the zero occurrence case from step (1)).
|
||||||
|
(3) Select the first (essentially an arbitrary) element from any character
|
||||||
|
class. Select an arbitrary character for any unordered class (e.g. '.' or
|
||||||
|
'\w') in the pattern.
|
||||||
|
(4) Take the first alternative in any '|' division, unless other
|
||||||
|
alternatives would involve different parameters.
|
||||||
|
(5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
|
||||||
|
look-ahead and look-behind matches).
|
||||||
|
|
||||||
|
Returns a list of tuples, each tuple containing (a) a pattern, (b) the
|
||||||
|
number of parameters, (c) the names of the parameters. Any unnamed
|
||||||
|
parameters are called '_0', '_1', etc.
|
||||||
|
"""
|
||||||
|
# Do a linear scan to work out the special features of this pattern. The
|
||||||
|
# idea is that we scan once here and collect all the information we need to
|
||||||
|
# make future decisions.
|
||||||
|
groups = [] # (start, end)
|
||||||
|
quantifiers = [] # start pos
|
||||||
|
ranges = [] # (start, end)
|
||||||
|
eols = [] # pos
|
||||||
|
disjunctions = [] # pos
|
||||||
|
unclosed_groups = []
|
||||||
|
unclosed_ranges = []
|
||||||
|
escaped = False
|
||||||
|
quantify = False
|
||||||
|
in_range = False
|
||||||
|
for pos, c in enumerate(pattern):
|
||||||
|
if in_range and c != ']' or (c == ']' and
|
||||||
|
unclosed_ranges[-1] == pos - 1):
|
||||||
|
continue
|
||||||
|
elif c == '[':
|
||||||
|
unclosed_ranges.append(pos)
|
||||||
|
elif c == ']':
|
||||||
|
ranges.append((unclosed_ranges.pop(), pos + 1))
|
||||||
|
in_range = False
|
||||||
|
elif c == '.':
|
||||||
|
# Treat this as a one-character long range:
|
||||||
|
ranges.append((pos, pos + 1))
|
||||||
|
elif escaped or c == '\\':
|
||||||
|
escaped = not escaped
|
||||||
|
elif c == '(':
|
||||||
|
unclosed_groups.append(pos)
|
||||||
|
elif c == ')':
|
||||||
|
groups.append((unclosed_groups.pop(), pos + 1))
|
||||||
|
elif quantify and c == '?':
|
||||||
|
quantify = False
|
||||||
|
elif c in '?*+{':
|
||||||
|
quantifiers.append(pos)
|
||||||
|
quantify = True
|
||||||
|
elif c == '$':
|
||||||
|
eols.append(pos)
|
||||||
|
elif c == '|':
|
||||||
|
disjunctions.append(pos)
|
||||||
|
|
||||||
|
# Now classify each of the parenthetical groups to work out which ones take
|
||||||
|
# parameters. Only the outer-most of a set of nested capturing groups is
|
||||||
|
# important.
|
||||||
|
groups.sort()
|
||||||
|
params = []
|
||||||
|
comments = []
|
||||||
|
last_end = 0
|
||||||
|
for start, end in groups:
|
||||||
|
if start < last_end:
|
||||||
|
# Skip over inner nested capturing groups.
|
||||||
|
continue
|
||||||
|
m = GROUP_CLASS.match(pattern, start)
|
||||||
|
if m.group('positional'):
|
||||||
|
params.append((start, end, '_%d' % len(params), start + 1))
|
||||||
|
elif m.group('named'):
|
||||||
|
params.append((start, end, m.group('named'), m.start('contents')))
|
||||||
|
elif m.group('repeat'):
|
||||||
|
params.append((start, end, m.group('repeat'), start + 1))
|
||||||
|
elif m.group('illegal'):
|
||||||
|
raise ValueError('The pattern construct %r is not valid here.'
|
||||||
|
% pattern[start:end])
|
||||||
|
elif m.group('comment'):
|
||||||
|
comments.extend([start, end])
|
||||||
|
else:
|
||||||
|
# This is a non-capturing set, so nesting prohibitions don't apply
|
||||||
|
# to any inner groups.
|
||||||
|
continue
|
||||||
|
last_end = end
|
||||||
|
|
||||||
|
# XXX: Got to here!
|
||||||
|
results = []
|
||||||
|
end = groups[0][0]
|
||||||
|
# The first bit, before the first group starts.
|
||||||
|
if end == 0:
|
||||||
|
# FIXME: don't want to handle this case just yet.
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
quant_end = bisect(quantifiers, end)
|
||||||
|
range_end = bisect(ranges, end)
|
||||||
|
dis_end = bisect(disjunctions, end)
|
Loading…
Reference in New Issue