Started to work on the regex reverse-engineering phase.
git-svn-id: http://code.djangoproject.com/svn/django/trunk@7850 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
c88e55b4fe
commit
c2f753a8ba
|
@ -0,0 +1,123 @@
|
|||
"""
|
||||
Functions for reversing a regular expression (used in reverse URL resolving).
|
||||
|
||||
This is not, and is not intended to be, a complete reg-exp decompiler. It
|
||||
should be good enough for almost all sane URLs.
|
||||
"""
|
||||
|
||||
import re
|
||||
from bisect import bisect
|
||||
|
||||
GROUP_CLASS = re.compile(r'''\((?:
|
||||
(?P<positional>[^?])| # Unnamed (positional) capturing group.
|
||||
\?(?:
|
||||
P<(?P<named>[\w]+)>(?P<contents>.*)| # Named capturing group.
|
||||
P=(?P<repeat>.+)| # Repeat of a previous named group.
|
||||
(?P<grouping>:)| # Non-capturing grouping parens.
|
||||
(?P<comment>\#)| # Comment group
|
||||
(?P<illegal>.) # Anything else (which will be an error)
|
||||
)
|
||||
).*\)''', re.VERBOSE)
|
||||
|
||||
def normalize(pattern):
|
||||
"""
|
||||
Given a reg-exp pattern, normalizes it to a list of forms that suffice for
|
||||
reverse matching. This does the following:
|
||||
|
||||
(1) For any repeating sections, keeps the minimum number of occurrences
|
||||
permitted (this means zero for optional groups).
|
||||
(2) If an optional group includes parameters, include one occurrence of
|
||||
that group (along with the zero occurrence case from step (1)).
|
||||
(3) Select the first (essentially an arbitrary) element from any character
|
||||
class. Select an arbitrary character for any unordered class (e.g. '.' or
|
||||
'\w') in the pattern.
|
||||
(4) Take the first alternative in any '|' division, unless other
|
||||
alternatives would involve different parameters.
|
||||
(5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
|
||||
look-ahead and look-behind matches).
|
||||
|
||||
Returns a list of tuples, each tuple containing (a) a pattern, (b) the
|
||||
number of parameters, (c) the names of the parameters. Any unnamed
|
||||
parameters are called '_0', '_1', etc.
|
||||
"""
|
||||
# Do a linear scan to work out the special features of this pattern. The
|
||||
# idea is that we scan once here and collect all the information we need to
|
||||
# make future decisions.
|
||||
groups = [] # (start, end)
|
||||
quantifiers = [] # start pos
|
||||
ranges = [] # (start, end)
|
||||
eols = [] # pos
|
||||
disjunctions = [] # pos
|
||||
unclosed_groups = []
|
||||
unclosed_ranges = []
|
||||
escaped = False
|
||||
quantify = False
|
||||
in_range = False
|
||||
for pos, c in enumerate(pattern):
|
||||
if in_range and c != ']' or (c == ']' and
|
||||
unclosed_ranges[-1] == pos - 1):
|
||||
continue
|
||||
elif c == '[':
|
||||
unclosed_ranges.append(pos)
|
||||
elif c == ']':
|
||||
ranges.append((unclosed_ranges.pop(), pos + 1))
|
||||
in_range = False
|
||||
elif c == '.':
|
||||
# Treat this as a one-character long range:
|
||||
ranges.append((pos, pos + 1))
|
||||
elif escaped or c == '\\':
|
||||
escaped = not escaped
|
||||
elif c == '(':
|
||||
unclosed_groups.append(pos)
|
||||
elif c == ')':
|
||||
groups.append((unclosed_groups.pop(), pos + 1))
|
||||
elif quantify and c == '?':
|
||||
quantify = False
|
||||
elif c in '?*+{':
|
||||
quantifiers.append(pos)
|
||||
quantify = True
|
||||
elif c == '$':
|
||||
eols.append(pos)
|
||||
elif c == '|':
|
||||
disjunctions.append(pos)
|
||||
|
||||
# Now classify each of the parenthetical groups to work out which ones take
|
||||
# parameters. Only the outer-most of a set of nested capturing groups is
|
||||
# important.
|
||||
groups.sort()
|
||||
params = []
|
||||
comments = []
|
||||
last_end = 0
|
||||
for start, end in groups:
|
||||
if start < last_end:
|
||||
# Skip over inner nested capturing groups.
|
||||
continue
|
||||
m = GROUP_CLASS.match(pattern, start)
|
||||
if m.group('positional'):
|
||||
params.append((start, end, '_%d' % len(params), start + 1))
|
||||
elif m.group('named'):
|
||||
params.append((start, end, m.group('named'), m.start('contents')))
|
||||
elif m.group('repeat'):
|
||||
params.append((start, end, m.group('repeat'), start + 1))
|
||||
elif m.group('illegal'):
|
||||
raise ValueError('The pattern construct %r is not valid here.'
|
||||
% pattern[start:end])
|
||||
elif m.group('comment'):
|
||||
comments.extend([start, end])
|
||||
else:
|
||||
# This is a non-capturing set, so nesting prohibitions don't apply
|
||||
# to any inner groups.
|
||||
continue
|
||||
last_end = end
|
||||
|
||||
# XXX: Got to here!
|
||||
results = []
|
||||
end = groups[0][0]
|
||||
# The first bit, before the first group starts.
|
||||
if end == 0:
|
||||
# FIXME: don't want to handle this case just yet.
|
||||
raise Exception
|
||||
|
||||
quant_end = bisect(quantifiers, end)
|
||||
range_end = bisect(ranges, end)
|
||||
dis_end = bisect(disjunctions, end)
|
Loading…
Reference in New Issue