2012-02-01 04:36:11 +08:00
|
|
|
"""
|
|
|
|
Comparing two html documents.
|
|
|
|
"""
|
2012-06-08 00:08:47 +08:00
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2012-02-01 04:36:11 +08:00
|
|
|
import re
|
2012-07-21 16:00:10 +08:00
|
|
|
from django.utils.encoding import force_text
|
2012-07-20 22:16:57 +08:00
|
|
|
from django.utils.html_parser import HTMLParser, HTMLParseError
|
2012-07-20 20:22:00 +08:00
|
|
|
from django.utils import six
|
2012-08-12 18:32:08 +08:00
|
|
|
from django.utils.encoding import python_2_unicode_compatible
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
|
|
|
WHITESPACE = re.compile('\s+')
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_whitespace(string):
|
|
|
|
return WHITESPACE.sub(' ', string)
|
|
|
|
|
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
@python_2_unicode_compatible
|
2012-02-01 04:36:11 +08:00
|
|
|
class Element(object):
|
|
|
|
def __init__(self, name, attributes):
|
|
|
|
self.name = name
|
|
|
|
self.attributes = sorted(attributes)
|
|
|
|
self.children = []
|
|
|
|
|
|
|
|
def append(self, element):
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(element, six.string_types):
|
2012-07-21 16:00:10 +08:00
|
|
|
element = force_text(element)
|
2012-02-01 04:36:11 +08:00
|
|
|
element = normalize_whitespace(element)
|
|
|
|
if self.children:
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(self.children[-1], six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
self.children[-1] += element
|
|
|
|
self.children[-1] = normalize_whitespace(self.children[-1])
|
|
|
|
return
|
|
|
|
elif self.children:
|
|
|
|
# removing last children if it is only whitespace
|
|
|
|
# this can result in incorrect dom representations since
|
|
|
|
# whitespace between inline tags like <span> is significant
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(self.children[-1], six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
if self.children[-1].isspace():
|
|
|
|
self.children.pop()
|
|
|
|
if element:
|
|
|
|
self.children.append(element)
|
|
|
|
|
|
|
|
def finalize(self):
|
|
|
|
def rstrip_last_element(children):
|
|
|
|
if children:
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(children[-1], six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
children[-1] = children[-1].rstrip()
|
|
|
|
if not children[-1]:
|
|
|
|
children.pop()
|
|
|
|
children = rstrip_last_element(children)
|
|
|
|
return children
|
|
|
|
|
|
|
|
rstrip_last_element(self.children)
|
|
|
|
for i, child in enumerate(self.children):
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(child, six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
self.children[i] = child.strip()
|
|
|
|
elif hasattr(child, 'finalize'):
|
|
|
|
child.finalize()
|
|
|
|
|
|
|
|
def __eq__(self, element):
|
|
|
|
if not hasattr(element, 'name'):
|
|
|
|
return False
|
|
|
|
if hasattr(element, 'name') and self.name != element.name:
|
|
|
|
return False
|
|
|
|
if len(self.attributes) != len(element.attributes):
|
|
|
|
return False
|
|
|
|
if self.attributes != element.attributes:
|
|
|
|
# attributes without a value is same as attribute with value that
|
|
|
|
# equals the attributes name:
|
|
|
|
# <input checked> == <input checked="checked">
|
|
|
|
for i in range(len(self.attributes)):
|
|
|
|
attr, value = self.attributes[i]
|
|
|
|
other_attr, other_value = element.attributes[i]
|
|
|
|
if value is None:
|
|
|
|
value = attr
|
|
|
|
if other_value is None:
|
|
|
|
other_value = other_attr
|
|
|
|
if attr != other_attr or value != other_value:
|
|
|
|
return False
|
|
|
|
if self.children != element.children:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2013-02-26 05:53:08 +08:00
|
|
|
def __hash__(self):
|
|
|
|
return hash((self.name,) + tuple(a for a in self.attributes))
|
2012-08-09 05:13:33 +08:00
|
|
|
|
2012-02-01 04:36:11 +08:00
|
|
|
def __ne__(self, element):
|
|
|
|
return not self.__eq__(element)
|
|
|
|
|
|
|
|
def _count(self, element, count=True):
|
2012-07-20 20:22:00 +08:00
|
|
|
if not isinstance(element, six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
if self == element:
|
|
|
|
return 1
|
|
|
|
i = 0
|
|
|
|
for child in self.children:
|
|
|
|
# child is text content and element is also text content, then
|
|
|
|
# make a simple "text" in "text"
|
2012-07-20 20:22:00 +08:00
|
|
|
if isinstance(child, six.string_types):
|
|
|
|
if isinstance(element, six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
if count:
|
|
|
|
i += child.count(element)
|
|
|
|
elif element in child:
|
|
|
|
return 1
|
|
|
|
else:
|
|
|
|
i += child._count(element, count=count)
|
|
|
|
if not count and i:
|
|
|
|
return i
|
|
|
|
return i
|
|
|
|
|
|
|
|
def __contains__(self, element):
|
|
|
|
return self._count(element, count=False) > 0
|
|
|
|
|
|
|
|
def count(self, element):
|
|
|
|
return self._count(element, count=True)
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
return self.children[key]
|
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
def __str__(self):
|
2012-06-08 00:08:47 +08:00
|
|
|
output = '<%s' % self.name
|
2012-02-01 04:36:11 +08:00
|
|
|
for key, value in self.attributes:
|
|
|
|
if value:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += ' %s="%s"' % (key, value)
|
2012-02-01 04:36:11 +08:00
|
|
|
else:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += ' %s' % key
|
2012-02-01 04:36:11 +08:00
|
|
|
if self.children:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += '>\n'
|
2012-07-20 20:48:51 +08:00
|
|
|
output += ''.join(six.text_type(c) for c in self.children)
|
2012-06-08 00:08:47 +08:00
|
|
|
output += '\n</%s>' % self.name
|
2012-02-01 04:36:11 +08:00
|
|
|
else:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += ' />'
|
2012-02-01 04:36:11 +08:00
|
|
|
return output
|
|
|
|
|
|
|
|
def __repr__(self):
|
2012-07-20 20:48:51 +08:00
|
|
|
return six.text_type(self)
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
@python_2_unicode_compatible
|
2012-02-01 04:36:11 +08:00
|
|
|
class RootElement(Element):
|
|
|
|
def __init__(self):
|
|
|
|
super(RootElement, self).__init__(None, ())
|
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
def __str__(self):
|
2012-07-20 20:48:51 +08:00
|
|
|
return ''.join(six.text_type(c) for c in self.children)
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
|
|
|
class Parser(HTMLParser):
|
2013-10-27 09:27:42 +08:00
|
|
|
SELF_CLOSING_TAGS = ('br', 'hr', 'input', 'img', 'meta', 'spacer',
|
2012-02-01 04:36:11 +08:00
|
|
|
'link', 'frame', 'base', 'col')
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
HTMLParser.__init__(self)
|
|
|
|
self.root = RootElement()
|
|
|
|
self.open_tags = []
|
|
|
|
self.element_positions = {}
|
|
|
|
|
|
|
|
def error(self, msg):
|
|
|
|
raise HTMLParseError(msg, self.getpos())
|
|
|
|
|
|
|
|
def format_position(self, position=None, element=None):
|
|
|
|
if not position and element:
|
|
|
|
position = self.element_positions[element]
|
|
|
|
if position is None:
|
|
|
|
position = self.getpos()
|
|
|
|
if hasattr(position, 'lineno'):
|
|
|
|
position = position.lineno, position.offset
|
|
|
|
return 'Line %d, Column %d' % position
|
|
|
|
|
|
|
|
@property
|
|
|
|
def current(self):
|
|
|
|
if self.open_tags:
|
|
|
|
return self.open_tags[-1]
|
|
|
|
else:
|
|
|
|
return self.root
|
|
|
|
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
|
|
self.handle_starttag(tag, attrs)
|
|
|
|
if tag not in self.SELF_CLOSING_TAGS:
|
|
|
|
self.handle_endtag(tag)
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
2012-12-24 10:11:32 +08:00
|
|
|
# Special case handling of 'class' attribute, so that comparisons of DOM
|
|
|
|
# instances are not sensitive to ordering of classes.
|
|
|
|
attrs = [
|
|
|
|
(name, " ".join(sorted(value.split(" "))))
|
|
|
|
if name == "class"
|
|
|
|
else (name, value)
|
|
|
|
for name, value in attrs
|
2013-10-18 17:02:43 +08:00
|
|
|
]
|
2012-02-01 04:36:11 +08:00
|
|
|
element = Element(tag, attrs)
|
|
|
|
self.current.append(element)
|
|
|
|
if tag not in self.SELF_CLOSING_TAGS:
|
|
|
|
self.open_tags.append(element)
|
|
|
|
self.element_positions[element] = self.getpos()
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if not self.open_tags:
|
|
|
|
self.error("Unexpected end tag `%s` (%s)" % (
|
|
|
|
tag, self.format_position()))
|
|
|
|
element = self.open_tags.pop()
|
|
|
|
while element.name != tag:
|
|
|
|
if not self.open_tags:
|
|
|
|
self.error("Unexpected end tag `%s` (%s)" % (
|
|
|
|
tag, self.format_position()))
|
|
|
|
element = self.open_tags.pop()
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
self.current.append(data)
|
|
|
|
|
|
|
|
def handle_charref(self, name):
|
|
|
|
self.current.append('&%s;' % name)
|
|
|
|
|
|
|
|
def handle_entityref(self, name):
|
|
|
|
self.current.append('&%s;' % name)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_html(html):
|
|
|
|
"""
|
|
|
|
Takes a string that contains *valid* HTML and turns it into a Python object
|
|
|
|
structure that can be easily compared against other HTML on semantic
|
2013-03-29 01:16:53 +08:00
|
|
|
equivalence. Syntactical differences like which quotation is used on
|
2012-02-01 04:36:11 +08:00
|
|
|
arguments will be ignored.
|
|
|
|
|
|
|
|
"""
|
|
|
|
parser = Parser()
|
|
|
|
parser.feed(html)
|
|
|
|
parser.close()
|
|
|
|
document = parser.root
|
|
|
|
document.finalize()
|
|
|
|
# Removing ROOT element if it's not necessary
|
|
|
|
if len(document.children) == 1:
|
2012-07-20 20:22:00 +08:00
|
|
|
if not isinstance(document.children[0], six.string_types):
|
2012-02-01 04:36:11 +08:00
|
|
|
document = document.children[0]
|
|
|
|
return document
|