2017-01-25 04:37:33 +08:00
|
|
|
"""Compare two HTML documents."""
|
2021-10-29 02:15:01 +08:00
|
|
|
import html
|
2017-02-18 08:45:34 +08:00
|
|
|
from html.parser import HTMLParser
|
2012-02-01 04:36:11 +08:00
|
|
|
|
2019-10-26 22:42:32 +08:00
|
|
|
from django.utils.regex_helper import _lazy_re_compile
|
|
|
|
|
2019-05-09 21:55:32 +08:00
|
|
|
# ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020
|
|
|
|
# SPACE.
|
|
|
|
# https://infra.spec.whatwg.org/#ascii-whitespace
|
2019-10-26 22:42:32 +08:00
|
|
|
ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+")
|
2012-02-01 04:36:11 +08:00
|
|
|
|
2021-03-19 07:43:38 +08:00
|
|
|
# https://html.spec.whatwg.org/#attributes-3
|
|
|
|
BOOLEAN_ATTRIBUTES = {
|
|
|
|
"allowfullscreen",
|
|
|
|
"async",
|
|
|
|
"autofocus",
|
|
|
|
"autoplay",
|
|
|
|
"checked",
|
|
|
|
"controls",
|
|
|
|
"default",
|
|
|
|
"defer ",
|
|
|
|
"disabled",
|
|
|
|
"formnovalidate",
|
|
|
|
"hidden",
|
|
|
|
"ismap",
|
|
|
|
"itemscope",
|
|
|
|
"loop",
|
|
|
|
"multiple",
|
|
|
|
"muted",
|
|
|
|
"nomodule",
|
|
|
|
"novalidate",
|
|
|
|
"open",
|
|
|
|
"playsinline",
|
|
|
|
"readonly",
|
|
|
|
"required",
|
|
|
|
"reversed",
|
|
|
|
"selected",
|
|
|
|
# Attributes for deprecated tags.
|
|
|
|
"truespeed",
|
|
|
|
}
|
|
|
|
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
def normalize_whitespace(string):
|
2019-05-09 21:55:32 +08:00
|
|
|
return ASCII_WHITESPACE.sub(" ", string)
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
2021-03-19 19:30:16 +08:00
|
|
|
def normalize_attributes(attributes):
|
|
|
|
normalized = []
|
|
|
|
for name, value in attributes:
|
|
|
|
if name == "class" and value:
|
|
|
|
# Special case handling of 'class' attribute, so that comparisons
|
|
|
|
# of DOM instances are not sensitive to ordering of classes.
|
|
|
|
value = " ".join(
|
|
|
|
sorted(value for value in ASCII_WHITESPACE.split(value) if value)
|
|
|
|
)
|
2021-03-19 07:43:38 +08:00
|
|
|
# Boolean attributes without a value is same as attribute with value
|
|
|
|
# that equals the attributes name. For example:
|
|
|
|
# <input checked> == <input checked="checked">
|
|
|
|
if name in BOOLEAN_ATTRIBUTES:
|
|
|
|
if not value or value == name:
|
|
|
|
value = None
|
|
|
|
elif value is None:
|
|
|
|
value = ""
|
2021-03-19 19:30:16 +08:00
|
|
|
normalized.append((name, value))
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
2017-01-19 15:39:46 +08:00
|
|
|
class Element:
|
2012-02-01 04:36:11 +08:00
|
|
|
def __init__(self, name, attributes):
|
|
|
|
self.name = name
|
|
|
|
self.attributes = sorted(attributes)
|
|
|
|
self.children = []
|
|
|
|
|
|
|
|
def append(self, element):
|
2016-12-29 23:27:49 +08:00
|
|
|
if isinstance(element, str):
|
2012-02-01 04:36:11 +08:00
|
|
|
element = normalize_whitespace(element)
|
2020-09-25 00:37:55 +08:00
|
|
|
if self.children and isinstance(self.children[-1], str):
|
|
|
|
self.children[-1] += element
|
|
|
|
self.children[-1] = normalize_whitespace(self.children[-1])
|
|
|
|
return
|
2012-02-01 04:36:11 +08:00
|
|
|
elif self.children:
|
|
|
|
# removing last children if it is only whitespace
|
|
|
|
# this can result in incorrect dom representations since
|
|
|
|
# whitespace between inline tags like <span> is significant
|
2020-09-25 00:37:55 +08:00
|
|
|
if isinstance(self.children[-1], str) and self.children[-1].isspace():
|
|
|
|
self.children.pop()
|
2012-02-01 04:36:11 +08:00
|
|
|
if element:
|
|
|
|
self.children.append(element)
|
|
|
|
|
|
|
|
def finalize(self):
|
|
|
|
def rstrip_last_element(children):
|
2020-09-25 00:37:55 +08:00
|
|
|
if children and isinstance(children[-1], str):
|
|
|
|
children[-1] = children[-1].rstrip()
|
|
|
|
if not children[-1]:
|
|
|
|
children.pop()
|
|
|
|
children = rstrip_last_element(children)
|
2012-02-01 04:36:11 +08:00
|
|
|
return children
|
|
|
|
|
|
|
|
rstrip_last_element(self.children)
|
|
|
|
for i, child in enumerate(self.children):
|
2016-12-29 23:27:49 +08:00
|
|
|
if isinstance(child, str):
|
2012-02-01 04:36:11 +08:00
|
|
|
self.children[i] = child.strip()
|
|
|
|
elif hasattr(child, "finalize"):
|
|
|
|
child.finalize()
|
|
|
|
|
|
|
|
def __eq__(self, element):
|
2018-01-04 00:34:10 +08:00
|
|
|
if not hasattr(element, "name") or self.name != element.name:
|
2012-02-01 04:36:11 +08:00
|
|
|
return False
|
|
|
|
if self.attributes != element.attributes:
|
2021-03-19 19:30:16 +08:00
|
|
|
return False
|
2018-01-04 07:52:12 +08:00
|
|
|
return self.children == element.children
|
2012-02-01 04:36:11 +08:00
|
|
|
|
2013-02-26 05:53:08 +08:00
|
|
|
def __hash__(self):
|
2019-02-09 22:18:48 +08:00
|
|
|
return hash((self.name, *self.attributes))
|
2012-08-09 05:13:33 +08:00
|
|
|
|
2012-02-01 04:36:11 +08:00
|
|
|
def _count(self, element, count=True):
|
2020-09-25 00:37:55 +08:00
|
|
|
if not isinstance(element, str) and self == element:
|
|
|
|
return 1
|
|
|
|
if isinstance(element, RootElement) and self.children == element.children:
|
|
|
|
return 1
|
2012-02-01 04:36:11 +08:00
|
|
|
i = 0
|
2020-09-20 22:14:54 +08:00
|
|
|
elem_child_idx = 0
|
2012-02-01 04:36:11 +08:00
|
|
|
for child in self.children:
|
|
|
|
# child is text content and element is also text content, then
|
|
|
|
# make a simple "text" in "text"
|
2016-12-29 23:27:49 +08:00
|
|
|
if isinstance(child, str):
|
|
|
|
if isinstance(element, str):
|
2012-02-01 04:36:11 +08:00
|
|
|
if count:
|
|
|
|
i += child.count(element)
|
|
|
|
elif element in child:
|
|
|
|
return 1
|
|
|
|
else:
|
2020-09-20 22:14:54 +08:00
|
|
|
# Look for element wholly within this child.
|
2012-02-01 04:36:11 +08:00
|
|
|
i += child._count(element, count=count)
|
|
|
|
if not count and i:
|
|
|
|
return i
|
2020-09-20 22:14:54 +08:00
|
|
|
# Also look for a sequence of element's children among self's
|
|
|
|
# children. self.children == element.children is tested above,
|
|
|
|
# but will fail if self has additional children. Ex: '<a/><b/>'
|
|
|
|
# is contained in '<a/><b/><c/>'.
|
|
|
|
if isinstance(element, RootElement) and element.children:
|
|
|
|
elem_child = element.children[elem_child_idx]
|
|
|
|
# Start or continue match, advance index.
|
|
|
|
if elem_child == child:
|
|
|
|
elem_child_idx += 1
|
|
|
|
# Match found, reset index.
|
|
|
|
if elem_child_idx == len(element.children):
|
|
|
|
i += 1
|
|
|
|
elem_child_idx = 0
|
|
|
|
# No match, reset index.
|
|
|
|
else:
|
|
|
|
elem_child_idx = 0
|
2012-02-01 04:36:11 +08:00
|
|
|
return i
|
|
|
|
|
|
|
|
def __contains__(self, element):
|
|
|
|
return self._count(element, count=False) > 0
|
|
|
|
|
|
|
|
def count(self, element):
|
|
|
|
return self._count(element, count=True)
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
return self.children[key]
|
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
def __str__(self):
|
2012-06-08 00:08:47 +08:00
|
|
|
output = "<%s" % self.name
|
2012-02-01 04:36:11 +08:00
|
|
|
for key, value in self.attributes:
|
2021-03-19 07:43:38 +08:00
|
|
|
if value is not None:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += ' %s="%s"' % (key, value)
|
2012-02-01 04:36:11 +08:00
|
|
|
else:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += " %s" % key
|
2012-02-01 04:36:11 +08:00
|
|
|
if self.children:
|
2012-06-08 00:08:47 +08:00
|
|
|
output += ">\n"
|
2021-10-29 02:15:01 +08:00
|
|
|
output += "".join(
|
|
|
|
[
|
|
|
|
html.escape(c) if isinstance(c, str) else str(c)
|
|
|
|
for c in self.children
|
|
|
|
]
|
|
|
|
)
|
2012-06-08 00:08:47 +08:00
|
|
|
output += "\n</%s>" % self.name
|
2012-02-01 04:36:11 +08:00
|
|
|
else:
|
2018-01-21 15:09:10 +08:00
|
|
|
output += ">"
|
2012-02-01 04:36:11 +08:00
|
|
|
return output
|
|
|
|
|
|
|
|
def __repr__(self):
|
2016-12-29 23:27:49 +08:00
|
|
|
return str(self)
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
|
|
|
class RootElement(Element):
|
|
|
|
def __init__(self):
|
2017-01-21 21:13:44 +08:00
|
|
|
super().__init__(None, ())
|
2012-02-01 04:36:11 +08:00
|
|
|
|
2012-08-12 18:32:08 +08:00
|
|
|
def __str__(self):
|
2021-10-29 02:15:01 +08:00
|
|
|
return "".join(
|
|
|
|
[html.escape(c) if isinstance(c, str) else str(c) for c in self.children]
|
|
|
|
)
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
|
2017-02-18 08:45:34 +08:00
|
|
|
class HTMLParseError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2012-02-01 04:36:11 +08:00
|
|
|
class Parser(HTMLParser):
|
2019-05-10 18:04:07 +08:00
|
|
|
# https://html.spec.whatwg.org/#void-elements
|
|
|
|
SELF_CLOSING_TAGS = {
|
|
|
|
"area",
|
|
|
|
"base",
|
|
|
|
"br",
|
|
|
|
"col",
|
|
|
|
"embed",
|
|
|
|
"hr",
|
|
|
|
"img",
|
|
|
|
"input",
|
|
|
|
"link",
|
|
|
|
"meta",
|
|
|
|
"param",
|
|
|
|
"source",
|
|
|
|
"track",
|
|
|
|
"wbr",
|
|
|
|
# Deprecated tags
|
|
|
|
"frame",
|
|
|
|
"spacer",
|
|
|
|
}
|
2012-02-01 04:36:11 +08:00
|
|
|
|
|
|
|
def __init__(self):
|
2019-05-09 21:55:32 +08:00
|
|
|
super().__init__()
|
2012-02-01 04:36:11 +08:00
|
|
|
self.root = RootElement()
|
|
|
|
self.open_tags = []
|
|
|
|
self.element_positions = {}
|
|
|
|
|
|
|
|
def error(self, msg):
|
|
|
|
raise HTMLParseError(msg, self.getpos())
|
|
|
|
|
|
|
|
def format_position(self, position=None, element=None):
|
|
|
|
if not position and element:
|
|
|
|
position = self.element_positions[element]
|
|
|
|
if position is None:
|
|
|
|
position = self.getpos()
|
|
|
|
if hasattr(position, "lineno"):
|
|
|
|
position = position.lineno, position.offset
|
|
|
|
return "Line %d, Column %d" % position
|
|
|
|
|
|
|
|
@property
|
|
|
|
def current(self):
|
|
|
|
if self.open_tags:
|
|
|
|
return self.open_tags[-1]
|
|
|
|
else:
|
|
|
|
return self.root
|
|
|
|
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
|
|
self.handle_starttag(tag, attrs)
|
|
|
|
if tag not in self.SELF_CLOSING_TAGS:
|
|
|
|
self.handle_endtag(tag)
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
2021-03-19 19:30:16 +08:00
|
|
|
attrs = normalize_attributes(attrs)
|
2012-02-01 04:36:11 +08:00
|
|
|
element = Element(tag, attrs)
|
|
|
|
self.current.append(element)
|
|
|
|
if tag not in self.SELF_CLOSING_TAGS:
|
|
|
|
self.open_tags.append(element)
|
|
|
|
self.element_positions[element] = self.getpos()
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if not self.open_tags:
|
|
|
|
self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position()))
|
|
|
|
element = self.open_tags.pop()
|
|
|
|
while element.name != tag:
|
|
|
|
if not self.open_tags:
|
|
|
|
self.error(
|
|
|
|
"Unexpected end tag `%s` (%s)" % (tag, self.format_position())
|
|
|
|
)
|
|
|
|
element = self.open_tags.pop()
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
self.current.append(data)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_html(html):
|
|
|
|
"""
|
2021-03-17 23:33:52 +08:00
|
|
|
Take a string that contains HTML and turn it into a Python object structure
|
|
|
|
that can be easily compared against other HTML on semantic equivalence.
|
|
|
|
Syntactical differences like which quotation is used on arguments will be
|
|
|
|
ignored.
|
2012-02-01 04:36:11 +08:00
|
|
|
"""
|
|
|
|
parser = Parser()
|
|
|
|
parser.feed(html)
|
|
|
|
parser.close()
|
|
|
|
document = parser.root
|
|
|
|
document.finalize()
|
|
|
|
# Removing ROOT element if it's not necessary
|
2020-09-25 00:37:55 +08:00
|
|
|
if len(document.children) == 1 and not isinstance(document.children[0], str):
|
|
|
|
document = document.children[0]
|
2012-02-01 04:36:11 +08:00
|
|
|
return document
|