Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator.

This commit is contained in:
David Smith 2023-01-03 20:48:06 +00:00 committed by Mariusz Felisiak
parent 70f39e46f8
commit 6ee37ada32
4 changed files with 149 additions and 125 deletions

View File

@ -2,12 +2,20 @@ import gzip
import re import re
import secrets import secrets
import unicodedata import unicodedata
from collections import deque
from gzip import GzipFile from gzip import GzipFile
from gzip import compress as gzip_compress from gzip import compress as gzip_compress
from html import escape
from html.parser import HTMLParser
from io import BytesIO from io import BytesIO
from django.core.exceptions import SuspiciousFileOperation from django.core.exceptions import SuspiciousFileOperation
from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy from django.utils.functional import (
SimpleLazyObject,
cached_property,
keep_lazy_text,
lazy,
)
from django.utils.regex_helper import _lazy_re_compile from django.utils.regex_helper import _lazy_re_compile
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from django.utils.translation import gettext_lazy, pgettext from django.utils.translation import gettext_lazy, pgettext
@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
return f"{text}{truncate}" return f"{text}{truncate}"
def calculate_truncate_chars_length(length, replacement):
truncate_len = length
for char in add_truncation_text("", replacement):
if not unicodedata.combining(char):
truncate_len -= 1
if truncate_len == 0:
break
return truncate_len
class TruncateHTMLParser(HTMLParser):
class TruncationCompleted(Exception):
pass
def __init__(self, *, length, replacement, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.tags = deque()
self.output = ""
self.remaining = length
self.replacement = replacement
@cached_property
def void_elements(self):
from django.utils.html import VOID_ELEMENTS
return VOID_ELEMENTS
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
if tag not in self.void_elements:
self.handle_endtag(tag)
def handle_starttag(self, tag, attrs):
self.output += self.get_starttag_text()
if tag not in self.void_elements:
self.tags.appendleft(tag)
def handle_endtag(self, tag):
if tag not in self.void_elements:
self.output += f"</{tag}>"
try:
self.tags.remove(tag)
except ValueError:
pass
def handle_data(self, data):
data, output = self.process(data)
data_len = len(data)
if self.remaining < data_len:
self.remaining = 0
self.output += add_truncation_text(output, self.replacement)
raise self.TruncationCompleted
self.remaining -= data_len
self.output += output
def feed(self, data):
try:
super().feed(data)
except self.TruncationCompleted:
self.output += "".join([f"</{tag}>" for tag in self.tags])
self.tags.clear()
self.reset()
else:
# No data was handled.
self.reset()
class TruncateCharsHTMLParser(TruncateHTMLParser):
def __init__(self, *, length, replacement, convert_charrefs=True):
self.length = length
self.processed_chars = 0
super().__init__(
length=calculate_truncate_chars_length(length, replacement),
replacement=replacement,
convert_charrefs=convert_charrefs,
)
def process(self, data):
self.processed_chars += len(data)
if (self.processed_chars == self.length) and (
len(self.output) + len(data) == len(self.rawdata)
):
self.output += data
raise self.TruncationCompleted
output = escape("".join(data[: self.remaining]))
return data, output
class TruncateWordsHTMLParser(TruncateHTMLParser):
def process(self, data):
data = re.split(r"(?<=\S)\s+(?=\S)", data)
output = escape(" ".join(data[: self.remaining]))
return data, output
class Truncator(SimpleLazyObject): class Truncator(SimpleLazyObject):
""" """
An object used to truncate text, either by characters or words. An object used to truncate text, either by characters or words.
@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject):
return "" return ""
text = unicodedata.normalize("NFC", self._wrapped) text = unicodedata.normalize("NFC", self._wrapped)
# Calculate the length to truncate to (max length - end_text length)
truncate_len = length
for char in add_truncation_text("", truncate):
if not unicodedata.combining(char):
truncate_len -= 1
if truncate_len == 0:
break
if html: if html:
return self._truncate_html(length, truncate, text, truncate_len, False) parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
return self._text_chars(length, truncate, text, truncate_len) parser.feed(text)
parser.close()
return parser.output
return self._text_chars(length, truncate, text)
def _text_chars(self, length, truncate, text, truncate_len): def _text_chars(self, length, truncate, text):
"""Truncate a string after a certain number of chars.""" """Truncate a string after a certain number of chars."""
truncate_len = calculate_truncate_chars_length(length, truncate)
s_len = 0 s_len = 0
end_index = None end_index = None
for i, char in enumerate(text): for i, char in enumerate(text):
@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject):
if length <= 0: if length <= 0:
return "" return ""
if html: if html:
return self._truncate_html(length, truncate, self._wrapped, length, True) parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
parser.feed(self._wrapped)
parser.close()
return parser.output
return self._text_words(length, truncate) return self._text_words(length, truncate)
def _text_words(self, length, truncate): def _text_words(self, length, truncate):
@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject):
return add_truncation_text(" ".join(words), truncate) return add_truncation_text(" ".join(words), truncate)
return " ".join(words) return " ".join(words)
def _truncate_html(self, length, truncate, text, truncate_len, words):
"""
Truncate HTML to a certain number of chars (not counting tags and
comments), or, if words is True, then to a certain number of words.
Close opened tags if they were correctly closed in the given HTML.
Preserve newlines in the HTML.
"""
if words and length <= 0:
return ""
size_limited = False
if len(text) > self.MAX_LENGTH_HTML:
text = text[: self.MAX_LENGTH_HTML]
size_limited = True
html4_singlets = (
"br",
"col",
"link",
"base",
"img",
"param",
"area",
"hr",
"input",
)
# Count non-HTML chars/words and keep note of open tags
pos = 0
end_text_pos = 0
current_len = 0
open_tags = []
regex = re_words if words else re_chars
while current_len <= length:
m = regex.search(text, pos)
if not m:
# Checked through whole string
break
pos = m.end(0)
if m[1]:
# It's an actual non-HTML word or char
current_len += 1
if current_len == truncate_len:
end_text_pos = pos
continue
# Check for tag
tag = re_tag.match(m[0])
if not tag or current_len >= truncate_len:
# Don't worry about non tags or tags after our truncate point
continue
closing_tag, tagname, self_closing = tag.groups()
# Element names are always case-insensitive
tagname = tagname.lower()
if self_closing or tagname in html4_singlets:
pass
elif closing_tag:
# Check for match in open tags list
try:
i = open_tags.index(tagname)
except ValueError:
pass
else:
# SGML: An end tag closes, back to the matching start tag,
# all unclosed intervening start tags with omitted end tags
open_tags = open_tags[i + 1 :]
else:
# Add it to the start of the open tags list
open_tags.insert(0, tagname)
truncate_text = add_truncation_text("", truncate)
if current_len <= length:
if size_limited and truncate_text:
text += truncate_text
return text
out = text[:end_text_pos]
if truncate_text:
out += truncate_text
# Close any tags still open
for tag in open_tags:
out += "</%s>" % tag
# Return string
return out
@keep_lazy_text @keep_lazy_text
def get_valid_filename(name): def get_valid_filename(name):

View File

@ -368,6 +368,11 @@ Miscellaneous
:meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
``msg_prefix``. This is consistent with the behavior of other assertions. ``msg_prefix``. This is consistent with the behavior of other assertions.
* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
:tfilter:`truncatewords_html` template filters now uses
:py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
and faster operation, but there may be small differences in the output.
.. _deprecated-features-5.1: .. _deprecated-features-5.1:
Features deprecated in 5.1 Features deprecated in 5.1

View File

@ -24,7 +24,7 @@ class FunctionTests(SimpleTestCase):
truncatewords_html( truncatewords_html(
'<p>one <a href="#">two - three <br>four</a> five</p>', 4 '<p>one <a href="#">two - three <br>four</a> five</p>', 4
), ),
'<p>one <a href="#">two - three …</a></p>', '<p>one <a href="#">two - three <br> …</a></p>',
) )
def test_truncate3(self): def test_truncate3(self):
@ -32,7 +32,7 @@ class FunctionTests(SimpleTestCase):
truncatewords_html( truncatewords_html(
'<p>one <a href="#">two - three <br>four</a> five</p>', 5 '<p>one <a href="#">two - three <br>four</a> five</p>', 5
), ),
'<p>one <a href="#">two - three <br>four</a></p>', '<p>one <a href="#">two - three <br>four</a></p>',
) )
def test_truncate4(self): def test_truncate4(self):
@ -53,7 +53,7 @@ class FunctionTests(SimpleTestCase):
truncatewords_html( truncatewords_html(
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3 "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
), ),
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>", "<i>Buenos días! ¿Cómo …</i>",
) )
def test_invalid_arg(self): def test_invalid_arg(self):

View File

@ -111,7 +111,7 @@ class TestUtilsText(SimpleTestCase):
truncator.chars(46, html=True), truncator.chars(46, html=True),
) )
self.assertEqual( self.assertEqual(
'<p id="par"><strong><em>The quick brown fox jumped over the lazy dog.</em>' '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog</em>'
"</strong></p>", "</strong></p>",
truncator.chars(45, html=True), truncator.chars(45, html=True),
) )
@ -120,7 +120,7 @@ class TestUtilsText(SimpleTestCase):
truncator.chars(10, html=True), truncator.chars(10, html=True),
) )
self.assertEqual( self.assertEqual(
"", '<p id="par"><strong><em>…</em></strong></p>',
truncator.chars(1, html=True), truncator.chars(1, html=True),
) )
self.assertEqual("", truncator.chars(0, html=True)) self.assertEqual("", truncator.chars(0, html=True))
@ -142,18 +142,16 @@ class TestUtilsText(SimpleTestCase):
bigger_len = text.Truncator.MAX_LENGTH_HTML + 1 bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
valid_html = "<p>Joel is a slug</p>" # 14 chars valid_html = "<p>Joel is a slug</p>" # 14 chars
perf_test_values = [ perf_test_values = [
("</a" + "\t" * (max_len - 6) + "//>", None), ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * 6 + ""), ("</p" + "\t" * bigger_len + "//>", "</p>"),
("&" * bigger_len, "&" * 9 + ""), ("&" * bigger_len, ""),
("_X<<<<<<<<<<<>", None), ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;…"),
(valid_html * bigger_len, "<p>Joel is a…</p>"), # 10 chars (valid_html * bigger_len, "<p>Joel is a…</p>"), # 10 chars
] ]
for value, expected in perf_test_values: for value, expected in perf_test_values:
with self.subTest(value=value): with self.subTest(value=value):
truncator = text.Truncator(value) truncator = text.Truncator(value)
self.assertEqual( self.assertEqual(expected, truncator.chars(10, html=True))
expected if expected else value, truncator.chars(10, html=True)
)
def test_truncate_chars_html_with_newline_inside_tag(self): def test_truncate_chars_html_with_newline_inside_tag(self):
truncator = text.Truncator( truncator = text.Truncator(
@ -181,7 +179,7 @@ class TestUtilsText(SimpleTestCase):
"<br>The <hr/>quick <em>brown…</em>", truncator.chars(16, html=True) "<br>The <hr/>quick <em>brown…</em>", truncator.chars(16, html=True)
) )
self.assertEqual("<br>The <hr/>q…", truncator.chars(6, html=True)) self.assertEqual("<br>The <hr/>q…", truncator.chars(6, html=True))
self.assertEqual("<br>The ", truncator.chars(5, html=True)) self.assertEqual("<br>The <hr/>", truncator.chars(5, html=True))
self.assertEqual("<br>The…", truncator.chars(4, html=True)) self.assertEqual("<br>The…", truncator.chars(4, html=True))
self.assertEqual("<br>Th…", truncator.chars(3, html=True)) self.assertEqual("<br>Th…", truncator.chars(3, html=True))
@ -190,11 +188,19 @@ class TestUtilsText(SimpleTestCase):
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>" "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
) )
self.assertEqual( self.assertEqual(
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>", "<i>Buenos días! ¿Cómo está?</i>",
truncator.chars(40, html=True), truncator.chars(40, html=True),
) )
self.assertEqual(
"<i>Buenos días…</i>",
truncator.chars(12, html=True),
)
self.assertEqual(
"<i>Buenos días! ¿Cómo está…</i>",
truncator.chars(24, html=True),
)
truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>") truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
self.assertEqual("<p>I &lt;3 python,…</p>", truncator.chars(16, html=True)) self.assertEqual("<p>I &lt;3 python, wh…</p>", truncator.chars(16, html=True))
def test_truncate_words(self): def test_truncate_words(self):
truncator = text.Truncator("The quick brown fox jumped over the lazy dog.") truncator = text.Truncator("The quick brown fox jumped over the lazy dog.")
@ -242,7 +248,7 @@ class TestUtilsText(SimpleTestCase):
"<p>The quick \t brown fox jumped over the lazy dog.</p>" "<p>The quick \t brown fox jumped over the lazy dog.</p>"
) )
self.assertEqual( self.assertEqual(
"<p>The quick \t brown fox…</p>", "<p>The quick brown fox…</p>",
truncator.words(4, html=True), truncator.words(4, html=True),
) )
@ -277,7 +283,7 @@ class TestUtilsText(SimpleTestCase):
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>" "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
) )
self.assertEqual( self.assertEqual(
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>", "<i>Buenos días! ¿Cómo…</i>",
truncator.words(3, html=True), truncator.words(3, html=True),
) )
truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>") truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
@ -292,19 +298,17 @@ class TestUtilsText(SimpleTestCase):
bigger_len = text.Truncator.MAX_LENGTH_HTML + 1 bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
valid_html = "<p>Joel is a slug</p>" # 4 words valid_html = "<p>Joel is a slug</p>" # 4 words
perf_test_values = [ perf_test_values = [
("</a" + "\t" * (max_len - 6) + "//>", None), ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * (max_len - 3) + ""), ("</p" + "\t" * bigger_len + "//>", "</p>"),
("&" * max_len, None), # no change ("&" * max_len, ""),
("&" * bigger_len, "&" * max_len + ""), ("&" * bigger_len, ""),
("_X<<<<<<<<<<<>", None), ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&gt;"),
(valid_html * bigger_len, valid_html * 12 + "<p>Joel is…</p>"), # 50 words (valid_html * bigger_len, valid_html * 12 + "<p>Joel is…</p>"), # 50 words
] ]
for value, expected in perf_test_values: for value, expected in perf_test_values:
with self.subTest(value=value): with self.subTest(value=value):
truncator = text.Truncator(value) truncator = text.Truncator(value)
self.assertEqual( self.assertEqual(expected, truncator.words(50, html=True))
expected if expected else value, truncator.words(50, html=True)
)
def test_wrap(self): def test_wrap(self):
digits = "1234 67 9" digits = "1234 67 9"