From 6ee37ada3241ed263d8d1c2901b030d964cbd161 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 3 Jan 2023 20:48:06 +0000 Subject: [PATCH] Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator. --- django/utils/text.py | 215 ++++++++++-------- docs/releases/5.1.txt | 5 + .../filter_tests/test_truncatewords_html.py | 6 +- tests/utils_tests/test_text.py | 48 ++-- 4 files changed, 149 insertions(+), 125 deletions(-) diff --git a/django/utils/text.py b/django/utils/text.py index 374fd78f927..9560ebc6784 100644 --- a/django/utils/text.py +++ b/django/utils/text.py @@ -2,12 +2,20 @@ import gzip import re import secrets import unicodedata +from collections import deque from gzip import GzipFile from gzip import compress as gzip_compress +from html import escape +from html.parser import HTMLParser from io import BytesIO from django.core.exceptions import SuspiciousFileOperation -from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy +from django.utils.functional import ( + SimpleLazyObject, + cached_property, + keep_lazy_text, + lazy, +) from django.utils.regex_helper import _lazy_re_compile from django.utils.translation import gettext as _ from django.utils.translation import gettext_lazy, pgettext @@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None): return f"{text}{truncate}" +def calculate_truncate_chars_length(length, replacement): + truncate_len = length + for char in add_truncation_text("", replacement): + if not unicodedata.combining(char): + truncate_len -= 1 + if truncate_len == 0: + break + return truncate_len + + +class TruncateHTMLParser(HTMLParser): + class TruncationCompleted(Exception): + pass + + def __init__(self, *, length, replacement, convert_charrefs=True): + super().__init__(convert_charrefs=convert_charrefs) + self.tags = deque() + self.output = "" + self.remaining = length + self.replacement = replacement + + @cached_property + def void_elements(self): + from django.utils.html import VOID_ELEMENTS + + return VOID_ELEMENTS + + def handle_startendtag(self, tag, attrs): + self.handle_starttag(tag, attrs) + if tag not in self.void_elements: + self.handle_endtag(tag) + + def handle_starttag(self, tag, attrs): + self.output += self.get_starttag_text() + if tag not in self.void_elements: + self.tags.appendleft(tag) + + def handle_endtag(self, tag): + if tag not in self.void_elements: + self.output += f"" + try: + self.tags.remove(tag) + except ValueError: + pass + + def handle_data(self, data): + data, output = self.process(data) + data_len = len(data) + if self.remaining < data_len: + self.remaining = 0 + self.output += add_truncation_text(output, self.replacement) + raise self.TruncationCompleted + self.remaining -= data_len + self.output += output + + def feed(self, data): + try: + super().feed(data) + except self.TruncationCompleted: + self.output += "".join([f"" for tag in self.tags]) + self.tags.clear() + self.reset() + else: + # No data was handled. + self.reset() + + +class TruncateCharsHTMLParser(TruncateHTMLParser): + def __init__(self, *, length, replacement, convert_charrefs=True): + self.length = length + self.processed_chars = 0 + super().__init__( + length=calculate_truncate_chars_length(length, replacement), + replacement=replacement, + convert_charrefs=convert_charrefs, + ) + + def process(self, data): + self.processed_chars += len(data) + if (self.processed_chars == self.length) and ( + len(self.output) + len(data) == len(self.rawdata) + ): + self.output += data + raise self.TruncationCompleted + output = escape("".join(data[: self.remaining])) + return data, output + + +class TruncateWordsHTMLParser(TruncateHTMLParser): + def process(self, data): + data = re.split(r"(?<=\S)\s+(?=\S)", data) + output = escape(" ".join(data[: self.remaining])) + return data, output + + class Truncator(SimpleLazyObject): """ An object used to truncate text, either by characters or words. @@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject): return "" text = unicodedata.normalize("NFC", self._wrapped) - # Calculate the length to truncate to (max length - end_text length) - truncate_len = length - for char in add_truncation_text("", truncate): - if not unicodedata.combining(char): - truncate_len -= 1 - if truncate_len == 0: - break if html: - return self._truncate_html(length, truncate, text, truncate_len, False) - return self._text_chars(length, truncate, text, truncate_len) + parser = TruncateCharsHTMLParser(length=length, replacement=truncate) + parser.feed(text) + parser.close() + return parser.output + return self._text_chars(length, truncate, text) - def _text_chars(self, length, truncate, text, truncate_len): + def _text_chars(self, length, truncate, text): """Truncate a string after a certain number of chars.""" + truncate_len = calculate_truncate_chars_length(length, truncate) s_len = 0 end_index = None for i, char in enumerate(text): @@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject): if length <= 0: return "" if html: - return self._truncate_html(length, truncate, self._wrapped, length, True) + parser = TruncateWordsHTMLParser(length=length, replacement=truncate) + parser.feed(self._wrapped) + parser.close() + return parser.output return self._text_words(length, truncate) def _text_words(self, length, truncate): @@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject): return add_truncation_text(" ".join(words), truncate) return " ".join(words) - def _truncate_html(self, length, truncate, text, truncate_len, words): - """ - Truncate HTML to a certain number of chars (not counting tags and - comments), or, if words is True, then to a certain number of words. - Close opened tags if they were correctly closed in the given HTML. - - Preserve newlines in the HTML. - """ - if words and length <= 0: - return "" - - size_limited = False - if len(text) > self.MAX_LENGTH_HTML: - text = text[: self.MAX_LENGTH_HTML] - size_limited = True - - html4_singlets = ( - "br", - "col", - "link", - "base", - "img", - "param", - "area", - "hr", - "input", - ) - - # Count non-HTML chars/words and keep note of open tags - pos = 0 - end_text_pos = 0 - current_len = 0 - open_tags = [] - - regex = re_words if words else re_chars - - while current_len <= length: - m = regex.search(text, pos) - if not m: - # Checked through whole string - break - pos = m.end(0) - if m[1]: - # It's an actual non-HTML word or char - current_len += 1 - if current_len == truncate_len: - end_text_pos = pos - continue - # Check for tag - tag = re_tag.match(m[0]) - if not tag or current_len >= truncate_len: - # Don't worry about non tags or tags after our truncate point - continue - closing_tag, tagname, self_closing = tag.groups() - # Element names are always case-insensitive - tagname = tagname.lower() - if self_closing or tagname in html4_singlets: - pass - elif closing_tag: - # Check for match in open tags list - try: - i = open_tags.index(tagname) - except ValueError: - pass - else: - # SGML: An end tag closes, back to the matching start tag, - # all unclosed intervening start tags with omitted end tags - open_tags = open_tags[i + 1 :] - else: - # Add it to the start of the open tags list - open_tags.insert(0, tagname) - - truncate_text = add_truncation_text("", truncate) - - if current_len <= length: - if size_limited and truncate_text: - text += truncate_text - return text - - out = text[:end_text_pos] - if truncate_text: - out += truncate_text - # Close any tags still open - for tag in open_tags: - out += "" % tag - # Return string - return out - @keep_lazy_text def get_valid_filename(name): diff --git a/docs/releases/5.1.txt b/docs/releases/5.1.txt index 701d6865325..aca1281a98a 100644 --- a/docs/releases/5.1.txt +++ b/docs/releases/5.1.txt @@ -368,6 +368,11 @@ Miscellaneous :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the ``msg_prefix``. This is consistent with the behavior of other assertions. +* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and + :tfilter:`truncatewords_html` template filters now uses + :py:class:`html.parser.HTMLParser` subclasses. This results in a more robust + and faster operation, but there may be small differences in the output. + .. _deprecated-features-5.1: Features deprecated in 5.1 diff --git a/tests/template_tests/filter_tests/test_truncatewords_html.py b/tests/template_tests/filter_tests/test_truncatewords_html.py index 32b7c81a762..0cf41d83aee 100644 --- a/tests/template_tests/filter_tests/test_truncatewords_html.py +++ b/tests/template_tests/filter_tests/test_truncatewords_html.py @@ -24,7 +24,7 @@ class FunctionTests(SimpleTestCase): truncatewords_html( '

one two - three
four
five

', 4 ), - '

one two - three …

', + '

one two - three

', ) def test_truncate3(self): @@ -32,7 +32,7 @@ class FunctionTests(SimpleTestCase): truncatewords_html( '

one two - three
four
five

', 5 ), - '

one two - three
four …

', + '

one two - three
four

', ) def test_truncate4(self): @@ -53,7 +53,7 @@ class FunctionTests(SimpleTestCase): truncatewords_html( "Buenos días! ¿Cómo está?", 3 ), - "Buenos días! ¿Cómo …", + "Buenos días! ¿Cómo …", ) def test_invalid_arg(self): diff --git a/tests/utils_tests/test_text.py b/tests/utils_tests/test_text.py index 6004712bf29..b38d8238c52 100644 --- a/tests/utils_tests/test_text.py +++ b/tests/utils_tests/test_text.py @@ -111,7 +111,7 @@ class TestUtilsText(SimpleTestCase): truncator.chars(46, html=True), ) self.assertEqual( - '

The quick brown fox jumped over the lazy dog.' + '

The quick brown fox jumped over the lazy dog…' "

", truncator.chars(45, html=True), ) @@ -120,7 +120,7 @@ class TestUtilsText(SimpleTestCase): truncator.chars(10, html=True), ) self.assertEqual( - "…", + '

', truncator.chars(1, html=True), ) self.assertEqual("", truncator.chars(0, html=True)) @@ -142,18 +142,16 @@ class TestUtilsText(SimpleTestCase): bigger_len = text.Truncator.MAX_LENGTH_HTML + 1 valid_html = "

Joel is a slug

" # 14 chars perf_test_values = [ - ("", None), - ("", "", None), + ("", ""), + ("", "

"), + ("&" * bigger_len, ""), + ("_X<<<<<<<<<<<>", "_X<<<<<<<…"), (valid_html * bigger_len, "

Joel is a…

"), # 10 chars ] for value, expected in perf_test_values: with self.subTest(value=value): truncator = text.Truncator(value) - self.assertEqual( - expected if expected else value, truncator.chars(10, html=True) - ) + self.assertEqual(expected, truncator.chars(10, html=True)) def test_truncate_chars_html_with_newline_inside_tag(self): truncator = text.Truncator( @@ -181,7 +179,7 @@ class TestUtilsText(SimpleTestCase): "
The
quick brown…", truncator.chars(16, html=True) ) self.assertEqual("
The
q…", truncator.chars(6, html=True)) - self.assertEqual("
The …", truncator.chars(5, html=True)) + self.assertEqual("
The
…", truncator.chars(5, html=True)) self.assertEqual("
The…", truncator.chars(4, html=True)) self.assertEqual("
Th…", truncator.chars(3, html=True)) @@ -190,11 +188,19 @@ class TestUtilsText(SimpleTestCase): "Buenos días! ¿Cómo está?" ) self.assertEqual( - "Buenos días! ¿Cómo…", + "Buenos días! ¿Cómo está?", truncator.chars(40, html=True), ) + self.assertEqual( + "Buenos días…", + truncator.chars(12, html=True), + ) + self.assertEqual( + "Buenos días! ¿Cómo está…", + truncator.chars(24, html=True), + ) truncator = text.Truncator("

I <3 python, what about you?

") - self.assertEqual("

I <3 python,…

", truncator.chars(16, html=True)) + self.assertEqual("

I <3 python, wh…

", truncator.chars(16, html=True)) def test_truncate_words(self): truncator = text.Truncator("The quick brown fox jumped over the lazy dog.") @@ -242,7 +248,7 @@ class TestUtilsText(SimpleTestCase): "

The quick \t brown fox jumped over the lazy dog.

" ) self.assertEqual( - "

The quick \t brown fox…

", + "

The quick brown fox…

", truncator.words(4, html=True), ) @@ -277,7 +283,7 @@ class TestUtilsText(SimpleTestCase): "Buenos días! ¿Cómo está?" ) self.assertEqual( - "Buenos días! ¿Cómo…", + "Buenos días! ¿Cómo…", truncator.words(3, html=True), ) truncator = text.Truncator("

I <3 python, what about you?

") @@ -292,19 +298,17 @@ class TestUtilsText(SimpleTestCase): bigger_len = text.Truncator.MAX_LENGTH_HTML + 1 valid_html = "

Joel is a slug

" # 4 words perf_test_values = [ - ("", None), - ("", "", None), + ("", ""), + ("", "

"), + ("&" * max_len, ""), + ("&" * bigger_len, ""), + ("_X<<<<<<<<<<<>", "_X<<<<<<<<<<<>"), (valid_html * bigger_len, valid_html * 12 + "

Joel is…

"), # 50 words ] for value, expected in perf_test_values: with self.subTest(value=value): truncator = text.Truncator(value) - self.assertEqual( - expected if expected else value, truncator.words(50, html=True) - ) + self.assertEqual(expected, truncator.words(50, html=True)) def test_wrap(self): digits = "1234 67 9"