Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator.

2023-01-03 20:48:06 +00:00 · 2023-01-03 20:48:06 +00:00 · 6ee37ada32
parent 70f39e46f8
commit 6ee37ada32
4 changed files with 149 additions and 125 deletions
--- a/django/utils/text.py
+++ b/django/utils/text.py
@ -2,12 +2,20 @@ import gzip
 import re
 import secrets
 import unicodedata
+from collections import deque
 from gzip import GzipFile
 from gzip import compress as gzip_compress
+from html import escape
+from html.parser import HTMLParser
 from io import BytesIO

 from django.core.exceptions import SuspiciousFileOperation
-from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
+from django.utils.functional import (
+    SimpleLazyObject,
+    cached_property,
+    keep_lazy_text,
+    lazy,
+)
 from django.utils.regex_helper import _lazy_re_compile
 from django.utils.translation import gettext as _
 from django.utils.translation import gettext_lazy, pgettext
@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
    return f"{text}{truncate}"


+def calculate_truncate_chars_length(length, replacement):
+    truncate_len = length
+    for char in add_truncation_text("", replacement):
+        if not unicodedata.combining(char):
+            truncate_len -= 1
+            if truncate_len == 0:
+                break
+    return truncate_len
+
+
+class TruncateHTMLParser(HTMLParser):
+    class TruncationCompleted(Exception):
+        pass
+
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.tags = deque()
+        self.output = ""
+        self.remaining = length
+        self.replacement = replacement
+
+    @cached_property
+    def void_elements(self):
+        from django.utils.html import VOID_ELEMENTS
+
+        return VOID_ELEMENTS
+
+    def handle_startendtag(self, tag, attrs):
+        self.handle_starttag(tag, attrs)
+        if tag not in self.void_elements:
+            self.handle_endtag(tag)
+
+    def handle_starttag(self, tag, attrs):
+        self.output += self.get_starttag_text()
+        if tag not in self.void_elements:
+            self.tags.appendleft(tag)
+
+    def handle_endtag(self, tag):
+        if tag not in self.void_elements:
+            self.output += f"</{tag}>"
+            try:
+                self.tags.remove(tag)
+            except ValueError:
+                pass
+
+    def handle_data(self, data):
+        data, output = self.process(data)
+        data_len = len(data)
+        if self.remaining < data_len:
+            self.remaining = 0
+            self.output += add_truncation_text(output, self.replacement)
+            raise self.TruncationCompleted
+        self.remaining -= data_len
+        self.output += output
+
+    def feed(self, data):
+        try:
+            super().feed(data)
+        except self.TruncationCompleted:
+            self.output += "".join([f"</{tag}>" for tag in self.tags])
+            self.tags.clear()
+            self.reset()
+        else:
+            # No data was handled.
+            self.reset()
+
+
+class TruncateCharsHTMLParser(TruncateHTMLParser):
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        self.length = length
+        self.processed_chars = 0
+        super().__init__(
+            length=calculate_truncate_chars_length(length, replacement),
+            replacement=replacement,
+            convert_charrefs=convert_charrefs,
+        )
+
+    def process(self, data):
+        self.processed_chars += len(data)
+        if (self.processed_chars == self.length) and (
+            len(self.output) + len(data) == len(self.rawdata)
+        ):
+            self.output += data
+            raise self.TruncationCompleted
+        output = escape("".join(data[: self.remaining]))
+        return data, output
+
+
+class TruncateWordsHTMLParser(TruncateHTMLParser):
+    def process(self, data):
+        data = re.split(r"(?<=\S)\s+(?=\S)", data)
+        output = escape(" ".join(data[: self.remaining]))
+        return data, output
+
+
 class Truncator(SimpleLazyObject):
    """
    An object used to truncate text, either by characters or words.
@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject):
            return ""
        text = unicodedata.normalize("NFC", self._wrapped)

-        # Calculate the length to truncate to (max length - end_text length)
-        truncate_len = length
-        for char in add_truncation_text("", truncate):
-            if not unicodedata.combining(char):
-                truncate_len -= 1
-                if truncate_len == 0:
-                    break
        if html:
-            return self._truncate_html(length, truncate, text, truncate_len, False)
-        return self._text_chars(length, truncate, text, truncate_len)
+            parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
+            parser.feed(text)
+            parser.close()
+            return parser.output
+        return self._text_chars(length, truncate, text)

-    def _text_chars(self, length, truncate, text, truncate_len):
+    def _text_chars(self, length, truncate, text):
        """Truncate a string after a certain number of chars."""
+        truncate_len = calculate_truncate_chars_length(length, truncate)
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject):
        if length <= 0:
            return ""
        if html:
-            return self._truncate_html(length, truncate, self._wrapped, length, True)
+            parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
+            parser.feed(self._wrapped)
+            parser.close()
+            return parser.output
        return self._text_words(length, truncate)

    def _text_words(self, length, truncate):
@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject):
            return add_truncation_text(" ".join(words), truncate)
        return " ".join(words)

-    def _truncate_html(self, length, truncate, text, truncate_len, words):
-        """
-        Truncate HTML to a certain number of chars (not counting tags and
-        comments), or, if words is True, then to a certain number of words.
-        Close opened tags if they were correctly closed in the given HTML.
-
-        Preserve newlines in the HTML.
-        """
-        if words and length <= 0:
-            return ""
-
-        size_limited = False
-        if len(text) > self.MAX_LENGTH_HTML:
-            text = text[: self.MAX_LENGTH_HTML]
-            size_limited = True
-
-        html4_singlets = (
-            "br",
-            "col",
-            "link",
-            "base",
-            "img",
-            "param",
-            "area",
-            "hr",
-            "input",
-        )
-
-        # Count non-HTML chars/words and keep note of open tags
-        pos = 0
-        end_text_pos = 0
-        current_len = 0
-        open_tags = []
-
-        regex = re_words if words else re_chars
-
-        while current_len <= length:
-            m = regex.search(text, pos)
-            if not m:
-                # Checked through whole string
-                break
-            pos = m.end(0)
-            if m[1]:
-                # It's an actual non-HTML word or char
-                current_len += 1
-                if current_len == truncate_len:
-                    end_text_pos = pos
-                continue
-            # Check for tag
-            tag = re_tag.match(m[0])
-            if not tag or current_len >= truncate_len:
-                # Don't worry about non tags or tags after our truncate point
-                continue
-            closing_tag, tagname, self_closing = tag.groups()
-            # Element names are always case-insensitive
-            tagname = tagname.lower()
-            if self_closing or tagname in html4_singlets:
-                pass
-            elif closing_tag:
-                # Check for match in open tags list
-                try:
-                    i = open_tags.index(tagname)
-                except ValueError:
-                    pass
-                else:
-                    # SGML: An end tag closes, back to the matching start tag,
-                    # all unclosed intervening start tags with omitted end tags
-                    open_tags = open_tags[i + 1 :]
-            else:
-                # Add it to the start of the open tags list
-                open_tags.insert(0, tagname)
-
-        truncate_text = add_truncation_text("", truncate)
-
-        if current_len <= length:
-            if size_limited and truncate_text:
-                text += truncate_text
-            return text
-
-        out = text[:end_text_pos]
-        if truncate_text:
-            out += truncate_text
-        # Close any tags still open
-        for tag in open_tags:
-            out += "</%s>" % tag
-        # Return string
-        return out
-

@keep_lazy_text
 def get_valid_filename(name):
--- a/docs/releases/5.1.txt
+++ b/docs/releases/5.1.txt
@ -368,6 +368,11 @@ Miscellaneous
  :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
  ``msg_prefix``. This is consistent with the behavior of other assertions.

+* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
+  :tfilter:`truncatewords_html` template filters now uses
+  :py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
+  and faster operation, but there may be small differences in the output.
+
 .. _deprecated-features-5.1:

 Features deprecated in 5.1
--- a/tests/template_tests/filter_tests/test_truncatewords_html.py
+++ b/tests/template_tests/filter_tests/test_truncatewords_html.py
@ -24,7 +24,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                '<p>one <a href="#">two - three <br>four</a> five</p>', 4
            ),
-            '<p>one <a href="#">two - three …</a></p>',
+            '<p>one <a href="#">two - three <br> …</a></p>',
        )

    def test_truncate3(self):
@ -32,7 +32,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                '<p>one <a href="#">two - three <br>four</a> five</p>', 5
            ),
-            '<p>one <a href="#">two - three <br>four …</a></p>',
+            '<p>one <a href="#">two - three <br>four</a> …</p>',
        )

    def test_truncate4(self):
@ -53,7 +53,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
            ),
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>",
+            "<i>Buenos días! ¿Cómo …</i>",
        )

    def test_invalid_arg(self):
--- a/tests/utils_tests/test_text.py
+++ b/tests/utils_tests/test_text.py
@ -111,7 +111,7 @@ class TestUtilsText(SimpleTestCase):
            truncator.chars(46, html=True),
        )
        self.assertEqual(
-            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog.</em>'
+            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog…</em>'
            "</strong></p>",
            truncator.chars(45, html=True),
        )
@ -120,7 +120,7 @@ class TestUtilsText(SimpleTestCase):
            truncator.chars(10, html=True),
        )
        self.assertEqual(
-            "…",
+            '<p id="par"><strong><em>…</em></strong></p>',
            truncator.chars(1, html=True),
        )
        self.assertEqual("", truncator.chars(0, html=True))
@ -142,18 +142,16 @@ class TestUtilsText(SimpleTestCase):
        bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
        valid_html = "<p>Joel is a slug</p>"  # 14 chars
        perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * 6 + "…"),
-            ("&" * bigger_len, "&" * 9 + "…"),
-            ("_X<<<<<<<<<<<>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
+            ("&" * bigger_len, ""),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;…"),
            (valid_html * bigger_len, "<p>Joel is a…</p>"),  # 10 chars
        ]
        for value, expected in perf_test_values:
            with self.subTest(value=value):
                truncator = text.Truncator(value)
-                self.assertEqual(
-                    expected if expected else value, truncator.chars(10, html=True)
-                )
+                self.assertEqual(expected, truncator.chars(10, html=True))

    def test_truncate_chars_html_with_newline_inside_tag(self):
        truncator = text.Truncator(
@ -181,7 +179,7 @@ class TestUtilsText(SimpleTestCase):
            "<br>The <hr/>quick <em>brown…</em>", truncator.chars(16, html=True)
        )
        self.assertEqual("<br>The <hr/>q…", truncator.chars(6, html=True))
-        self.assertEqual("<br>The …", truncator.chars(5, html=True))
+        self.assertEqual("<br>The <hr/>…", truncator.chars(5, html=True))
        self.assertEqual("<br>The…", truncator.chars(4, html=True))
        self.assertEqual("<br>Th…", truncator.chars(3, html=True))

@ -190,11 +188,19 @@ class TestUtilsText(SimpleTestCase):
            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
        )
        self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo está?</i>",
            truncator.chars(40, html=True),
        )
+        self.assertEqual(
+            "<i>Buenos días…</i>",
+            truncator.chars(12, html=True),
+        )
+        self.assertEqual(
+            "<i>Buenos días! ¿Cómo está…</i>",
+            truncator.chars(24, html=True),
+        )
        truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
-        self.assertEqual("<p>I &lt;3 python,…</p>", truncator.chars(16, html=True))
+        self.assertEqual("<p>I &lt;3 python, wh…</p>", truncator.chars(16, html=True))

    def test_truncate_words(self):
        truncator = text.Truncator("The quick brown fox jumped over the lazy dog.")
@ -242,7 +248,7 @@ class TestUtilsText(SimpleTestCase):
            "<p>The  quick \t brown fox jumped over the lazy dog.</p>"
        )
        self.assertEqual(
-            "<p>The  quick \t brown fox…</p>",
+            "<p>The quick brown fox…</p>",
            truncator.words(4, html=True),
        )

@ -277,7 +283,7 @@ class TestUtilsText(SimpleTestCase):
            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
        )
        self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo…</i>",
            truncator.words(3, html=True),
        )
        truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
@ -292,19 +298,17 @@ class TestUtilsText(SimpleTestCase):
        bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
        valid_html = "<p>Joel is a slug</p>"  # 4 words
        perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * (max_len - 3) + "…"),
-            ("&" * max_len, None),  # no change
-            ("&" * bigger_len, "&" * max_len + "…"),
-            ("_X<<<<<<<<<<<>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
+            ("&" * max_len, ""),
+            ("&" * bigger_len, ""),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&gt;"),
            (valid_html * bigger_len, valid_html * 12 + "<p>Joel is…</p>"),  # 50 words
        ]
        for value, expected in perf_test_values:
            with self.subTest(value=value):
                truncator = text.Truncator(value)
-                self.assertEqual(
-                    expected if expected else value, truncator.words(50, html=True)
-                )
+                self.assertEqual(expected, truncator.words(50, html=True))

    def test_wrap(self):
        digits = "1234 67 9"