Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator.

2023-01-03 20:48:06 +00:00 · 2023-01-03 20:48:06 +00:00 · 6ee37ada32
parent 70f39e46f8
commit 6ee37ada32
4 changed files with 149 additions and 125 deletions
--- a/django/utils/text.py
+++ b/django/utils/text.py
@ -2,12 +2,20 @@ import gzip
 import re
 import secrets
 import unicodedata
 from collections import deque
 from gzip import GzipFile
 from gzip import compress as gzip_compress
 from html import escape
 from html.parser import HTMLParser
 from io import BytesIO
 from django.core.exceptions import SuspiciousFileOperation
-from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
+from django.utils.functional import (
    SimpleLazyObject,
    cached_property,
    keep_lazy_text,
    lazy,
 )
 from django.utils.regex_helper import _lazy_re_compile
 from django.utils.translation import gettext as _
 from django.utils.translation import gettext_lazy, pgettext
@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
    return f"{text}{truncate}"
 def calculate_truncate_chars_length(length, replacement):
    truncate_len = length
    for char in add_truncation_text("", replacement):
        if not unicodedata.combining(char):
            truncate_len -= 1
            if truncate_len == 0:
                break
    return truncate_len
 class TruncateHTMLParser(HTMLParser):
    class TruncationCompleted(Exception):
        pass
    def __init__(self, *, length, replacement, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.tags = deque()
        self.output = ""
        self.remaining = length
        self.replacement = replacement
    @cached_property
    def void_elements(self):
        from django.utils.html import VOID_ELEMENTS
        return VOID_ELEMENTS
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        if tag not in self.void_elements:
            self.handle_endtag(tag)
    def handle_starttag(self, tag, attrs):
        self.output += self.get_starttag_text()
        if tag not in self.void_elements:
            self.tags.appendleft(tag)
    def handle_endtag(self, tag):
        if tag not in self.void_elements:
            self.output += f"</{tag}>"
            try:
                self.tags.remove(tag)
            except ValueError:
                pass
    def handle_data(self, data):
        data, output = self.process(data)
        data_len = len(data)
        if self.remaining < data_len:
            self.remaining = 0
            self.output += add_truncation_text(output, self.replacement)
            raise self.TruncationCompleted
        self.remaining -= data_len
        self.output += output
    def feed(self, data):
        try:
            super().feed(data)
        except self.TruncationCompleted:
            self.output += "".join([f"</{tag}>" for tag in self.tags])
            self.tags.clear()
            self.reset()
        else:
            # No data was handled.
            self.reset()
 class TruncateCharsHTMLParser(TruncateHTMLParser):
    def __init__(self, *, length, replacement, convert_charrefs=True):
        self.length = length
        self.processed_chars = 0
        super().__init__(
            length=calculate_truncate_chars_length(length, replacement),
            replacement=replacement,
            convert_charrefs=convert_charrefs,
        )
    def process(self, data):
        self.processed_chars += len(data)
        if (self.processed_chars == self.length) and (
            len(self.output) + len(data) == len(self.rawdata)
        ):
            self.output += data
            raise self.TruncationCompleted
        output = escape("".join(data[: self.remaining]))
        return data, output
 class TruncateWordsHTMLParser(TruncateHTMLParser):
    def process(self, data):
        data = re.split(r"(?<=\S)\s+(?=\S)", data)
        output = escape(" ".join(data[: self.remaining]))
        return data, output
 class Truncator(SimpleLazyObject):
    """
    An object used to truncate text, either by characters or words.
@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject):
            return ""
        text = unicodedata.normalize("NFC", self._wrapped)
        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in add_truncation_text("", truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
                    break
        if html:
-            return self._truncate_html(length, truncate, text, truncate_len, False)
+            parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
-        return self._text_chars(length, truncate, text, truncate_len)
+            parser.feed(text)
            parser.close()
            return parser.output
        return self._text_chars(length, truncate, text)
-    def _text_chars(self, length, truncate, text, truncate_len):
+    def _text_chars(self, length, truncate, text):
        """Truncate a string after a certain number of chars."""
        truncate_len = calculate_truncate_chars_length(length, truncate)
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject):
        if length <= 0:
            return ""
        if html:
-            return self._truncate_html(length, truncate, self._wrapped, length, True)
+            parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
            parser.feed(self._wrapped)
            parser.close()
            return parser.output
        return self._text_words(length, truncate)
    def _text_words(self, length, truncate):
@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject):
            return add_truncation_text(" ".join(words), truncate)
        return " ".join(words)
    def _truncate_html(self, length, truncate, text, truncate_len, words):
        """
        Truncate HTML to a certain number of chars (not counting tags and
        comments), or, if words is True, then to a certain number of words.
        Close opened tags if they were correctly closed in the given HTML.
        Preserve newlines in the HTML.
        """
        if words and length <= 0:
            return ""
        size_limited = False
        if len(text) > self.MAX_LENGTH_HTML:
            text = text[: self.MAX_LENGTH_HTML]
            size_limited = True
        html4_singlets = (
            "br",
            "col",
            "link",
            "base",
            "img",
            "param",
            "area",
            "hr",
            "input",
        )
        # Count non-HTML chars/words and keep note of open tags
        pos = 0
        end_text_pos = 0
        current_len = 0
        open_tags = []
        regex = re_words if words else re_chars
        while current_len <= length:
            m = regex.search(text, pos)
            if not m:
                # Checked through whole string
                break
            pos = m.end(0)
            if m[1]:
                # It's an actual non-HTML word or char
                current_len += 1
                if current_len == truncate_len:
                    end_text_pos = pos
                continue
            # Check for tag
            tag = re_tag.match(m[0])
            if not tag or current_len >= truncate_len:
                # Don't worry about non tags or tags after our truncate point
                continue
            closing_tag, tagname, self_closing = tag.groups()
            # Element names are always case-insensitive
            tagname = tagname.lower()
            if self_closing or tagname in html4_singlets:
                pass
            elif closing_tag:
                # Check for match in open tags list
                try:
                    i = open_tags.index(tagname)
                except ValueError:
                    pass
                else:
                    # SGML: An end tag closes, back to the matching start tag,
                    # all unclosed intervening start tags with omitted end tags
                    open_tags = open_tags[i + 1 :]
            else:
                # Add it to the start of the open tags list
                open_tags.insert(0, tagname)
        truncate_text = add_truncation_text("", truncate)
        if current_len <= length:
            if size_limited and truncate_text:
                text += truncate_text
            return text
        out = text[:end_text_pos]
        if truncate_text:
            out += truncate_text
        # Close any tags still open
        for tag in open_tags:
            out += "</%s>" % tag
        # Return string
        return out
@keep_lazy_text
 def get_valid_filename(name):
--- a/docs/releases/5.1.txt
+++ b/docs/releases/5.1.txt
@ -368,6 +368,11 @@ Miscellaneous
  :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
  ``msg_prefix``. This is consistent with the behavior of other assertions.
 * ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
  :tfilter:`truncatewords_html` template filters now uses
  :py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
  and faster operation, but there may be small differences in the output.
 .. _deprecated-features-5.1:
 Features deprecated in 5.1
--- a/tests/template_tests/filter_tests/test_truncatewords_html.py
+++ b/tests/template_tests/filter_tests/test_truncatewords_html.py
@ -24,7 +24,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                '<p>one <a href="#">two - three <br>four</a> five</p>', 4
            ),
-            '<p>one <a href="#">two - three …</a></p>',
+            '<p>one <a href="#">two - three <br> …</a></p>',
        )
    def test_truncate3(self):
@ -32,7 +32,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                '<p>one <a href="#">two - three <br>four</a> five</p>', 5
            ),
-            '<p>one <a href="#">two - three <br>four …</a></p>',
+            '<p>one <a href="#">two - three <br>four</a> …</p>',
        )
    def test_truncate4(self):
@ -53,7 +53,7 @@ class FunctionTests(SimpleTestCase):
            truncatewords_html(
                "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
            ),
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>",
+            "<i>Buenos días! ¿Cómo …</i>",
        )
    def test_invalid_arg(self):
--- a/tests/utils_tests/test_text.py
+++ b/tests/utils_tests/test_text.py
@ -111,7 +111,7 @@ class TestUtilsText(SimpleTestCase):
            truncator.chars(46, html=True),
        )
        self.assertEqual(
-            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog.</em>'
+            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog…</em>'
            "</strong></p>",
            truncator.chars(45, html=True),
        )
@ -120,7 +120,7 @@ class TestUtilsText(SimpleTestCase):
            truncator.chars(10, html=True),
        )
        self.assertEqual(
-            "…",
+            '<p id="par"><strong><em>…</em></strong></p>',
            truncator.chars(1, html=True),
        )
        self.assertEqual("", truncator.chars(0, html=True))
@ -142,18 +142,16 @@ class TestUtilsText(SimpleTestCase):
        bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
        valid_html = "<p>Joel is a slug</p>"  # 14 chars
        perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * 6 + "…"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
-            ("&" * bigger_len, "&" * 9 + "…"),
+            ("&" * bigger_len, ""),
-            ("_X<<<<<<<<<<<>", None),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;…"),
            (valid_html * bigger_len, "<p>Joel is a…</p>"),  # 10 chars
        ]
        for value, expected in perf_test_values:
            with self.subTest(value=value):
                truncator = text.Truncator(value)
-                self.assertEqual(
+                self.assertEqual(expected, truncator.chars(10, html=True))
                    expected if expected else value, truncator.chars(10, html=True)
                )
    def test_truncate_chars_html_with_newline_inside_tag(self):
        truncator = text.Truncator(
@ -181,7 +179,7 @@ class TestUtilsText(SimpleTestCase):
            "<br>The <hr/>quick <em>brown…</em>", truncator.chars(16, html=True)
        )
        self.assertEqual("<br>The <hr/>q…", truncator.chars(6, html=True))
-        self.assertEqual("<br>The …", truncator.chars(5, html=True))
+        self.assertEqual("<br>The <hr/>…", truncator.chars(5, html=True))
        self.assertEqual("<br>The…", truncator.chars(4, html=True))
        self.assertEqual("<br>Th…", truncator.chars(3, html=True))
@ -190,11 +188,19 @@ class TestUtilsText(SimpleTestCase):
            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
        )
        self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo está?</i>",
            truncator.chars(40, html=True),
        )
        self.assertEqual(
            "<i>Buenos días…</i>",
            truncator.chars(12, html=True),
        )
        self.assertEqual(
            "<i>Buenos días! ¿Cómo está…</i>",
            truncator.chars(24, html=True),
        )
        truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
-        self.assertEqual("<p>I &lt;3 python,…</p>", truncator.chars(16, html=True))
+        self.assertEqual("<p>I &lt;3 python, wh…</p>", truncator.chars(16, html=True))
    def test_truncate_words(self):
        truncator = text.Truncator("The quick brown fox jumped over the lazy dog.")
@ -242,7 +248,7 @@ class TestUtilsText(SimpleTestCase):
            "<p>The  quick \t brown fox jumped over the lazy dog.</p>"
        )
        self.assertEqual(
-            "<p>The  quick \t brown fox…</p>",
+            "<p>The quick brown fox…</p>",
            truncator.words(4, html=True),
        )
@ -277,7 +283,7 @@ class TestUtilsText(SimpleTestCase):
            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
        )
        self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo…</i>",
            truncator.words(3, html=True),
        )
        truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
@ -292,19 +298,17 @@ class TestUtilsText(SimpleTestCase):
        bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
        valid_html = "<p>Joel is a slug</p>"  # 4 words
        perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * (max_len - 3) + "…"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
-            ("&" * max_len, None),  # no change
+            ("&" * max_len, ""),
-            ("&" * bigger_len, "&" * max_len + "…"),
+            ("&" * bigger_len, ""),
-            ("_X<<<<<<<<<<<>", None),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&gt;"),
            (valid_html * bigger_len, valid_html * 12 + "<p>Joel is…</p>"),  # 50 words
        ]
        for value, expected in perf_test_values:
            with self.subTest(value=value):
                truncator = text.Truncator(value)
-                self.assertEqual(
+                self.assertEqual(expected, truncator.words(50, html=True))
                    expected if expected else value, truncator.words(50, html=True)
                )
    def test_wrap(self):
        digits = "1234 67 9"