Fixed #33195 -- Refactored urlize() based on a class.

This allows easier customization/
2021-10-14 19:27:31 +02:00 · 2021-10-14 19:27:31 +02:00 · e567670b1a
parent 4a58dfd9db
commit e567670b1a
1 changed files with 97 additions and 74 deletions
--- a/django/utils/html.py
+++ b/django/utils/html.py
@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
 from django.utils.safestring import SafeData, SafeString, mark_safe
 from django.utils.text import normalize_newlines
 # Configuration for urlize() function.
 TRAILING_PUNCTUATION_CHARS = '.,:;!'
 WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
 word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
 simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
 simple_url_2_re = _lazy_re_compile(
    r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
    re.IGNORECASE
 )
@keep_lazy(str, SafeString)
 def escape(text):
@ -229,48 +218,118 @@ def smart_urlquote(url):
    return urlunsplit((scheme, netloc, path, query, fragment))
-@keep_lazy_text
+class Urlizer:
 def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
    """
    Convert any URLs in text into clickable links.
-    Works on http://, https://, www. links, and also on links ending in one of
+    Work on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.
    """
    trailing_punctuation_chars = '.,:;!'
    wrapping_punctuation = [('(', ')'), ('[', ']')]
-    If trim_url_limit is not None, truncate the URLs in the link text longer
+    simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
-    than this limit to trim_url_limit - 1 characters and append an ellipsis.
+    simple_url_2_re = _lazy_re_compile(
        r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
        re.IGNORECASE
    )
    word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
    mailto_template = 'mailto:{local}@{domain}'
    url_template = '<a href="{href}"{attrs}>{url}</a>'
    def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
        """
        If trim_url_limit is not None, truncate the URLs in the link text
        longer than this limit to trim_url_limit - 1 characters and append an
        ellipsis.
        If nofollow is True, give the links a rel="nofollow" attribute.
        If autoescape is True, autoescape the link text and URLs.
        """
-    safe_input = isinstance(text, SafeData)
+        self.trim_url_limit = trim_url_limit
        self.nofollow = nofollow
        self.autoescape = autoescape
        self.safe_input = isinstance(text, SafeData)
-    def trim_url(x, limit=trim_url_limit):
+        words = self.word_split_re.split(str(text))
-        if limit is None or len(x) <= limit:
+        return ''.join([
            self.handle_word(word) for word in words
        ])
    def handle_word(self, word):
        if '.' in word or '@' in word or ':' in word:
            # lead: Punctuation trimmed from the beginning of the word.
            # middle: State of the word.
            # trail: Punctuation trimmed from the end of the word.
            lead, middle, trail = self.trim_punctuation(word)
            # Make URL we want to point to.
            url = None
            nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
            if self.simple_url_re.match(middle):
                url = smart_urlquote(html.unescape(middle))
            elif self.simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % html.unescape(middle))
            elif ':' not in middle and self.is_email_simple(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = punycode(domain)
                except UnicodeError:
                    return word
                url = self.mailto_template.format(local=local, domain=domain)
                nofollow_attr = ''
            # Make link.
            if url:
                trimmed = self.trim_url(middle)
                if self.autoescape and not self.safe_input:
                    lead, trail = escape(lead), escape(trail)
                    trimmed = escape(trimmed)
                middle = self.url_template.format(
                    href=escape(url),
                    attrs=nofollow_attr,
                    url=trimmed,
                )
                return mark_safe(f'{lead}{middle}{trail}')
            else:
                if self.safe_input:
                    return mark_safe(word)
                elif self.autoescape:
                    return escape(word)
        elif self.safe_input:
            return mark_safe(word)
        elif self.autoescape:
            return escape(word)
        return word
    def trim_url(self, x):
        if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
            return x
-        return '%s…' % x[:max(0, limit - 1)]
+        return '%s…' % x[:max(0, self.trim_url_limit - 1)]
-    def trim_punctuation(lead, middle, trail):
+    def trim_punctuation(self, word):
        """
-        Trim trailing and wrapping punctuation from `middle`. Return the items
+        Trim trailing and wrapping punctuation from `word`. Return the items of
-        of the new state.
+        the new state.
        """
        lead, middle, trail = '', word, ''
        # Continue trimming until middle remains unchanged.
        trimmed_something = True
        while trimmed_something:
            trimmed_something = False
            # Trim wrapping punctuation.
-            for opening, closing in WRAPPING_PUNCTUATION:
+            for opening, closing in self.wrapping_punctuation:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead += opening
                    trimmed_something = True
                # Keep parentheses at the end only if they're balanced.
-                if (middle.endswith(closing) and
+                if (
-                        middle.count(closing) == middle.count(opening) + 1):
+                    middle.endswith(closing) and
                    middle.count(closing) == middle.count(opening) + 1
                ):
                    middle = middle[:-len(closing)]
                    trail = closing + trail
                    trimmed_something = True
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
            # as encoded entities contain ';'). Unescape entities to avoid
            # breaking them by removing ';'.
            middle_unescaped = html.unescape(middle)
-            stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
+            stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
            if middle_unescaped != stripped:
                punctuation_count = len(middle_unescaped) - len(stripped)
                trail = middle[-punctuation_count:] + trail
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
                trimmed_something = True
        return lead, middle, trail
    @staticmethod
    def is_email_simple(value):
        """Return True if value looks like an email address."""
        # An @ must be in the middle of the value.
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
            return False
        return True
    words = word_split_re.split(str(text))
    for i, word in enumerate(words):
        if '.' in word or '@' in word or ':' in word:
            # lead: Current punctuation trimmed from the beginning of the word.
            # middle: Current state of the word.
            # trail: Current punctuation trimmed from the end of the word.
            lead, middle, trail = '', word, ''
            # Deal with punctuation.
            lead, middle, trail = trim_punctuation(lead, middle, trail)
-            # Make URL we want to point to.
+urlizer = Urlizer()
            url = None
            nofollow_attr = ' rel="nofollow"' if nofollow else ''
            if simple_url_re.match(middle):
                url = smart_urlquote(html.unescape(middle))
            elif simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % html.unescape(middle))
            elif ':' not in middle and is_email_simple(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = punycode(domain)
                except UnicodeError:
                    continue
                url = 'mailto:%s@%s' % (local, domain)
                nofollow_attr = ''
-            # Make link.
+
-            if url:
+@keep_lazy_text
-                trimmed = trim_url(middle)
+def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
-                if autoescape and not safe_input:
+    return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
                    lead, trail = escape(lead), escape(trail)
                    trimmed = escape(trimmed)
                middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
            else:
                if safe_input:
                    words[i] = mark_safe(word)
                elif autoescape:
                    words[i] = escape(word)
        elif safe_input:
            words[i] = mark_safe(word)
        elif autoescape:
            words[i] = escape(word)
    return ''.join(words)
 def avoid_wrapping(value):