Fixed #33195 -- Refactored urlize() based on a class.

This allows easier customization/
2021-10-14 19:27:31 +02:00 · 2021-10-14 19:27:31 +02:00 · e567670b1a
parent 4a58dfd9db
commit e567670b1a
1 changed files with 97 additions and 74 deletions
--- a/django/utils/html.py
+++ b/django/utils/html.py
@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
 from django.utils.safestring import SafeData, SafeString, mark_safe
 from django.utils.text import normalize_newlines

-# Configuration for urlize() function.
-TRAILING_PUNCTUATION_CHARS = '.,:;!'
-WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
-
-word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
-simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
-simple_url_2_re = _lazy_re_compile(
-    r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
-    re.IGNORECASE
-)
-

@keep_lazy(str, SafeString)
 def escape(text):
@ -229,48 +218,118 @@ def smart_urlquote(url):
    return urlunsplit((scheme, netloc, path, query, fragment))


-@keep_lazy_text
-def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+class Urlizer:
    """
    Convert any URLs in text into clickable links.

-    Works on http://, https://, www. links, and also on links ending in one of
+    Work on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.
-
-    If trim_url_limit is not None, truncate the URLs in the link text longer
-    than this limit to trim_url_limit - 1 characters and append an ellipsis.
-
-    If nofollow is True, give the links a rel="nofollow" attribute.
-
-    If autoescape is True, autoescape the link text and URLs.
    """
-    safe_input = isinstance(text, SafeData)
+    trailing_punctuation_chars = '.,:;!'
+    wrapping_punctuation = [('(', ')'), ('[', ']')]

-    def trim_url(x, limit=trim_url_limit):
-        if limit is None or len(x) <= limit:
+    simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
+    simple_url_2_re = _lazy_re_compile(
+        r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
+        re.IGNORECASE
+    )
+    word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
+
+    mailto_template = 'mailto:{local}@{domain}'
+    url_template = '<a href="{href}"{attrs}>{url}</a>'
+
+    def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
+        """
+        If trim_url_limit is not None, truncate the URLs in the link text
+        longer than this limit to trim_url_limit - 1 characters and append an
+        ellipsis.
+
+        If nofollow is True, give the links a rel="nofollow" attribute.
+
+        If autoescape is True, autoescape the link text and URLs.
+        """
+        self.trim_url_limit = trim_url_limit
+        self.nofollow = nofollow
+        self.autoescape = autoescape
+        self.safe_input = isinstance(text, SafeData)
+
+        words = self.word_split_re.split(str(text))
+        return ''.join([
+            self.handle_word(word) for word in words
+        ])
+
+    def handle_word(self, word):
+        if '.' in word or '@' in word or ':' in word:
+            # lead: Punctuation trimmed from the beginning of the word.
+            # middle: State of the word.
+            # trail: Punctuation trimmed from the end of the word.
+            lead, middle, trail = self.trim_punctuation(word)
+            # Make URL we want to point to.
+            url = None
+            nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
+            if self.simple_url_re.match(middle):
+                url = smart_urlquote(html.unescape(middle))
+            elif self.simple_url_2_re.match(middle):
+                url = smart_urlquote('http://%s' % html.unescape(middle))
+            elif ':' not in middle and self.is_email_simple(middle):
+                local, domain = middle.rsplit('@', 1)
+                try:
+                    domain = punycode(domain)
+                except UnicodeError:
+                    return word
+                url = self.mailto_template.format(local=local, domain=domain)
+                nofollow_attr = ''
+            # Make link.
+            if url:
+                trimmed = self.trim_url(middle)
+                if self.autoescape and not self.safe_input:
+                    lead, trail = escape(lead), escape(trail)
+                    trimmed = escape(trimmed)
+                middle = self.url_template.format(
+                    href=escape(url),
+                    attrs=nofollow_attr,
+                    url=trimmed,
+                )
+                return mark_safe(f'{lead}{middle}{trail}')
+            else:
+                if self.safe_input:
+                    return mark_safe(word)
+                elif self.autoescape:
+                    return escape(word)
+        elif self.safe_input:
+            return mark_safe(word)
+        elif self.autoescape:
+            return escape(word)
+        return word
+
+    def trim_url(self, x):
+        if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
            return x
-        return '%s…' % x[:max(0, limit - 1)]
+        return '%s…' % x[:max(0, self.trim_url_limit - 1)]

-    def trim_punctuation(lead, middle, trail):
+    def trim_punctuation(self, word):
        """
-        Trim trailing and wrapping punctuation from `middle`. Return the items
-        of the new state.
+        Trim trailing and wrapping punctuation from `word`. Return the items of
+        the new state.
        """
+        lead, middle, trail = '', word, ''
        # Continue trimming until middle remains unchanged.
        trimmed_something = True
        while trimmed_something:
            trimmed_something = False
            # Trim wrapping punctuation.
-            for opening, closing in WRAPPING_PUNCTUATION:
+            for opening, closing in self.wrapping_punctuation:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead += opening
                    trimmed_something = True
                # Keep parentheses at the end only if they're balanced.
-                if (middle.endswith(closing) and
-                        middle.count(closing) == middle.count(opening) + 1):
+                if (
+                    middle.endswith(closing) and
+                    middle.count(closing) == middle.count(opening) + 1
+                ):
                    middle = middle[:-len(closing)]
                    trail = closing + trail
                    trimmed_something = True
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
            # as encoded entities contain ';'). Unescape entities to avoid
            # breaking them by removing ';'.
            middle_unescaped = html.unescape(middle)
-            stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
+            stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
            if middle_unescaped != stripped:
                punctuation_count = len(middle_unescaped) - len(stripped)
                trail = middle[-punctuation_count:] + trail
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
                trimmed_something = True
        return lead, middle, trail

+    @staticmethod
    def is_email_simple(value):
        """Return True if value looks like an email address."""
        # An @ must be in the middle of the value.
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
            return False
        return True

-    words = word_split_re.split(str(text))
-    for i, word in enumerate(words):
-        if '.' in word or '@' in word or ':' in word:
-            # lead: Current punctuation trimmed from the beginning of the word.
-            # middle: Current state of the word.
-            # trail: Current punctuation trimmed from the end of the word.
-            lead, middle, trail = '', word, ''
-            # Deal with punctuation.
-            lead, middle, trail = trim_punctuation(lead, middle, trail)

-            # Make URL we want to point to.
-            url = None
-            nofollow_attr = ' rel="nofollow"' if nofollow else ''
-            if simple_url_re.match(middle):
-                url = smart_urlquote(html.unescape(middle))
-            elif simple_url_2_re.match(middle):
-                url = smart_urlquote('http://%s' % html.unescape(middle))
-            elif ':' not in middle and is_email_simple(middle):
-                local, domain = middle.rsplit('@', 1)
-                try:
-                    domain = punycode(domain)
-                except UnicodeError:
-                    continue
-                url = 'mailto:%s@%s' % (local, domain)
-                nofollow_attr = ''
+urlizer = Urlizer()

-            # Make link.
-            if url:
-                trimmed = trim_url(middle)
-                if autoescape and not safe_input:
-                    lead, trail = escape(lead), escape(trail)
-                    trimmed = escape(trimmed)
-                middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
-                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
-            else:
-                if safe_input:
-                    words[i] = mark_safe(word)
-                elif autoescape:
-                    words[i] = escape(word)
-        elif safe_input:
-            words[i] = mark_safe(word)
-        elif autoescape:
-            words[i] = escape(word)
-    return ''.join(words)
+
+@keep_lazy_text
+def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+    return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)


 def avoid_wrapping(value):