diff --git a/django/utils/html.py b/django/utils/html.py
index 2c8c1cc79ec..cb4757818e0 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
from django.utils.safestring import SafeData, SafeString, mark_safe
from django.utils.text import normalize_newlines
-# Configuration for urlize() function.
-TRAILING_PUNCTUATION_CHARS = '.,:;!'
-WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
-
-word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
-simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
-simple_url_2_re = _lazy_re_compile(
- r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
- re.IGNORECASE
-)
-
@keep_lazy(str, SafeString)
def escape(text):
@@ -229,48 +218,118 @@ def smart_urlquote(url):
return urlunsplit((scheme, netloc, path, query, fragment))
-@keep_lazy_text
-def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+class Urlizer:
"""
Convert any URLs in text into clickable links.
- Works on http://, https://, www. links, and also on links ending in one of
+ Work on http://, https://, www. links, and also on links ending in one of
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
Links can have trailing punctuation (periods, commas, close-parens) and
leading punctuation (opening parens) and it'll still do the right thing.
-
- If trim_url_limit is not None, truncate the URLs in the link text longer
- than this limit to trim_url_limit - 1 characters and append an ellipsis.
-
- If nofollow is True, give the links a rel="nofollow" attribute.
-
- If autoescape is True, autoescape the link text and URLs.
"""
- safe_input = isinstance(text, SafeData)
+ trailing_punctuation_chars = '.,:;!'
+ wrapping_punctuation = [('(', ')'), ('[', ']')]
- def trim_url(x, limit=trim_url_limit):
- if limit is None or len(x) <= limit:
+ simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
+ simple_url_2_re = _lazy_re_compile(
+ r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
+ re.IGNORECASE
+ )
+ word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
+
+ mailto_template = 'mailto:{local}@{domain}'
+ url_template = '{url}'
+
+ def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
+ """
+ If trim_url_limit is not None, truncate the URLs in the link text
+ longer than this limit to trim_url_limit - 1 characters and append an
+ ellipsis.
+
+ If nofollow is True, give the links a rel="nofollow" attribute.
+
+ If autoescape is True, autoescape the link text and URLs.
+ """
+ self.trim_url_limit = trim_url_limit
+ self.nofollow = nofollow
+ self.autoescape = autoescape
+ self.safe_input = isinstance(text, SafeData)
+
+ words = self.word_split_re.split(str(text))
+ return ''.join([
+ self.handle_word(word) for word in words
+ ])
+
+ def handle_word(self, word):
+ if '.' in word or '@' in word or ':' in word:
+ # lead: Punctuation trimmed from the beginning of the word.
+ # middle: State of the word.
+ # trail: Punctuation trimmed from the end of the word.
+ lead, middle, trail = self.trim_punctuation(word)
+ # Make URL we want to point to.
+ url = None
+ nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
+ if self.simple_url_re.match(middle):
+ url = smart_urlquote(html.unescape(middle))
+ elif self.simple_url_2_re.match(middle):
+ url = smart_urlquote('http://%s' % html.unescape(middle))
+ elif ':' not in middle and self.is_email_simple(middle):
+ local, domain = middle.rsplit('@', 1)
+ try:
+ domain = punycode(domain)
+ except UnicodeError:
+ return word
+ url = self.mailto_template.format(local=local, domain=domain)
+ nofollow_attr = ''
+ # Make link.
+ if url:
+ trimmed = self.trim_url(middle)
+ if self.autoescape and not self.safe_input:
+ lead, trail = escape(lead), escape(trail)
+ trimmed = escape(trimmed)
+ middle = self.url_template.format(
+ href=escape(url),
+ attrs=nofollow_attr,
+ url=trimmed,
+ )
+ return mark_safe(f'{lead}{middle}{trail}')
+ else:
+ if self.safe_input:
+ return mark_safe(word)
+ elif self.autoescape:
+ return escape(word)
+ elif self.safe_input:
+ return mark_safe(word)
+ elif self.autoescape:
+ return escape(word)
+ return word
+
+ def trim_url(self, x):
+ if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
return x
- return '%s…' % x[:max(0, limit - 1)]
+ return '%s…' % x[:max(0, self.trim_url_limit - 1)]
- def trim_punctuation(lead, middle, trail):
+ def trim_punctuation(self, word):
"""
- Trim trailing and wrapping punctuation from `middle`. Return the items
- of the new state.
+ Trim trailing and wrapping punctuation from `word`. Return the items of
+ the new state.
"""
+ lead, middle, trail = '', word, ''
# Continue trimming until middle remains unchanged.
trimmed_something = True
while trimmed_something:
trimmed_something = False
# Trim wrapping punctuation.
- for opening, closing in WRAPPING_PUNCTUATION:
+ for opening, closing in self.wrapping_punctuation:
if middle.startswith(opening):
middle = middle[len(opening):]
lead += opening
trimmed_something = True
# Keep parentheses at the end only if they're balanced.
- if (middle.endswith(closing) and
- middle.count(closing) == middle.count(opening) + 1):
+ if (
+ middle.endswith(closing) and
+ middle.count(closing) == middle.count(opening) + 1
+ ):
middle = middle[:-len(closing)]
trail = closing + trail
trimmed_something = True
@@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
# as encoded entities contain ';'). Unescape entities to avoid
# breaking them by removing ';'.
middle_unescaped = html.unescape(middle)
- stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
+ stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
if middle_unescaped != stripped:
punctuation_count = len(middle_unescaped) - len(stripped)
trail = middle[-punctuation_count:] + trail
@@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
trimmed_something = True
return lead, middle, trail
+ @staticmethod
def is_email_simple(value):
"""Return True if value looks like an email address."""
# An @ must be in the middle of the value.
@@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
return False
return True
- words = word_split_re.split(str(text))
- for i, word in enumerate(words):
- if '.' in word or '@' in word or ':' in word:
- # lead: Current punctuation trimmed from the beginning of the word.
- # middle: Current state of the word.
- # trail: Current punctuation trimmed from the end of the word.
- lead, middle, trail = '', word, ''
- # Deal with punctuation.
- lead, middle, trail = trim_punctuation(lead, middle, trail)
- # Make URL we want to point to.
- url = None
- nofollow_attr = ' rel="nofollow"' if nofollow else ''
- if simple_url_re.match(middle):
- url = smart_urlquote(html.unescape(middle))
- elif simple_url_2_re.match(middle):
- url = smart_urlquote('http://%s' % html.unescape(middle))
- elif ':' not in middle and is_email_simple(middle):
- local, domain = middle.rsplit('@', 1)
- try:
- domain = punycode(domain)
- except UnicodeError:
- continue
- url = 'mailto:%s@%s' % (local, domain)
- nofollow_attr = ''
+urlizer = Urlizer()
- # Make link.
- if url:
- trimmed = trim_url(middle)
- if autoescape and not safe_input:
- lead, trail = escape(lead), escape(trail)
- trimmed = escape(trimmed)
- middle = '%s' % (escape(url), nofollow_attr, trimmed)
- words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
- else:
- if safe_input:
- words[i] = mark_safe(word)
- elif autoescape:
- words[i] = escape(word)
- elif safe_input:
- words[i] = mark_safe(word)
- elif autoescape:
- words[i] = escape(word)
- return ''.join(words)
+
+@keep_lazy_text
+def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+ return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
def avoid_wrapping(value):