Fixed #33195 -- Refactored urlize() based on a class.

This allows easier customization/
This commit is contained in:
Claude Paroz 2021-10-14 19:27:31 +02:00 committed by Mariusz Felisiak
parent 4a58dfd9db
commit e567670b1a
1 changed files with 97 additions and 74 deletions

View File

@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
from django.utils.safestring import SafeData, SafeString, mark_safe from django.utils.safestring import SafeData, SafeString, mark_safe
from django.utils.text import normalize_newlines from django.utils.text import normalize_newlines
# Configuration for urlize() function.
TRAILING_PUNCTUATION_CHARS = '.,:;!'
WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
simple_url_2_re = _lazy_re_compile(
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
re.IGNORECASE
)
@keep_lazy(str, SafeString) @keep_lazy(str, SafeString)
def escape(text): def escape(text):
@ -229,48 +218,118 @@ def smart_urlquote(url):
return urlunsplit((scheme, netloc, path, query, fragment)) return urlunsplit((scheme, netloc, path, query, fragment))
@keep_lazy_text class Urlizer:
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
""" """
Convert any URLs in text into clickable links. Convert any URLs in text into clickable links.
Works on http://, https://, www. links, and also on links ending in one of Work on http://, https://, www. links, and also on links ending in one of
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
Links can have trailing punctuation (periods, commas, close-parens) and Links can have trailing punctuation (periods, commas, close-parens) and
leading punctuation (opening parens) and it'll still do the right thing. leading punctuation (opening parens) and it'll still do the right thing.
"""
trailing_punctuation_chars = '.,:;!'
wrapping_punctuation = [('(', ')'), ('[', ']')]
If trim_url_limit is not None, truncate the URLs in the link text longer simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
than this limit to trim_url_limit - 1 characters and append an ellipsis. simple_url_2_re = _lazy_re_compile(
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
re.IGNORECASE
)
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
mailto_template = 'mailto:{local}@{domain}'
url_template = '<a href="{href}"{attrs}>{url}</a>'
def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
"""
If trim_url_limit is not None, truncate the URLs in the link text
longer than this limit to trim_url_limit - 1 characters and append an
ellipsis.
If nofollow is True, give the links a rel="nofollow" attribute. If nofollow is True, give the links a rel="nofollow" attribute.
If autoescape is True, autoescape the link text and URLs. If autoescape is True, autoescape the link text and URLs.
""" """
safe_input = isinstance(text, SafeData) self.trim_url_limit = trim_url_limit
self.nofollow = nofollow
self.autoescape = autoescape
self.safe_input = isinstance(text, SafeData)
def trim_url(x, limit=trim_url_limit): words = self.word_split_re.split(str(text))
if limit is None or len(x) <= limit: return ''.join([
self.handle_word(word) for word in words
])
def handle_word(self, word):
if '.' in word or '@' in word or ':' in word:
# lead: Punctuation trimmed from the beginning of the word.
# middle: State of the word.
# trail: Punctuation trimmed from the end of the word.
lead, middle, trail = self.trim_punctuation(word)
# Make URL we want to point to.
url = None
nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
if self.simple_url_re.match(middle):
url = smart_urlquote(html.unescape(middle))
elif self.simple_url_2_re.match(middle):
url = smart_urlquote('http://%s' % html.unescape(middle))
elif ':' not in middle and self.is_email_simple(middle):
local, domain = middle.rsplit('@', 1)
try:
domain = punycode(domain)
except UnicodeError:
return word
url = self.mailto_template.format(local=local, domain=domain)
nofollow_attr = ''
# Make link.
if url:
trimmed = self.trim_url(middle)
if self.autoescape and not self.safe_input:
lead, trail = escape(lead), escape(trail)
trimmed = escape(trimmed)
middle = self.url_template.format(
href=escape(url),
attrs=nofollow_attr,
url=trimmed,
)
return mark_safe(f'{lead}{middle}{trail}')
else:
if self.safe_input:
return mark_safe(word)
elif self.autoescape:
return escape(word)
elif self.safe_input:
return mark_safe(word)
elif self.autoescape:
return escape(word)
return word
def trim_url(self, x):
if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
return x return x
return '%s' % x[:max(0, limit - 1)] return '%s' % x[:max(0, self.trim_url_limit - 1)]
def trim_punctuation(lead, middle, trail): def trim_punctuation(self, word):
""" """
Trim trailing and wrapping punctuation from `middle`. Return the items Trim trailing and wrapping punctuation from `word`. Return the items of
of the new state. the new state.
""" """
lead, middle, trail = '', word, ''
# Continue trimming until middle remains unchanged. # Continue trimming until middle remains unchanged.
trimmed_something = True trimmed_something = True
while trimmed_something: while trimmed_something:
trimmed_something = False trimmed_something = False
# Trim wrapping punctuation. # Trim wrapping punctuation.
for opening, closing in WRAPPING_PUNCTUATION: for opening, closing in self.wrapping_punctuation:
if middle.startswith(opening): if middle.startswith(opening):
middle = middle[len(opening):] middle = middle[len(opening):]
lead += opening lead += opening
trimmed_something = True trimmed_something = True
# Keep parentheses at the end only if they're balanced. # Keep parentheses at the end only if they're balanced.
if (middle.endswith(closing) and if (
middle.count(closing) == middle.count(opening) + 1): middle.endswith(closing) and
middle.count(closing) == middle.count(opening) + 1
):
middle = middle[:-len(closing)] middle = middle[:-len(closing)]
trail = closing + trail trail = closing + trail
trimmed_something = True trimmed_something = True
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
# as encoded entities contain ';'). Unescape entities to avoid # as encoded entities contain ';'). Unescape entities to avoid
# breaking them by removing ';'. # breaking them by removing ';'.
middle_unescaped = html.unescape(middle) middle_unescaped = html.unescape(middle)
stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS) stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
if middle_unescaped != stripped: if middle_unescaped != stripped:
punctuation_count = len(middle_unescaped) - len(stripped) punctuation_count = len(middle_unescaped) - len(stripped)
trail = middle[-punctuation_count:] + trail trail = middle[-punctuation_count:] + trail
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
trimmed_something = True trimmed_something = True
return lead, middle, trail return lead, middle, trail
@staticmethod
def is_email_simple(value): def is_email_simple(value):
"""Return True if value looks like an email address.""" """Return True if value looks like an email address."""
# An @ must be in the middle of the value. # An @ must be in the middle of the value.
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
return False return False
return True return True
words = word_split_re.split(str(text))
for i, word in enumerate(words):
if '.' in word or '@' in word or ':' in word:
# lead: Current punctuation trimmed from the beginning of the word.
# middle: Current state of the word.
# trail: Current punctuation trimmed from the end of the word.
lead, middle, trail = '', word, ''
# Deal with punctuation.
lead, middle, trail = trim_punctuation(lead, middle, trail)
# Make URL we want to point to. urlizer = Urlizer()
url = None
nofollow_attr = ' rel="nofollow"' if nofollow else ''
if simple_url_re.match(middle):
url = smart_urlquote(html.unescape(middle))
elif simple_url_2_re.match(middle):
url = smart_urlquote('http://%s' % html.unescape(middle))
elif ':' not in middle and is_email_simple(middle):
local, domain = middle.rsplit('@', 1)
try:
domain = punycode(domain)
except UnicodeError:
continue
url = 'mailto:%s@%s' % (local, domain)
nofollow_attr = ''
# Make link.
if url: @keep_lazy_text
trimmed = trim_url(middle) def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
if autoescape and not safe_input: return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
lead, trail = escape(lead), escape(trail)
trimmed = escape(trimmed)
middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
else:
if safe_input:
words[i] = mark_safe(word)
elif autoescape:
words[i] = escape(word)
elif safe_input:
words[i] = mark_safe(word)
elif autoescape:
words[i] = escape(word)
return ''.join(words)
def avoid_wrapping(value): def avoid_wrapping(value):