Fixed #33195 -- Refactored urlize() based on a class.

This allows easier customization/
This commit is contained in:
Claude Paroz 2021-10-14 19:27:31 +02:00 committed by Mariusz Felisiak
parent 4a58dfd9db
commit e567670b1a
1 changed files with 97 additions and 74 deletions

View File

@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
from django.utils.safestring import SafeData, SafeString, mark_safe
from django.utils.text import normalize_newlines
# Configuration for urlize() function.
TRAILING_PUNCTUATION_CHARS = '.,:;!'
WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
simple_url_2_re = _lazy_re_compile(
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
re.IGNORECASE
)
@keep_lazy(str, SafeString)
def escape(text):
@ -229,48 +218,118 @@ def smart_urlquote(url):
return urlunsplit((scheme, netloc, path, query, fragment))
@keep_lazy_text
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
class Urlizer:
"""
Convert any URLs in text into clickable links.
Works on http://, https://, www. links, and also on links ending in one of
Work on http://, https://, www. links, and also on links ending in one of
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
Links can have trailing punctuation (periods, commas, close-parens) and
leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, truncate the URLs in the link text longer
than this limit to trim_url_limit - 1 characters and append an ellipsis.
If nofollow is True, give the links a rel="nofollow" attribute.
If autoescape is True, autoescape the link text and URLs.
"""
safe_input = isinstance(text, SafeData)
trailing_punctuation_chars = '.,:;!'
wrapping_punctuation = [('(', ')'), ('[', ']')]
def trim_url(x, limit=trim_url_limit):
if limit is None or len(x) <= limit:
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
simple_url_2_re = _lazy_re_compile(
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
re.IGNORECASE
)
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
mailto_template = 'mailto:{local}@{domain}'
url_template = '<a href="{href}"{attrs}>{url}</a>'
def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
"""
If trim_url_limit is not None, truncate the URLs in the link text
longer than this limit to trim_url_limit - 1 characters and append an
ellipsis.
If nofollow is True, give the links a rel="nofollow" attribute.
If autoescape is True, autoescape the link text and URLs.
"""
self.trim_url_limit = trim_url_limit
self.nofollow = nofollow
self.autoescape = autoescape
self.safe_input = isinstance(text, SafeData)
words = self.word_split_re.split(str(text))
return ''.join([
self.handle_word(word) for word in words
])
def handle_word(self, word):
if '.' in word or '@' in word or ':' in word:
# lead: Punctuation trimmed from the beginning of the word.
# middle: State of the word.
# trail: Punctuation trimmed from the end of the word.
lead, middle, trail = self.trim_punctuation(word)
# Make URL we want to point to.
url = None
nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
if self.simple_url_re.match(middle):
url = smart_urlquote(html.unescape(middle))
elif self.simple_url_2_re.match(middle):
url = smart_urlquote('http://%s' % html.unescape(middle))
elif ':' not in middle and self.is_email_simple(middle):
local, domain = middle.rsplit('@', 1)
try:
domain = punycode(domain)
except UnicodeError:
return word
url = self.mailto_template.format(local=local, domain=domain)
nofollow_attr = ''
# Make link.
if url:
trimmed = self.trim_url(middle)
if self.autoescape and not self.safe_input:
lead, trail = escape(lead), escape(trail)
trimmed = escape(trimmed)
middle = self.url_template.format(
href=escape(url),
attrs=nofollow_attr,
url=trimmed,
)
return mark_safe(f'{lead}{middle}{trail}')
else:
if self.safe_input:
return mark_safe(word)
elif self.autoescape:
return escape(word)
elif self.safe_input:
return mark_safe(word)
elif self.autoescape:
return escape(word)
return word
def trim_url(self, x):
if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
return x
return '%s' % x[:max(0, limit - 1)]
return '%s' % x[:max(0, self.trim_url_limit - 1)]
def trim_punctuation(lead, middle, trail):
def trim_punctuation(self, word):
"""
Trim trailing and wrapping punctuation from `middle`. Return the items
of the new state.
Trim trailing and wrapping punctuation from `word`. Return the items of
the new state.
"""
lead, middle, trail = '', word, ''
# Continue trimming until middle remains unchanged.
trimmed_something = True
while trimmed_something:
trimmed_something = False
# Trim wrapping punctuation.
for opening, closing in WRAPPING_PUNCTUATION:
for opening, closing in self.wrapping_punctuation:
if middle.startswith(opening):
middle = middle[len(opening):]
lead += opening
trimmed_something = True
# Keep parentheses at the end only if they're balanced.
if (middle.endswith(closing) and
middle.count(closing) == middle.count(opening) + 1):
if (
middle.endswith(closing) and
middle.count(closing) == middle.count(opening) + 1
):
middle = middle[:-len(closing)]
trail = closing + trail
trimmed_something = True
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
# as encoded entities contain ';'). Unescape entities to avoid
# breaking them by removing ';'.
middle_unescaped = html.unescape(middle)
stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
if middle_unescaped != stripped:
punctuation_count = len(middle_unescaped) - len(stripped)
trail = middle[-punctuation_count:] + trail
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
trimmed_something = True
return lead, middle, trail
@staticmethod
def is_email_simple(value):
"""Return True if value looks like an email address."""
# An @ must be in the middle of the value.
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
return False
return True
words = word_split_re.split(str(text))
for i, word in enumerate(words):
if '.' in word or '@' in word or ':' in word:
# lead: Current punctuation trimmed from the beginning of the word.
# middle: Current state of the word.
# trail: Current punctuation trimmed from the end of the word.
lead, middle, trail = '', word, ''
# Deal with punctuation.
lead, middle, trail = trim_punctuation(lead, middle, trail)
# Make URL we want to point to.
url = None
nofollow_attr = ' rel="nofollow"' if nofollow else ''
if simple_url_re.match(middle):
url = smart_urlquote(html.unescape(middle))
elif simple_url_2_re.match(middle):
url = smart_urlquote('http://%s' % html.unescape(middle))
elif ':' not in middle and is_email_simple(middle):
local, domain = middle.rsplit('@', 1)
try:
domain = punycode(domain)
except UnicodeError:
continue
url = 'mailto:%s@%s' % (local, domain)
nofollow_attr = ''
urlizer = Urlizer()
# Make link.
if url:
trimmed = trim_url(middle)
if autoescape and not safe_input:
lead, trail = escape(lead), escape(trail)
trimmed = escape(trimmed)
middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
else:
if safe_input:
words[i] = mark_safe(word)
elif autoescape:
words[i] = escape(word)
elif safe_input:
words[i] = mark_safe(word)
elif autoescape:
words[i] = escape(word)
return ''.join(words)
@keep_lazy_text
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
def avoid_wrapping(value):