Fixed #33195 -- Refactored urlize() based on a class.
This allows easier customization/
This commit is contained in:
parent
4a58dfd9db
commit
e567670b1a
|
@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
|
||||||
from django.utils.safestring import SafeData, SafeString, mark_safe
|
from django.utils.safestring import SafeData, SafeString, mark_safe
|
||||||
from django.utils.text import normalize_newlines
|
from django.utils.text import normalize_newlines
|
||||||
|
|
||||||
# Configuration for urlize() function.
|
|
||||||
TRAILING_PUNCTUATION_CHARS = '.,:;!'
|
|
||||||
WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
|
|
||||||
|
|
||||||
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
|
|
||||||
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
|
|
||||||
simple_url_2_re = _lazy_re_compile(
|
|
||||||
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
|
|
||||||
re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@keep_lazy(str, SafeString)
|
@keep_lazy(str, SafeString)
|
||||||
def escape(text):
|
def escape(text):
|
||||||
|
@ -229,48 +218,118 @@ def smart_urlquote(url):
|
||||||
return urlunsplit((scheme, netloc, path, query, fragment))
|
return urlunsplit((scheme, netloc, path, query, fragment))
|
||||||
|
|
||||||
|
|
||||||
@keep_lazy_text
|
class Urlizer:
|
||||||
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
|
||||||
"""
|
"""
|
||||||
Convert any URLs in text into clickable links.
|
Convert any URLs in text into clickable links.
|
||||||
|
|
||||||
Works on http://, https://, www. links, and also on links ending in one of
|
Work on http://, https://, www. links, and also on links ending in one of
|
||||||
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
|
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
|
||||||
Links can have trailing punctuation (periods, commas, close-parens) and
|
Links can have trailing punctuation (periods, commas, close-parens) and
|
||||||
leading punctuation (opening parens) and it'll still do the right thing.
|
leading punctuation (opening parens) and it'll still do the right thing.
|
||||||
|
|
||||||
If trim_url_limit is not None, truncate the URLs in the link text longer
|
|
||||||
than this limit to trim_url_limit - 1 characters and append an ellipsis.
|
|
||||||
|
|
||||||
If nofollow is True, give the links a rel="nofollow" attribute.
|
|
||||||
|
|
||||||
If autoescape is True, autoescape the link text and URLs.
|
|
||||||
"""
|
"""
|
||||||
safe_input = isinstance(text, SafeData)
|
trailing_punctuation_chars = '.,:;!'
|
||||||
|
wrapping_punctuation = [('(', ')'), ('[', ']')]
|
||||||
|
|
||||||
def trim_url(x, limit=trim_url_limit):
|
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
|
||||||
if limit is None or len(x) <= limit:
|
simple_url_2_re = _lazy_re_compile(
|
||||||
|
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
|
||||||
|
|
||||||
|
mailto_template = 'mailto:{local}@{domain}'
|
||||||
|
url_template = '<a href="{href}"{attrs}>{url}</a>'
|
||||||
|
|
||||||
|
def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||||
|
"""
|
||||||
|
If trim_url_limit is not None, truncate the URLs in the link text
|
||||||
|
longer than this limit to trim_url_limit - 1 characters and append an
|
||||||
|
ellipsis.
|
||||||
|
|
||||||
|
If nofollow is True, give the links a rel="nofollow" attribute.
|
||||||
|
|
||||||
|
If autoescape is True, autoescape the link text and URLs.
|
||||||
|
"""
|
||||||
|
self.trim_url_limit = trim_url_limit
|
||||||
|
self.nofollow = nofollow
|
||||||
|
self.autoescape = autoescape
|
||||||
|
self.safe_input = isinstance(text, SafeData)
|
||||||
|
|
||||||
|
words = self.word_split_re.split(str(text))
|
||||||
|
return ''.join([
|
||||||
|
self.handle_word(word) for word in words
|
||||||
|
])
|
||||||
|
|
||||||
|
def handle_word(self, word):
|
||||||
|
if '.' in word or '@' in word or ':' in word:
|
||||||
|
# lead: Punctuation trimmed from the beginning of the word.
|
||||||
|
# middle: State of the word.
|
||||||
|
# trail: Punctuation trimmed from the end of the word.
|
||||||
|
lead, middle, trail = self.trim_punctuation(word)
|
||||||
|
# Make URL we want to point to.
|
||||||
|
url = None
|
||||||
|
nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
|
||||||
|
if self.simple_url_re.match(middle):
|
||||||
|
url = smart_urlquote(html.unescape(middle))
|
||||||
|
elif self.simple_url_2_re.match(middle):
|
||||||
|
url = smart_urlquote('http://%s' % html.unescape(middle))
|
||||||
|
elif ':' not in middle and self.is_email_simple(middle):
|
||||||
|
local, domain = middle.rsplit('@', 1)
|
||||||
|
try:
|
||||||
|
domain = punycode(domain)
|
||||||
|
except UnicodeError:
|
||||||
|
return word
|
||||||
|
url = self.mailto_template.format(local=local, domain=domain)
|
||||||
|
nofollow_attr = ''
|
||||||
|
# Make link.
|
||||||
|
if url:
|
||||||
|
trimmed = self.trim_url(middle)
|
||||||
|
if self.autoescape and not self.safe_input:
|
||||||
|
lead, trail = escape(lead), escape(trail)
|
||||||
|
trimmed = escape(trimmed)
|
||||||
|
middle = self.url_template.format(
|
||||||
|
href=escape(url),
|
||||||
|
attrs=nofollow_attr,
|
||||||
|
url=trimmed,
|
||||||
|
)
|
||||||
|
return mark_safe(f'{lead}{middle}{trail}')
|
||||||
|
else:
|
||||||
|
if self.safe_input:
|
||||||
|
return mark_safe(word)
|
||||||
|
elif self.autoescape:
|
||||||
|
return escape(word)
|
||||||
|
elif self.safe_input:
|
||||||
|
return mark_safe(word)
|
||||||
|
elif self.autoescape:
|
||||||
|
return escape(word)
|
||||||
|
return word
|
||||||
|
|
||||||
|
def trim_url(self, x):
|
||||||
|
if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
|
||||||
return x
|
return x
|
||||||
return '%s…' % x[:max(0, limit - 1)]
|
return '%s…' % x[:max(0, self.trim_url_limit - 1)]
|
||||||
|
|
||||||
def trim_punctuation(lead, middle, trail):
|
def trim_punctuation(self, word):
|
||||||
"""
|
"""
|
||||||
Trim trailing and wrapping punctuation from `middle`. Return the items
|
Trim trailing and wrapping punctuation from `word`. Return the items of
|
||||||
of the new state.
|
the new state.
|
||||||
"""
|
"""
|
||||||
|
lead, middle, trail = '', word, ''
|
||||||
# Continue trimming until middle remains unchanged.
|
# Continue trimming until middle remains unchanged.
|
||||||
trimmed_something = True
|
trimmed_something = True
|
||||||
while trimmed_something:
|
while trimmed_something:
|
||||||
trimmed_something = False
|
trimmed_something = False
|
||||||
# Trim wrapping punctuation.
|
# Trim wrapping punctuation.
|
||||||
for opening, closing in WRAPPING_PUNCTUATION:
|
for opening, closing in self.wrapping_punctuation:
|
||||||
if middle.startswith(opening):
|
if middle.startswith(opening):
|
||||||
middle = middle[len(opening):]
|
middle = middle[len(opening):]
|
||||||
lead += opening
|
lead += opening
|
||||||
trimmed_something = True
|
trimmed_something = True
|
||||||
# Keep parentheses at the end only if they're balanced.
|
# Keep parentheses at the end only if they're balanced.
|
||||||
if (middle.endswith(closing) and
|
if (
|
||||||
middle.count(closing) == middle.count(opening) + 1):
|
middle.endswith(closing) and
|
||||||
|
middle.count(closing) == middle.count(opening) + 1
|
||||||
|
):
|
||||||
middle = middle[:-len(closing)]
|
middle = middle[:-len(closing)]
|
||||||
trail = closing + trail
|
trail = closing + trail
|
||||||
trimmed_something = True
|
trimmed_something = True
|
||||||
|
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||||
# as encoded entities contain ';'). Unescape entities to avoid
|
# as encoded entities contain ';'). Unescape entities to avoid
|
||||||
# breaking them by removing ';'.
|
# breaking them by removing ';'.
|
||||||
middle_unescaped = html.unescape(middle)
|
middle_unescaped = html.unescape(middle)
|
||||||
stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
|
stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
|
||||||
if middle_unescaped != stripped:
|
if middle_unescaped != stripped:
|
||||||
punctuation_count = len(middle_unescaped) - len(stripped)
|
punctuation_count = len(middle_unescaped) - len(stripped)
|
||||||
trail = middle[-punctuation_count:] + trail
|
trail = middle[-punctuation_count:] + trail
|
||||||
|
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||||
trimmed_something = True
|
trimmed_something = True
|
||||||
return lead, middle, trail
|
return lead, middle, trail
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def is_email_simple(value):
|
def is_email_simple(value):
|
||||||
"""Return True if value looks like an email address."""
|
"""Return True if value looks like an email address."""
|
||||||
# An @ must be in the middle of the value.
|
# An @ must be in the middle of the value.
|
||||||
|
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
words = word_split_re.split(str(text))
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
if '.' in word or '@' in word or ':' in word:
|
|
||||||
# lead: Current punctuation trimmed from the beginning of the word.
|
|
||||||
# middle: Current state of the word.
|
|
||||||
# trail: Current punctuation trimmed from the end of the word.
|
|
||||||
lead, middle, trail = '', word, ''
|
|
||||||
# Deal with punctuation.
|
|
||||||
lead, middle, trail = trim_punctuation(lead, middle, trail)
|
|
||||||
|
|
||||||
# Make URL we want to point to.
|
urlizer = Urlizer()
|
||||||
url = None
|
|
||||||
nofollow_attr = ' rel="nofollow"' if nofollow else ''
|
|
||||||
if simple_url_re.match(middle):
|
|
||||||
url = smart_urlquote(html.unescape(middle))
|
|
||||||
elif simple_url_2_re.match(middle):
|
|
||||||
url = smart_urlquote('http://%s' % html.unescape(middle))
|
|
||||||
elif ':' not in middle and is_email_simple(middle):
|
|
||||||
local, domain = middle.rsplit('@', 1)
|
|
||||||
try:
|
|
||||||
domain = punycode(domain)
|
|
||||||
except UnicodeError:
|
|
||||||
continue
|
|
||||||
url = 'mailto:%s@%s' % (local, domain)
|
|
||||||
nofollow_attr = ''
|
|
||||||
|
|
||||||
# Make link.
|
|
||||||
if url:
|
@keep_lazy_text
|
||||||
trimmed = trim_url(middle)
|
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||||
if autoescape and not safe_input:
|
return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
|
||||||
lead, trail = escape(lead), escape(trail)
|
|
||||||
trimmed = escape(trimmed)
|
|
||||||
middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
|
|
||||||
words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
|
|
||||||
else:
|
|
||||||
if safe_input:
|
|
||||||
words[i] = mark_safe(word)
|
|
||||||
elif autoescape:
|
|
||||||
words[i] = escape(word)
|
|
||||||
elif safe_input:
|
|
||||||
words[i] = mark_safe(word)
|
|
||||||
elif autoescape:
|
|
||||||
words[i] = escape(word)
|
|
||||||
return ''.join(words)
|
|
||||||
|
|
||||||
|
|
||||||
def avoid_wrapping(value):
|
def avoid_wrapping(value):
|
||||||
|
|
Loading…
Reference in New Issue