Fixed #33195 -- Refactored urlize() based on a class.
This allows easier customization/
This commit is contained in:
parent
4a58dfd9db
commit
e567670b1a
|
@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
|
|||
from django.utils.safestring import SafeData, SafeString, mark_safe
|
||||
from django.utils.text import normalize_newlines
|
||||
|
||||
# Configuration for urlize() function.
|
||||
TRAILING_PUNCTUATION_CHARS = '.,:;!'
|
||||
WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
|
||||
|
||||
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
|
||||
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
|
||||
simple_url_2_re = _lazy_re_compile(
|
||||
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
@keep_lazy(str, SafeString)
|
||||
def escape(text):
|
||||
|
@ -229,48 +218,118 @@ def smart_urlquote(url):
|
|||
return urlunsplit((scheme, netloc, path, query, fragment))
|
||||
|
||||
|
||||
@keep_lazy_text
|
||||
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||
class Urlizer:
|
||||
"""
|
||||
Convert any URLs in text into clickable links.
|
||||
|
||||
Works on http://, https://, www. links, and also on links ending in one of
|
||||
Work on http://, https://, www. links, and also on links ending in one of
|
||||
the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
|
||||
Links can have trailing punctuation (periods, commas, close-parens) and
|
||||
leading punctuation (opening parens) and it'll still do the right thing.
|
||||
|
||||
If trim_url_limit is not None, truncate the URLs in the link text longer
|
||||
than this limit to trim_url_limit - 1 characters and append an ellipsis.
|
||||
|
||||
If nofollow is True, give the links a rel="nofollow" attribute.
|
||||
|
||||
If autoescape is True, autoescape the link text and URLs.
|
||||
"""
|
||||
safe_input = isinstance(text, SafeData)
|
||||
trailing_punctuation_chars = '.,:;!'
|
||||
wrapping_punctuation = [('(', ')'), ('[', ']')]
|
||||
|
||||
def trim_url(x, limit=trim_url_limit):
|
||||
if limit is None or len(x) <= limit:
|
||||
simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
|
||||
simple_url_2_re = _lazy_re_compile(
|
||||
r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
|
||||
re.IGNORECASE
|
||||
)
|
||||
word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
|
||||
|
||||
mailto_template = 'mailto:{local}@{domain}'
|
||||
url_template = '<a href="{href}"{attrs}>{url}</a>'
|
||||
|
||||
def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||
"""
|
||||
If trim_url_limit is not None, truncate the URLs in the link text
|
||||
longer than this limit to trim_url_limit - 1 characters and append an
|
||||
ellipsis.
|
||||
|
||||
If nofollow is True, give the links a rel="nofollow" attribute.
|
||||
|
||||
If autoescape is True, autoescape the link text and URLs.
|
||||
"""
|
||||
self.trim_url_limit = trim_url_limit
|
||||
self.nofollow = nofollow
|
||||
self.autoescape = autoescape
|
||||
self.safe_input = isinstance(text, SafeData)
|
||||
|
||||
words = self.word_split_re.split(str(text))
|
||||
return ''.join([
|
||||
self.handle_word(word) for word in words
|
||||
])
|
||||
|
||||
def handle_word(self, word):
|
||||
if '.' in word or '@' in word or ':' in word:
|
||||
# lead: Punctuation trimmed from the beginning of the word.
|
||||
# middle: State of the word.
|
||||
# trail: Punctuation trimmed from the end of the word.
|
||||
lead, middle, trail = self.trim_punctuation(word)
|
||||
# Make URL we want to point to.
|
||||
url = None
|
||||
nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
|
||||
if self.simple_url_re.match(middle):
|
||||
url = smart_urlquote(html.unescape(middle))
|
||||
elif self.simple_url_2_re.match(middle):
|
||||
url = smart_urlquote('http://%s' % html.unescape(middle))
|
||||
elif ':' not in middle and self.is_email_simple(middle):
|
||||
local, domain = middle.rsplit('@', 1)
|
||||
try:
|
||||
domain = punycode(domain)
|
||||
except UnicodeError:
|
||||
return word
|
||||
url = self.mailto_template.format(local=local, domain=domain)
|
||||
nofollow_attr = ''
|
||||
# Make link.
|
||||
if url:
|
||||
trimmed = self.trim_url(middle)
|
||||
if self.autoescape and not self.safe_input:
|
||||
lead, trail = escape(lead), escape(trail)
|
||||
trimmed = escape(trimmed)
|
||||
middle = self.url_template.format(
|
||||
href=escape(url),
|
||||
attrs=nofollow_attr,
|
||||
url=trimmed,
|
||||
)
|
||||
return mark_safe(f'{lead}{middle}{trail}')
|
||||
else:
|
||||
if self.safe_input:
|
||||
return mark_safe(word)
|
||||
elif self.autoescape:
|
||||
return escape(word)
|
||||
elif self.safe_input:
|
||||
return mark_safe(word)
|
||||
elif self.autoescape:
|
||||
return escape(word)
|
||||
return word
|
||||
|
||||
def trim_url(self, x):
|
||||
if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
|
||||
return x
|
||||
return '%s…' % x[:max(0, limit - 1)]
|
||||
return '%s…' % x[:max(0, self.trim_url_limit - 1)]
|
||||
|
||||
def trim_punctuation(lead, middle, trail):
|
||||
def trim_punctuation(self, word):
|
||||
"""
|
||||
Trim trailing and wrapping punctuation from `middle`. Return the items
|
||||
of the new state.
|
||||
Trim trailing and wrapping punctuation from `word`. Return the items of
|
||||
the new state.
|
||||
"""
|
||||
lead, middle, trail = '', word, ''
|
||||
# Continue trimming until middle remains unchanged.
|
||||
trimmed_something = True
|
||||
while trimmed_something:
|
||||
trimmed_something = False
|
||||
# Trim wrapping punctuation.
|
||||
for opening, closing in WRAPPING_PUNCTUATION:
|
||||
for opening, closing in self.wrapping_punctuation:
|
||||
if middle.startswith(opening):
|
||||
middle = middle[len(opening):]
|
||||
lead += opening
|
||||
trimmed_something = True
|
||||
# Keep parentheses at the end only if they're balanced.
|
||||
if (middle.endswith(closing) and
|
||||
middle.count(closing) == middle.count(opening) + 1):
|
||||
if (
|
||||
middle.endswith(closing) and
|
||||
middle.count(closing) == middle.count(opening) + 1
|
||||
):
|
||||
middle = middle[:-len(closing)]
|
||||
trail = closing + trail
|
||||
trimmed_something = True
|
||||
|
@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
|||
# as encoded entities contain ';'). Unescape entities to avoid
|
||||
# breaking them by removing ';'.
|
||||
middle_unescaped = html.unescape(middle)
|
||||
stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
|
||||
stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
|
||||
if middle_unescaped != stripped:
|
||||
punctuation_count = len(middle_unescaped) - len(stripped)
|
||||
trail = middle[-punctuation_count:] + trail
|
||||
|
@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
|||
trimmed_something = True
|
||||
return lead, middle, trail
|
||||
|
||||
@staticmethod
|
||||
def is_email_simple(value):
|
||||
"""Return True if value looks like an email address."""
|
||||
# An @ must be in the middle of the value.
|
||||
|
@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
|||
return False
|
||||
return True
|
||||
|
||||
words = word_split_re.split(str(text))
|
||||
for i, word in enumerate(words):
|
||||
if '.' in word or '@' in word or ':' in word:
|
||||
# lead: Current punctuation trimmed from the beginning of the word.
|
||||
# middle: Current state of the word.
|
||||
# trail: Current punctuation trimmed from the end of the word.
|
||||
lead, middle, trail = '', word, ''
|
||||
# Deal with punctuation.
|
||||
lead, middle, trail = trim_punctuation(lead, middle, trail)
|
||||
|
||||
# Make URL we want to point to.
|
||||
url = None
|
||||
nofollow_attr = ' rel="nofollow"' if nofollow else ''
|
||||
if simple_url_re.match(middle):
|
||||
url = smart_urlquote(html.unescape(middle))
|
||||
elif simple_url_2_re.match(middle):
|
||||
url = smart_urlquote('http://%s' % html.unescape(middle))
|
||||
elif ':' not in middle and is_email_simple(middle):
|
||||
local, domain = middle.rsplit('@', 1)
|
||||
try:
|
||||
domain = punycode(domain)
|
||||
except UnicodeError:
|
||||
continue
|
||||
url = 'mailto:%s@%s' % (local, domain)
|
||||
nofollow_attr = ''
|
||||
urlizer = Urlizer()
|
||||
|
||||
# Make link.
|
||||
if url:
|
||||
trimmed = trim_url(middle)
|
||||
if autoescape and not safe_input:
|
||||
lead, trail = escape(lead), escape(trail)
|
||||
trimmed = escape(trimmed)
|
||||
middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
|
||||
words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
|
||||
else:
|
||||
if safe_input:
|
||||
words[i] = mark_safe(word)
|
||||
elif autoescape:
|
||||
words[i] = escape(word)
|
||||
elif safe_input:
|
||||
words[i] = mark_safe(word)
|
||||
elif autoescape:
|
||||
words[i] = escape(word)
|
||||
return ''.join(words)
|
||||
|
||||
@keep_lazy_text
|
||||
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
|
||||
return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
|
||||
|
||||
|
||||
def avoid_wrapping(value):
|
||||
|
|
Loading…
Reference in New Issue