From f21a9da4857f4877a5178ba1c80fb0f7ad328a3f Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sat, 7 Jan 2012 18:39:14 +0000 Subject: [PATCH] Fixed #13704 -- Handled IDN properly in the urlize template filter. Thanks Claude Paroz for the initial version of the patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17348 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/utils/html.py | 27 ++++++++++++++----- tests/regressiontests/defaultfilters/tests.py | 11 ++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index 4f74a7492c..ce886efba7 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -2,11 +2,12 @@ import re import string +import urllib +import urlparse from django.utils.safestring import SafeData, mark_safe -from django.utils.encoding import force_unicode +from django.utils.encoding import smart_str, force_unicode from django.utils.functional import allow_lazy -from django.utils.http import urlquote from django.utils.text import normalize_newlines # Configuration for urlize() function. @@ -22,7 +23,7 @@ word_split_re = re.compile(r'(\s+)') punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) -simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') +simple_email_re = re.compile(r'^\S+@\S+\.\S+$') link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) @@ -103,12 +104,22 @@ fix_ampersands = allow_lazy(fix_ampersands, unicode) def smart_urlquote(url): """Quotes an URL if it isn't already quoted.""" + # Handle IDN before quoting. + scheme, netloc, path, query, fragment = urlparse.urlsplit(url) + try: + netloc = netloc.encode('idna') # IDN -> ACE + except UnicodeError: # invalid domain part + pass + else: + url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) + # An URL is considered unquoted if it contains no % character, or if it # contains a % not followed by two hexadecimal digits. See #9655. if '%' not in url or unquoted_percents_re.search(url): # See http://bugs.python.org/issue2637 - return urlquote(url, safe='!*\'();:@&=+$,/?#[]~') - return url + url = urllib.quote(smart_str(url), safe='!*\'();:@&=+$,/?#[]~') + + return force_unicode(url) def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """ @@ -145,8 +156,10 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): middle and middle[0] in string.ascii_letters + string.digits and \ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): url = smart_urlquote('http://%s' % middle) - elif '@' in middle and not ':' in middle and simple_email_re.match(middle): - url = 'mailto:%s' % middle + elif not ':' in middle and simple_email_re.match(middle): + local, domain = middle.rsplit('@', 1) + domain = domain.encode('idna') + url = 'mailto:%s@%s' % (local, domain) nofollow_attr = '' # Make link. if url: diff --git a/tests/regressiontests/defaultfilters/tests.py b/tests/regressiontests/defaultfilters/tests.py index 00518344a3..515840d87e 100644 --- a/tests/regressiontests/defaultfilters/tests.py +++ b/tests/regressiontests/defaultfilters/tests.py @@ -238,6 +238,7 @@ class DefaultFiltersTests(TestCase): # Check urlize with https addresses self.assertEqual(urlize('https://google.com'), u'
https://google.com') + # Check urlize doesn't overquote already quoted urls - see #9655 self.assertEqual(urlize('http://hi.baidu.com/%D6%D8%D0%C2%BF'), u'' @@ -252,6 +253,16 @@ class DefaultFiltersTests(TestCase): u'' u'http://en.wikipedia.org/wiki/Café') + # Check urlize handles IDN correctly - see #13704 + self.assertEqual(urlize('http://c✶.ws'), + u'http://c✶.ws') + self.assertEqual(urlize('www.c✶.ws'), + u'www.c✶.ws') + self.assertEqual(urlize('c✶.org'), + u'c✶.org') + self.assertEqual(urlize('info@c✶.org'), + u'info@c✶.org') + def test_wordcount(self): self.assertEqual(wordcount(''), 0) self.assertEqual(wordcount(u'oneword'), 1)