From cf11e3789b6643cf451d79d675a97c4de94542b0 Mon Sep 17 00:00:00 2001
From: Luke Plant
Date: Thu, 28 Apr 2011 14:08:53 +0000
Subject: [PATCH] Fixed #7267 - UnicodeDecodeError in clean_html
Thanks to Nikolay for the report, and gav and aaugustin for the patch.
git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37
---
django/utils/html.py | 8 ++++----
tests/regressiontests/utils/html.py | 12 ++++++++++++
2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/django/utils/html.py b/django/utils/html.py
index 094bc6660da..7fda015840c 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -13,7 +13,7 @@ LEADING_PUNCTUATION = ['(', '<', '<']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
# List of possible strings used for bullets in bulleted lists.
-DOTS = ['·', '*', '\xe2\x80\xa2', '', '•', '•']
+DOTS = [u'·', u'*', u'\u2022', u'', u'•', u'•']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
@@ -180,13 +180,13 @@ def clean_html(text):
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
- s = match.group().replace('
', '')
+ s = match.group().replace(u'', u'')
for d in DOTS:
- s = s.replace('%s' % d, '
')
+ s = s.replace(u'%s' % d, u'
')
return u'' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "
", but only if it's at the bottom
# of the text.
- text = trailing_empty_content_re.sub('', text)
+ text = trailing_empty_content_re.sub(u'', text)
return text
clean_html = allow_lazy(clean_html, unicode)
diff --git a/tests/regressiontests/utils/html.py b/tests/regressiontests/utils/html.py
index 3acb218cd18..d8b9bde8bf9 100644
--- a/tests/regressiontests/utils/html.py
+++ b/tests/regressiontests/utils/html.py
@@ -121,3 +121,15 @@ class TestUtilsHtml(unittest.TestCase):
)
for value, output in items:
self.check_output(f, value, output)
+
+ def test_clean_html(self):
+ f = html.clean_html
+ items = (
+ (u'I believe in semantic markup!
', u'I believe in semantic markup!
'),
+ (u'I escape & I don\'t target', u'I escape & I don\'t target'),
+ (u'I kill whitespace
', u'I kill whitespace
'),
+ # also a regression test for #7267: this used to raise an UnicodeDecodeError
+ (u'* foo
* bar
', u''),
+ )
+ for value, output in items:
+ self.check_output(f, value, output)