From cf11e3789b6643cf451d79d675a97c4de94542b0 Mon Sep 17 00:00:00 2001 From: Luke Plant Date: Thu, 28 Apr 2011 14:08:53 +0000 Subject: [PATCH] Fixed #7267 - UnicodeDecodeError in clean_html Thanks to Nikolay for the report, and gav and aaugustin for the patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/utils/html.py | 8 ++++---- tests/regressiontests/utils/html.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index 094bc6660da..7fda015840c 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -13,7 +13,7 @@ LEADING_PUNCTUATION = ['(', '<', '<'] TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] # List of possible strings used for bullets in bulleted lists. -DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] +DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•'] unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') word_split_re = re.compile(r'(\s+)') @@ -180,13 +180,13 @@ def clean_html(text): text = html_gunk_re.sub('', text) # Convert hard-coded bullets into HTML unordered lists. def replace_p_tags(match): - s = match.group().replace('

', '') + s = match.group().replace(u'

', u'') for d in DOTS: - s = s.replace('

%s' % d, '

  • ') + s = s.replace(u'

    %s' % d, u'

  • ') return u'' % s text = hard_coded_bullets_re.sub(replace_p_tags, text) # Remove stuff like "

      

    ", but only if it's at the bottom # of the text. - text = trailing_empty_content_re.sub('', text) + text = trailing_empty_content_re.sub(u'', text) return text clean_html = allow_lazy(clean_html, unicode) diff --git a/tests/regressiontests/utils/html.py b/tests/regressiontests/utils/html.py index 3acb218cd18..d8b9bde8bf9 100644 --- a/tests/regressiontests/utils/html.py +++ b/tests/regressiontests/utils/html.py @@ -121,3 +121,15 @@ class TestUtilsHtml(unittest.TestCase): ) for value, output in items: self.check_output(f, value, output) + + def test_clean_html(self): + f = html.clean_html + items = ( + (u'

    I believe in semantic markup!

    ', u'

    I believe in semantic markup!

    '), + (u'I escape & I don\'t target', u'I escape & I don\'t target'), + (u'

    I kill whitespace


     

    ', u'

    I kill whitespace

    '), + # also a regression test for #7267: this used to raise an UnicodeDecodeError + (u'

    * foo

    * bar

    ', u''), + ) + for value, output in items: + self.check_output(f, value, output)