Fixed #7267 - UnicodeDecodeError in clean_html
Thanks to Nikolay for the report, and gav and aaugustin for the patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
2ac4f175ec
commit
cf11e3789b
|
@ -13,7 +13,7 @@ LEADING_PUNCTUATION = ['(', '<', '<']
|
||||||
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
|
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
|
||||||
|
|
||||||
# List of possible strings used for bullets in bulleted lists.
|
# List of possible strings used for bullets in bulleted lists.
|
||||||
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•']
|
||||||
|
|
||||||
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||||
word_split_re = re.compile(r'(\s+)')
|
word_split_re = re.compile(r'(\s+)')
|
||||||
|
@ -180,13 +180,13 @@ def clean_html(text):
|
||||||
text = html_gunk_re.sub('', text)
|
text = html_gunk_re.sub('', text)
|
||||||
# Convert hard-coded bullets into HTML unordered lists.
|
# Convert hard-coded bullets into HTML unordered lists.
|
||||||
def replace_p_tags(match):
|
def replace_p_tags(match):
|
||||||
s = match.group().replace('</p>', '</li>')
|
s = match.group().replace(u'</p>', u'</li>')
|
||||||
for d in DOTS:
|
for d in DOTS:
|
||||||
s = s.replace('<p>%s' % d, '<li>')
|
s = s.replace(u'<p>%s' % d, u'<li>')
|
||||||
return u'<ul>\n%s\n</ul>' % s
|
return u'<ul>\n%s\n</ul>' % s
|
||||||
text = hard_coded_bullets_re.sub(replace_p_tags, text)
|
text = hard_coded_bullets_re.sub(replace_p_tags, text)
|
||||||
# Remove stuff like "<p> </p>", but only if it's at the bottom
|
# Remove stuff like "<p> </p>", but only if it's at the bottom
|
||||||
# of the text.
|
# of the text.
|
||||||
text = trailing_empty_content_re.sub('', text)
|
text = trailing_empty_content_re.sub(u'', text)
|
||||||
return text
|
return text
|
||||||
clean_html = allow_lazy(clean_html, unicode)
|
clean_html = allow_lazy(clean_html, unicode)
|
||||||
|
|
|
@ -121,3 +121,15 @@ class TestUtilsHtml(unittest.TestCase):
|
||||||
)
|
)
|
||||||
for value, output in items:
|
for value, output in items:
|
||||||
self.check_output(f, value, output)
|
self.check_output(f, value, output)
|
||||||
|
|
||||||
|
def test_clean_html(self):
|
||||||
|
f = html.clean_html
|
||||||
|
items = (
|
||||||
|
(u'<p>I <i>believe</i> in <b>semantic markup</b>!</p>', u'<p>I <em>believe</em> in <strong>semantic markup</strong>!</p>'),
|
||||||
|
(u'I escape & I don\'t <a href="#" target="_blank">target</a>', u'I escape & I don\'t <a href="#" >target</a>'),
|
||||||
|
(u'<p>I kill whitespace</p><br clear="all"><p> </p>', u'<p>I kill whitespace</p>'),
|
||||||
|
# also a regression test for #7267: this used to raise an UnicodeDecodeError
|
||||||
|
(u'<p>* foo</p><p>* bar</p>', u'<ul>\n<li> foo</li><li> bar</li>\n</ul>'),
|
||||||
|
)
|
||||||
|
for value, output in items:
|
||||||
|
self.check_output(f, value, output)
|
||||||
|
|
Loading…
Reference in New Issue