Fixed #19237 -- Used HTML parser to strip tags

The regex method used until now for the strip_tags utility is fast,
but subject to flaws and security issues. Consensus and good
practice lead use to use a slower but safer method.
This commit is contained in:
Claude Paroz 2013-05-22 17:29:16 +02:00
parent 01948e384f
commit dc51ec8bc2
2 changed files with 32 additions and 4 deletions

View File

@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy
from django.utils import six from django.utils import six
from django.utils.text import normalize_newlines from django.utils.text import normalize_newlines
from .html_parser import HTMLParser
# Configuration for urlize() function. # Configuration for urlize() function.
TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')] WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')]
@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z') trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
def escape(text): def escape(text):
@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
return '\n\n'.join(paras) return '\n\n'.join(paras)
linebreaks = allow_lazy(linebreaks, six.text_type) linebreaks = allow_lazy(linebreaks, six.text_type)
class MLStripper(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def handle_charref(self, name):
self.fed.append('&#%s;' % name)
def get_data(self):
return ''.join(self.fed)
def strip_tags(value): def strip_tags(value):
"""Returns the given HTML with all tags stripped.""" """Returns the given HTML with all tags stripped."""
return strip_tags_re.sub('', force_text(value)) s = MLStripper()
s.feed(value)
data = s.get_data()
try:
res = s.close()
except Exception as e:
data += s.rawdata
return data
strip_tags = allow_lazy(strip_tags) strip_tags = allow_lazy(strip_tags)
def remove_tags(html, tags): def remove_tags(html, tags):

View File

@ -5,6 +5,7 @@ import os
from django.utils import html from django.utils import html
from django.utils._os import upath from django.utils._os import upath
from django.utils.encoding import force_text
from django.utils.unittest import TestCase from django.utils.unittest import TestCase
@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase):
def test_strip_tags(self): def test_strip_tags(self):
f = html.strip_tags f = html.strip_tags
items = ( items = (
('<p>See: &#39;&eacute; is an apostrophe followed by e acute</p>',
'See: &#39;&eacute; is an apostrophe followed by e acute'),
('<adf>a', 'a'), ('<adf>a', 'a'),
('</adf>a', 'a'), ('</adf>a', 'a'),
('<asdf><asdf>e', 'e'), ('<asdf><asdf>e', 'e'),
('<f', '<f'), ('hi, <f x', 'hi, <f x'),
('</fe', '</fe'), ('</fe', '</fe'),
('<x>b<y>', 'b'), ('<x>b<y>', 'b'),
('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase):
for filename in ('strip_tags1.html', 'strip_tags2.txt'): for filename in ('strip_tags1.html', 'strip_tags2.txt'):
path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
with open(path, 'r') as fp: with open(path, 'r') as fp:
content = force_text(fp.read())
start = datetime.now() start = datetime.now()
stripped = html.strip_tags(fp.read()) stripped = html.strip_tags(content)
elapsed = datetime.now() - start elapsed = datetime.now() - start
self.assertEqual(elapsed.seconds, 0) self.assertEqual(elapsed.seconds, 0)
self.assertIn("Please try again.", stripped) self.assertIn("Please try again.", stripped)