Fixed #19237 -- Used HTML parser to strip tags
The regex method used until now for the strip_tags utility is fast, but subject to flaws and security issues. Consensus and good practice lead use to use a slower but safer method.
This commit is contained in:
parent
01948e384f
commit
dc51ec8bc2
|
@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy
|
||||||
from django.utils import six
|
from django.utils import six
|
||||||
from django.utils.text import normalize_newlines
|
from django.utils.text import normalize_newlines
|
||||||
|
|
||||||
|
from .html_parser import HTMLParser
|
||||||
|
|
||||||
|
|
||||||
# Configuration for urlize() function.
|
# Configuration for urlize() function.
|
||||||
TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
|
TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
|
||||||
WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')]
|
WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')]
|
||||||
|
@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
||||||
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||||
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||||
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||||
strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def escape(text):
|
def escape(text):
|
||||||
|
@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
|
||||||
return '\n\n'.join(paras)
|
return '\n\n'.join(paras)
|
||||||
linebreaks = allow_lazy(linebreaks, six.text_type)
|
linebreaks = allow_lazy(linebreaks, six.text_type)
|
||||||
|
|
||||||
|
|
||||||
|
class MLStripper(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
HTMLParser.__init__(self)
|
||||||
|
self.reset()
|
||||||
|
self.fed = []
|
||||||
|
def handle_data(self, d):
|
||||||
|
self.fed.append(d)
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
self.fed.append('&%s;' % name)
|
||||||
|
def handle_charref(self, name):
|
||||||
|
self.fed.append('&#%s;' % name)
|
||||||
|
def get_data(self):
|
||||||
|
return ''.join(self.fed)
|
||||||
|
|
||||||
def strip_tags(value):
|
def strip_tags(value):
|
||||||
"""Returns the given HTML with all tags stripped."""
|
"""Returns the given HTML with all tags stripped."""
|
||||||
return strip_tags_re.sub('', force_text(value))
|
s = MLStripper()
|
||||||
|
s.feed(value)
|
||||||
|
data = s.get_data()
|
||||||
|
try:
|
||||||
|
res = s.close()
|
||||||
|
except Exception as e:
|
||||||
|
data += s.rawdata
|
||||||
|
return data
|
||||||
strip_tags = allow_lazy(strip_tags)
|
strip_tags = allow_lazy(strip_tags)
|
||||||
|
|
||||||
def remove_tags(html, tags):
|
def remove_tags(html, tags):
|
||||||
|
|
|
@ -5,6 +5,7 @@ import os
|
||||||
|
|
||||||
from django.utils import html
|
from django.utils import html
|
||||||
from django.utils._os import upath
|
from django.utils._os import upath
|
||||||
|
from django.utils.encoding import force_text
|
||||||
from django.utils.unittest import TestCase
|
from django.utils.unittest import TestCase
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase):
|
||||||
def test_strip_tags(self):
|
def test_strip_tags(self):
|
||||||
f = html.strip_tags
|
f = html.strip_tags
|
||||||
items = (
|
items = (
|
||||||
|
('<p>See: 'é is an apostrophe followed by e acute</p>',
|
||||||
|
'See: 'é is an apostrophe followed by e acute'),
|
||||||
('<adf>a', 'a'),
|
('<adf>a', 'a'),
|
||||||
('</adf>a', 'a'),
|
('</adf>a', 'a'),
|
||||||
('<asdf><asdf>e', 'e'),
|
('<asdf><asdf>e', 'e'),
|
||||||
('<f', '<f'),
|
('hi, <f x', 'hi, <f x'),
|
||||||
('</fe', '</fe'),
|
('</fe', '</fe'),
|
||||||
('<x>b<y>', 'b'),
|
('<x>b<y>', 'b'),
|
||||||
('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
|
('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
|
||||||
|
@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase):
|
||||||
for filename in ('strip_tags1.html', 'strip_tags2.txt'):
|
for filename in ('strip_tags1.html', 'strip_tags2.txt'):
|
||||||
path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
|
path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
|
||||||
with open(path, 'r') as fp:
|
with open(path, 'r') as fp:
|
||||||
|
content = force_text(fp.read())
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
stripped = html.strip_tags(fp.read())
|
stripped = html.strip_tags(content)
|
||||||
elapsed = datetime.now() - start
|
elapsed = datetime.now() - start
|
||||||
self.assertEqual(elapsed.seconds, 0)
|
self.assertEqual(elapsed.seconds, 0)
|
||||||
self.assertIn("Please try again.", stripped)
|
self.assertIn("Please try again.", stripped)
|
||||||
|
|
Loading…
Reference in New Issue