From dc51ec8bc214cf60ebb99732363624c23df8005f Mon Sep 17 00:00:00 2001 From: Claude Paroz Date: Wed, 22 May 2013 17:29:16 +0200 Subject: [PATCH] Fixed #19237 -- Used HTML parser to strip tags The regex method used until now for the strip_tags utility is fast, but subject to flaws and security issues. Consensus and good practice lead use to use a slower but safer method. --- django/utils/html.py | 28 ++++++++++++++++++++++++++-- tests/utils_tests/test_html.py | 8 ++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index edddc48e62..573235092d 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy from django.utils import six from django.utils.text import normalize_newlines +from .html_parser import HTMLParser + + # Configuration for urlize() function. TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] @@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') -strip_tags_re = re.compile(r']*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) def escape(text): @@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False): return '\n\n'.join(paras) linebreaks = allow_lazy(linebreaks, six.text_type) + +class MLStripper(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.reset() + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def handle_entityref(self, name): + self.fed.append('&%s;' % name) + def handle_charref(self, name): + self.fed.append('&#%s;' % name) + def get_data(self): + return ''.join(self.fed) + def strip_tags(value): """Returns the given HTML with all tags stripped.""" - return strip_tags_re.sub('', force_text(value)) + s = MLStripper() + s.feed(value) + data = s.get_data() + try: + res = s.close() + except Exception as e: + data += s.rawdata + return data strip_tags = allow_lazy(strip_tags) def remove_tags(html, tags): diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py index 090cc32d1c..c3e9f7c878 100644 --- a/tests/utils_tests/test_html.py +++ b/tests/utils_tests/test_html.py @@ -5,6 +5,7 @@ import os from django.utils import html from django.utils._os import upath +from django.utils.encoding import force_text from django.utils.unittest import TestCase @@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase): def test_strip_tags(self): f = html.strip_tags items = ( + ('

See: 'é is an apostrophe followed by e acute

', + 'See: 'é is an apostrophe followed by e acute'), ('a', 'a'), ('a', 'a'), ('e', 'e'), - ('b', 'b'), ('a

b

c', 'abc'), @@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase): for filename in ('strip_tags1.html', 'strip_tags2.txt'): path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) with open(path, 'r') as fp: + content = force_text(fp.read()) start = datetime.now() - stripped = html.strip_tags(fp.read()) + stripped = html.strip_tags(content) elapsed = datetime.now() - start self.assertEqual(elapsed.seconds, 0) self.assertIn("Please try again.", stripped)