From d1503afd66ca8f2f8d3819ba8a60727e0ee66cec Mon Sep 17 00:00:00 2001 From: Claude Paroz Date: Thu, 20 Mar 2014 16:50:50 +0100 Subject: [PATCH] [1.6.x] Improved strip_tags and clarified documentation The fact that strip_tags cannot guarantee to really strip all non-safe HTML content was not clear enough. Also see: https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/ Backport of 6ca6c36f8 from master. --- django/utils/html.py | 32 ++++++++++++++++++++++++++++---- docs/ref/templates/builtins.txt | 12 +++++++++++- docs/ref/utils.txt | 18 ++++++++++++------ tests/utils_tests/test_html.py | 2 ++ 4 files changed, 53 insertions(+), 11 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index b55a2234b58..5d96f15daae 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -115,7 +115,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type) class MLStripper(HTMLParser): def __init__(self): - HTMLParser.__init__(self) + if six.PY2: + HTMLParser.__init__(self) + else: + HTMLParser.__init__(self, strict=False) self.reset() self.fed = [] def handle_data(self, d): @@ -127,16 +130,37 @@ class MLStripper(HTMLParser): def get_data(self): return ''.join(self.fed) -def strip_tags(value): - """Returns the given HTML with all tags stripped.""" + +def _strip_once(value): + """ + Internal tag stripping utility used by strip_tags. + """ s = MLStripper() try: s.feed(value) - s.close() except HTMLParseError: return value + try: + s.close() + except (HTMLParseError, UnboundLocalError) as err: + # UnboundLocalError because of http://bugs.python.org/issue17802 + # on Python 3.2, triggered by strict=False mode of HTMLParser + return s.get_data() + s.rawdata else: return s.get_data() + + +def strip_tags(value): + """Returns the given HTML with all tags stripped.""" + while True: + if not ('<' in value or '>' in value): + return value + new_value = _strip_once(value) + if new_value == value: + # _strip_once was not able to detect more tags + return value + else: + value = new_value strip_tags = allow_lazy(strip_tags) def remove_tags(html, tags): diff --git a/docs/ref/templates/builtins.txt b/docs/ref/templates/builtins.txt index 62568df2eb6..6d6c159fe3c 100644 --- a/docs/ref/templates/builtins.txt +++ b/docs/ref/templates/builtins.txt @@ -2012,7 +2012,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``. striptags ^^^^^^^^^ -Strips all [X]HTML tags. +Makes all possible efforts to strip all [X]HTML tags. For example:: @@ -2021,6 +2021,16 @@ For example:: If ``value`` is ``"Joel a slug"``, the output will be ``"Joel is a slug"``. +.. admonition:: No safety guarantee + + Note that ``striptags`` doesn't give any guarantee about its output being + entirely HTML safe, particularly with non valid HTML input. So **NEVER** + apply the ``safe`` filter to a ``striptags`` output. + If you are looking for something more robust, you can use the ``bleach`` + Python library, notably its `clean`_ method. + +.. _clean: http://bleach.readthedocs.org/en/latest/clean.html + .. templatefilter:: time time diff --git a/docs/ref/utils.txt b/docs/ref/utils.txt index c75f38566ac..107f6fd5911 100644 --- a/docs/ref/utils.txt +++ b/docs/ref/utils.txt @@ -616,17 +616,23 @@ escaping HTML. .. function:: strip_tags(value) - Removes anything that looks like an html tag from the string, that is - anything contained within ``<>``. + Tries to remove anything that looks like an HTML tag from the string, that + is anything contained within ``<>``. + Absolutely NO guaranty is provided about the resulting string being entirely + HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without + escaping it first, for example with :func:`~django.utils.html.escape`. For example:: strip_tags(value) - If ``value`` is ``"Joel a slug"`` the - return value will be ``"Joel is a slug"``. Note that ``strip_tags`` result - may still contain unsafe HTML content, so you might use - :func:`~django.utils.html.escape` to make it a safe string. + If ``value`` is ``"Joel a slug"`` + the return value will be ``"Joel is a slug"``. + + If you are looking for a more robust solution, take a look at the `bleach`_ + Python library. + + .. _bleach: https://pypi.python.org/pypi/bleach .. versionchanged:: 1.6 diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py index ba8f29e3ae6..b4e3d28db98 100644 --- a/tests/utils_tests/test_html.py +++ b/tests/utils_tests/test_html.py @@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase): ('a

b

c', 'abc'), ('de

f', 'def'), ('foobar', 'foobar'), + ('ript>test</script>', 'test'), + ('&h', 'alert()&h'), ) for value, output in items: self.check_output(f, value, output)