From d1503afd66ca8f2f8d3819ba8a60727e0ee66cec Mon Sep 17 00:00:00 2001
From: Claude Paroz <claude@2xlibre.net>
Date: Thu, 20 Mar 2014 16:50:50 +0100
Subject: [PATCH] [1.6.x] Improved strip_tags and clarified documentation

The fact that strip_tags cannot guarantee to really strip all
non-safe HTML content was not clear enough. Also see:
https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/
Backport of 6ca6c36f8 from master.
---
 django/utils/html.py            | 32 ++++++++++++++++++++++++++++----
 docs/ref/templates/builtins.txt | 12 +++++++++++-
 docs/ref/utils.txt              | 18 ++++++++++++------
 tests/utils_tests/test_html.py  |  2 ++
 4 files changed, 53 insertions(+), 11 deletions(-)
diff --git a/django/utils/html.py b/django/utils/html.py
index b55a2234b58..5d96f15daae 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -115,7 +115,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)
 
 class MLStripper(HTMLParser):
     def __init__(self):
-        HTMLParser.__init__(self)
+        if six.PY2:
+            HTMLParser.__init__(self)
+        else:
+            HTMLParser.__init__(self, strict=False)
         self.reset()
         self.fed = []
     def handle_data(self, d):
@@ -127,16 +130,37 @@ class MLStripper(HTMLParser):
     def get_data(self):
         return ''.join(self.fed)
 
-def strip_tags(value):
-    """Returns the given HTML with all tags stripped."""
+
+def _strip_once(value):
+    """
+    Internal tag stripping utility used by strip_tags.
+    """
     s = MLStripper()
     try:
         s.feed(value)
-        s.close()
     except HTMLParseError:
         return value
+    try:
+        s.close()
+    except (HTMLParseError, UnboundLocalError) as err:
+        # UnboundLocalError because of http://bugs.python.org/issue17802
+        # on Python 3.2, triggered by strict=False mode of HTMLParser
+        return s.get_data() + s.rawdata
     else:
         return s.get_data()
+
+
+def strip_tags(value):
+    """Returns the given HTML with all tags stripped."""
+    while True:
+        if not ('<' in value or '>' in value):
+            return value
+        new_value = _strip_once(value)
+        if new_value == value:
+            # _strip_once was not able to detect more tags
+            return value
+        else:
+            value = new_value
 strip_tags = allow_lazy(strip_tags)
 
 def remove_tags(html, tags):
diff --git a/docs/ref/templates/builtins.txt b/docs/ref/templates/builtins.txt
index 62568df2eb6..6d6c159fe3c 100644
--- a/docs/ref/templates/builtins.txt
+++ b/docs/ref/templates/builtins.txt
@@ -2012,7 +2012,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
 striptags
 ^^^^^^^^^
 
-Strips all [X]HTML tags.
+Makes all possible efforts to strip all [X]HTML tags.
 
 For example::
 
@@ -2021,6 +2021,16 @@ For example::
 If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
 output will be ``"Joel is a slug"``.
 
+.. admonition:: No safety guarantee
+
+    Note that ``striptags`` doesn't give any guarantee about its output being
+    entirely HTML safe, particularly with non valid HTML input. So **NEVER**
+    apply the ``safe`` filter to a ``striptags`` output.
+    If you are looking for something more robust, you can use the ``bleach``
+    Python library, notably its `clean`_ method.
+
+.. _clean: http://bleach.readthedocs.org/en/latest/clean.html
+
 .. templatefilter:: time
 
 time
diff --git a/docs/ref/utils.txt b/docs/ref/utils.txt
index c75f38566ac..107f6fd5911 100644
--- a/docs/ref/utils.txt
+++ b/docs/ref/utils.txt
@@ -616,17 +616,23 @@ escaping HTML.
 
 .. function:: strip_tags(value)
 
-    Removes anything that looks like an html tag from the string, that is
-    anything contained within ``<>``.
+    Tries to remove anything that looks like an HTML tag from the string, that
+    is anything contained within ``<>``.
+    Absolutely NO guaranty is provided about the resulting string being entirely
+    HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
+    escaping it first, for example with :func:`~django.utils.html.escape`.
 
     For example::
 
         strip_tags(value)
 
-    If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"`` the
-    return value will be ``"Joel is a slug"``. Note that ``strip_tags`` result
-    may still contain unsafe HTML content, so you might use
-    :func:`~django.utils.html.escape` to make it a safe string.
+    If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
+    the return value will be ``"Joel is a slug"``.
+
+    If you are looking for a more robust solution, take a look at the `bleach`_
+    Python library.
+
+    .. _bleach: https://pypi.python.org/pypi/bleach
 
     .. versionchanged:: 1.6
 
diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py
index ba8f29e3ae6..b4e3d28db98 100644
--- a/tests/utils_tests/test_html.py
+++ b/tests/utils_tests/test_html.py
@@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
             ('a<p a >b</p>c', 'abc'),
             ('d<a:b c:d>e</p>f', 'def'),
             ('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
+            ('<sc<!-- -->ript>test<<!-- -->/script>', 'test'),
+            ('<script>alert()</script>&h', 'alert()&h'),
         )
         for value, output in items:
             self.check_output(f, value, output)