Improved strip_tags and clarified documentation
The fact that strip_tags cannot guarantee to really strip all non-safe HTML content was not clear enough. Also see: https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/
This commit is contained in:
parent
aaa2110259
commit
6ca6c36f82
|
@ -118,7 +118,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)
|
||||||
|
|
||||||
class MLStripper(HTMLParser):
|
class MLStripper(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
if six.PY2:
|
||||||
|
HTMLParser.__init__(self)
|
||||||
|
else:
|
||||||
|
HTMLParser.__init__(self, strict=False)
|
||||||
self.reset()
|
self.reset()
|
||||||
self.fed = []
|
self.fed = []
|
||||||
|
|
||||||
|
@ -135,16 +138,36 @@ class MLStripper(HTMLParser):
|
||||||
return ''.join(self.fed)
|
return ''.join(self.fed)
|
||||||
|
|
||||||
|
|
||||||
def strip_tags(value):
|
def _strip_once(value):
|
||||||
"""Returns the given HTML with all tags stripped."""
|
"""
|
||||||
|
Internal tag stripping utility used by strip_tags.
|
||||||
|
"""
|
||||||
s = MLStripper()
|
s = MLStripper()
|
||||||
try:
|
try:
|
||||||
s.feed(value)
|
s.feed(value)
|
||||||
s.close()
|
|
||||||
except HTMLParseError:
|
except HTMLParseError:
|
||||||
return value
|
return value
|
||||||
|
try:
|
||||||
|
s.close()
|
||||||
|
except (HTMLParseError, UnboundLocalError) as err:
|
||||||
|
# UnboundLocalError because of http://bugs.python.org/issue17802
|
||||||
|
# on Python 3.2, triggered by strict=False mode of HTMLParser
|
||||||
|
return s.get_data() + s.rawdata
|
||||||
else:
|
else:
|
||||||
return s.get_data()
|
return s.get_data()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_tags(value):
|
||||||
|
"""Returns the given HTML with all tags stripped."""
|
||||||
|
while True:
|
||||||
|
if not ('<' in value or '>' in value):
|
||||||
|
return value
|
||||||
|
new_value = _strip_once(value)
|
||||||
|
if new_value == value:
|
||||||
|
# _strip_once was not able to detect more tags
|
||||||
|
return value
|
||||||
|
else:
|
||||||
|
value = new_value
|
||||||
strip_tags = allow_lazy(strip_tags)
|
strip_tags = allow_lazy(strip_tags)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1985,7 +1985,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
|
||||||
striptags
|
striptags
|
||||||
^^^^^^^^^
|
^^^^^^^^^
|
||||||
|
|
||||||
Strips all [X]HTML tags.
|
Makes all possible efforts to strip all [X]HTML tags.
|
||||||
|
|
||||||
For example::
|
For example::
|
||||||
|
|
||||||
|
@ -1994,6 +1994,16 @@ For example::
|
||||||
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
|
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
|
||||||
output will be ``"Joel is a slug"``.
|
output will be ``"Joel is a slug"``.
|
||||||
|
|
||||||
|
.. admonition:: No safety guarantee
|
||||||
|
|
||||||
|
Note that ``striptags`` doesn't give any guarantee about its output being
|
||||||
|
entirely HTML safe, particularly with non valid HTML input. So **NEVER**
|
||||||
|
apply the ``safe`` filter to a ``striptags`` output.
|
||||||
|
If you are looking for something more robust, you can use the ``bleach``
|
||||||
|
Python library, notably its `clean`_ method.
|
||||||
|
|
||||||
|
.. _clean: http://bleach.readthedocs.org/en/latest/clean.html
|
||||||
|
|
||||||
.. templatefilter:: time
|
.. templatefilter:: time
|
||||||
|
|
||||||
time
|
time
|
||||||
|
|
|
@ -595,17 +595,23 @@ escaping HTML.
|
||||||
|
|
||||||
.. function:: strip_tags(value)
|
.. function:: strip_tags(value)
|
||||||
|
|
||||||
Removes anything that looks like an html tag from the string, that is
|
Tries to remove anything that looks like an HTML tag from the string, that
|
||||||
anything contained within ``<>``.
|
is anything contained within ``<>``.
|
||||||
|
Absolutely NO guaranty is provided about the resulting string being entirely
|
||||||
|
HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
|
||||||
|
escaping it first, for example with :func:`~django.utils.html.escape`.
|
||||||
|
|
||||||
For example::
|
For example::
|
||||||
|
|
||||||
strip_tags(value)
|
strip_tags(value)
|
||||||
|
|
||||||
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
|
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
|
||||||
the return value will be ``"Joel is a slug"``. Note that ``strip_tags``
|
the return value will be ``"Joel is a slug"``.
|
||||||
result may still contain unsafe HTML content, so you might use
|
|
||||||
:func:`~django.utils.html.escape` to make it a safe string.
|
If you are looking for a more robust solution, take a look at the `bleach`_
|
||||||
|
Python library.
|
||||||
|
|
||||||
|
.. _bleach: https://pypi.python.org/pypi/bleach
|
||||||
|
|
||||||
.. versionchanged:: 1.6
|
.. versionchanged:: 1.6
|
||||||
|
|
||||||
|
|
|
@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
|
||||||
('a<p a >b</p>c', 'abc'),
|
('a<p a >b</p>c', 'abc'),
|
||||||
('d<a:b c:d>e</p>f', 'def'),
|
('d<a:b c:d>e</p>f', 'def'),
|
||||||
('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
|
('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
|
||||||
|
('<sc<!-- -->ript>test<<!-- -->/script>', 'test'),
|
||||||
|
('<script>alert()</script>&h', 'alert()&h'),
|
||||||
)
|
)
|
||||||
for value, output in items:
|
for value, output in items:
|
||||||
self.check_output(f, value, output)
|
self.check_output(f, value, output)
|
||||||
|
|
Loading…
Reference in New Issue