Improved strip_tags and clarified documentation
The fact that strip_tags cannot guarantee to really strip all non-safe HTML content was not clear enough. Also see: https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/
This commit is contained in:
parent
aaa2110259
commit
6ca6c36f82
|
@ -118,7 +118,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)
|
|||
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
if six.PY2:
|
||||
HTMLParser.__init__(self)
|
||||
else:
|
||||
HTMLParser.__init__(self, strict=False)
|
||||
self.reset()
|
||||
self.fed = []
|
||||
|
||||
|
@ -135,16 +138,36 @@ class MLStripper(HTMLParser):
|
|||
return ''.join(self.fed)
|
||||
|
||||
|
||||
def strip_tags(value):
|
||||
"""Returns the given HTML with all tags stripped."""
|
||||
def _strip_once(value):
|
||||
"""
|
||||
Internal tag stripping utility used by strip_tags.
|
||||
"""
|
||||
s = MLStripper()
|
||||
try:
|
||||
s.feed(value)
|
||||
s.close()
|
||||
except HTMLParseError:
|
||||
return value
|
||||
try:
|
||||
s.close()
|
||||
except (HTMLParseError, UnboundLocalError) as err:
|
||||
# UnboundLocalError because of http://bugs.python.org/issue17802
|
||||
# on Python 3.2, triggered by strict=False mode of HTMLParser
|
||||
return s.get_data() + s.rawdata
|
||||
else:
|
||||
return s.get_data()
|
||||
|
||||
|
||||
def strip_tags(value):
|
||||
"""Returns the given HTML with all tags stripped."""
|
||||
while True:
|
||||
if not ('<' in value or '>' in value):
|
||||
return value
|
||||
new_value = _strip_once(value)
|
||||
if new_value == value:
|
||||
# _strip_once was not able to detect more tags
|
||||
return value
|
||||
else:
|
||||
value = new_value
|
||||
strip_tags = allow_lazy(strip_tags)
|
||||
|
||||
|
||||
|
|
|
@ -1985,7 +1985,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
|
|||
striptags
|
||||
^^^^^^^^^
|
||||
|
||||
Strips all [X]HTML tags.
|
||||
Makes all possible efforts to strip all [X]HTML tags.
|
||||
|
||||
For example::
|
||||
|
||||
|
@ -1994,6 +1994,16 @@ For example::
|
|||
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``, the
|
||||
output will be ``"Joel is a slug"``.
|
||||
|
||||
.. admonition:: No safety guarantee
|
||||
|
||||
Note that ``striptags`` doesn't give any guarantee about its output being
|
||||
entirely HTML safe, particularly with non valid HTML input. So **NEVER**
|
||||
apply the ``safe`` filter to a ``striptags`` output.
|
||||
If you are looking for something more robust, you can use the ``bleach``
|
||||
Python library, notably its `clean`_ method.
|
||||
|
||||
.. _clean: http://bleach.readthedocs.org/en/latest/clean.html
|
||||
|
||||
.. templatefilter:: time
|
||||
|
||||
time
|
||||
|
|
|
@ -595,17 +595,23 @@ escaping HTML.
|
|||
|
||||
.. function:: strip_tags(value)
|
||||
|
||||
Removes anything that looks like an html tag from the string, that is
|
||||
anything contained within ``<>``.
|
||||
Tries to remove anything that looks like an HTML tag from the string, that
|
||||
is anything contained within ``<>``.
|
||||
Absolutely NO guaranty is provided about the resulting string being entirely
|
||||
HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
|
||||
escaping it first, for example with :func:`~django.utils.html.escape`.
|
||||
|
||||
For example::
|
||||
|
||||
strip_tags(value)
|
||||
|
||||
If ``value`` is ``"<b>Joel</b> <button>is</button> a <span>slug</span>"``
|
||||
the return value will be ``"Joel is a slug"``. Note that ``strip_tags``
|
||||
result may still contain unsafe HTML content, so you might use
|
||||
:func:`~django.utils.html.escape` to make it a safe string.
|
||||
the return value will be ``"Joel is a slug"``.
|
||||
|
||||
If you are looking for a more robust solution, take a look at the `bleach`_
|
||||
Python library.
|
||||
|
||||
.. _bleach: https://pypi.python.org/pypi/bleach
|
||||
|
||||
.. versionchanged:: 1.6
|
||||
|
||||
|
|
|
@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
|
|||
('a<p a >b</p>c', 'abc'),
|
||||
('d<a:b c:d>e</p>f', 'def'),
|
||||
('<strong>foo</strong><a href="http://example.com">bar</a>', 'foobar'),
|
||||
('<sc<!-- -->ript>test<<!-- -->/script>', 'test'),
|
||||
('<script>alert()</script>&h', 'alert()&h'),
|
||||
)
|
||||
for value, output in items:
|
||||
self.check_output(f, value, output)
|
||||
|
|
Loading…
Reference in New Issue