<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:(?:%s).*?[a-zA-Z].*?
\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:(?: |\s|
)*?
\s*)+\Z')
-strip_tags_re = re.compile(r'?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
def escape(text):
@@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
return '\n\n'.join(paras)
linebreaks = allow_lazy(linebreaks, six.text_type)
+
+class MLStripper(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.reset()
+ self.fed = []
+ def handle_data(self, d):
+ self.fed.append(d)
+ def handle_entityref(self, name):
+ self.fed.append('&%s;' % name)
+ def handle_charref(self, name):
+ self.fed.append('%s;' % name)
+ def get_data(self):
+ return ''.join(self.fed)
+
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
- return strip_tags_re.sub('', force_text(value))
+ s = MLStripper()
+ s.feed(value)
+ data = s.get_data()
+ try:
+ res = s.close()
+ except Exception as e:
+ data += s.rawdata
+ return data
strip_tags = allow_lazy(strip_tags)
def remove_tags(html, tags):
diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py
index 090cc32d1c..c3e9f7c878 100644
--- a/tests/utils_tests/test_html.py
+++ b/tests/utils_tests/test_html.py
@@ -5,6 +5,7 @@ import os
from django.utils import html
from django.utils._os import upath
+from django.utils.encoding import force_text
from django.utils.unittest import TestCase
@@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase):
def test_strip_tags(self):
f = html.strip_tags
items = (
+ ('See: 'é is an apostrophe followed by e acute
',
+ 'See: 'é is an apostrophe followed by e acute'),
('a', 'a'),
('a', 'a'),
('e', 'e'),
- ('b', 'b'),
('ab
c', 'abc'),
@@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase):
for filename in ('strip_tags1.html', 'strip_tags2.txt'):
path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
with open(path, 'r') as fp:
+ content = force_text(fp.read())
start = datetime.now()
- stripped = html.strip_tags(fp.read())
+ stripped = html.strip_tags(content)
elapsed = datetime.now() - start
self.assertEqual(elapsed.seconds, 0)
self.assertIn("Please try again.", stripped)