Fixed #19237 -- Improved strip_tags utility

The previous pattern didn't properly addressed cases where '>'
was present inside quoted tag content.
This commit is contained in:
Chris Khoo 2012-11-24 12:10:25 +01:00 committed by Claude Paroz
parent be64dd35fb
commit bf1871d874
2 changed files with 5 additions and 1 deletions

View File

@ -33,6 +33,7 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
strip_tags_re = re.compile(r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
def escape(text):
@ -117,7 +118,7 @@ linebreaks = allow_lazy(linebreaks, six.text_type)
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
return re.sub(r'<[^>]*?>', '', force_text(value))
return strip_tags_re.sub('', force_text(value))
strip_tags = allow_lazy(strip_tags)
def remove_tags(html, tags):

View File

@ -65,6 +65,9 @@ class TestUtilsHtml(unittest.TestCase):
('<f', '<f'),
('</fe', '</fe'),
('<x>b<y>', 'b'),
('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
('a<p a >b</p>c', 'abc'),
('d<a:b c:d>e</p>f', 'def'),
)
for value, output in items:
self.check_output(f, value, output)