From 5a0b72a6eb41a66af14d6256fa382380399eabfb Mon Sep 17 00:00:00 2001 From: Malcolm Tredinnick Date: Sat, 10 Feb 2007 02:51:27 +0000 Subject: [PATCH] Fixed #2027 -- added truncatewords_html filter that respects HTML tags whilst truncating. Patch from SmileyChris. git-svn-id: http://code.djangoproject.com/svn/django/trunk@4468 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/template/defaultfilters.py | 16 +++++ django/utils/text.py | 60 +++++++++++++++++++ tests/regressiontests/defaultfilters/tests.py | 14 +++++ 3 files changed, 90 insertions(+) diff --git a/django/template/defaultfilters.py b/django/template/defaultfilters.py index 1d0f78ce12..b0cfcfdeb9 100644 --- a/django/template/defaultfilters.py +++ b/django/template/defaultfilters.py @@ -119,6 +119,21 @@ def truncatewords(value, arg): value = str(value) return truncate_words(value, length) +def truncatewords_html(value, arg): + """ + Truncates HTML after a certain number of words + + Argument: Number of words to truncate after + """ + from django.utils.text import truncate_html_words + try: + length = int(arg) + except ValueError: # invalid literal for int() + return value # Fail silently. + if not isinstance(value, basestring): + value = str(value) + return truncate_html_words(value, length) + def upper(value): "Converts a string into all uppercase" return value.upper() @@ -534,6 +549,7 @@ register.filter(timesince) register.filter(timeuntil) register.filter(title) register.filter(truncatewords) +register.filter(truncatewords_html) register.filter(unordered_list) register.filter(upper) register.filter(urlencode) diff --git a/django/utils/text.py b/django/utils/text.py index 217f42491b..1c1c456e2d 100644 --- a/django/utils/text.py +++ b/django/utils/text.py @@ -41,6 +41,66 @@ def truncate_words(s, num): words.append('...') return ' '.join(words) +def truncate_html_words(s, num): + """ + Truncates html to a certain number of words (not counting tags and comments). + Closes opened tags if they were correctly closed in the given html. + """ + length = int(num) + if length <= 0: + return '' + html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') + # Set up regular expressions + re_words = re.compile(r'&.*?;|<.*?>|([A-Za-z0-9][\w-]*)') + re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>') + # Count non-HTML words and keep note of open tags + pos = 0 + ellipsis_pos = 0 + words = 0 + open_tags = [] + while words <= length: + m = re_words.search(s, pos) + if not m: + # Checked through whole string + break + pos = m.end(0) + if m.group(1): + # It's an actual non-HTML word + words += 1 + if words == length: + ellipsis_pos = pos + continue + # Check for tag + tag = re_tag.match(m.group(0)) + if not tag or ellipsis_pos: + # Don't worry about non tags or tags after our truncate point + continue + closing_tag, tagname, self_closing = tag.groups() + tagname = tagname.lower() # Element names are always case-insensitive + if self_closing or tagname in html4_singlets: + pass + elif closing_tag: + # Check for match in open tags list + try: + i = open_tags.index(tagname) + except ValueError: + pass + else: + # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags + open_tags = open_tags[i+1:] + else: + # Add it to the start of the open tags list + open_tags.insert(0, tagname) + if words <= length: + # Don't try to close tags if we don't need to truncate + return s + out = s[:ellipsis_pos] + ' ...' + # Close any tags still open + for tag in open_tags: + out += '' % tag + # Return string + return out + def get_valid_filename(s): """ Returns the given string converted to a string that can be used for a clean diff --git a/tests/regressiontests/defaultfilters/tests.py b/tests/regressiontests/defaultfilters/tests.py index 439a40c31b..481557b782 100644 --- a/tests/regressiontests/defaultfilters/tests.py +++ b/tests/regressiontests/defaultfilters/tests.py @@ -87,6 +87,20 @@ u'\xeb' >>> truncatewords('A sentence with a few words in it', 'not a number') 'A sentence with a few words in it' +>>> truncatewords_html('

one two - three
four
five

', 0) +'' + +>>> truncatewords_html('

one two - three
four
five

', 2) +'

one two ...

' + +>>> truncatewords_html('

one two - three
four
five

', 4) +'

one two - three
four ...

' + +>>> truncatewords_html('

one two - three
four
five

', 5) +'

one two - three
four
five

' + +>>> truncatewords_html('

one two - three
four
five

', 100) +'

one two - three
four
five

' >>> upper('Mixed case input') 'MIXED CASE INPUT'