django1/django/utils/html_parser.py

from django.utils.six.moves import html_parser as _html_parser
import re

tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')

HTMLParseError = _html_parser.HTMLParseError

class HTMLParser(_html_parser.HTMLParser):
    """
    Patched version of stdlib's HTMLParser with patch from:
    http://bugs.python.org/issue670664
    """
    def __init__(self):
        _html_parser.HTMLParser.__init__(self)
        self.cdata_tag = None

    def set_cdata_mode(self, tag):
        try:
            self.interesting = _html_parser.interesting_cdata
        except AttributeError:
            self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
        self.cdata_tag = tag.lower()

    def clear_cdata_mode(self):
        self.interesting = _html_parser.interesting_normal
        self.cdata_tag = None

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i + 1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = match.group(1).lower()

        while k < endpos:
            m = _html_parser.attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.error("junk characters in start tag: %r"
                       % (rawdata[k:endpos][:20],))
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag) # <--------------------------- Changed
        return endpos

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
        match = _html_parser.endendtag.search(rawdata, i + 1) # >
        if not match:
            return -1
        j = match.end()
        match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
            if self.cdata_tag is not None: # *** add ***
                self.handle_data(rawdata[i:j]) # *** add ***
                return j # *** add ***
            self.error("bad end tag: %r" % (rawdata[i:j],))
        # --- changed start ---------------------------------------------------
        tag = match.group(1).strip()
        if self.cdata_tag is not None:
            if tag.lower() != self.cdata_tag:
                self.handle_data(rawdata[i:j])
                return j
        # --- changed end -----------------------------------------------------
        self.handle_endtag(tag.lower())
        self.clear_cdata_mode()
        return j
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`from django.utils.six.moves import html_parser as _html_parser`
Fixed #17641 -- Work around an issue in Python distributions that remove the module attribute ('2.7.2+'). Many thanks to Ramiro Morales for finding it. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17456 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-05 18:29:08 +08:00			`import re`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00
Fixed our HTMLParser patches for python 2.7.4 2012-06-17 02:49:50 +08:00			`tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s\|/(?!>))')`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`HTMLParseError = _html_parser.HTMLParseError`

			`class HTMLParser(_html_parser.HTMLParser):`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`"""`
			`Patched version of stdlib's HTMLParser with patch from:`
			`http://bugs.python.org/issue670664`
			`"""`
			`def __init__(self):`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`_html_parser.HTMLParser.__init__(self)`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`self.cdata_tag = None`

			`def set_cdata_mode(self, tag):`
Fixed #17641 -- Work around an issue in Python distributions that remove the module attribute ('2.7.2+'). Many thanks to Ramiro Morales for finding it. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17456 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-05 18:29:08 +08:00			`try:`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`self.interesting = _html_parser.interesting_cdata`
Fixed #17641 -- Work around an issue in Python distributions that remove the module attribute ('2.7.2+'). Many thanks to Ramiro Morales for finding it. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17456 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-05 18:29:08 +08:00			`except AttributeError:`
			`self.interesting = re.compile(r'</\s%s\s>' % tag.lower(), re.I)`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`self.cdata_tag = tag.lower()`

			`def clear_cdata_mode(self):`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`self.interesting = _html_parser.interesting_normal`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`self.cdata_tag = None`

			`# Internal -- handle starttag, return end or -1 if not terminated`
			`def parse_starttag(self, i):`
			`self.__starttag_text = None`
			`endpos = self.check_for_whole_start_tag(i)`
			`if endpos < 0:`
			`return endpos`
			`rawdata = self.rawdata`
			`self.__starttag_text = rawdata[i:endpos]`

			`# Now parse the data between i+1 and j into a tag and attrs`
			`attrs = []`
Fixed our HTMLParser patches for python 2.7.4 2012-06-17 02:49:50 +08:00			`match = tagfind.match(rawdata, i + 1)`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`assert match, 'unexpected call to parse_starttag()'`
			`k = match.end()`
Fixed our HTMLParser patches for python 2.7.4 2012-06-17 02:49:50 +08:00			`self.lasttag = tag = match.group(1).lower()`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00
			`while k < endpos:`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`m = _html_parser.attrfind.match(rawdata, k)`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`if not m:`
			`break`
			`attrname, rest, attrvalue = m.group(1, 2, 3)`
			`if not rest:`
			`attrvalue = None`
			`elif attrvalue[:1] == '\'' == attrvalue[-1:] or \`
			`attrvalue[:1] == '"' == attrvalue[-1:]:`
			`attrvalue = attrvalue[1:-1]`
Fixed our HTMLParser patches for python 2.7.4 2012-06-17 02:49:50 +08:00			`if attrvalue:`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`attrvalue = self.unescape(attrvalue)`
			`attrs.append((attrname.lower(), attrvalue))`
			`k = m.end()`

			`end = rawdata[k:endpos].strip()`
			`if end not in (">", "/>"):`
			`lineno, offset = self.getpos()`
			`if "\n" in self.__starttag_text:`
			`lineno = lineno + self.__starttag_text.count("\n")`
			`offset = len(self.__starttag_text) \`
			`- self.__starttag_text.rfind("\n")`
			`else:`
			`offset = offset + len(self.__starttag_text)`
			`self.error("junk characters in start tag: %r"`
			`% (rawdata[k:endpos][:20],))`
			`if end.endswith('/>'):`
			`# XHTML-style empty tag: <span attr="value" />`
			`self.handle_startendtag(tag, attrs)`
			`else:`
			`self.handle_starttag(tag, attrs)`
			`if tag in self.CDATA_CONTENT_ELEMENTS:`
			`self.set_cdata_mode(tag) # <--------------------------- Changed`
			`return endpos`

			`# Internal -- parse endtag, return end or -1 if incomplete`
			`def parse_endtag(self, i):`
			`rawdata = self.rawdata`
			`assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`match = _html_parser.endendtag.search(rawdata, i + 1) # >`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`if not match:`
			`return -1`
			`j = match.end()`
[py3] Switched to Python 3-compatible imports. xrange/range will be dealt with in a separate commit due to the huge number of changes. 2012-07-20 22:16:57 +08:00			`match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >`
Fixed #16921 -- Added assertHTMLEqual and assertHTMLNotEqual assertions, and converted Django tests to use them where appropriate. Thanks Greg Müllegger. git-svn-id: http://code.djangoproject.com/svn/django/trunk@17414 bcc190cf-cafb-0310-a4f2-bffc1f526a37 2012-02-01 04:36:11 +08:00			`if not match:`
			`if self.cdata_tag is not None: # * add *`
			`self.handle_data(rawdata[i:j]) # * add *`
			`return j # * add *`
			`self.error("bad end tag: %r" % (rawdata[i:j],))`
			`# --- changed start ---------------------------------------------------`
			`tag = match.group(1).strip()`
			`if self.cdata_tag is not None:`
			`if tag.lower() != self.cdata_tag:`
			`self.handle_data(rawdata[i:j])`
			`return j`
			`# --- changed end -----------------------------------------------------`
			`self.handle_endtag(tag.lower())`
			`self.clear_cdata_mode()`
			`return j`