django1/django/utils/translation/template.py

import warnings
from io import StringIO

from django.template.base import Lexer, TokenType
from django.utils.regex_helper import _lazy_re_compile

from . import TranslatorCommentWarning, trim_whitespace

TRANSLATOR_COMMENT_MARK = 'Translators'

dot_re = _lazy_re_compile(r'\S')


def blankout(src, char):
    """
    Change every non-whitespace character to the given char.
    Used in the templatize function.
    """
    return dot_re.sub(char, src)


context_re = _lazy_re_compile(r"""^\s+.*context\s+((?:"[^"]*?")|(?:'[^']*?'))\s*""")
inline_re = _lazy_re_compile(
    # Match the trans/translate 'some text' part.
    r"""^\s*trans(?:late)?\s+((?:"[^"]*?")|(?:'[^']*?'))"""
    # Match and ignore optional filters
    r"""(?:\s*\|\s*[^\s:]+(?::(?:[^\s'":]+|(?:"[^"]*?")|(?:'[^']*?')))?)*"""
    # Match the optional context part
    r"""(\s+.*context\s+((?:"[^"]*?")|(?:'[^']*?')))?\s*"""
)
block_re = _lazy_re_compile(r"""^\s*blocktrans(?:late)?(\s+.*context\s+((?:"[^"]*?")|(?:'[^']*?')))?(?:\s+|$)""")
endblock_re = _lazy_re_compile(r"""^\s*endblocktrans(?:late)?$""")
plural_re = _lazy_re_compile(r"""^\s*plural$""")
constant_re = _lazy_re_compile(r"""_\(((?:".*?")|(?:'.*?'))\)""")


def templatize(src, origin=None):
    """
    Turn a Django template into something that is understood by xgettext. It
    does so by translating the Django translation tags into standard gettext
    function invocations.
    """
    out = StringIO('')
    message_context = None
    intrans = False
    inplural = False
    trimmed = False
    singular = []
    plural = []
    incomment = False
    comment = []
    lineno_comment_map = {}
    comment_lineno_cache = None
    # Adding the u prefix allows gettext to recognize the string (#26093).
    raw_prefix = 'u'

    def join_tokens(tokens, trim=False):
        message = ''.join(tokens)
        if trim:
            message = trim_whitespace(message)
        return message

    for t in Lexer(src).tokenize():
        if incomment:
            if t.token_type == TokenType.BLOCK and t.contents == 'endcomment':
                content = ''.join(comment)
                translators_comment_start = None
                for lineno, line in enumerate(content.splitlines(True)):
                    if line.lstrip().startswith(TRANSLATOR_COMMENT_MARK):
                        translators_comment_start = lineno
                for lineno, line in enumerate(content.splitlines(True)):
                    if translators_comment_start is not None and lineno >= translators_comment_start:
                        out.write(' # %s' % line)
                    else:
                        out.write(' #\n')
                incomment = False
                comment = []
            else:
                comment.append(t.contents)
        elif intrans:
            if t.token_type == TokenType.BLOCK:
                endbmatch = endblock_re.match(t.contents)
                pluralmatch = plural_re.match(t.contents)
                if endbmatch:
                    if inplural:
                        if message_context:
                            out.write(' npgettext({p}{!r}, {p}{!r}, {p}{!r},count) '.format(
                                message_context,
                                join_tokens(singular, trimmed),
                                join_tokens(plural, trimmed),
                                p=raw_prefix,
                            ))
                        else:
                            out.write(' ngettext({p}{!r}, {p}{!r}, count) '.format(
                                join_tokens(singular, trimmed),
                                join_tokens(plural, trimmed),
                                p=raw_prefix,
                            ))
                        for part in singular:
                            out.write(blankout(part, 'S'))
                        for part in plural:
                            out.write(blankout(part, 'P'))
                    else:
                        if message_context:
                            out.write(' pgettext({p}{!r}, {p}{!r}) '.format(
                                message_context,
                                join_tokens(singular, trimmed),
                                p=raw_prefix,
                            ))
                        else:
                            out.write(' gettext({p}{!r}) '.format(
                                join_tokens(singular, trimmed),
                                p=raw_prefix,
                            ))
                        for part in singular:
                            out.write(blankout(part, 'S'))
                    message_context = None
                    intrans = False
                    inplural = False
                    singular = []
                    plural = []
                elif pluralmatch:
                    inplural = True
                else:
                    filemsg = ''
                    if origin:
                        filemsg = 'file %s, ' % origin
                    raise SyntaxError(
                        "Translation blocks must not include other block tags: "
                        "%s (%sline %d)" % (t.contents, filemsg, t.lineno)
                    )
            elif t.token_type == TokenType.VAR:
                if inplural:
                    plural.append('%%(%s)s' % t.contents)
                else:
                    singular.append('%%(%s)s' % t.contents)
            elif t.token_type == TokenType.TEXT:
                contents = t.contents.replace('%', '%%')
                if inplural:
                    plural.append(contents)
                else:
                    singular.append(contents)
        else:
            # Handle comment tokens (`{# ... #}`) plus other constructs on
            # the same line:
            if comment_lineno_cache is not None:
                cur_lineno = t.lineno + t.contents.count('\n')
                if comment_lineno_cache == cur_lineno:
                    if t.token_type != TokenType.COMMENT:
                        for c in lineno_comment_map[comment_lineno_cache]:
                            filemsg = ''
                            if origin:
                                filemsg = 'file %s, ' % origin
                            warn_msg = (
                                "The translator-targeted comment '%s' "
                                "(%sline %d) was ignored, because it wasn't "
                                "the last item on the line."
                            ) % (c, filemsg, comment_lineno_cache)
                            warnings.warn(warn_msg, TranslatorCommentWarning)
                        lineno_comment_map[comment_lineno_cache] = []
                else:
                    out.write('# %s' % ' | '.join(lineno_comment_map[comment_lineno_cache]))
                comment_lineno_cache = None

            if t.token_type == TokenType.BLOCK:
                imatch = inline_re.match(t.contents)
                bmatch = block_re.match(t.contents)
                cmatches = constant_re.findall(t.contents)
                if imatch:
                    g = imatch[1]
                    if g[0] == '"':
                        g = g.strip('"')
                    elif g[0] == "'":
                        g = g.strip("'")
                    g = g.replace('%', '%%')
                    if imatch[2]:
                        # A context is provided
                        context_match = context_re.match(imatch[2])
                        message_context = context_match[1]
                        if message_context[0] == '"':
                            message_context = message_context.strip('"')
                        elif message_context[0] == "'":
                            message_context = message_context.strip("'")
                        out.write(' pgettext({p}{!r}, {p}{!r}) '.format(
                            message_context, g, p=raw_prefix
                        ))
                        message_context = None
                    else:
                        out.write(' gettext({p}{!r}) '.format(g, p=raw_prefix))
                elif bmatch:
                    for fmatch in constant_re.findall(t.contents):
                        out.write(' _(%s) ' % fmatch)
                    if bmatch[1]:
                        # A context is provided
                        context_match = context_re.match(bmatch[1])
                        message_context = context_match[1]
                        if message_context[0] == '"':
                            message_context = message_context.strip('"')
                        elif message_context[0] == "'":
                            message_context = message_context.strip("'")
                    intrans = True
                    inplural = False
                    trimmed = 'trimmed' in t.split_contents()
                    singular = []
                    plural = []
                elif cmatches:
                    for cmatch in cmatches:
                        out.write(' _(%s) ' % cmatch)
                elif t.contents == 'comment':
                    incomment = True
                else:
                    out.write(blankout(t.contents, 'B'))
            elif t.token_type == TokenType.VAR:
                parts = t.contents.split('|')
                cmatch = constant_re.match(parts[0])
                if cmatch:
                    out.write(' _(%s) ' % cmatch[1])
                for p in parts[1:]:
                    if p.find(':_(') >= 0:
                        out.write(' %s ' % p.split(':', 1)[1])
                    else:
                        out.write(blankout(p, 'F'))
            elif t.token_type == TokenType.COMMENT:
                if t.contents.lstrip().startswith(TRANSLATOR_COMMENT_MARK):
                    lineno_comment_map.setdefault(t.lineno, []).append(t.contents)
                    comment_lineno_cache = t.lineno
            else:
                out.write(blankout(t.contents, 'X'))
    return out.getvalue()
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`import warnings`
Refs #23919 -- Removed most of remaining six usage Thanks Tim Graham for the review. 2017-01-07 19:11:46 +08:00			`from io import StringIO`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00
Refs #32986 -- Moved TRANSLATOR_COMMENT_MARK to django.utils.translation.template. 2021-08-05 11:20:04 +08:00			`from django.template.base import Lexer, TokenType`
Fixed #30899 -- Lazily compiled import time regular expressions. 2019-10-26 22:42:32 +08:00			`from django.utils.regex_helper import _lazy_re_compile`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00
			`from . import TranslatorCommentWarning, trim_whitespace`

Refs #32986 -- Moved TRANSLATOR_COMMENT_MARK to django.utils.translation.template. 2021-08-05 11:20:04 +08:00			`TRANSLATOR_COMMENT_MARK = 'Translators'`

Fixed #30899 -- Lazily compiled import time regular expressions. 2019-10-26 22:42:32 +08:00			`dot_re = _lazy_re_compile(r'\S')`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00

			`def blankout(src, char):`
			`"""`
			`Change every non-whitespace character to the given char.`
			`Used in the templatize function.`
			`"""`
			`return dot_re.sub(char, src)`


Fixed #30899 -- Lazily compiled import time regular expressions. 2019-10-26 22:42:32 +08:00			`context_re = _lazy_re_compile(r"""^\s+.context\s+((?:"[^"]?")\|(?:'[^']?'))\s""")`
			`inline_re = _lazy_re_compile(`
Fixed #30585 -- Added {% translate %} and {% blocktranslate %} template tags. 2019-06-22 00:41:01 +08:00			`# Match the trans/translate 'some text' part.`
			`r"""^\strans(?:late)?\s+((?:"[^"]?")\|(?:'[^']*?'))"""`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`# Match and ignore optional filters`
			`r"""(?:\s\\|\s[^\s:]+(?::(?:[^\s'":]+\|(?:"[^"]?")\|(?:'[^']?')))?)*"""`
			`# Match the optional context part`
			`r"""(\s+.context\s+((?:"[^"]?")\|(?:'[^']?')))?\s"""`
			`)`
Fixed #30585 -- Added {% translate %} and {% blocktranslate %} template tags. 2019-06-22 00:41:01 +08:00			`block_re = _lazy_re_compile(r"""^\sblocktrans(?:late)?(\s+.context\s+((?:"[^"]?")\|(?:'[^']?')))?(?:\s+\|$)""")`
			`endblock_re = _lazy_re_compile(r"""^\s*endblocktrans(?:late)?$""")`
Fixed #30899 -- Lazily compiled import time regular expressions. 2019-10-26 22:42:32 +08:00			`plural_re = _lazy_re_compile(r"""^\s*plural$""")`
			`constant_re = _lazy_re_compile(r"""_\(((?:".?")\|(?:'.?'))\)""")`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00

Refs #27795 -- Removed force_text from templatize function 2017-02-06 16:15:25 +08:00			`def templatize(src, origin=None):`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`"""`
			`Turn a Django template into something that is understood by xgettext. It`
			`does so by translating the Django translation tags into standard gettext`
			`function invocations.`
			`"""`
			`out = StringIO('')`
			`message_context = None`
			`intrans = False`
			`inplural = False`
			`trimmed = False`
			`singular = []`
			`plural = []`
			`incomment = False`
			`comment = []`
			`lineno_comment_map = {}`
			`comment_lineno_cache = None`
Refs #23919, #27778 -- Removed obsolete mentions of unicode. 2017-01-21 05:04:05 +08:00			`# Adding the u prefix allows gettext to recognize the string (#26093).`
Refs #23919 -- Removed six.PY2/PY3 usage Thanks Tim Graham for the review. 2016-12-01 18:38:01 +08:00			`raw_prefix = 'u'`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00
			`def join_tokens(tokens, trim=False):`
			`message = ''.join(tokens)`
			`if trim:`
			`message = trim_whitespace(message)`
			`return message`

			`for t in Lexer(src).tokenize():`
			`if incomment:`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`if t.token_type == TokenType.BLOCK and t.contents == 'endcomment':`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`content = ''.join(comment)`
			`translators_comment_start = None`
			`for lineno, line in enumerate(content.splitlines(True)):`
			`if line.lstrip().startswith(TRANSLATOR_COMMENT_MARK):`
			`translators_comment_start = lineno`
			`for lineno, line in enumerate(content.splitlines(True)):`
			`if translators_comment_start is not None and lineno >= translators_comment_start:`
			`out.write(' # %s' % line)`
			`else:`
			`out.write(' #\n')`
			`incomment = False`
			`comment = []`
			`else:`
			`comment.append(t.contents)`
			`elif intrans:`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`if t.token_type == TokenType.BLOCK:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`endbmatch = endblock_re.match(t.contents)`
			`pluralmatch = plural_re.match(t.contents)`
			`if endbmatch:`
			`if inplural:`
			`if message_context:`
			`out.write(' npgettext({p}{!r}, {p}{!r}, {p}{!r},count) '.format(`
			`message_context,`
			`join_tokens(singular, trimmed),`
			`join_tokens(plural, trimmed),`
			`p=raw_prefix,`
			`))`
			`else:`
			`out.write(' ngettext({p}{!r}, {p}{!r}, count) '.format(`
			`join_tokens(singular, trimmed),`
			`join_tokens(plural, trimmed),`
			`p=raw_prefix,`
			`))`
			`for part in singular:`
			`out.write(blankout(part, 'S'))`
			`for part in plural:`
			`out.write(blankout(part, 'P'))`
			`else:`
			`if message_context:`
			`out.write(' pgettext({p}{!r}, {p}{!r}) '.format(`
			`message_context,`
			`join_tokens(singular, trimmed),`
			`p=raw_prefix,`
			`))`
			`else:`
			`out.write(' gettext({p}{!r}) '.format(`
			`join_tokens(singular, trimmed),`
			`p=raw_prefix,`
			`))`
			`for part in singular:`
			`out.write(blankout(part, 'S'))`
			`message_context = None`
			`intrans = False`
			`inplural = False`
			`singular = []`
			`plural = []`
			`elif pluralmatch:`
			`inplural = True`
			`else:`
			`filemsg = ''`
			`if origin:`
			`filemsg = 'file %s, ' % origin`
			`raise SyntaxError(`
			`"Translation blocks must not include other block tags: "`
			`"%s (%sline %d)" % (t.contents, filemsg, t.lineno)`
			`)`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`elif t.token_type == TokenType.VAR:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`if inplural:`
			`plural.append('%%(%s)s' % t.contents)`
			`else:`
			`singular.append('%%(%s)s' % t.contents)`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`elif t.token_type == TokenType.TEXT:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`contents = t.contents.replace('%', '%%')`
			`if inplural:`
			`plural.append(contents)`
			`else:`
			`singular.append(contents)`
			`else:`
			# Handle comment tokens (`{# ... #}`) plus other constructs on
			`# the same line:`
			`if comment_lineno_cache is not None:`
			`cur_lineno = t.lineno + t.contents.count('\n')`
			`if comment_lineno_cache == cur_lineno:`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`if t.token_type != TokenType.COMMENT:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`for c in lineno_comment_map[comment_lineno_cache]:`
			`filemsg = ''`
			`if origin:`
			`filemsg = 'file %s, ' % origin`
			`warn_msg = (`
			`"The translator-targeted comment '%s' "`
			`"(%sline %d) was ignored, because it wasn't "`
			`"the last item on the line."`
			`) % (c, filemsg, comment_lineno_cache)`
			`warnings.warn(warn_msg, TranslatorCommentWarning)`
			`lineno_comment_map[comment_lineno_cache] = []`
			`else:`
			`out.write('# %s' % ' \| '.join(lineno_comment_map[comment_lineno_cache]))`
			`comment_lineno_cache = None`

Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`if t.token_type == TokenType.BLOCK:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`imatch = inline_re.match(t.contents)`
			`bmatch = block_re.match(t.contents)`
			`cmatches = constant_re.findall(t.contents)`
			`if imatch:`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`g = imatch[1]`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`if g[0] == '"':`
			`g = g.strip('"')`
			`elif g[0] == "'":`
			`g = g.strip("'")`
			`g = g.replace('%', '%%')`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`if imatch[2]:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`# A context is provided`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`context_match = context_re.match(imatch[2])`
			`message_context = context_match[1]`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`if message_context[0] == '"':`
			`message_context = message_context.strip('"')`
			`elif message_context[0] == "'":`
			`message_context = message_context.strip("'")`
			`out.write(' pgettext({p}{!r}, {p}{!r}) '.format(`
			`message_context, g, p=raw_prefix`
			`))`
			`message_context = None`
			`else:`
			`out.write(' gettext({p}{!r}) '.format(g, p=raw_prefix))`
			`elif bmatch:`
			`for fmatch in constant_re.findall(t.contents):`
			`out.write(' _(%s) ' % fmatch)`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`if bmatch[1]:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`# A context is provided`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`context_match = context_re.match(bmatch[1])`
			`message_context = context_match[1]`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`if message_context[0] == '"':`
			`message_context = message_context.strip('"')`
			`elif message_context[0] == "'":`
			`message_context = message_context.strip("'")`
			`intrans = True`
			`inplural = False`
			`trimmed = 'trimmed' in t.split_contents()`
			`singular = []`
			`plural = []`
			`elif cmatches:`
			`for cmatch in cmatches:`
			`out.write(' _(%s) ' % cmatch)`
			`elif t.contents == 'comment':`
			`incomment = True`
			`else:`
			`out.write(blankout(t.contents, 'B'))`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`elif t.token_type == TokenType.VAR:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`parts = t.contents.split('\|')`
			`cmatch = constant_re.match(parts[0])`
			`if cmatch:`
Refs #30116 -- Simplified regex match group access with Match.__getitem__(). The method has been available since Python 3.6. The shorter syntax is also marginally faster. 2020-05-11 04:03:39 +08:00			`out.write(' _(%s) ' % cmatch[1])`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`for p in parts[1:]:`
			`if p.find(':_(') >= 0:`
			`out.write(' %s ' % p.split(':', 1)[1])`
			`else:`
			`out.write(blankout(p, 'F'))`
Replaced TOKEN_* constants by TokenType enums. Thanks Tim Graham for the review. 2018-05-10 23:51:51 +08:00			`elif t.token_type == TokenType.COMMENT:`
Fixed #27034 -- Made makemessages independent of USE_I18N Thanks Tim Graham for the review. 2016-08-08 21:46:52 +08:00			`if t.contents.lstrip().startswith(TRANSLATOR_COMMENT_MARK):`
			`lineno_comment_map.setdefault(t.lineno, []).append(t.contents)`
			`comment_lineno_cache = t.lineno`
			`else:`
			`out.write(blankout(t.contents, 'X'))`
			`return out.getvalue()`