Fixed #19508 -- Implemented uri_to_iri as per RFC.

Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz
for the review.
This commit is contained in:
Anubhav Joshi 2014-07-22 17:55:22 +05:30 committed by Loic Bistuer
parent 3af5af1a61
commit 10b17a22be
9 changed files with 189 additions and 42 deletions

View File

@ -206,7 +206,6 @@ def get_path_info(environ):
""" """
path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/') path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/')
# It'd be better to implement URI-to-IRI decoding, see #19508.
return path_info.decode(UTF_8) return path_info.decode(UTF_8)
@ -236,7 +235,6 @@ def get_script_name(environ):
else: else:
script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '') script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '')
# It'd be better to implement URI-to-IRI decoding, see #19508.
return script_name.decode(UTF_8) return script_name.decode(UTF_8)
@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default):
# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
# decoded with ISO-8859-1. This is wrong for Django websites where UTF-8 # decoded with ISO-8859-1. This is wrong for Django websites where UTF-8
# is the default. Re-encode to recover the original bytestring. # is the default. Re-encode to recover the original bytestring.
return value if six.PY2 else value.encode(ISO_8859_1) return value.encode(ISO_8859_1) if six.PY3 else value
def get_str_from_wsgi(environ, key, default): def get_str_from_wsgi(environ, key, default):
""" """
Get a value from the WSGI environ dictionary as bytes. Get a value from the WSGI environ dictionary as str.
key and default should be str objects. Under Python 2 they may also be key and default should be str objects. Under Python 2 they may also be
unicode objects provided they only contain ASCII characters. unicode objects provided they only contain ASCII characters.
""" """
value = environ.get(str(key), str(default)) value = get_bytes_from_wsgi(environ, key, default)
# Same comment as above return value.decode(UTF_8, errors='replace') if six.PY3 else value
return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace')

View File

@ -15,9 +15,11 @@ from wsgiref import simple_server
from wsgiref.util import FileWrapper # NOQA: for backwards compatibility from wsgiref.util import FileWrapper # NOQA: for backwards compatibility
from django.core.exceptions import ImproperlyConfigured from django.core.exceptions import ImproperlyConfigured
from django.core.handlers.wsgi import ISO_8859_1, UTF_8
from django.core.management.color import color_style from django.core.management.color import color_style
from django.core.wsgi import get_wsgi_application from django.core.wsgi import get_wsgi_application
from django.utils import six from django.utils import six
from django.utils.encoding import uri_to_iri
from django.utils.module_loading import import_string from django.utils.module_loading import import_string
from django.utils.six.moves import socketserver from django.utils.six.moves import socketserver
@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object):
sys.stderr.write(msg) sys.stderr.write(msg)
def get_environ(self):
env = super(WSGIRequestHandler, self).get_environ()
path = self.path
if '?' in path:
path = path.partition('?')[0]
path = uri_to_iri(path).encode(UTF_8)
# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
# decoded with ISO-8859-1. We replicate this behavior here.
# Refs comment in `get_bytes_from_wsgi()`.
env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path
return env
def run(addr, port, wsgi_handler, ipv6=False, threading=False): def run(addr, port, wsgi_handler, ipv6=False, threading=False):
server_address = (addr, port) server_address = (addr, port)

View File

@ -12,7 +12,7 @@ from django.apps import apps
from django.conf import settings from django.conf import settings
from django.core import urlresolvers from django.core import urlresolvers
from django.core.handlers.base import BaseHandler from django.core.handlers.base import BaseHandler
from django.core.handlers.wsgi import WSGIRequest from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8
from django.core.signals import (request_started, request_finished, from django.core.signals import (request_started, request_finished,
got_request_exception) got_request_exception)
from django.db import close_old_connections from django.db import close_old_connections
@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict
from django.template import TemplateDoesNotExist from django.template import TemplateDoesNotExist
from django.test import signals from django.test import signals
from django.utils.functional import curry, SimpleLazyObject from django.utils.functional import curry, SimpleLazyObject
from django.utils.encoding import force_bytes, force_str from django.utils.encoding import force_bytes, force_str, uri_to_iri
from django.utils.http import urlencode from django.utils.http import urlencode
from django.utils.itercompat import is_iterable from django.utils.itercompat import is_iterable
from django.utils import six from django.utils import six
from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit from django.utils.six.moves.urllib.parse import urlparse, urlsplit
from django.test.utils import ContextList from django.test.utils import ContextList
__all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart') __all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart')
@ -270,11 +270,11 @@ class RequestFactory(object):
# If there are parameters, add them # If there are parameters, add them
if parsed[3]: if parsed[3]:
path += str(";") + force_str(parsed[3]) path += str(";") + force_str(parsed[3])
path = unquote(path) path = uri_to_iri(path).encode(UTF_8)
# WSGI requires latin-1 encoded strings. See get_path_info(). # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
if six.PY3: # decoded with ISO-8859-1. We replicate this behavior here.
path = path.encode('utf-8').decode('iso-8859-1') # Refs comment in `get_bytes_from_wsgi()`.
return path return path.decode(ISO_8859_1) if six.PY3 else path
def get(self, path, data=None, secure=False, **extra): def get(self, path, data=None, secure=False, **extra):
"Construct a GET request." "Construct a GET request."

View File

@ -1,3 +1,4 @@
# -*- encoding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import codecs import codecs
@ -7,7 +8,9 @@ import locale
from django.utils.functional import Promise from django.utils.functional import Promise
from django.utils import six from django.utils import six
from django.utils.six.moves.urllib.parse import quote from django.utils.six.moves.urllib.parse import quote, unquote
if six.PY3:
from urllib.parse import unquote_to_bytes
class DjangoUnicodeDecodeError(UnicodeDecodeError): class DjangoUnicodeDecodeError(UnicodeDecodeError):
@ -185,7 +188,9 @@ def iri_to_uri(iri):
assuming input is either UTF-8 or unicode already, we can simplify things a assuming input is either UTF-8 or unicode already, we can simplify things a
little from the full method. little from the full method.
Returns an ASCII string containing the encoded result. Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
(e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
(e.g. '/I%20%E2%99%A5%20Django/').
""" """
# The list of safe characters here is constructed from the "reserved" and # The list of safe characters here is constructed from the "reserved" and
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@ -204,6 +209,38 @@ def iri_to_uri(iri):
return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
def uri_to_iri(uri):
"""
Converts a Uniform Resource Identifier(URI) into an Internationalized
Resource Identifier(IRI).
This is the algorithm from section 3.2 of RFC 3987.
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
"""
if uri is None:
return uri
uri = force_bytes(uri)
iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
return repercent_broken_unicode(iri).decode('utf-8')
def repercent_broken_unicode(path):
"""
As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
we need to re-percent-encode any octet produced that is not part of a
strictly legal UTF-8 octet sequence.
"""
try:
path.decode('utf-8')
except UnicodeDecodeError as e:
repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
path = repercent_broken_unicode(
path[:e.start] + force_bytes(repercent) + path[e.end:])
return path
def filepath_to_uri(path): def filepath_to_uri(path):
"""Convert a file system path to a URI portion that is suitable for """Convert a file system path to a URI portion that is suitable for
inclusion in a URL. inclusion in a URL.

View File

@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode
characters. Quoting and converting an IRI to URI can be a little tricky, so characters. Quoting and converting an IRI to URI can be a little tricky, so
Django provides some assistance. Django provides some assistance.
* The function ``django.utils.encoding.iri_to_uri()`` implements the * The function :func:`django.utils.encoding.iri_to_uri()` implements the
conversion from IRI to URI as required by the specification (:rfc:`3987`). conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`).
* The functions ``django.utils.http.urlquote()`` and * The functions :func:`django.utils.http.urlquote()` and
``django.utils.http.urlquote_plus()`` are versions of Python's standard :func:`django.utils.http.urlquote_plus()` are versions of Python's standard
``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
characters. (The data is converted to UTF-8 prior to encoding.) characters. (The data is converted to UTF-8 prior to encoding.)
@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains
non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the
result. result.
The ``iri_to_uri()`` function is also idempotent, which means the following is Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
always true:: implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
It decodes all percent-encodings except those that don't represent a valid
UTF-8 sequence.
An example to demonstrate::
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
'/♥♥/?utf8=✓'
>>> uri_to_iri('%A9helloworld')
'%A9helloworld'
In the first example, the UTF-8 characters and reserved characters are
unquoted. In the second, the percent-encoding remains unchanged because it
lies outside the valid UTF-8 range.
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
following is always true::
iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string) iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string)
uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string)
So you can safely call it multiple times on the same IRI without risking So you can safely call it multiple times on the same URI/IRI without risking
double-quoting problems. double-quoting problems.
.. _URI: http://www.ietf.org/rfc/rfc2396.txt .. _URI: http://www.ietf.org/rfc/rfc2396.txt

View File

@ -271,7 +271,20 @@ The functions defined in this module share the following properties:
since we are assuming input is either UTF-8 or unicode already, we can since we are assuming input is either UTF-8 or unicode already, we can
simplify things a little from the full method. simplify things a little from the full method.
Returns an ASCII string containing the encoded result. Takes an IRI in UTF-8 bytes and returns ASCII bytes containing the encoded
result.
.. function:: uri_to_iri(uri)
.. versionadded:: 1.8
Converts a Uniform Resource Identifier into an Internationalized Resource
Identifier.
This is an algorithm from section 3.2 of :rfc:`3987#section-3.2`.
Takes a URI in ASCII bytes and returns a unicode string containing the
encoded result.
.. function:: filepath_to_uri(path) .. function:: filepath_to_uri(path)

View File

@ -348,6 +348,9 @@ Requests and Responses
* The :attr:`HttpResponse.charset <django.http.HttpResponse.charset>` attribute * The :attr:`HttpResponse.charset <django.http.HttpResponse.charset>` attribute
was added. was added.
* ``WSGIRequestHandler`` now follows RFC in converting URI to IRI, using
``uri_to_iri()``.
Tests Tests
^^^^^ ^^^^^

View File

@ -161,3 +161,28 @@ class HandlerSuspiciousOpsTest(TestCase):
def test_suspiciousop_in_view_returns_400(self): def test_suspiciousop_in_view_returns_400(self):
response = self.client.get('/suspicious/') response = self.client.get('/suspicious/')
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
@override_settings(ROOT_URLCONF='handlers.urls')
class HandlerNotFoundTest(TestCase):
def test_invalid_urls(self):
response = self.client.get('~%A9helloworld')
self.assertEqual(response.status_code, 404)
self.assertContains(response, '~%A9helloworld', status_code=404)
response = self.client.get('d%aao%aaw%aan%aal%aao%aaa%aad%aa/')
self.assertEqual(response.status_code, 404)
self.assertContains(response, 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA', status_code=404)
response = self.client.get('/%E2%99%E2%99%A5/')
self.assertEqual(response.status_code, 404)
self.assertContains(response, '%E2%99\u2665', status_code=404)
response = self.client.get('/%E2%98%8E%E2%A9%E2%99%A5/')
self.assertEqual(response.status_code, 404)
self.assertContains(response, '\u260e%E2%A9\u2665', status_code=404)
def test_environ_path_info_type(self):
environ = RequestFactory().get('/%E2%A8%87%87%A5%E2%A8%A0').environ
self.assertIsInstance(environ['PATH_INFO'], six.text_type)

View File

@ -5,8 +5,8 @@ import unittest
import datetime import datetime
from django.utils import six from django.utils import six
from django.utils.encoding import (filepath_to_uri, force_bytes, from django.utils.encoding import (filepath_to_uri, force_bytes, force_text,
force_text, iri_to_uri, python_2_unicode_compatible) iri_to_uri, uri_to_iri)
from django.utils.http import urlquote_plus from django.utils.http import urlquote_plus
@ -40,6 +40,9 @@ class TestEncodingUtils(unittest.TestCase):
today = datetime.date.today() today = datetime.date.today()
self.assertEqual(force_bytes(today, strings_only=True), today) self.assertEqual(force_bytes(today, strings_only=True), today)
class TestRFC3987IEncodingUtils(unittest.TestCase):
def test_filepath_to_uri(self): def test_filepath_to_uri(self):
self.assertEqual(filepath_to_uri('upload\\чубака.mp4'), self.assertEqual(filepath_to_uri('upload\\чубака.mp4'),
'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4') 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
@ -47,22 +50,57 @@ class TestEncodingUtils(unittest.TestCase):
'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4') 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
def test_iri_to_uri(self): def test_iri_to_uri(self):
self.assertEqual(iri_to_uri('red%09ros\xe9#red'), cases = [
'red%09ros%C3%A9#red') # Valid UTF-8 sequences are encoded.
('red%09rosé#red', 'red%09ros%C3%A9#red'),
('/blog/for/Jürgen Münster/', '/blog/for/J%C3%BCrgen%20M%C3%BCnster/'),
('locations/%s' % urlquote_plus('Paris & Orléans'), 'locations/Paris+%26+Orl%C3%A9ans'),
self.assertEqual(iri_to_uri('/blog/for/J\xfcrgen M\xfcnster/'), # Reserved chars remain unescaped.
'/blog/for/J%C3%BCrgen%20M%C3%BCnster/') ('%&', '%&'),
('red&♥ros%#red', 'red&%E2%99%A5ros%#red'),
]
self.assertEqual(iri_to_uri('locations/%s' % urlquote_plus('Paris & Orl\xe9ans')), for iri, uri in cases:
'locations/Paris+%26+Orl%C3%A9ans') self.assertEqual(iri_to_uri(iri), uri)
def test_iri_to_uri_idempotent(self): # Test idempotency.
self.assertEqual(iri_to_uri(iri_to_uri('red%09ros\xe9#red')), self.assertEqual(iri_to_uri(iri_to_uri(iri)), uri)
'red%09ros%C3%A9#red')
@unittest.skipIf(six.PY3, "tests a class not defining __str__ under Python 2") def test_uri_to_iri(self):
def test_decorated_class_without_str(self): cases = [
with self.assertRaises(ValueError): # Valid UTF-8 sequences are decoded.
@python_2_unicode_compatible ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
class NoStr(object): ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
pass
# Broken UTF-8 sequences remain escaped.
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
]
for uri, iri in cases:
self.assertEqual(uri_to_iri(uri), iri)
# Test idempotency.
self.assertEqual(uri_to_iri(uri_to_iri(uri)), iri)
def test_complementarity(self):
cases = [
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
('%&', '%&'),
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
]
for uri, iri in cases:
self.assertEqual(iri_to_uri(uri_to_iri(uri)), uri)
self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)