From 10b17a22bec2eaf44c3315614aea87c127caee46 Mon Sep 17 00:00:00 2001 From: Anubhav Joshi Date: Tue, 22 Jul 2014 17:55:22 +0530 Subject: [PATCH] Fixed #19508 -- Implemented uri_to_iri as per RFC. Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz for the review. --- django/core/handlers/wsgi.py | 11 ++--- django/core/servers/basehttp.py | 17 +++++++ django/test/client.py | 16 +++---- django/utils/encoding.py | 41 ++++++++++++++++- docs/ref/unicode.txt | 31 ++++++++++--- docs/ref/utils.txt | 15 ++++++- docs/releases/1.8.txt | 3 ++ tests/handlers/tests.py | 25 +++++++++++ tests/utils_tests/test_encoding.py | 72 +++++++++++++++++++++++------- 9 files changed, 189 insertions(+), 42 deletions(-) diff --git a/django/core/handlers/wsgi.py b/django/core/handlers/wsgi.py index 03138bb781..b947177bd1 100644 --- a/django/core/handlers/wsgi.py +++ b/django/core/handlers/wsgi.py @@ -206,7 +206,6 @@ def get_path_info(environ): """ path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/') - # It'd be better to implement URI-to-IRI decoding, see #19508. return path_info.decode(UTF_8) @@ -236,7 +235,6 @@ def get_script_name(environ): else: script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '') - # It'd be better to implement URI-to-IRI decoding, see #19508. return script_name.decode(UTF_8) @@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default): # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. This is wrong for Django websites where UTF-8 # is the default. Re-encode to recover the original bytestring. - return value if six.PY2 else value.encode(ISO_8859_1) + return value.encode(ISO_8859_1) if six.PY3 else value def get_str_from_wsgi(environ, key, default): """ - Get a value from the WSGI environ dictionary as bytes. + Get a value from the WSGI environ dictionary as str. key and default should be str objects. Under Python 2 they may also be unicode objects provided they only contain ASCII characters. """ - value = environ.get(str(key), str(default)) - # Same comment as above - return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace') + value = get_bytes_from_wsgi(environ, key, default) + return value.decode(UTF_8, errors='replace') if six.PY3 else value diff --git a/django/core/servers/basehttp.py b/django/core/servers/basehttp.py index cea1dd5057..9ba9ede43a 100644 --- a/django/core/servers/basehttp.py +++ b/django/core/servers/basehttp.py @@ -15,9 +15,11 @@ from wsgiref import simple_server from wsgiref.util import FileWrapper # NOQA: for backwards compatibility from django.core.exceptions import ImproperlyConfigured +from django.core.handlers.wsgi import ISO_8859_1, UTF_8 from django.core.management.color import color_style from django.core.wsgi import get_wsgi_application from django.utils import six +from django.utils.encoding import uri_to_iri from django.utils.module_loading import import_string from django.utils.six.moves import socketserver @@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object): sys.stderr.write(msg) + def get_environ(self): + env = super(WSGIRequestHandler, self).get_environ() + + path = self.path + if '?' in path: + path = path.partition('?')[0] + + path = uri_to_iri(path).encode(UTF_8) + # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily + # decoded with ISO-8859-1. We replicate this behavior here. + # Refs comment in `get_bytes_from_wsgi()`. + env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path + + return env + def run(addr, port, wsgi_handler, ipv6=False, threading=False): server_address = (addr, port) diff --git a/django/test/client.py b/django/test/client.py index f2bbbfe77d..a3a8f21a63 100644 --- a/django/test/client.py +++ b/django/test/client.py @@ -12,7 +12,7 @@ from django.apps import apps from django.conf import settings from django.core import urlresolvers from django.core.handlers.base import BaseHandler -from django.core.handlers.wsgi import WSGIRequest +from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8 from django.core.signals import (request_started, request_finished, got_request_exception) from django.db import close_old_connections @@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict from django.template import TemplateDoesNotExist from django.test import signals from django.utils.functional import curry, SimpleLazyObject -from django.utils.encoding import force_bytes, force_str +from django.utils.encoding import force_bytes, force_str, uri_to_iri from django.utils.http import urlencode from django.utils.itercompat import is_iterable from django.utils import six -from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit +from django.utils.six.moves.urllib.parse import urlparse, urlsplit from django.test.utils import ContextList __all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart') @@ -270,11 +270,11 @@ class RequestFactory(object): # If there are parameters, add them if parsed[3]: path += str(";") + force_str(parsed[3]) - path = unquote(path) - # WSGI requires latin-1 encoded strings. See get_path_info(). - if six.PY3: - path = path.encode('utf-8').decode('iso-8859-1') - return path + path = uri_to_iri(path).encode(UTF_8) + # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily + # decoded with ISO-8859-1. We replicate this behavior here. + # Refs comment in `get_bytes_from_wsgi()`. + return path.decode(ISO_8859_1) if six.PY3 else path def get(self, path, data=None, secure=False, **extra): "Construct a GET request." diff --git a/django/utils/encoding.py b/django/utils/encoding.py index beb5e54ae8..3abee09c52 100644 --- a/django/utils/encoding.py +++ b/django/utils/encoding.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- from __future__ import unicode_literals import codecs @@ -7,7 +8,9 @@ import locale from django.utils.functional import Promise from django.utils import six -from django.utils.six.moves.urllib.parse import quote +from django.utils.six.moves.urllib.parse import quote, unquote +if six.PY3: + from urllib.parse import unquote_to_bytes class DjangoUnicodeDecodeError(UnicodeDecodeError): @@ -185,7 +188,9 @@ def iri_to_uri(iri): assuming input is either UTF-8 or unicode already, we can simplify things a little from the full method. - Returns an ASCII string containing the encoded result. + Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode + (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result + (e.g. '/I%20%E2%99%A5%20Django/'). """ # The list of safe characters here is constructed from the "reserved" and # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: @@ -204,6 +209,38 @@ def iri_to_uri(iri): return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") +def uri_to_iri(uri): + """ + Converts a Uniform Resource Identifier(URI) into an Internationalized + Resource Identifier(IRI). + + This is the algorithm from section 3.2 of RFC 3987. + + Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns + unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). + """ + if uri is None: + return uri + uri = force_bytes(uri) + iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri) + return repercent_broken_unicode(iri).decode('utf-8') + + +def repercent_broken_unicode(path): + """ + As per section 3.2 of RFC 3987, step three of converting a URI into an IRI, + we need to re-percent-encode any octet produced that is not part of a + strictly legal UTF-8 octet sequence. + """ + try: + path.decode('utf-8') + except UnicodeDecodeError as e: + repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~") + path = repercent_broken_unicode( + path[:e.start] + force_bytes(repercent) + path[e.end:]) + return path + + def filepath_to_uri(path): """Convert a file system path to a URI portion that is suitable for inclusion in a URL. diff --git a/docs/ref/unicode.txt b/docs/ref/unicode.txt index 90201d2d33..21e8c537c8 100644 --- a/docs/ref/unicode.txt +++ b/docs/ref/unicode.txt @@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode characters. Quoting and converting an IRI to URI can be a little tricky, so Django provides some assistance. -* The function ``django.utils.encoding.iri_to_uri()`` implements the - conversion from IRI to URI as required by the specification (:rfc:`3987`). +* The function :func:`django.utils.encoding.iri_to_uri()` implements the + conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`). -* The functions ``django.utils.http.urlquote()`` and - ``django.utils.http.urlquote_plus()`` are versions of Python's standard +* The functions :func:`django.utils.http.urlquote()` and + :func:`django.utils.http.urlquote_plus()` are versions of Python's standard ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII characters. (The data is converted to UTF-8 prior to encoding.) @@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the result. -The ``iri_to_uri()`` function is also idempotent, which means the following is -always true:: +Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which +implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. +It decodes all percent-encodings except those that don't represent a valid +UTF-8 sequence. + +An example to demonstrate:: + + >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') + '/♥♥/?utf8=✓' + >>> uri_to_iri('%A9helloworld') + '%A9helloworld' + +In the first example, the UTF-8 characters and reserved characters are +unquoted. In the second, the percent-encoding remains unchanged because it +lies outside the valid UTF-8 range. + +Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the +following is always true:: iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string) + uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string) -So you can safely call it multiple times on the same IRI without risking +So you can safely call it multiple times on the same URI/IRI without risking double-quoting problems. .. _URI: http://www.ietf.org/rfc/rfc2396.txt diff --git a/docs/ref/utils.txt b/docs/ref/utils.txt index c38579cb7a..1cbc23449b 100644 --- a/docs/ref/utils.txt +++ b/docs/ref/utils.txt @@ -271,7 +271,20 @@ The functions defined in this module share the following properties: since we are assuming input is either UTF-8 or unicode already, we can simplify things a little from the full method. - Returns an ASCII string containing the encoded result. + Takes an IRI in UTF-8 bytes and returns ASCII bytes containing the encoded + result. + +.. function:: uri_to_iri(uri) + + .. versionadded:: 1.8 + + Converts a Uniform Resource Identifier into an Internationalized Resource + Identifier. + + This is an algorithm from section 3.2 of :rfc:`3987#section-3.2`. + + Takes a URI in ASCII bytes and returns a unicode string containing the + encoded result. .. function:: filepath_to_uri(path) diff --git a/docs/releases/1.8.txt b/docs/releases/1.8.txt index 7cdb6aaf77..94d09eed4f 100644 --- a/docs/releases/1.8.txt +++ b/docs/releases/1.8.txt @@ -348,6 +348,9 @@ Requests and Responses * The :attr:`HttpResponse.charset ` attribute was added. +* ``WSGIRequestHandler`` now follows RFC in converting URI to IRI, using + ``uri_to_iri()``. + Tests ^^^^^ diff --git a/tests/handlers/tests.py b/tests/handlers/tests.py index 689b0ed9d8..f574418ae2 100644 --- a/tests/handlers/tests.py +++ b/tests/handlers/tests.py @@ -161,3 +161,28 @@ class HandlerSuspiciousOpsTest(TestCase): def test_suspiciousop_in_view_returns_400(self): response = self.client.get('/suspicious/') self.assertEqual(response.status_code, 400) + + +@override_settings(ROOT_URLCONF='handlers.urls') +class HandlerNotFoundTest(TestCase): + + def test_invalid_urls(self): + response = self.client.get('~%A9helloworld') + self.assertEqual(response.status_code, 404) + self.assertContains(response, '~%A9helloworld', status_code=404) + + response = self.client.get('d%aao%aaw%aan%aal%aao%aaa%aad%aa/') + self.assertEqual(response.status_code, 404) + self.assertContains(response, 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA', status_code=404) + + response = self.client.get('/%E2%99%E2%99%A5/') + self.assertEqual(response.status_code, 404) + self.assertContains(response, '%E2%99\u2665', status_code=404) + + response = self.client.get('/%E2%98%8E%E2%A9%E2%99%A5/') + self.assertEqual(response.status_code, 404) + self.assertContains(response, '\u260e%E2%A9\u2665', status_code=404) + + def test_environ_path_info_type(self): + environ = RequestFactory().get('/%E2%A8%87%87%A5%E2%A8%A0').environ + self.assertIsInstance(environ['PATH_INFO'], six.text_type) diff --git a/tests/utils_tests/test_encoding.py b/tests/utils_tests/test_encoding.py index 526fb709ce..1685c82def 100644 --- a/tests/utils_tests/test_encoding.py +++ b/tests/utils_tests/test_encoding.py @@ -5,8 +5,8 @@ import unittest import datetime from django.utils import six -from django.utils.encoding import (filepath_to_uri, force_bytes, - force_text, iri_to_uri, python_2_unicode_compatible) +from django.utils.encoding import (filepath_to_uri, force_bytes, force_text, + iri_to_uri, uri_to_iri) from django.utils.http import urlquote_plus @@ -40,6 +40,9 @@ class TestEncodingUtils(unittest.TestCase): today = datetime.date.today() self.assertEqual(force_bytes(today, strings_only=True), today) + +class TestRFC3987IEncodingUtils(unittest.TestCase): + def test_filepath_to_uri(self): self.assertEqual(filepath_to_uri('upload\\чубака.mp4'), 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4') @@ -47,22 +50,57 @@ class TestEncodingUtils(unittest.TestCase): 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4') def test_iri_to_uri(self): - self.assertEqual(iri_to_uri('red%09ros\xe9#red'), - 'red%09ros%C3%A9#red') + cases = [ + # Valid UTF-8 sequences are encoded. + ('red%09rosé#red', 'red%09ros%C3%A9#red'), + ('/blog/for/Jürgen Münster/', '/blog/for/J%C3%BCrgen%20M%C3%BCnster/'), + ('locations/%s' % urlquote_plus('Paris & Orléans'), 'locations/Paris+%26+Orl%C3%A9ans'), - self.assertEqual(iri_to_uri('/blog/for/J\xfcrgen M\xfcnster/'), - '/blog/for/J%C3%BCrgen%20M%C3%BCnster/') + # Reserved chars remain unescaped. + ('%&', '%&'), + ('red&♥ros%#red', 'red&%E2%99%A5ros%#red'), + ] - self.assertEqual(iri_to_uri('locations/%s' % urlquote_plus('Paris & Orl\xe9ans')), - 'locations/Paris+%26+Orl%C3%A9ans') + for iri, uri in cases: + self.assertEqual(iri_to_uri(iri), uri) - def test_iri_to_uri_idempotent(self): - self.assertEqual(iri_to_uri(iri_to_uri('red%09ros\xe9#red')), - 'red%09ros%C3%A9#red') + # Test idempotency. + self.assertEqual(iri_to_uri(iri_to_uri(iri)), uri) - @unittest.skipIf(six.PY3, "tests a class not defining __str__ under Python 2") - def test_decorated_class_without_str(self): - with self.assertRaises(ValueError): - @python_2_unicode_compatible - class NoStr(object): - pass + def test_uri_to_iri(self): + cases = [ + # Valid UTF-8 sequences are decoded. + ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), + ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), + + # Broken UTF-8 sequences remain escaped. + ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), + ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), + ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'), + ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'), + ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'), + ] + + for uri, iri in cases: + self.assertEqual(uri_to_iri(uri), iri) + + # Test idempotency. + self.assertEqual(uri_to_iri(uri_to_iri(uri)), iri) + + def test_complementarity(self): + cases = [ + ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'), + ('%&', '%&'), + ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'), + ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), + ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), + ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), + ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), + ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'), + ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'), + ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'), + ] + + for uri, iri in cases: + self.assertEqual(iri_to_uri(uri_to_iri(uri)), uri) + self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)