From 10b17a22bec2eaf44c3315614aea87c127caee46 Mon Sep 17 00:00:00 2001
From: Anubhav Joshi <anubhav9042@gmail.com>
Date: Tue, 22 Jul 2014 17:55:22 +0530
Subject: [PATCH] Fixed #19508 -- Implemented uri_to_iri as per RFC.

Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz
for the review.
---
 django/core/handlers/wsgi.py       | 11 ++---
 django/core/servers/basehttp.py    | 17 +++++++
 django/test/client.py              | 16 +++----
 django/utils/encoding.py           | 41 ++++++++++++++++-
 docs/ref/unicode.txt               | 31 ++++++++++---
 docs/ref/utils.txt                 | 15 ++++++-
 docs/releases/1.8.txt              |  3 ++
 tests/handlers/tests.py            | 25 +++++++++++
 tests/utils_tests/test_encoding.py | 72 +++++++++++++++++++++++-------
 9 files changed, 189 insertions(+), 42 deletions(-)

diff --git a/django/core/handlers/wsgi.py b/django/core/handlers/wsgi.py
index 03138bb781..b947177bd1 100644
--- a/django/core/handlers/wsgi.py
+++ b/django/core/handlers/wsgi.py
@@ -206,7 +206,6 @@ def get_path_info(environ):
     """
     path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/')
 
-    # It'd be better to implement URI-to-IRI decoding, see #19508.
     return path_info.decode(UTF_8)
 
 
@@ -236,7 +235,6 @@ def get_script_name(environ):
     else:
         script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '')
 
-    # It'd be better to implement URI-to-IRI decoding, see #19508.
     return script_name.decode(UTF_8)
 
 
@@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default):
     # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
     # decoded with ISO-8859-1. This is wrong for Django websites where UTF-8
     # is the default. Re-encode to recover the original bytestring.
-    return value if six.PY2 else value.encode(ISO_8859_1)
+    return value.encode(ISO_8859_1) if six.PY3 else value
 
 
 def get_str_from_wsgi(environ, key, default):
     """
-    Get a value from the WSGI environ dictionary as bytes.
+    Get a value from the WSGI environ dictionary as str.
 
     key and default should be str objects. Under Python 2 they may also be
     unicode objects provided they only contain ASCII characters.
     """
-    value = environ.get(str(key), str(default))
-    # Same comment as above
-    return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace')
+    value = get_bytes_from_wsgi(environ, key, default)
+    return value.decode(UTF_8, errors='replace') if six.PY3 else value
diff --git a/django/core/servers/basehttp.py b/django/core/servers/basehttp.py
index cea1dd5057..9ba9ede43a 100644
--- a/django/core/servers/basehttp.py
+++ b/django/core/servers/basehttp.py
@@ -15,9 +15,11 @@ from wsgiref import simple_server
 from wsgiref.util import FileWrapper   # NOQA: for backwards compatibility
 
 from django.core.exceptions import ImproperlyConfigured
+from django.core.handlers.wsgi import ISO_8859_1, UTF_8
 from django.core.management.color import color_style
 from django.core.wsgi import get_wsgi_application
 from django.utils import six
+from django.utils.encoding import uri_to_iri
 from django.utils.module_loading import import_string
 from django.utils.six.moves import socketserver
 
@@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object):
 
         sys.stderr.write(msg)
 
+    def get_environ(self):
+        env = super(WSGIRequestHandler, self).get_environ()
+
+        path = self.path
+        if '?' in path:
+            path = path.partition('?')[0]
+
+        path = uri_to_iri(path).encode(UTF_8)
+        # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
+        # decoded with ISO-8859-1. We replicate this behavior here.
+        # Refs comment in `get_bytes_from_wsgi()`.
+        env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path
+
+        return env
+
 
 def run(addr, port, wsgi_handler, ipv6=False, threading=False):
     server_address = (addr, port)
diff --git a/django/test/client.py b/django/test/client.py
index f2bbbfe77d..a3a8f21a63 100644
--- a/django/test/client.py
+++ b/django/test/client.py
@@ -12,7 +12,7 @@ from django.apps import apps
 from django.conf import settings
 from django.core import urlresolvers
 from django.core.handlers.base import BaseHandler
-from django.core.handlers.wsgi import WSGIRequest
+from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8
 from django.core.signals import (request_started, request_finished,
     got_request_exception)
 from django.db import close_old_connections
@@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict
 from django.template import TemplateDoesNotExist
 from django.test import signals
 from django.utils.functional import curry, SimpleLazyObject
-from django.utils.encoding import force_bytes, force_str
+from django.utils.encoding import force_bytes, force_str, uri_to_iri
 from django.utils.http import urlencode
 from django.utils.itercompat import is_iterable
 from django.utils import six
-from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit
+from django.utils.six.moves.urllib.parse import urlparse, urlsplit
 from django.test.utils import ContextList
 
 __all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart')
@@ -270,11 +270,11 @@ class RequestFactory(object):
         # If there are parameters, add them
         if parsed[3]:
             path += str(";") + force_str(parsed[3])
-        path = unquote(path)
-        # WSGI requires latin-1 encoded strings. See get_path_info().
-        if six.PY3:
-            path = path.encode('utf-8').decode('iso-8859-1')
-        return path
+        path = uri_to_iri(path).encode(UTF_8)
+        # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
+        # decoded with ISO-8859-1. We replicate this behavior here.
+        # Refs comment in `get_bytes_from_wsgi()`.
+        return path.decode(ISO_8859_1) if six.PY3 else path
 
     def get(self, path, data=None, secure=False, **extra):
         "Construct a GET request."
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index beb5e54ae8..3abee09c52 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 from __future__ import unicode_literals
 
 import codecs
@@ -7,7 +8,9 @@ import locale
 
 from django.utils.functional import Promise
 from django.utils import six
-from django.utils.six.moves.urllib.parse import quote
+from django.utils.six.moves.urllib.parse import quote, unquote
+if six.PY3:
+    from urllib.parse import unquote_to_bytes
 
 
 class DjangoUnicodeDecodeError(UnicodeDecodeError):
@@ -185,7 +188,9 @@ def iri_to_uri(iri):
     assuming input is either UTF-8 or unicode already, we can simplify things a
     little from the full method.
 
-    Returns an ASCII string containing the encoded result.
+    Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
+    (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
+    (e.g. '/I%20%E2%99%A5%20Django/').
     """
     # The list of safe characters here is constructed from the "reserved" and
     # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@@ -204,6 +209,38 @@ def iri_to_uri(iri):
     return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
 
 
+def uri_to_iri(uri):
+    """
+    Converts a Uniform Resource Identifier(URI) into an Internationalized
+    Resource Identifier(IRI).
+
+    This is the algorithm from section 3.2 of RFC 3987.
+
+    Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
+    unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+    """
+    if uri is None:
+        return uri
+    uri = force_bytes(uri)
+    iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
+    return repercent_broken_unicode(iri).decode('utf-8')
+
+
+def repercent_broken_unicode(path):
+    """
+    As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
+    we need to re-percent-encode any octet produced that is not part of a
+    strictly legal UTF-8 octet sequence.
+    """
+    try:
+        path.decode('utf-8')
+    except UnicodeDecodeError as e:
+        repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
+        path = repercent_broken_unicode(
+            path[:e.start] + force_bytes(repercent) + path[e.end:])
+    return path
+
+
 def filepath_to_uri(path):
     """Convert a file system path to a URI portion that is suitable for
     inclusion in a URL.
diff --git a/docs/ref/unicode.txt b/docs/ref/unicode.txt
index 90201d2d33..21e8c537c8 100644
--- a/docs/ref/unicode.txt
+++ b/docs/ref/unicode.txt
@@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode
 characters. Quoting and converting an IRI to URI can be a little tricky, so
 Django provides some assistance.
 
-* The function ``django.utils.encoding.iri_to_uri()`` implements the
-  conversion from IRI to URI as required by the specification (:rfc:`3987`).
+* The function :func:`django.utils.encoding.iri_to_uri()` implements the
+  conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`).
 
-* The functions ``django.utils.http.urlquote()`` and
-  ``django.utils.http.urlquote_plus()`` are versions of Python's standard
+* The functions :func:`django.utils.http.urlquote()` and
+  :func:`django.utils.http.urlquote_plus()` are versions of Python's standard
   ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
   characters. (The data is converted to UTF-8 prior to encoding.)
 
@@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains
 non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the
 result.
 
-The ``iri_to_uri()`` function is also idempotent, which means the following is
-always true::
+Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
+implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
+It decodes all percent-encodings except those that don't represent a valid
+UTF-8 sequence.
+
+An example to demonstrate::
+
+    >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
+    '/♥♥/?utf8=✓'
+    >>> uri_to_iri('%A9helloworld')
+    '%A9helloworld'
+
+In the first example, the UTF-8 characters and reserved characters are
+unquoted. In the second, the percent-encoding remains unchanged because it
+lies outside the valid UTF-8 range.
+
+Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
+following is always true::
 
     iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string)
+    uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string)
 
-So you can safely call it multiple times on the same IRI without risking
+So you can safely call it multiple times on the same URI/IRI without risking
 double-quoting problems.
 
 .. _URI: http://www.ietf.org/rfc/rfc2396.txt
diff --git a/docs/ref/utils.txt b/docs/ref/utils.txt
index c38579cb7a..1cbc23449b 100644
--- a/docs/ref/utils.txt
+++ b/docs/ref/utils.txt
@@ -271,7 +271,20 @@ The functions defined in this module share the following properties:
     since we are assuming input is either UTF-8 or unicode already, we can
     simplify things a little from the full method.
 
-    Returns an ASCII string containing the encoded result.
+    Takes an IRI in UTF-8 bytes and returns ASCII bytes containing the encoded
+    result.
+
+.. function:: uri_to_iri(uri)
+
+    .. versionadded:: 1.8
+
+    Converts a Uniform Resource Identifier into an Internationalized Resource
+    Identifier.
+
+    This is an algorithm from section 3.2 of :rfc:`3987#section-3.2`.
+
+    Takes a URI in ASCII bytes and returns a unicode string containing the
+    encoded result.
 
 .. function:: filepath_to_uri(path)
 
diff --git a/docs/releases/1.8.txt b/docs/releases/1.8.txt
index 7cdb6aaf77..94d09eed4f 100644
--- a/docs/releases/1.8.txt
+++ b/docs/releases/1.8.txt
@@ -348,6 +348,9 @@ Requests and Responses
 * The :attr:`HttpResponse.charset <django.http.HttpResponse.charset>` attribute
   was added.
 
+* ``WSGIRequestHandler`` now follows RFC in converting URI to IRI, using
+  ``uri_to_iri()``.
+
 Tests
 ^^^^^
 
diff --git a/tests/handlers/tests.py b/tests/handlers/tests.py
index 689b0ed9d8..f574418ae2 100644
--- a/tests/handlers/tests.py
+++ b/tests/handlers/tests.py
@@ -161,3 +161,28 @@ class HandlerSuspiciousOpsTest(TestCase):
     def test_suspiciousop_in_view_returns_400(self):
         response = self.client.get('/suspicious/')
         self.assertEqual(response.status_code, 400)
+
+
+@override_settings(ROOT_URLCONF='handlers.urls')
+class HandlerNotFoundTest(TestCase):
+
+    def test_invalid_urls(self):
+        response = self.client.get('~%A9helloworld')
+        self.assertEqual(response.status_code, 404)
+        self.assertContains(response, '~%A9helloworld', status_code=404)
+
+        response = self.client.get('d%aao%aaw%aan%aal%aao%aaa%aad%aa/')
+        self.assertEqual(response.status_code, 404)
+        self.assertContains(response, 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA', status_code=404)
+
+        response = self.client.get('/%E2%99%E2%99%A5/')
+        self.assertEqual(response.status_code, 404)
+        self.assertContains(response, '%E2%99\u2665', status_code=404)
+
+        response = self.client.get('/%E2%98%8E%E2%A9%E2%99%A5/')
+        self.assertEqual(response.status_code, 404)
+        self.assertContains(response, '\u260e%E2%A9\u2665', status_code=404)
+
+    def test_environ_path_info_type(self):
+        environ = RequestFactory().get('/%E2%A8%87%87%A5%E2%A8%A0').environ
+        self.assertIsInstance(environ['PATH_INFO'], six.text_type)
diff --git a/tests/utils_tests/test_encoding.py b/tests/utils_tests/test_encoding.py
index 526fb709ce..1685c82def 100644
--- a/tests/utils_tests/test_encoding.py
+++ b/tests/utils_tests/test_encoding.py
@@ -5,8 +5,8 @@ import unittest
 import datetime
 
 from django.utils import six
-from django.utils.encoding import (filepath_to_uri, force_bytes,
-    force_text, iri_to_uri, python_2_unicode_compatible)
+from django.utils.encoding import (filepath_to_uri, force_bytes, force_text,
+    iri_to_uri, uri_to_iri)
 from django.utils.http import urlquote_plus
 
 
@@ -40,6 +40,9 @@ class TestEncodingUtils(unittest.TestCase):
         today = datetime.date.today()
         self.assertEqual(force_bytes(today, strings_only=True), today)
 
+
+class TestRFC3987IEncodingUtils(unittest.TestCase):
+
     def test_filepath_to_uri(self):
         self.assertEqual(filepath_to_uri('upload\\чубака.mp4'),
             'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
@@ -47,22 +50,57 @@ class TestEncodingUtils(unittest.TestCase):
             'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
 
     def test_iri_to_uri(self):
-        self.assertEqual(iri_to_uri('red%09ros\xe9#red'),
-            'red%09ros%C3%A9#red')
+        cases = [
+            # Valid UTF-8 sequences are encoded.
+            ('red%09rosé#red', 'red%09ros%C3%A9#red'),
+            ('/blog/for/Jürgen Münster/', '/blog/for/J%C3%BCrgen%20M%C3%BCnster/'),
+            ('locations/%s' % urlquote_plus('Paris & Orléans'), 'locations/Paris+%26+Orl%C3%A9ans'),
 
-        self.assertEqual(iri_to_uri('/blog/for/J\xfcrgen M\xfcnster/'),
-            '/blog/for/J%C3%BCrgen%20M%C3%BCnster/')
+            # Reserved chars remain unescaped.
+            ('%&', '%&'),
+            ('red&♥ros%#red', 'red&%E2%99%A5ros%#red'),
+        ]
 
-        self.assertEqual(iri_to_uri('locations/%s' % urlquote_plus('Paris & Orl\xe9ans')),
-            'locations/Paris+%26+Orl%C3%A9ans')
+        for iri, uri in cases:
+            self.assertEqual(iri_to_uri(iri), uri)
 
-    def test_iri_to_uri_idempotent(self):
-        self.assertEqual(iri_to_uri(iri_to_uri('red%09ros\xe9#red')),
-            'red%09ros%C3%A9#red')
+            # Test idempotency.
+            self.assertEqual(iri_to_uri(iri_to_uri(iri)), uri)
 
-    @unittest.skipIf(six.PY3, "tests a class not defining __str__ under Python 2")
-    def test_decorated_class_without_str(self):
-        with self.assertRaises(ValueError):
-            @python_2_unicode_compatible
-            class NoStr(object):
-                pass
+    def test_uri_to_iri(self):
+        cases = [
+            # Valid UTF-8 sequences are decoded.
+            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+            ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+
+            # Broken UTF-8 sequences remain escaped.
+            ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+            ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+            ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+            ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+            ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+        ]
+
+        for uri, iri in cases:
+            self.assertEqual(uri_to_iri(uri), iri)
+
+            # Test idempotency.
+            self.assertEqual(uri_to_iri(uri_to_iri(uri)), iri)
+
+    def test_complementarity(self):
+        cases = [
+            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
+            ('%&', '%&'),
+            ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
+            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+            ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+            ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+            ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+            ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+            ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+            ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+        ]
+
+        for uri, iri in cases:
+            self.assertEqual(iri_to_uri(uri_to_iri(uri)), uri)
+            self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)