diff --git a/django/test/client.py b/django/test/client.py index e904085595..43eb09dd7b 100644 --- a/django/test/client.py +++ b/django/test/client.py @@ -6,7 +6,7 @@ import sys from copy import copy from importlib import import_module from io import BytesIO -from urllib.parse import urljoin, urlparse, urlsplit +from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit from django.conf import settings from django.core.handlers.base import BaseHandler @@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist from django.test import signals from django.test.utils import ContextList from django.urls import resolve -from django.utils.encoding import force_bytes, uri_to_iri +from django.utils.encoding import force_bytes from django.utils.functional import SimpleLazyObject, curry from django.utils.http import urlencode from django.utils.itercompat import is_iterable @@ -320,7 +320,7 @@ class RequestFactory: # If there are parameters, add them if parsed.params: path += ";" + parsed.params - path = uri_to_iri(path).encode() + path = unquote_to_bytes(path) # Replace the behavior where non-ASCII values in the WSGI environ are # arbitrarily decoded with ISO-8859-1. # Refs comment in `get_bytes_from_wsgi()`. diff --git a/django/utils/encoding.py b/django/utils/encoding.py index 63f4193701..63f915b139 100644 --- a/django/utils/encoding.py +++ b/django/utils/encoding.py @@ -2,7 +2,7 @@ import codecs import datetime import locale from decimal import Decimal -from urllib.parse import quote, unquote_to_bytes +from urllib.parse import quote from django.utils import six from django.utils.functional import Promise @@ -151,20 +151,57 @@ def iri_to_uri(iri): return quote(iri, safe="/#%[]=:;$&()+,!?*@'~") +# List of byte values that uri_to_iri() decodes from percent encoding. +# First, the unreserved characters from RFC 3986: +_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)] +_hextobyte = { + (fmt % char).encode(): bytes((char,)) + for ascii_range in _ascii_ranges + for char in ascii_range + for fmt in ['%02x', '%02X'] +} +# And then everything above 128, because bytes ≥ 128 are part of multibyte +# unicode characters. +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte.update({ + (a + b).encode(): bytes.fromhex(a + b) + for a in _hexdig[8:] for b in _hexdig +}) + + def uri_to_iri(uri): """ Converts a Uniform Resource Identifier(URI) into an Internationalized Resource Identifier(IRI). - This is the algorithm from section 3.2 of RFC 3987. + This is the algorithm from section 3.2 of RFC 3987, excluding step 4. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns - a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). + a string containing the encoded result (e.g. '/I%20♥%20Django/'). """ if uri is None: return uri uri = force_bytes(uri) - iri = unquote_to_bytes(uri) + # Fast selective unqote: First, split on '%' and then starting with the + # second block, decode the first 2 bytes if they represent a hex code to + # decode. The rest of the block is the part after '%AB', not containing + # any '%'. Add that to the output without further processing. + bits = uri.split(b'%') + if len(bits) == 1: + iri = uri + else: + parts = [bits[0]] + append = parts.append + hextobyte = _hextobyte + for item in bits[1:]: + hex = item[:2] + if hex in hextobyte: + append(hextobyte[item[:2]]) + append(item[2:]) + else: + append(b'%') + append(item) + iri = b''.join(parts) return repercent_broken_unicode(iri).decode() diff --git a/docs/ref/unicode.txt b/docs/ref/unicode.txt index b0a888e2f8..c20886775e 100644 --- a/docs/ref/unicode.txt +++ b/docs/ref/unicode.txt @@ -195,19 +195,17 @@ result. Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. -It decodes all percent-encodings except those that don't represent a valid -UTF-8 sequence. An example to demonstrate:: >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') '/♥♥/?utf8=✓' - >>> uri_to_iri('%A9helloworld') - '%A9helloworld' + >>> uri_to_iri('%A9hello%3Fworld') + '%A9hello%3Fworld' -In the first example, the UTF-8 characters and reserved characters are -unquoted. In the second, the percent-encoding remains unchanged because it -lies outside the valid UTF-8 range. +In the first example, the UTF-8 characters are unquoted. In the second, the +percent-encodings remain unchanged because they lie outside the valid UTF-8 +range or represent a reserved character. Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the following is always true:: diff --git a/tests/utils_tests/test_encoding.py b/tests/utils_tests/test_encoding.py index e3da394e77..1f5d5891ac 100644 --- a/tests/utils_tests/test_encoding.py +++ b/tests/utils_tests/test_encoding.py @@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase): def test_uri_to_iri(self): cases = [ # Valid UTF-8 sequences are decoded. - ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), + ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'), ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), - + ('/%41%5a%6B/', '/AZk/'), + # Reserved and non-URL valid ASCII chars are not decoded. + ('/%25%20%02%41%7b/', '/%25%20%02A%7b/'), # Broken UTF-8 sequences remain escaped. ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), @@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase): def test_complementarity(self): cases = [ - ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'), + ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'), ('%&', '%&'), ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'), ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), + ('/%25%20%02%7b/', '/%25%20%02%7b/'), ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),