Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().
This commit is contained in:
parent
500532c95d
commit
03281d8fe7
|
@ -6,7 +6,7 @@ import sys
|
|||
from copy import copy
|
||||
from importlib import import_module
|
||||
from io import BytesIO
|
||||
from urllib.parse import urljoin, urlparse, urlsplit
|
||||
from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.handlers.base import BaseHandler
|
||||
|
@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
|
|||
from django.test import signals
|
||||
from django.test.utils import ContextList
|
||||
from django.urls import resolve
|
||||
from django.utils.encoding import force_bytes, uri_to_iri
|
||||
from django.utils.encoding import force_bytes
|
||||
from django.utils.functional import SimpleLazyObject, curry
|
||||
from django.utils.http import urlencode
|
||||
from django.utils.itercompat import is_iterable
|
||||
|
@ -320,7 +320,7 @@ class RequestFactory:
|
|||
# If there are parameters, add them
|
||||
if parsed.params:
|
||||
path += ";" + parsed.params
|
||||
path = uri_to_iri(path).encode()
|
||||
path = unquote_to_bytes(path)
|
||||
# Replace the behavior where non-ASCII values in the WSGI environ are
|
||||
# arbitrarily decoded with ISO-8859-1.
|
||||
# Refs comment in `get_bytes_from_wsgi()`.
|
||||
|
|
|
@ -2,7 +2,7 @@ import codecs
|
|||
import datetime
|
||||
import locale
|
||||
from decimal import Decimal
|
||||
from urllib.parse import quote, unquote_to_bytes
|
||||
from urllib.parse import quote
|
||||
|
||||
from django.utils import six
|
||||
from django.utils.functional import Promise
|
||||
|
@ -151,20 +151,57 @@ def iri_to_uri(iri):
|
|||
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
|
||||
|
||||
|
||||
# List of byte values that uri_to_iri() decodes from percent encoding.
|
||||
# First, the unreserved characters from RFC 3986:
|
||||
_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
|
||||
_hextobyte = {
|
||||
(fmt % char).encode(): bytes((char,))
|
||||
for ascii_range in _ascii_ranges
|
||||
for char in ascii_range
|
||||
for fmt in ['%02x', '%02X']
|
||||
}
|
||||
# And then everything above 128, because bytes ≥ 128 are part of multibyte
|
||||
# unicode characters.
|
||||
_hexdig = '0123456789ABCDEFabcdef'
|
||||
_hextobyte.update({
|
||||
(a + b).encode(): bytes.fromhex(a + b)
|
||||
for a in _hexdig[8:] for b in _hexdig
|
||||
})
|
||||
|
||||
|
||||
def uri_to_iri(uri):
|
||||
"""
|
||||
Converts a Uniform Resource Identifier(URI) into an Internationalized
|
||||
Resource Identifier(IRI).
|
||||
|
||||
This is the algorithm from section 3.2 of RFC 3987.
|
||||
This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
|
||||
|
||||
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
|
||||
a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
|
||||
a string containing the encoded result (e.g. '/I%20♥%20Django/').
|
||||
"""
|
||||
if uri is None:
|
||||
return uri
|
||||
uri = force_bytes(uri)
|
||||
iri = unquote_to_bytes(uri)
|
||||
# Fast selective unqote: First, split on '%' and then starting with the
|
||||
# second block, decode the first 2 bytes if they represent a hex code to
|
||||
# decode. The rest of the block is the part after '%AB', not containing
|
||||
# any '%'. Add that to the output without further processing.
|
||||
bits = uri.split(b'%')
|
||||
if len(bits) == 1:
|
||||
iri = uri
|
||||
else:
|
||||
parts = [bits[0]]
|
||||
append = parts.append
|
||||
hextobyte = _hextobyte
|
||||
for item in bits[1:]:
|
||||
hex = item[:2]
|
||||
if hex in hextobyte:
|
||||
append(hextobyte[item[:2]])
|
||||
append(item[2:])
|
||||
else:
|
||||
append(b'%')
|
||||
append(item)
|
||||
iri = b''.join(parts)
|
||||
return repercent_broken_unicode(iri).decode()
|
||||
|
||||
|
||||
|
|
|
@ -195,19 +195,17 @@ result.
|
|||
|
||||
Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
|
||||
implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
|
||||
It decodes all percent-encodings except those that don't represent a valid
|
||||
UTF-8 sequence.
|
||||
|
||||
An example to demonstrate::
|
||||
|
||||
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
|
||||
'/♥♥/?utf8=✓'
|
||||
>>> uri_to_iri('%A9helloworld')
|
||||
'%A9helloworld'
|
||||
>>> uri_to_iri('%A9hello%3Fworld')
|
||||
'%A9hello%3Fworld'
|
||||
|
||||
In the first example, the UTF-8 characters and reserved characters are
|
||||
unquoted. In the second, the percent-encoding remains unchanged because it
|
||||
lies outside the valid UTF-8 range.
|
||||
In the first example, the UTF-8 characters are unquoted. In the second, the
|
||||
percent-encodings remain unchanged because they lie outside the valid UTF-8
|
||||
range or represent a reserved character.
|
||||
|
||||
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
|
||||
following is always true::
|
||||
|
|
|
@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
|
|||
def test_uri_to_iri(self):
|
||||
cases = [
|
||||
# Valid UTF-8 sequences are decoded.
|
||||
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
|
||||
('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
|
||||
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
||||
|
||||
('/%41%5a%6B/', '/AZk/'),
|
||||
# Reserved and non-URL valid ASCII chars are not decoded.
|
||||
('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
|
||||
# Broken UTF-8 sequences remain escaped.
|
||||
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
||||
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
||||
|
@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
|
|||
|
||||
def test_complementarity(self):
|
||||
cases = [
|
||||
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
|
||||
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
|
||||
('%&', '%&'),
|
||||
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
|
||||
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
|
||||
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
||||
('/%25%20%02%7b/', '/%25%20%02%7b/'),
|
||||
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
||||
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
||||
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
|
||||
|
|
Loading…
Reference in New Issue