Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().

This commit is contained in:
Chronial 2017-02-07 14:55:44 +01:00 committed by Tim Graham
parent 500532c95d
commit 03281d8fe7
4 changed files with 55 additions and 17 deletions

View File

@ -6,7 +6,7 @@ import sys
from copy import copy
from importlib import import_module
from io import BytesIO
from urllib.parse import urljoin, urlparse, urlsplit
from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
from django.conf import settings
from django.core.handlers.base import BaseHandler
@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
from django.test import signals
from django.test.utils import ContextList
from django.urls import resolve
from django.utils.encoding import force_bytes, uri_to_iri
from django.utils.encoding import force_bytes
from django.utils.functional import SimpleLazyObject, curry
from django.utils.http import urlencode
from django.utils.itercompat import is_iterable
@ -320,7 +320,7 @@ class RequestFactory:
# If there are parameters, add them
if parsed.params:
path += ";" + parsed.params
path = uri_to_iri(path).encode()
path = unquote_to_bytes(path)
# Replace the behavior where non-ASCII values in the WSGI environ are
# arbitrarily decoded with ISO-8859-1.
# Refs comment in `get_bytes_from_wsgi()`.

View File

@ -2,7 +2,7 @@ import codecs
import datetime
import locale
from decimal import Decimal
from urllib.parse import quote, unquote_to_bytes
from urllib.parse import quote
from django.utils import six
from django.utils.functional import Promise
@ -151,20 +151,57 @@ def iri_to_uri(iri):
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
# List of byte values that uri_to_iri() decodes from percent encoding.
# First, the unreserved characters from RFC 3986:
_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
_hextobyte = {
(fmt % char).encode(): bytes((char,))
for ascii_range in _ascii_ranges
for char in ascii_range
for fmt in ['%02x', '%02X']
}
# And then everything above 128, because bytes ≥ 128 are part of multibyte
# unicode characters.
_hexdig = '0123456789ABCDEFabcdef'
_hextobyte.update({
(a + b).encode(): bytes.fromhex(a + b)
for a in _hexdig[8:] for b in _hexdig
})
def uri_to_iri(uri):
"""
Converts a Uniform Resource Identifier(URI) into an Internationalized
Resource Identifier(IRI).
This is the algorithm from section 3.2 of RFC 3987.
This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
a string containing the encoded result (e.g. '/I%20♥%20Django/').
"""
if uri is None:
return uri
uri = force_bytes(uri)
iri = unquote_to_bytes(uri)
# Fast selective unqote: First, split on '%' and then starting with the
# second block, decode the first 2 bytes if they represent a hex code to
# decode. The rest of the block is the part after '%AB', not containing
# any '%'. Add that to the output without further processing.
bits = uri.split(b'%')
if len(bits) == 1:
iri = uri
else:
parts = [bits[0]]
append = parts.append
hextobyte = _hextobyte
for item in bits[1:]:
hex = item[:2]
if hex in hextobyte:
append(hextobyte[item[:2]])
append(item[2:])
else:
append(b'%')
append(item)
iri = b''.join(parts)
return repercent_broken_unicode(iri).decode()

View File

@ -195,19 +195,17 @@ result.
Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
It decodes all percent-encodings except those that don't represent a valid
UTF-8 sequence.
An example to demonstrate::
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
'/♥♥/?utf8=✓'
>>> uri_to_iri('%A9helloworld')
'%A9helloworld'
>>> uri_to_iri('%A9hello%3Fworld')
'%A9hello%3Fworld'
In the first example, the UTF-8 characters and reserved characters are
unquoted. In the second, the percent-encoding remains unchanged because it
lies outside the valid UTF-8 range.
In the first example, the UTF-8 characters are unquoted. In the second, the
percent-encodings remain unchanged because they lie outside the valid UTF-8
range or represent a reserved character.
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
following is always true::

View File

@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
def test_uri_to_iri(self):
cases = [
# Valid UTF-8 sequences are decoded.
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
('/%41%5a%6B/', '/AZk/'),
# Reserved and non-URL valid ASCII chars are not decoded.
('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
# Broken UTF-8 sequences remain escaped.
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
def test_complementarity(self):
cases = [
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
('%&', '%&'),
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
('/%25%20%02%7b/', '/%25%20%02%7b/'),
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),