Fixed 35467 -- Replaced urlparse with urlsplit where appropriate.

This work should not generate any change of functionality, and
`urlsplit` is approximately 6x faster.

Most use cases of `urlparse` didn't touch the path, so they can be
converted to `urlsplit` without any issue. Most of those which do use
`.path`, simply parse the URL, mutate the querystring, then put them
back together, which is also fine (so long as urlunsplit is used).
This commit is contained in:
Jake Howard 2024-05-29 14:48:27 +01:00 committed by GitHub
parent 02dab94c7b
commit ff308a0604
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 61 additions and 72 deletions

View File

@ -6,7 +6,7 @@ import warnings
from functools import partial, update_wrapper
from urllib.parse import parse_qsl
from urllib.parse import quote as urlquote
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django import forms
from django.conf import settings
@ -1384,7 +1384,7 @@ class ModelAdmin(BaseModelAdmin):
)
def _get_preserved_qsl(self, request, preserved_filters):
query_string = urlparse(request.build_absolute_uri()).query
query_string = urlsplit(request.build_absolute_uri()).query
return parse_qsl(query_string.replace(preserved_filters, ""))
def response_add(self, request, obj, post_url_continue=None):

View File

@ -1,4 +1,4 @@
from urllib.parse import parse_qsl, unquote, urlparse, urlunparse
from urllib.parse import parse_qsl, unquote, urlsplit, urlunsplit
from django import template
from django.contrib.admin.utils import quote
@ -24,8 +24,8 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
preserved_filters = context.get("preserved_filters")
preserved_qsl = context.get("preserved_qsl")
parsed_url = list(urlparse(url))
parsed_qs = dict(parse_qsl(parsed_url[4]))
parsed_url = list(urlsplit(url))
parsed_qs = dict(parse_qsl(parsed_url[3]))
merged_qs = {}
if preserved_qsl:
@ -66,5 +66,5 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
merged_qs.update(parsed_qs)
parsed_url[4] = urlencode(merged_qs)
return urlunparse(parsed_url)
parsed_url[3] = urlencode(merged_qs)
return urlunsplit(parsed_url)

View File

@ -1,6 +1,6 @@
import asyncio
from functools import wraps
from urllib.parse import urlparse
from urllib.parse import urlsplit
from asgiref.sync import async_to_sync, sync_to_async
@ -25,8 +25,8 @@ def user_passes_test(
resolved_login_url = resolve_url(login_url or settings.LOGIN_URL)
# If the login url is the same scheme and net location then just
# use the path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):

View File

@ -1,5 +1,5 @@
from functools import partial
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.conf import settings
from django.contrib import auth
@ -74,8 +74,8 @@ class LoginRequiredMiddleware(MiddlewareMixin):
resolved_login_url = resolve_url(self.get_login_url(view_func))
# If the login url is the same scheme and net location then use the
# path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):

View File

@ -1,4 +1,4 @@
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.conf import settings
from django.contrib.auth import REDIRECT_FIELD_NAME
@ -51,8 +51,8 @@ class AccessMixin:
resolved_login_url = resolve_url(self.get_login_url())
# If the login url is the same scheme and net location then use the
# path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):

View File

@ -1,4 +1,4 @@
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlsplit, urlunsplit
from django.conf import settings
@ -183,13 +183,13 @@ def redirect_to_login(next, login_url=None, redirect_field_name=REDIRECT_FIELD_N
"""
resolved_url = resolve_url(login_url or settings.LOGIN_URL)
login_url_parts = list(urlparse(resolved_url))
login_url_parts = list(urlsplit(resolved_url))
if redirect_field_name:
querystring = QueryDict(login_url_parts[4], mutable=True)
querystring = QueryDict(login_url_parts[3], mutable=True)
querystring[redirect_field_name] = next
login_url_parts[4] = querystring.urlencode(safe="/")
login_url_parts[3] = querystring.urlencode(safe="/")
return HttpResponseRedirect(urlunparse(login_url_parts))
return HttpResponseRedirect(urlunsplit(login_url_parts))
# Class-based password reset views

View File

@ -36,13 +36,13 @@ class StaticFilesHandlerMixin:
* the host is provided as part of the base_url
* the request's path isn't under the media path (or equal)
"""
return path.startswith(self.base_url[2]) and not self.base_url[1]
return path.startswith(self.base_url.path) and not self.base_url.netloc
def file_path(self, url):
"""
Return the relative path to the media file on disk for the given URL.
"""
relative_url = url.removeprefix(self.base_url[2])
relative_url = url.removeprefix(self.base_url.path)
return url2pathname(relative_url)
def serve(self, request):

View File

@ -792,13 +792,13 @@ class URLField(CharField):
def to_python(self, value):
def split_url(url):
"""
Return a list of url parts via urlparse.urlsplit(), or raise
Return a list of url parts via urlsplit(), or raise
ValidationError for some malformed URLs.
"""
try:
return list(urlsplit(url))
except ValueError:
# urlparse.urlsplit can raise a ValueError with some
# urlsplit can raise a ValueError with some
# misformatted URLs.
raise ValidationError(self.error_messages["invalid"], code="invalid")

View File

@ -9,7 +9,7 @@ import time
import warnings
from email.header import Header
from http.client import responses
from urllib.parse import urlparse
from urllib.parse import urlsplit
from asgiref.sync import async_to_sync, sync_to_async
@ -616,7 +616,7 @@ class HttpResponseRedirectBase(HttpResponse):
def __init__(self, redirect_to, *args, **kwargs):
super().__init__(*args, **kwargs)
self["Location"] = iri_to_uri(redirect_to)
parsed = urlparse(str(redirect_to))
parsed = urlsplit(str(redirect_to))
if parsed.scheme and parsed.scheme not in self.allowed_schemes:
raise DisallowedRedirect(
"Unsafe redirect to URL with protocol '%s'" % parsed.scheme

View File

@ -1,5 +1,5 @@
import re
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.conf import settings
from django.core.exceptions import PermissionDenied
@ -171,7 +171,7 @@ class BrokenLinkEmailsMiddleware(MiddlewareMixin):
# The referer is equal to the current URL, ignoring the scheme (assumed
# to be a poorly implemented bot).
parsed_referer = urlparse(referer)
parsed_referer = urlsplit(referer)
if parsed_referer.netloc in ["", domain] and parsed_referer.path == uri:
return True

View File

@ -8,7 +8,7 @@ against request forgeries from other sites.
import logging
import string
from collections import defaultdict
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.conf import settings
from django.core.exceptions import DisallowedHost, ImproperlyConfigured
@ -174,7 +174,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
@cached_property
def csrf_trusted_origins_hosts(self):
return [
urlparse(origin).netloc.lstrip("*")
urlsplit(origin).netloc.lstrip("*")
for origin in settings.CSRF_TRUSTED_ORIGINS
]
@ -190,7 +190,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
"""
allowed_origin_subdomains = defaultdict(list)
for parsed in (
urlparse(origin)
urlsplit(origin)
for origin in settings.CSRF_TRUSTED_ORIGINS
if "*" in origin
):
@ -284,7 +284,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
if request_origin in self.allowed_origins_exact:
return True
try:
parsed_origin = urlparse(request_origin)
parsed_origin = urlsplit(request_origin)
except ValueError:
return False
request_scheme = parsed_origin.scheme
@ -300,7 +300,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
raise RejectRequest(REASON_NO_REFERER)
try:
referer = urlparse(referer)
referer = urlsplit(referer)
except ValueError:
raise RejectRequest(REASON_MALFORMED_REFERER)

View File

@ -8,7 +8,7 @@ from functools import partial
from http import HTTPStatus
from importlib import import_module
from io import BytesIO, IOBase
from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
from urllib.parse import unquote_to_bytes, urljoin, urlsplit
from asgiref.sync import sync_to_async
@ -458,11 +458,7 @@ class RequestFactory:
return json.dumps(data, cls=self.json_encoder) if should_encode else data
def _get_path(self, parsed):
path = parsed.path
# If there are parameters, add them
if parsed.params:
path += ";" + parsed.params
path = unquote_to_bytes(path)
path = unquote_to_bytes(parsed.path)
# Replace the behavior where non-ASCII values in the WSGI environ are
# arbitrarily decoded with ISO-8859-1.
# Refs comment in `get_bytes_from_wsgi()`.
@ -647,7 +643,7 @@ class RequestFactory:
**extra,
):
"""Construct an arbitrary HTTP request."""
parsed = urlparse(str(path)) # path can be lazy
parsed = urlsplit(str(path)) # path can be lazy
data = force_bytes(data, settings.DEFAULT_CHARSET)
r = {
"PATH_INFO": self._get_path(parsed),
@ -671,8 +667,7 @@ class RequestFactory:
# If QUERY_STRING is absent or empty, we want to extract it from the URL.
if not r.get("QUERY_STRING"):
# WSGI requires latin-1 encoded strings. See get_path_info().
query_string = parsed[4].encode().decode("iso-8859-1")
r["QUERY_STRING"] = query_string
r["QUERY_STRING"] = parsed.query.encode().decode("iso-8859-1")
return self.request(**r)
@ -748,7 +743,7 @@ class AsyncRequestFactory(RequestFactory):
**extra,
):
"""Construct an arbitrary HTTP request."""
parsed = urlparse(str(path)) # path can be lazy.
parsed = urlsplit(str(path)) # path can be lazy.
data = force_bytes(data, settings.DEFAULT_CHARSET)
s = {
"method": method,
@ -772,7 +767,7 @@ class AsyncRequestFactory(RequestFactory):
else:
# If QUERY_STRING is absent or empty, we want to extract it from
# the URL.
s["query_string"] = parsed[4]
s["query_string"] = parsed.query
if headers:
extra.update(HttpHeaders.to_asgi_names(headers))
s["headers"] += [

View File

@ -21,7 +21,7 @@ from urllib.parse import (
urljoin,
urlparse,
urlsplit,
urlunparse,
urlunsplit,
)
from urllib.request import url2pathname
@ -541,11 +541,9 @@ class SimpleTestCase(unittest.TestCase):
def normalize(url):
"""Sort the URL's query string parameters."""
url = str(url) # Coerce reverse_lazy() URLs.
scheme, netloc, path, params, query, fragment = urlparse(url)
scheme, netloc, path, query, fragment = urlsplit(url)
query_parts = sorted(parse_qsl(query))
return urlunparse(
(scheme, netloc, path, params, urlencode(query_parts), fragment)
)
return urlunsplit((scheme, netloc, path, urlencode(query_parts), fragment))
if msg_prefix:
msg_prefix += ": "
@ -1637,11 +1635,11 @@ class FSFilesHandler(WSGIHandler):
* the host is provided as part of the base_url
* the request's path isn't under the media path (or equal)
"""
return path.startswith(self.base_url[2]) and not self.base_url[1]
return path.startswith(self.base_url.path) and not self.base_url.netloc
def file_path(self, url):
"""Return the relative path to the file on disk for the given URL."""
relative_url = url.removeprefix(self.base_url[2])
relative_url = url.removeprefix(self.base_url.path)
return url2pathname(relative_url)
def get_response(self, request):

View File

@ -6,7 +6,7 @@ from datetime import datetime, timezone
from email.utils import formatdate
from urllib.parse import quote, unquote
from urllib.parse import urlencode as original_urlencode
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.utils.datastructures import MultiValueDict
from django.utils.regex_helper import _lazy_re_compile
@ -271,11 +271,11 @@ def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
# Chrome considers any URL with more than two slashes to be absolute, but
# urlparse is not so flexible. Treat any url with three slashes as unsafe.
# urlsplit is not so flexible. Treat any url with three slashes as unsafe.
if url.startswith("///"):
return False
try:
url_info = urlparse(url)
url_info = urlsplit(url)
except ValueError: # e.g. invalid IPv6 addresses
return False
# Forbid URLs like http:///example.com - with a scheme, but without a hostname.

View File

@ -203,7 +203,7 @@ A :class:`ResolverMatch` object can also be assigned to a triple::
One possible use of :func:`~django.urls.resolve` would be to test whether a
view would raise a ``Http404`` error before redirecting to it::
from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.urls import resolve
from django.http import Http404, HttpResponseRedirect
@ -215,7 +215,7 @@ view would raise a ``Http404`` error before redirecting to it::
# modify the request and response as required, e.g. change locale
# and set corresponding locale cookie
view, args, kwargs = resolve(urlparse(next)[2])
view, args, kwargs = resolve(urlsplit(next).path)
kwargs["request"] = request
try:
view(*args, **kwargs)

View File

@ -4,7 +4,7 @@ import re
import unittest
import zoneinfo
from unittest import mock
from urllib.parse import parse_qsl, urljoin, urlparse
from urllib.parse import parse_qsl, urljoin, urlsplit
from django import forms
from django.contrib import admin
@ -357,7 +357,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase):
**save_option,
},
)
parsed_url = urlparse(response.url)
parsed_url = urlsplit(response.url)
self.assertEqual(parsed_url.query, qsl)
def test_change_query_string_persists(self):
@ -386,7 +386,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase):
**save_option,
},
)
parsed_url = urlparse(response.url)
parsed_url = urlsplit(response.url)
self.assertEqual(parsed_url.query, qsl)
def test_basic_edit_GET(self):
@ -8032,11 +8032,11 @@ class AdminKeepChangeListFiltersTests(TestCase):
Assert that two URLs are equal despite the ordering
of their querystring. Refs #22360.
"""
parsed_url1 = urlparse(url1)
parsed_url1 = urlsplit(url1)
path1 = parsed_url1.path
parsed_qs1 = dict(parse_qsl(parsed_url1.query))
parsed_url2 = urlparse(url2)
parsed_url2 = urlsplit(url2)
path2 = parsed_url2.path
parsed_qs2 = dict(parse_qsl(parsed_url2.query))

View File

@ -709,25 +709,21 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin):
response = mw.process_view(req, post_form_view, (), {})
self.assertContains(response, malformed_referer_msg, status_code=403)
# missing scheme
# >>> urlparse('//example.com/')
# ParseResult(
# scheme='', netloc='example.com', path='/', params='', query='', fragment='',
# )
# >>> urlsplit('//example.com/')
# SplitResult(scheme='', netloc='example.com', path='/', query='', fragment='')
req.META["HTTP_REFERER"] = "//example.com/"
self._check_referer_rejects(mw, req)
response = mw.process_view(req, post_form_view, (), {})
self.assertContains(response, malformed_referer_msg, status_code=403)
# missing netloc
# >>> urlparse('https://')
# ParseResult(
# scheme='https', netloc='', path='', params='', query='', fragment='',
# )
# >>> urlsplit('https://')
# SplitResult(scheme='https', netloc='', path='', query='', fragment='')
req.META["HTTP_REFERER"] = "https://"
self._check_referer_rejects(mw, req)
response = mw.process_view(req, post_form_view, (), {})
self.assertContains(response, malformed_referer_msg, status_code=403)
# Invalid URL
# >>> urlparse('https://[')
# >>> urlsplit('https://[')
# ValueError: Invalid IPv6 URL
req.META["HTTP_REFERER"] = "https://["
self._check_referer_rejects(mw, req)
@ -979,7 +975,7 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin):
@override_settings(ALLOWED_HOSTS=["www.example.com"])
def test_bad_origin_cannot_be_parsed(self):
"""
A POST request with an origin that can't be parsed by urlparse() is
A POST request with an origin that can't be parsed by urlsplit() is
rejected.
"""
req = self._get_POST_request_with_token()