Fixed 35467 -- Replaced urlparse with urlsplit where appropriate.

This work should not generate any change of functionality, and `urlsplit` is approximately 6x faster. Most use cases of `urlparse` didn't touch the path, so they can be converted to `urlsplit` without any issue. Most of those which do use `.path`, simply parse the URL, mutate the querystring, then put them back together, which is also fine (so long as urlunsplit is used).
2024-05-29 14:48:27 +01:00 · 2024-05-29 14:48:27 +01:00 · ff308a0604
parent 02dab94c7b
commit ff308a0604
17 changed files with 61 additions and 72 deletions
--- a/django/contrib/admin/options.py
+++ b/django/contrib/admin/options.py
@ -6,7 +6,7 @@ import warnings
 from functools import partial, update_wrapper
 from urllib.parse import parse_qsl
 from urllib.parse import quote as urlquote
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django import forms
 from django.conf import settings
@ -1384,7 +1384,7 @@ class ModelAdmin(BaseModelAdmin):
        )

    def _get_preserved_qsl(self, request, preserved_filters):
-        query_string = urlparse(request.build_absolute_uri()).query
+        query_string = urlsplit(request.build_absolute_uri()).query
        return parse_qsl(query_string.replace(preserved_filters, ""))

    def response_add(self, request, obj, post_url_continue=None):
--- a/django/contrib/admin/templatetags/admin_urls.py
+++ b/django/contrib/admin/templatetags/admin_urls.py
@ -1,4 +1,4 @@
-from urllib.parse import parse_qsl, unquote, urlparse, urlunparse
+from urllib.parse import parse_qsl, unquote, urlsplit, urlunsplit

 from django import template
 from django.contrib.admin.utils import quote
@ -24,8 +24,8 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
    preserved_filters = context.get("preserved_filters")
    preserved_qsl = context.get("preserved_qsl")

-    parsed_url = list(urlparse(url))
-    parsed_qs = dict(parse_qsl(parsed_url[4]))
+    parsed_url = list(urlsplit(url))
+    parsed_qs = dict(parse_qsl(parsed_url[3]))
    merged_qs = {}

    if preserved_qsl:
@ -66,5 +66,5 @@ def add_preserved_filters(context, url, popup=False, to_field=None):

    merged_qs.update(parsed_qs)

-    parsed_url[4] = urlencode(merged_qs)
-    return urlunparse(parsed_url)
+    parsed_url[3] = urlencode(merged_qs)
+    return urlunsplit(parsed_url)
--- a/django/contrib/auth/decorators.py
+++ b/django/contrib/auth/decorators.py
@ -1,6 +1,6 @@
 import asyncio
 from functools import wraps
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from asgiref.sync import async_to_sync, sync_to_async

@ -25,8 +25,8 @@ def user_passes_test(
            resolved_login_url = resolve_url(login_url or settings.LOGIN_URL)
            # If the login url is the same scheme and net location then just
            # use the path as the "next" url.
-            login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-            current_scheme, current_netloc = urlparse(path)[:2]
+            login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+            current_scheme, current_netloc = urlsplit(path)[:2]
            if (not login_scheme or login_scheme == current_scheme) and (
                not login_netloc or login_netloc == current_netloc
            ):
--- a/django/contrib/auth/middleware.py
+++ b/django/contrib/auth/middleware.py
@ -1,5 +1,5 @@
 from functools import partial
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django.conf import settings
 from django.contrib import auth
@ -74,8 +74,8 @@ class LoginRequiredMiddleware(MiddlewareMixin):
        resolved_login_url = resolve_url(self.get_login_url(view_func))
        # If the login url is the same scheme and net location then use the
        # path as the "next" url.
-        login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-        current_scheme, current_netloc = urlparse(path)[:2]
+        login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+        current_scheme, current_netloc = urlsplit(path)[:2]
        if (not login_scheme or login_scheme == current_scheme) and (
            not login_netloc or login_netloc == current_netloc
        ):
--- a/django/contrib/auth/mixins.py
+++ b/django/contrib/auth/mixins.py
@ -1,4 +1,4 @@
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django.conf import settings
 from django.contrib.auth import REDIRECT_FIELD_NAME
@ -51,8 +51,8 @@ class AccessMixin:
        resolved_login_url = resolve_url(self.get_login_url())
        # If the login url is the same scheme and net location then use the
        # path as the "next" url.
-        login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-        current_scheme, current_netloc = urlparse(path)[:2]
+        login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+        current_scheme, current_netloc = urlsplit(path)[:2]
        if (not login_scheme or login_scheme == current_scheme) and (
            not login_netloc or login_netloc == current_netloc
        ):
--- a/django/contrib/auth/views.py
+++ b/django/contrib/auth/views.py
@ -1,4 +1,4 @@
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlsplit, urlunsplit

 from django.conf import settings

@ -183,13 +183,13 @@ def redirect_to_login(next, login_url=None, redirect_field_name=REDIRECT_FIELD_N
    """
    resolved_url = resolve_url(login_url or settings.LOGIN_URL)

-    login_url_parts = list(urlparse(resolved_url))
+    login_url_parts = list(urlsplit(resolved_url))
    if redirect_field_name:
-        querystring = QueryDict(login_url_parts[4], mutable=True)
+        querystring = QueryDict(login_url_parts[3], mutable=True)
        querystring[redirect_field_name] = next
-        login_url_parts[4] = querystring.urlencode(safe="/")
+        login_url_parts[3] = querystring.urlencode(safe="/")

-    return HttpResponseRedirect(urlunparse(login_url_parts))
+    return HttpResponseRedirect(urlunsplit(login_url_parts))


 # Class-based password reset views
--- a/django/contrib/staticfiles/handlers.py
+++ b/django/contrib/staticfiles/handlers.py
@ -36,13 +36,13 @@ class StaticFilesHandlerMixin:
        * the host is provided as part of the base_url
        * the request's path isn't under the media path (or equal)
        """
-        return path.startswith(self.base_url[2]) and not self.base_url[1]
+        return path.startswith(self.base_url.path) and not self.base_url.netloc

    def file_path(self, url):
        """
        Return the relative path to the media file on disk for the given URL.
        """
-        relative_url = url.removeprefix(self.base_url[2])
+        relative_url = url.removeprefix(self.base_url.path)
        return url2pathname(relative_url)

    def serve(self, request):
--- a/django/forms/fields.py
+++ b/django/forms/fields.py
@ -792,13 +792,13 @@ class URLField(CharField):
    def to_python(self, value):
        def split_url(url):
            """
-            Return a list of url parts via urlparse.urlsplit(), or raise
+            Return a list of url parts via urlsplit(), or raise
            ValidationError for some malformed URLs.
            """
            try:
                return list(urlsplit(url))
            except ValueError:
-                # urlparse.urlsplit can raise a ValueError with some
+                # urlsplit can raise a ValueError with some
                # misformatted URLs.
                raise ValidationError(self.error_messages["invalid"], code="invalid")

--- a/django/http/response.py
+++ b/django/http/response.py
@ -9,7 +9,7 @@ import time
 import warnings
 from email.header import Header
 from http.client import responses
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from asgiref.sync import async_to_sync, sync_to_async

@ -616,7 +616,7 @@ class HttpResponseRedirectBase(HttpResponse):
    def __init__(self, redirect_to, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self["Location"] = iri_to_uri(redirect_to)
-        parsed = urlparse(str(redirect_to))
+        parsed = urlsplit(str(redirect_to))
        if parsed.scheme and parsed.scheme not in self.allowed_schemes:
            raise DisallowedRedirect(
                "Unsafe redirect to URL with protocol '%s'" % parsed.scheme
--- a/django/middleware/common.py
+++ b/django/middleware/common.py
@ -1,5 +1,5 @@
 import re
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django.conf import settings
 from django.core.exceptions import PermissionDenied
@ -171,7 +171,7 @@ class BrokenLinkEmailsMiddleware(MiddlewareMixin):

        # The referer is equal to the current URL, ignoring the scheme (assumed
        # to be a poorly implemented bot).
-        parsed_referer = urlparse(referer)
+        parsed_referer = urlsplit(referer)
        if parsed_referer.netloc in ["", domain] and parsed_referer.path == uri:
            return True

--- a/django/middleware/csrf.py
+++ b/django/middleware/csrf.py
@ -8,7 +8,7 @@ against request forgeries from other sites.
 import logging
 import string
 from collections import defaultdict
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django.conf import settings
 from django.core.exceptions import DisallowedHost, ImproperlyConfigured
@ -174,7 +174,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
    @cached_property
    def csrf_trusted_origins_hosts(self):
        return [
-            urlparse(origin).netloc.lstrip("*")
+            urlsplit(origin).netloc.lstrip("*")
            for origin in settings.CSRF_TRUSTED_ORIGINS
        ]

@ -190,7 +190,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
        """
        allowed_origin_subdomains = defaultdict(list)
        for parsed in (
-            urlparse(origin)
+            urlsplit(origin)
            for origin in settings.CSRF_TRUSTED_ORIGINS
            if "*" in origin
        ):
@ -284,7 +284,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
        if request_origin in self.allowed_origins_exact:
            return True
        try:
-            parsed_origin = urlparse(request_origin)
+            parsed_origin = urlsplit(request_origin)
        except ValueError:
            return False
        request_scheme = parsed_origin.scheme
@ -300,7 +300,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
            raise RejectRequest(REASON_NO_REFERER)

        try:
-            referer = urlparse(referer)
+            referer = urlsplit(referer)
        except ValueError:
            raise RejectRequest(REASON_MALFORMED_REFERER)

--- a/django/test/client.py
+++ b/django/test/client.py
@ -8,7 +8,7 @@ from functools import partial
 from http import HTTPStatus
 from importlib import import_module
 from io import BytesIO, IOBase
-from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
+from urllib.parse import unquote_to_bytes, urljoin, urlsplit

 from asgiref.sync import sync_to_async

@ -458,11 +458,7 @@ class RequestFactory:
        return json.dumps(data, cls=self.json_encoder) if should_encode else data

    def _get_path(self, parsed):
-        path = parsed.path
-        # If there are parameters, add them
-        if parsed.params:
-            path += ";" + parsed.params
-        path = unquote_to_bytes(path)
+        path = unquote_to_bytes(parsed.path)
        # Replace the behavior where non-ASCII values in the WSGI environ are
        # arbitrarily decoded with ISO-8859-1.
        # Refs comment in `get_bytes_from_wsgi()`.
@ -647,7 +643,7 @@ class RequestFactory:
        **extra,
    ):
        """Construct an arbitrary HTTP request."""
-        parsed = urlparse(str(path))  # path can be lazy
+        parsed = urlsplit(str(path))  # path can be lazy
        data = force_bytes(data, settings.DEFAULT_CHARSET)
        r = {
            "PATH_INFO": self._get_path(parsed),
@ -671,8 +667,7 @@ class RequestFactory:
        # If QUERY_STRING is absent or empty, we want to extract it from the URL.
        if not r.get("QUERY_STRING"):
            # WSGI requires latin-1 encoded strings. See get_path_info().
-            query_string = parsed[4].encode().decode("iso-8859-1")
-            r["QUERY_STRING"] = query_string
+            r["QUERY_STRING"] = parsed.query.encode().decode("iso-8859-1")
        return self.request(**r)


@ -748,7 +743,7 @@ class AsyncRequestFactory(RequestFactory):
        **extra,
    ):
        """Construct an arbitrary HTTP request."""
-        parsed = urlparse(str(path))  # path can be lazy.
+        parsed = urlsplit(str(path))  # path can be lazy.
        data = force_bytes(data, settings.DEFAULT_CHARSET)
        s = {
            "method": method,
@ -772,7 +767,7 @@ class AsyncRequestFactory(RequestFactory):
        else:
            # If QUERY_STRING is absent or empty, we want to extract it from
            # the URL.
-            s["query_string"] = parsed[4]
+            s["query_string"] = parsed.query
        if headers:
            extra.update(HttpHeaders.to_asgi_names(headers))
        s["headers"] += [
--- a/django/test/testcases.py
+++ b/django/test/testcases.py
@ -21,7 +21,7 @@ from urllib.parse import (
    urljoin,
    urlparse,
    urlsplit,
-    urlunparse,
+    urlunsplit,
 )
 from urllib.request import url2pathname

@ -541,11 +541,9 @@ class SimpleTestCase(unittest.TestCase):
        def normalize(url):
            """Sort the URL's query string parameters."""
            url = str(url)  # Coerce reverse_lazy() URLs.
-            scheme, netloc, path, params, query, fragment = urlparse(url)
+            scheme, netloc, path, query, fragment = urlsplit(url)
            query_parts = sorted(parse_qsl(query))
-            return urlunparse(
-                (scheme, netloc, path, params, urlencode(query_parts), fragment)
-            )
+            return urlunsplit((scheme, netloc, path, urlencode(query_parts), fragment))

        if msg_prefix:
            msg_prefix += ": "
@ -1637,11 +1635,11 @@ class FSFilesHandler(WSGIHandler):
        * the host is provided as part of the base_url
        * the request's path isn't under the media path (or equal)
        """
-        return path.startswith(self.base_url[2]) and not self.base_url[1]
+        return path.startswith(self.base_url.path) and not self.base_url.netloc

    def file_path(self, url):
        """Return the relative path to the file on disk for the given URL."""
-        relative_url = url.removeprefix(self.base_url[2])
+        relative_url = url.removeprefix(self.base_url.path)
        return url2pathname(relative_url)

    def get_response(self, request):
--- a/django/utils/http.py
+++ b/django/utils/http.py
@ -6,7 +6,7 @@ from datetime import datetime, timezone
 from email.utils import formatdate
 from urllib.parse import quote, unquote
 from urllib.parse import urlencode as original_urlencode
-from urllib.parse import urlparse
+from urllib.parse import urlsplit

 from django.utils.datastructures import MultiValueDict
 from django.utils.regex_helper import _lazy_re_compile
@ -271,11 +271,11 @@ def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):

 def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
    # Chrome considers any URL with more than two slashes to be absolute, but
-    # urlparse is not so flexible. Treat any url with three slashes as unsafe.
+    # urlsplit is not so flexible. Treat any url with three slashes as unsafe.
    if url.startswith("///"):
        return False
    try:
-        url_info = urlparse(url)
+        url_info = urlsplit(url)
    except ValueError:  # e.g. invalid IPv6 addresses
        return False
    # Forbid URLs like http:///example.com - with a scheme, but without a hostname.
--- a/docs/ref/urlresolvers.txt
+++ b/docs/ref/urlresolvers.txt
@ -203,7 +203,7 @@ A :class:`ResolverMatch` object can also be assigned to a triple::
 One possible use of :func:`~django.urls.resolve` would be to test whether a
 view would raise a ``Http404`` error before redirecting to it::

-    from urllib.parse import urlparse
+    from urllib.parse import urlsplit
    from django.urls import resolve
    from django.http import Http404, HttpResponseRedirect

@ -215,7 +215,7 @@ view would raise a ``Http404`` error before redirecting to it::
        # modify the request and response as required, e.g. change locale
        # and set corresponding locale cookie

-        view, args, kwargs = resolve(urlparse(next)[2])
+        view, args, kwargs = resolve(urlsplit(next).path)
        kwargs["request"] = request
        try:
            view(*args, **kwargs)
--- a/tests/admin_views/tests.py
+++ b/tests/admin_views/tests.py
@ -4,7 +4,7 @@ import re
 import unittest
 import zoneinfo
 from unittest import mock
-from urllib.parse import parse_qsl, urljoin, urlparse
+from urllib.parse import parse_qsl, urljoin, urlsplit

 from django import forms
 from django.contrib import admin
@ -357,7 +357,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase):
                            **save_option,
                        },
                    )
-                    parsed_url = urlparse(response.url)
+                    parsed_url = urlsplit(response.url)
                    self.assertEqual(parsed_url.query, qsl)

    def test_change_query_string_persists(self):
@ -386,7 +386,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase):
                            **save_option,
                        },
                    )
-                    parsed_url = urlparse(response.url)
+                    parsed_url = urlsplit(response.url)
                    self.assertEqual(parsed_url.query, qsl)

    def test_basic_edit_GET(self):
@ -8032,11 +8032,11 @@ class AdminKeepChangeListFiltersTests(TestCase):
        Assert that two URLs are equal despite the ordering
        of their querystring. Refs #22360.
        """
-        parsed_url1 = urlparse(url1)
+        parsed_url1 = urlsplit(url1)
        path1 = parsed_url1.path
        parsed_qs1 = dict(parse_qsl(parsed_url1.query))

-        parsed_url2 = urlparse(url2)
+        parsed_url2 = urlsplit(url2)
        path2 = parsed_url2.path
        parsed_qs2 = dict(parse_qsl(parsed_url2.query))

--- a/tests/csrf_tests/tests.py
+++ b/tests/csrf_tests/tests.py
@ -709,25 +709,21 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin):
        response = mw.process_view(req, post_form_view, (), {})
        self.assertContains(response, malformed_referer_msg, status_code=403)
        # missing scheme
-        # >>> urlparse('//example.com/')
-        # ParseResult(
-        #   scheme='', netloc='example.com', path='/', params='', query='', fragment='',
-        # )
+        # >>> urlsplit('//example.com/')
+        # SplitResult(scheme='', netloc='example.com', path='/', query='', fragment='')
        req.META["HTTP_REFERER"] = "//example.com/"
        self._check_referer_rejects(mw, req)
        response = mw.process_view(req, post_form_view, (), {})
        self.assertContains(response, malformed_referer_msg, status_code=403)
        # missing netloc
-        # >>> urlparse('https://')
-        # ParseResult(
-        #   scheme='https', netloc='', path='', params='', query='', fragment='',
-        # )
+        # >>> urlsplit('https://')
+        # SplitResult(scheme='https', netloc='', path='', query='', fragment='')
        req.META["HTTP_REFERER"] = "https://"
        self._check_referer_rejects(mw, req)
        response = mw.process_view(req, post_form_view, (), {})
        self.assertContains(response, malformed_referer_msg, status_code=403)
        # Invalid URL
-        # >>> urlparse('https://[')
+        # >>> urlsplit('https://[')
        # ValueError: Invalid IPv6 URL
        req.META["HTTP_REFERER"] = "https://["
        self._check_referer_rejects(mw, req)
@ -979,7 +975,7 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin):
    @override_settings(ALLOWED_HOSTS=["www.example.com"])
    def test_bad_origin_cannot_be_parsed(self):
        """
-        A POST request with an origin that can't be parsed by urlparse() is
+        A POST request with an origin that can't be parsed by urlsplit() is
        rejected.
        """
        req = self._get_POST_request_with_token()