440 lines
15 KiB
Python
440 lines
15 KiB
Python
import base64
|
|
import calendar
|
|
import datetime
|
|
import re
|
|
import unicodedata
|
|
import warnings
|
|
from binascii import Error as BinasciiError
|
|
from email.utils import formatdate
|
|
from urllib.parse import (
|
|
ParseResult, SplitResult, _coerce_args, _splitnetloc, _splitparams, quote,
|
|
quote_plus, scheme_chars, unquote, unquote_plus,
|
|
urlencode as original_urlencode, uses_params,
|
|
)
|
|
|
|
from django.core.exceptions import TooManyFieldsSent
|
|
from django.utils.datastructures import MultiValueDict
|
|
from django.utils.deprecation import RemovedInDjango21Warning
|
|
from django.utils.encoding import force_bytes
|
|
from django.utils.functional import keep_lazy_text
|
|
|
|
# based on RFC 7232, Appendix C
|
|
ETAG_MATCH = re.compile(r'''
|
|
\A( # start of string and capture group
|
|
(?:W/)? # optional weak indicator
|
|
" # opening quote
|
|
[^"]* # any sequence of non-quote characters
|
|
" # end quote
|
|
)\Z # end of string and capture group
|
|
''', re.X)
|
|
|
|
MONTHS = 'jan feb mar apr may jun jul aug sep oct nov dec'.split()
|
|
__D = r'(?P<day>\d{2})'
|
|
__D2 = r'(?P<day>[ \d]\d)'
|
|
__M = r'(?P<mon>\w{3})'
|
|
__Y = r'(?P<year>\d{4})'
|
|
__Y2 = r'(?P<year>\d{2})'
|
|
__T = r'(?P<hour>\d{2}):(?P<min>\d{2}):(?P<sec>\d{2})'
|
|
RFC1123_DATE = re.compile(r'^\w{3}, %s %s %s %s GMT$' % (__D, __M, __Y, __T))
|
|
RFC850_DATE = re.compile(r'^\w{6,9}, %s-%s-%s %s GMT$' % (__D, __M, __Y2, __T))
|
|
ASCTIME_DATE = re.compile(r'^\w{3} %s %s %s %s$' % (__M, __D2, __T, __Y))
|
|
|
|
RFC3986_GENDELIMS = ":/?#[]@"
|
|
RFC3986_SUBDELIMS = "!$&'()*+,;="
|
|
|
|
FIELDS_MATCH = re.compile('[&;]')
|
|
|
|
|
|
@keep_lazy_text
|
|
def urlquote(url, safe='/'):
|
|
"""
|
|
A legacy compatibility wrapper to Python's urllib.parse.quote() function.
|
|
(was used for unicode handling on Python 2)
|
|
"""
|
|
return quote(url, safe)
|
|
|
|
|
|
@keep_lazy_text
|
|
def urlquote_plus(url, safe=''):
|
|
"""
|
|
A legacy compatibility wrapper to Python's urllib.parse.quote_plus()
|
|
function. (was used for unicode handling on Python 2)
|
|
"""
|
|
return quote_plus(url, safe)
|
|
|
|
|
|
@keep_lazy_text
|
|
def urlunquote(quoted_url):
|
|
"""
|
|
A legacy compatibility wrapper to Python's urllib.parse.unquote() function.
|
|
(was used for unicode handling on Python 2)
|
|
"""
|
|
return unquote(quoted_url)
|
|
|
|
|
|
@keep_lazy_text
|
|
def urlunquote_plus(quoted_url):
|
|
"""
|
|
A legacy compatibility wrapper to Python's urllib.parse.unquote_plus()
|
|
function. (was used for unicode handling on Python 2)
|
|
"""
|
|
return unquote_plus(quoted_url)
|
|
|
|
|
|
def urlencode(query, doseq=False):
|
|
"""
|
|
A version of Python's urllib.parse.urlencode() function that can operate on
|
|
MultiValueDict and non-string values.
|
|
"""
|
|
if isinstance(query, MultiValueDict):
|
|
query = query.lists()
|
|
elif hasattr(query, 'items'):
|
|
query = query.items()
|
|
query_params = []
|
|
for key, value in query:
|
|
if isinstance(value, (str, bytes)):
|
|
query_val = value
|
|
else:
|
|
try:
|
|
iter(value)
|
|
except TypeError:
|
|
query_val = value
|
|
else:
|
|
# Consume generators and iterators, even when doseq=True, to
|
|
# work around https://bugs.python.org/issue31706.
|
|
query_val = [
|
|
item if isinstance(item, bytes) else str(item)
|
|
for item in value
|
|
]
|
|
query_params.append((key, query_val))
|
|
return original_urlencode(query_params, doseq)
|
|
|
|
|
|
def cookie_date(epoch_seconds=None):
|
|
"""
|
|
Format the time to ensure compatibility with Netscape's cookie standard.
|
|
|
|
`epoch_seconds` is a floating point number expressed in seconds since the
|
|
epoch, in UTC - such as that outputted by time.time(). If set to None, it
|
|
defaults to the current time.
|
|
|
|
Output a string in the format 'Wdy, DD-Mon-YYYY HH:MM:SS GMT'.
|
|
"""
|
|
rfcdate = formatdate(epoch_seconds)
|
|
return '%s-%s-%s GMT' % (rfcdate[:7], rfcdate[8:11], rfcdate[12:25])
|
|
|
|
|
|
def http_date(epoch_seconds=None):
|
|
"""
|
|
Format the time to match the RFC1123 date format as specified by HTTP
|
|
RFC7231 section 7.1.1.1.
|
|
|
|
`epoch_seconds` is a floating point number expressed in seconds since the
|
|
epoch, in UTC - such as that outputted by time.time(). If set to None, it
|
|
defaults to the current time.
|
|
|
|
Output a string in the format 'Wdy, DD Mon YYYY HH:MM:SS GMT'.
|
|
"""
|
|
return formatdate(epoch_seconds, usegmt=True)
|
|
|
|
|
|
def parse_http_date(date):
|
|
"""
|
|
Parse a date format as specified by HTTP RFC7231 section 7.1.1.1.
|
|
|
|
The three formats allowed by the RFC are accepted, even if only the first
|
|
one is still in widespread use.
|
|
|
|
Return an integer expressed in seconds since the epoch, in UTC.
|
|
"""
|
|
# email.utils.parsedate() does the job for RFC1123 dates; unfortunately
|
|
# RFC7231 makes it mandatory to support RFC850 dates too. So we roll
|
|
# our own RFC-compliant parsing.
|
|
for regex in RFC1123_DATE, RFC850_DATE, ASCTIME_DATE:
|
|
m = regex.match(date)
|
|
if m is not None:
|
|
break
|
|
else:
|
|
raise ValueError("%r is not in a valid HTTP date format" % date)
|
|
try:
|
|
year = int(m.group('year'))
|
|
if year < 100:
|
|
if year < 70:
|
|
year += 2000
|
|
else:
|
|
year += 1900
|
|
month = MONTHS.index(m.group('mon').lower()) + 1
|
|
day = int(m.group('day'))
|
|
hour = int(m.group('hour'))
|
|
min = int(m.group('min'))
|
|
sec = int(m.group('sec'))
|
|
result = datetime.datetime(year, month, day, hour, min, sec)
|
|
return calendar.timegm(result.utctimetuple())
|
|
except Exception as exc:
|
|
raise ValueError("%r is not a valid date" % date) from exc
|
|
|
|
|
|
def parse_http_date_safe(date):
|
|
"""
|
|
Same as parse_http_date, but return None if the input is invalid.
|
|
"""
|
|
try:
|
|
return parse_http_date(date)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# Base 36 functions: useful for generating compact URLs
|
|
|
|
def base36_to_int(s):
|
|
"""
|
|
Convert a base 36 string to an int. Raise ValueError if the input won't fit
|
|
into an int.
|
|
"""
|
|
# To prevent overconsumption of server resources, reject any
|
|
# base36 string that is longer than 13 base36 digits (13 digits
|
|
# is sufficient to base36-encode any 64-bit integer)
|
|
if len(s) > 13:
|
|
raise ValueError("Base36 input too large")
|
|
return int(s, 36)
|
|
|
|
|
|
def int_to_base36(i):
|
|
"""Convert an integer to a base36 string."""
|
|
char_set = '0123456789abcdefghijklmnopqrstuvwxyz'
|
|
if i < 0:
|
|
raise ValueError("Negative base36 conversion input.")
|
|
if i < 36:
|
|
return char_set[i]
|
|
b36 = ''
|
|
while i != 0:
|
|
i, n = divmod(i, 36)
|
|
b36 = char_set[n] + b36
|
|
return b36
|
|
|
|
|
|
def urlsafe_base64_encode(s):
|
|
"""
|
|
Encode a bytestring in base64 for use in URLs. Strip any trailing equal
|
|
signs.
|
|
"""
|
|
return base64.urlsafe_b64encode(s).rstrip(b'\n=')
|
|
|
|
|
|
def urlsafe_base64_decode(s):
|
|
"""
|
|
Decode a base64 encoded string. Add back any trailing equal signs that
|
|
might have been stripped.
|
|
"""
|
|
s = force_bytes(s)
|
|
try:
|
|
return base64.urlsafe_b64decode(s.ljust(len(s) + len(s) % 4, b'='))
|
|
except (LookupError, BinasciiError) as e:
|
|
raise ValueError(e)
|
|
|
|
|
|
def parse_etags(etag_str):
|
|
"""
|
|
Parse a string of ETags given in an If-None-Match or If-Match header as
|
|
defined by RFC 7232. Return a list of quoted ETags, or ['*'] if all ETags
|
|
should be matched.
|
|
"""
|
|
if etag_str.strip() == '*':
|
|
return ['*']
|
|
else:
|
|
# Parse each ETag individually, and return any that are valid.
|
|
etag_matches = (ETAG_MATCH.match(etag.strip()) for etag in etag_str.split(','))
|
|
return [match.group(1) for match in etag_matches if match]
|
|
|
|
|
|
def quote_etag(etag_str):
|
|
"""
|
|
If the provided string is already a quoted ETag, return it. Otherwise, wrap
|
|
the string in quotes, making it a strong ETag.
|
|
"""
|
|
if ETAG_MATCH.match(etag_str):
|
|
return etag_str
|
|
else:
|
|
return '"%s"' % etag_str
|
|
|
|
|
|
def is_same_domain(host, pattern):
|
|
"""
|
|
Return ``True`` if the host is either an exact match or a match
|
|
to the wildcard pattern.
|
|
|
|
Any pattern beginning with a period matches a domain and all of its
|
|
subdomains. (e.g. ``.example.com`` matches ``example.com`` and
|
|
``foo.example.com``). Anything else is an exact string match.
|
|
"""
|
|
if not pattern:
|
|
return False
|
|
|
|
pattern = pattern.lower()
|
|
return (
|
|
pattern[0] == '.' and (host.endswith(pattern) or host == pattern[1:]) or
|
|
pattern == host
|
|
)
|
|
|
|
|
|
def is_safe_url(url, host=None, allowed_hosts=None, require_https=False):
|
|
"""
|
|
Return ``True`` if the url is a safe redirection (i.e. it doesn't point to
|
|
a different host and uses a safe scheme).
|
|
|
|
Always return ``False`` on an empty url.
|
|
|
|
If ``require_https`` is ``True``, only 'https' will be considered a valid
|
|
scheme, as opposed to 'http' and 'https' with the default, ``False``.
|
|
"""
|
|
if url is not None:
|
|
url = url.strip()
|
|
if not url:
|
|
return False
|
|
if allowed_hosts is None:
|
|
allowed_hosts = set()
|
|
if host:
|
|
warnings.warn(
|
|
"The host argument is deprecated, use allowed_hosts instead.",
|
|
RemovedInDjango21Warning,
|
|
stacklevel=2,
|
|
)
|
|
# Avoid mutating the passed in allowed_hosts.
|
|
allowed_hosts = allowed_hosts | {host}
|
|
# Chrome treats \ completely as / in paths but it could be part of some
|
|
# basic auth credentials so we need to check both URLs.
|
|
return (_is_safe_url(url, allowed_hosts, require_https=require_https) and
|
|
_is_safe_url(url.replace('\\', '/'), allowed_hosts, require_https=require_https))
|
|
|
|
|
|
# Copied from urllib.parse.urlparse() but uses fixed urlsplit() function.
|
|
def _urlparse(url, scheme='', allow_fragments=True):
|
|
"""Parse a URL into 6 components:
|
|
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
|
|
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
|
|
Note that we don't break the components up in smaller bits
|
|
(e.g. netloc is a single string) and we don't expand % escapes."""
|
|
url, scheme, _coerce_result = _coerce_args(url, scheme)
|
|
splitresult = _urlsplit(url, scheme, allow_fragments)
|
|
scheme, netloc, url, query, fragment = splitresult
|
|
if scheme in uses_params and ';' in url:
|
|
url, params = _splitparams(url)
|
|
else:
|
|
params = ''
|
|
result = ParseResult(scheme, netloc, url, params, query, fragment)
|
|
return _coerce_result(result)
|
|
|
|
|
|
# Copied from urllib.parse.urlsplit() with
|
|
# https://github.com/python/cpython/pull/661 applied.
|
|
def _urlsplit(url, scheme='', allow_fragments=True):
|
|
"""Parse a URL into 5 components:
|
|
<scheme>://<netloc>/<path>?<query>#<fragment>
|
|
Return a 5-tuple: (scheme, netloc, path, query, fragment).
|
|
Note that we don't break the components up in smaller bits
|
|
(e.g. netloc is a single string) and we don't expand % escapes."""
|
|
url, scheme, _coerce_result = _coerce_args(url, scheme)
|
|
allow_fragments = bool(allow_fragments)
|
|
netloc = query = fragment = ''
|
|
i = url.find(':')
|
|
if i > 0:
|
|
for c in url[:i]:
|
|
if c not in scheme_chars:
|
|
break
|
|
else:
|
|
scheme, url = url[:i].lower(), url[i + 1:]
|
|
|
|
if url[:2] == '//':
|
|
netloc, url = _splitnetloc(url, 2)
|
|
if (('[' in netloc and ']' not in netloc) or
|
|
(']' in netloc and '[' not in netloc)):
|
|
raise ValueError("Invalid IPv6 URL")
|
|
if allow_fragments and '#' in url:
|
|
url, fragment = url.split('#', 1)
|
|
if '?' in url:
|
|
url, query = url.split('?', 1)
|
|
v = SplitResult(scheme, netloc, url, query, fragment)
|
|
return _coerce_result(v)
|
|
|
|
|
|
def _is_safe_url(url, allowed_hosts, require_https=False):
|
|
# Chrome considers any URL with more than two slashes to be absolute, but
|
|
# urlparse is not so flexible. Treat any url with three slashes as unsafe.
|
|
if url.startswith('///'):
|
|
return False
|
|
try:
|
|
url_info = _urlparse(url)
|
|
except ValueError: # e.g. invalid IPv6 addresses
|
|
return False
|
|
# Forbid URLs like http:///example.com - with a scheme, but without a hostname.
|
|
# In that URL, example.com is not the hostname but, a path component. However,
|
|
# Chrome will still consider example.com to be the hostname, so we must not
|
|
# allow this syntax.
|
|
if not url_info.netloc and url_info.scheme:
|
|
return False
|
|
# Forbid URLs that start with control characters. Some browsers (like
|
|
# Chrome) ignore quite a few control characters at the start of a
|
|
# URL and might consider the URL as scheme relative.
|
|
if unicodedata.category(url[0])[0] == 'C':
|
|
return False
|
|
scheme = url_info.scheme
|
|
# Consider URLs without a scheme (e.g. //example.com/p) to be http.
|
|
if not url_info.scheme and url_info.netloc:
|
|
scheme = 'http'
|
|
valid_schemes = ['https'] if require_https else ['http', 'https']
|
|
return ((not url_info.netloc or url_info.netloc in allowed_hosts) and
|
|
(not scheme or scheme in valid_schemes))
|
|
|
|
|
|
def limited_parse_qsl(qs, keep_blank_values=False, encoding='utf-8',
|
|
errors='replace', fields_limit=None):
|
|
"""
|
|
Return a list of key/value tuples parsed from query string.
|
|
|
|
Copied from urlparse with an additional "fields_limit" argument.
|
|
Copyright (C) 2013 Python Software Foundation (see LICENSE.python).
|
|
|
|
Arguments:
|
|
|
|
qs: percent-encoded query string to be parsed
|
|
|
|
keep_blank_values: flag indicating whether blank values in
|
|
percent-encoded queries should be treated as blank strings. A
|
|
true value indicates that blanks should be retained as blank
|
|
strings. The default false value indicates that blank values
|
|
are to be ignored and treated as if they were not included.
|
|
|
|
encoding and errors: specify how to decode percent-encoded sequences
|
|
into Unicode characters, as accepted by the bytes.decode() method.
|
|
|
|
fields_limit: maximum number of fields parsed or an exception
|
|
is raised. None means no limit and is the default.
|
|
"""
|
|
if fields_limit:
|
|
pairs = FIELDS_MATCH.split(qs, fields_limit)
|
|
if len(pairs) > fields_limit:
|
|
raise TooManyFieldsSent(
|
|
'The number of GET/POST parameters exceeded '
|
|
'settings.DATA_UPLOAD_MAX_NUMBER_FIELDS.'
|
|
)
|
|
else:
|
|
pairs = FIELDS_MATCH.split(qs)
|
|
r = []
|
|
for name_value in pairs:
|
|
if not name_value:
|
|
continue
|
|
nv = name_value.split('=', 1)
|
|
if len(nv) != 2:
|
|
# Handle case of a control-name with no equal sign
|
|
if keep_blank_values:
|
|
nv.append('')
|
|
else:
|
|
continue
|
|
if len(nv[1]) or keep_blank_values:
|
|
name = nv[0].replace('+', ' ')
|
|
name = unquote(name, encoding=encoding, errors=errors)
|
|
value = nv[1].replace('+', ' ')
|
|
value = unquote(value, encoding=encoding, errors=errors)
|
|
r.append((name, value))
|
|
return r
|