From 269e921756371bee6d35a967bc2ffe84d1ae39eb Mon Sep 17 00:00:00 2001 From: Russell Keith-Magee Date: Fri, 29 Oct 2010 16:39:25 +0000 Subject: [PATCH] Fixed #9886 -- Added a file-like interface to HttpRequest. Thanks to Ivan Sagalaev for the suggestion and patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@14394 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/core/handlers/modpython.py | 30 +----- django/core/handlers/wsgi.py | 127 +++++++++++++----------- django/http/__init__.py | 77 +++++++++++++- docs/ref/request-response.txt | 31 +++++- tests/regressiontests/requests/tests.py | 66 +++++++++++- 5 files changed, 236 insertions(+), 95 deletions(-) diff --git a/django/core/handlers/modpython.py b/django/core/handlers/modpython.py index 0128f1abe8e..7b25f0e11e8 100644 --- a/django/core/handlers/modpython.py +++ b/django/core/handlers/modpython.py @@ -42,6 +42,8 @@ class ModPythonRequest(http.HttpRequest): # naughty, but also pretty harmless. self.path_info = u'/' self._post_parse_error = False + self._stream = self._req + self._read_started = False def __repr__(self): # Since this is called as part of error handling, we need to be very @@ -81,26 +83,6 @@ class ModPythonRequest(http.HttpRequest): # mod_python < 3.2.10 doesn't have req.is_https(). return self._req.subprocess_env.get('HTTPS', '').lower() in ('on', '1') - def _load_post_and_files(self): - "Populates self._post and self._files" - if self.method != 'POST': - self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict() - return - - if 'content-type' in self._req.headers_in and self._req.headers_in['content-type'].startswith('multipart'): - self._raw_post_data = '' - try: - self._post, self._files = self.parse_file_upload(self.META, self._req) - except: - # See django.core.handlers.wsgi.WSGIHandler for an explanation - # of what's going on here. - self._post = http.QueryDict('') - self._files = datastructures.MultiValueDict() - self._post_parse_error = True - raise - else: - self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict() - def _get_request(self): if not hasattr(self, '_request'): self._request = datastructures.MergeDict(self.POST, self.GET) @@ -162,13 +144,6 @@ class ModPythonRequest(http.HttpRequest): self._meta[key] = value return self._meta - def _get_raw_post_data(self): - try: - return self._raw_post_data - except AttributeError: - self._raw_post_data = self._req.read() - return self._raw_post_data - def _get_method(self): return self.META['REQUEST_METHOD'].upper() @@ -178,7 +153,6 @@ class ModPythonRequest(http.HttpRequest): FILES = property(_get_files) META = property(_get_meta) REQUEST = property(_get_request) - raw_post_data = property(_get_raw_post_data) method = property(_get_method) class ModPythonHandler(BaseHandler): diff --git a/django/core/handlers/wsgi.py b/django/core/handlers/wsgi.py index 36c3c48dbec..7e5a6552fd0 100644 --- a/django/core/handlers/wsgi.py +++ b/django/core/handlers/wsgi.py @@ -5,6 +5,7 @@ try: from cStringIO import StringIO except ImportError: from StringIO import StringIO +import socket from django import http from django.core import signals @@ -62,20 +63,55 @@ STATUS_CODE_TEXT = { 505: 'HTTP VERSION NOT SUPPORTED', } -def safe_copyfileobj(fsrc, fdst, length=16*1024, size=0): - """ - A version of shutil.copyfileobj that will not read more than 'size' bytes. - This makes it safe from clients sending more than CONTENT_LENGTH bytes of - data in the body. - """ - if not size: - return - while size > 0: - buf = fsrc.read(min(length, size)) - if not buf: - break - fdst.write(buf) - size -= len(buf) +class LimitedStream(object): + ''' + LimitedStream wraps another stream in order to not allow reading from it + past specified amount of bytes. + ''' + def __init__(self, stream, limit, buf_size=64 * 1024 * 1024): + self.stream = stream + self.remaining = limit + self.buffer = '' + self.buf_size = buf_size + + def _read_limited(self, size=None): + if size is None or size > self.remaining: + size = self.remaining + if size == 0: + return '' + result = self.stream.read(size) + self.remaining -= len(result) + return result + + def read(self, size=None): + if size is None: + result = self.buffer + self._read_limited() + self.buffer = '' + elif size < len(self.buffer): + result = self.buffer[:size] + self.buffer = self.buffer[size:] + else: # size >= len(self.buffer) + result = self.buffer + self._read_limited(size - len(self.buffer)) + self.buffer = '' + return result + + def readline(self, size=None): + while '\n' not in self.buffer or \ + (size is not None and len(self.buffer) < size): + if size: + chunk = self._read_limited(size - len(self.buffer)) + else: + chunk = self._read_limited() + if not chunk: + break + self.buffer += chunk + sio = StringIO(self.buffer) + if size: + line = sio.readline(size) + else: + line = sio.readline() + self.buffer = sio.read() + return line class WSGIRequest(http.HttpRequest): def __init__(self, environ): @@ -98,6 +134,24 @@ class WSGIRequest(http.HttpRequest): self.META['SCRIPT_NAME'] = script_name self.method = environ['REQUEST_METHOD'].upper() self._post_parse_error = False + if isinstance(self.environ['wsgi.input'], socket._fileobject): + # Under development server 'wsgi.input' is an instance of + # socket._fileobject which hangs indefinitely on reading bytes past + # available count. To prevent this it's wrapped in LimitedStream + # that doesn't read past Content-Length bytes. + # + # This is not done for other kinds of inputs (like flup's FastCGI + # streams) beacuse they don't suffer from this problem and we can + # avoid using another wrapper with its own .read and .readline + # implementation. + try: + content_length = int(self.environ.get('CONTENT_LENGTH', 0)) + except (ValueError, TypeError): + content_length = 0 + self._stream = LimitedStream(self.environ['wsgi.input'], content_length) + else: + self._stream = self.environ['wsgi.input'] + self._read_started = False def __repr__(self): # Since this is called as part of error handling, we need to be very @@ -133,30 +187,6 @@ class WSGIRequest(http.HttpRequest): return 'wsgi.url_scheme' in self.environ \ and self.environ['wsgi.url_scheme'] == 'https' - def _load_post_and_files(self): - # Populates self._post and self._files - if self.method == 'POST': - if self.environ.get('CONTENT_TYPE', '').startswith('multipart'): - self._raw_post_data = '' - try: - self._post, self._files = self.parse_file_upload(self.META, self.environ['wsgi.input']) - except: - # An error occured while parsing POST data. Since when - # formatting the error the request handler might access - # self.POST, set self._post and self._file to prevent - # attempts to parse POST data again. - self._post = http.QueryDict('') - self._files = datastructures.MultiValueDict() - # Mark that an error occured. This allows self.__repr__ to - # be explicit about it instead of simply representing an - # empty POST - self._post_parse_error = True - raise - else: - self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict() - else: - self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict() - def _get_request(self): if not hasattr(self, '_request'): self._request = datastructures.MergeDict(self.POST, self.GET) @@ -192,32 +222,11 @@ class WSGIRequest(http.HttpRequest): self._load_post_and_files() return self._files - def _get_raw_post_data(self): - try: - return self._raw_post_data - except AttributeError: - buf = StringIO() - try: - # CONTENT_LENGTH might be absent if POST doesn't have content at all (lighttpd) - content_length = int(self.environ.get('CONTENT_LENGTH', 0)) - except (ValueError, TypeError): - # If CONTENT_LENGTH was empty string or not an integer, don't - # error out. We've also seen None passed in here (against all - # specs, but see ticket #8259), so we handle TypeError as well. - content_length = 0 - if content_length > 0: - safe_copyfileobj(self.environ['wsgi.input'], buf, - size=content_length) - self._raw_post_data = buf.getvalue() - buf.close() - return self._raw_post_data - GET = property(_get_get, _set_get) POST = property(_get_post, _set_post) COOKIES = property(_get_cookies, _set_cookies) FILES = property(_get_files) REQUEST = property(_get_request) - raw_post_data = property(_get_raw_post_data) class WSGIHandler(base.BaseHandler): initLock = Lock() diff --git a/django/http/__init__.py b/django/http/__init__.py index 406c217b284..2fa683dbbbb 100644 --- a/django/http/__init__.py +++ b/django/http/__init__.py @@ -6,6 +6,10 @@ from Cookie import BaseCookie, SimpleCookie, CookieError from pprint import pformat from urllib import urlencode from urlparse import urljoin +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO try: # The mod_python version is more efficient, so try importing it first. from mod_python.util import parse_qsl @@ -132,6 +136,73 @@ class HttpRequest(object): parser = MultiPartParser(META, post_data, self.upload_handlers, self.encoding) return parser.parse() + def _get_raw_post_data(self): + if not hasattr(self, '_raw_post_data'): + if self._read_started: + raise Exception("You cannot access raw_post_data after reading from request's data stream") + self._raw_post_data = self.read() + self._stream = StringIO(self._raw_post_data) + return self._raw_post_data + raw_post_data = property(_get_raw_post_data) + + def _mark_post_parse_error(self): + self._post = QueryDict('') + self._files = MultiValueDict() + self._post_parse_error = True + + def _load_post_and_files(self): + # Populates self._post and self._files + if self.method != 'POST': + self._post, self._files = QueryDict('', encoding=self._encoding), MultiValueDict() + return + if self._read_started: + self._mark_post_parse_error() + return + + if self.META.get('CONTENT_TYPE', '').startswith('multipart'): + self._raw_post_data = '' + try: + self._post, self._files = self.parse_file_upload(self.META, self) + except: + # An error occured while parsing POST data. Since when + # formatting the error the request handler might access + # self.POST, set self._post and self._file to prevent + # attempts to parse POST data again. + # Mark that an error occured. This allows self.__repr__ to + # be explicit about it instead of simply representing an + # empty POST + self._mark_post_parse_error() + raise + else: + self._post, self._files = QueryDict(self.raw_post_data, encoding=self._encoding), MultiValueDict() + + ## File-like and iterator interface. + ## + ## Expects self._stream to be set to an appropriate source of bytes by + ## a corresponding request subclass (WSGIRequest or ModPythonRequest). + ## Also when request data has already been read by request.POST or + ## request.raw_post_data, self._stream points to a StringIO instance + ## containing that data. + + def read(self, *args, **kwargs): + self._read_started = True + return self._stream.read(*args, **kwargs) + + def readline(self, *args, **kwargs): + self._read_started = True + return self._stream.readline(*args, **kwargs) + + def xreadlines(self): + while True: + buf = self.readline() + if not buf: + break + yield buf + __iter__ = xreadlines + + def readlines(self): + return list(iter(self)) + class QueryDict(MultiValueDict): """ A specialized MultiValueDict that takes a query string when initialized. @@ -198,7 +269,7 @@ class QueryDict(MultiValueDict): for key, value in dict.items(self): dict.__setitem__(result, copy.deepcopy(key, memo), copy.deepcopy(value, memo)) return result - + def setlist(self, key, list_): self._assert_mutable() key = str_to_unicode(key, self.encoding) @@ -385,7 +456,7 @@ class HttpResponse(object): """ Sets a cookie. - ``expires`` can be a string in the correct format or a + ``expires`` can be a string in the correct format or a ``datetime.datetime`` object in UTC. If ``expires`` is a datetime object then ``max_age`` will be calculated. """ @@ -407,7 +478,7 @@ class HttpResponse(object): # IE requires expires, so set it if hasn't been already. if not expires: self.cookies[key]['expires'] = cookie_date(time.time() + - max_age) + max_age) if path is not None: self.cookies[key]['path'] = path if domain is not None: diff --git a/docs/ref/request-response.txt b/docs/ref/request-response.txt index 0cecebf5ad0..5a317aee066 100644 --- a/docs/ref/request-response.txt +++ b/docs/ref/request-response.txt @@ -189,8 +189,14 @@ All attributes except ``session`` should be considered read-only. .. attribute:: HttpRequest.raw_post_data - The raw HTTP POST data. This is only useful for advanced processing. Use - ``POST`` instead. + The raw HTTP POST data as a byte string. This is useful for processing + data in different formats than of conventional HTML forms: binary images, + XML payload etc. For processing form data use ``HttpRequest.POST``. + + .. versionadded:: 1.3 + + You can also read from an HttpRequest using file-like interface. See + :meth:`HttpRequest.read()`. .. attribute:: HttpRequest.urlconf @@ -249,6 +255,27 @@ Methods If you write your own XMLHttpRequest call (on the browser side), you'll have to set this header manually if you want ``is_ajax()`` to work. +.. method:: HttpRequest.read(size=None) +.. method:: HttpRequest.readline() +.. method:: HttpRequest.readlines() +.. method:: HttpRequest.xreadlines() +.. method:: HttpRequest.__iter__() + + .. versionadded:: 1.3 + + Methods implementing a file-like interface for reading from an + HttpRequest instance. This makes it possible to consume an incoming + request in a streaming fashion. A common use-case would be to process a + big XML payload with iterative parser without constructing a whole + XML tree in memory. + + Given this standard interface, an HttpRequest instance can be + passed directly to an XML parser such as ElementTree:: + + import xml.etree.ElementTree as ET + for element in ET.iterparse(request): + process(element) + QueryDict objects ----------------- diff --git a/tests/regressiontests/requests/tests.py b/tests/regressiontests/requests/tests.py index 81f00766c55..2087dc43e3f 100644 --- a/tests/regressiontests/requests/tests.py +++ b/tests/regressiontests/requests/tests.py @@ -1,9 +1,10 @@ from datetime import datetime, timedelta import time +from StringIO import StringIO import unittest from django.http import HttpRequest, HttpResponse, parse_cookie -from django.core.handlers.wsgi import WSGIRequest +from django.core.handlers.wsgi import WSGIRequest, LimitedStream from django.core.handlers.modpython import ModPythonRequest from django.utils.http import cookie_date @@ -17,11 +18,11 @@ class RequestsTests(unittest.TestCase): self.assertEqual(request.META.keys(), []) def test_wsgirequest(self): - request = WSGIRequest({'PATH_INFO': 'bogus', 'REQUEST_METHOD': 'bogus'}) + request = WSGIRequest({'PATH_INFO': 'bogus', 'REQUEST_METHOD': 'bogus', 'wsgi.input': StringIO('')}) self.assertEqual(request.GET.keys(), []) self.assertEqual(request.POST.keys(), []) self.assertEqual(request.COOKIES.keys(), []) - self.assertEqual(set(request.META.keys()), set(['PATH_INFO', 'REQUEST_METHOD', 'SCRIPT_NAME'])) + self.assertEqual(set(request.META.keys()), set(['PATH_INFO', 'REQUEST_METHOD', 'SCRIPT_NAME', 'wsgi.input'])) self.assertEqual(request.META['PATH_INFO'], 'bogus') self.assertEqual(request.META['REQUEST_METHOD'], 'bogus') self.assertEqual(request.META['SCRIPT_NAME'], '') @@ -88,3 +89,62 @@ class RequestsTests(unittest.TestCase): max_age_cookie = response.cookies['max_age'] self.assertEqual(max_age_cookie['max-age'], 10) self.assertEqual(max_age_cookie['expires'], cookie_date(time.time()+10)) + + def test_limited_stream(self): + # Read all of a limited stream + stream = LimitedStream(StringIO('test'), 2) + self.assertEqual(stream.read(), 'te') + + # Read a number of characters greater than the stream has to offer + stream = LimitedStream(StringIO('test'), 2) + self.assertEqual(stream.read(5), 'te') + + # Read sequentially from a stream + stream = LimitedStream(StringIO('12345678'), 8) + self.assertEqual(stream.read(5), '12345') + self.assertEqual(stream.read(5), '678') + + # Read lines from a stream + stream = LimitedStream(StringIO('1234\n5678\nabcd\nefgh\nijkl'), 24) + # Read a full line, unconditionally + self.assertEqual(stream.readline(), '1234\n') + # Read a number of characters less than a line + self.assertEqual(stream.readline(2), '56') + # Read the rest of the partial line + self.assertEqual(stream.readline(), '78\n') + # Read a full line, with a character limit greater than the line length + self.assertEqual(stream.readline(6), 'abcd\n') + # Read the next line, deliberately terminated at the line end + self.assertEqual(stream.readline(4), 'efgh') + # Read the next line... just the line end + self.assertEqual(stream.readline(), '\n') + # Read everything else. + self.assertEqual(stream.readline(), 'ijkl') + + def test_stream(self): + request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')}) + self.assertEqual(request.read(), 'name=value') + + def test_read_after_value(self): + """ + Reading from request is allowed after accessing request contents as + POST or raw_post_data. + """ + request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')}) + self.assertEqual(request.POST, {u'name': [u'value']}) + self.assertEqual(request.raw_post_data, 'name=value') + self.assertEqual(request.read(), 'name=value') + + def test_value_after_read(self): + """ + Construction of POST or raw_post_data is not allowed after reading + from request. + """ + request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')}) + self.assertEqual(request.read(2), 'na') + self.assertRaises(Exception, lambda: request.raw_post_data) + self.assertEqual(request.POST, {}) + + def test_read_by_lines(self): + request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')}) + self.assertEqual(list(request), ['name=value'])