Fixed #9886 -- Added a file-like interface to HttpRequest. Thanks to Ivan Sagalaev for the suggestion and patch.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@14394 bcc190cf-cafb-0310-a4f2-bffc1f526a37
2010-10-29 16:39:25 +00:00 · 2010-10-29 16:39:25 +00:00 · 269e921756
parent 3086b55b0e
commit 269e921756
5 changed files with 236 additions and 95 deletions
--- a/django/core/handlers/modpython.py
+++ b/django/core/handlers/modpython.py
@ -42,6 +42,8 @@ class ModPythonRequest(http.HttpRequest):
            # naughty, but also pretty harmless.
            self.path_info = u'/'
        self._post_parse_error = False
        self._stream = self._req
        self._read_started = False
    def __repr__(self):
        # Since this is called as part of error handling, we need to be very
@ -81,26 +83,6 @@ class ModPythonRequest(http.HttpRequest):
            # mod_python < 3.2.10 doesn't have req.is_https().
            return self._req.subprocess_env.get('HTTPS', '').lower() in ('on', '1')
    def _load_post_and_files(self):
        "Populates self._post and self._files"
        if self.method != 'POST':
            self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict()
            return
        if 'content-type' in self._req.headers_in and self._req.headers_in['content-type'].startswith('multipart'):
            self._raw_post_data = ''
            try:
                self._post, self._files = self.parse_file_upload(self.META, self._req)
            except:
                # See django.core.handlers.wsgi.WSGIHandler for an explanation
                # of what's going on here.
                self._post = http.QueryDict('')
                self._files = datastructures.MultiValueDict()
                self._post_parse_error = True
                raise
        else:
            self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict()
    def _get_request(self):
        if not hasattr(self, '_request'):
            self._request = datastructures.MergeDict(self.POST, self.GET)
@ -162,13 +144,6 @@ class ModPythonRequest(http.HttpRequest):
                self._meta[key] = value
        return self._meta
    def _get_raw_post_data(self):
        try:
            return self._raw_post_data
        except AttributeError:
            self._raw_post_data = self._req.read()
            return self._raw_post_data
    def _get_method(self):
        return self.META['REQUEST_METHOD'].upper()
@ -178,7 +153,6 @@ class ModPythonRequest(http.HttpRequest):
    FILES = property(_get_files)
    META = property(_get_meta)
    REQUEST = property(_get_request)
    raw_post_data = property(_get_raw_post_data)
    method = property(_get_method)
 class ModPythonHandler(BaseHandler):
--- a/django/core/handlers/wsgi.py
+++ b/django/core/handlers/wsgi.py
@ -5,6 +5,7 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
 import socket
 from django import http
 from django.core import signals
@ -62,20 +63,55 @@ STATUS_CODE_TEXT = {
    505: 'HTTP VERSION NOT SUPPORTED',
 }
-def safe_copyfileobj(fsrc, fdst, length=16*1024, size=0):
+class LimitedStream(object):
-    """
+    '''
-    A version of shutil.copyfileobj that will not read more than 'size' bytes.
+    LimitedStream wraps another stream in order to not allow reading from it
-    This makes it safe from clients sending more than CONTENT_LENGTH bytes of
+    past specified amount of bytes.
-    data in the body.
+    '''
-    """
+    def __init__(self, stream, limit, buf_size=64 * 1024 * 1024):
-    if not size:
+        self.stream = stream
-        return
+        self.remaining = limit
-    while size > 0:
+        self.buffer = ''
-        buf = fsrc.read(min(length, size))
+        self.buf_size = buf_size
-        if not buf:
+
-            break
+    def _read_limited(self, size=None):
-        fdst.write(buf)
+        if size is None or size > self.remaining:
-        size -= len(buf)
+            size = self.remaining
        if size == 0:
            return ''
        result = self.stream.read(size)
        self.remaining -= len(result)
        return result
    def read(self, size=None):
        if size is None:
            result = self.buffer + self._read_limited()
            self.buffer = ''
        elif size < len(self.buffer):
            result = self.buffer[:size]
            self.buffer = self.buffer[size:]
        else: # size >= len(self.buffer)
            result = self.buffer + self._read_limited(size - len(self.buffer))
            self.buffer = ''
        return result
    def readline(self, size=None):
        while '\n' not in self.buffer or \
              (size is not None and len(self.buffer) < size):
            if size:
                chunk = self._read_limited(size - len(self.buffer))
            else:
                chunk = self._read_limited()
            if not chunk:
                break
            self.buffer += chunk
        sio = StringIO(self.buffer)
        if size:
            line = sio.readline(size)
        else:
            line = sio.readline()
        self.buffer = sio.read()
        return line
 class WSGIRequest(http.HttpRequest):
    def __init__(self, environ):
@ -98,6 +134,24 @@ class WSGIRequest(http.HttpRequest):
        self.META['SCRIPT_NAME'] = script_name
        self.method = environ['REQUEST_METHOD'].upper()
        self._post_parse_error = False
        if isinstance(self.environ['wsgi.input'], socket._fileobject):
            # Under development server 'wsgi.input' is an instance of
            # socket._fileobject which hangs indefinitely on reading bytes past
            # available count. To prevent this it's wrapped in LimitedStream
            # that doesn't read past Content-Length bytes.
            #
            # This is not done for other kinds of inputs (like flup's FastCGI
            # streams) beacuse they don't suffer from this problem and we can
            # avoid using another wrapper with its own .read and .readline
            # implementation.
            try:
                content_length = int(self.environ.get('CONTENT_LENGTH', 0))
            except (ValueError, TypeError):
                content_length = 0
            self._stream = LimitedStream(self.environ['wsgi.input'], content_length)
        else:
            self._stream = self.environ['wsgi.input']
        self._read_started = False
    def __repr__(self):
        # Since this is called as part of error handling, we need to be very
@ -133,30 +187,6 @@ class WSGIRequest(http.HttpRequest):
        return 'wsgi.url_scheme' in self.environ \
            and self.environ['wsgi.url_scheme'] == 'https'
    def _load_post_and_files(self):
        # Populates self._post and self._files
        if self.method == 'POST':
            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
                self._raw_post_data = ''
                try:
                    self._post, self._files = self.parse_file_upload(self.META, self.environ['wsgi.input'])
                except:
                    # An error occured while parsing POST data.  Since when
                    # formatting the error the request handler might access
                    # self.POST, set self._post and self._file to prevent
                    # attempts to parse POST data again.
                    self._post = http.QueryDict('')
                    self._files = datastructures.MultiValueDict()
                    # Mark that an error occured.  This allows self.__repr__ to
                    # be explicit about it instead of simply representing an
                    # empty POST
                    self._post_parse_error = True
                    raise
            else:
                self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict()
        else:
            self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict()
    def _get_request(self):
        if not hasattr(self, '_request'):
            self._request = datastructures.MergeDict(self.POST, self.GET)
@ -192,32 +222,11 @@ class WSGIRequest(http.HttpRequest):
            self._load_post_and_files()
        return self._files
    def _get_raw_post_data(self):
        try:
            return self._raw_post_data
        except AttributeError:
            buf = StringIO()
            try:
                # CONTENT_LENGTH might be absent if POST doesn't have content at all (lighttpd)
                content_length = int(self.environ.get('CONTENT_LENGTH', 0))
            except (ValueError, TypeError):
                # If CONTENT_LENGTH was empty string or not an integer, don't
                # error out. We've also seen None passed in here (against all
                # specs, but see ticket #8259), so we handle TypeError as well.
                content_length = 0
            if content_length > 0:
                safe_copyfileobj(self.environ['wsgi.input'], buf,
                        size=content_length)
            self._raw_post_data = buf.getvalue()
            buf.close()
            return self._raw_post_data
    GET = property(_get_get, _set_get)
    POST = property(_get_post, _set_post)
    COOKIES = property(_get_cookies, _set_cookies)
    FILES = property(_get_files)
    REQUEST = property(_get_request)
    raw_post_data = property(_get_raw_post_data)
 class WSGIHandler(base.BaseHandler):
    initLock = Lock()
--- a/django/http/init.py
+++ b/django/http/init.py
@ -6,6 +6,10 @@ from Cookie import BaseCookie, SimpleCookie, CookieError
 from pprint import pformat
 from urllib import urlencode
 from urlparse import urljoin
 try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
 try:
    # The mod_python version is more efficient, so try importing it first.
    from mod_python.util import parse_qsl
@ -132,6 +136,73 @@ class HttpRequest(object):
        parser = MultiPartParser(META, post_data, self.upload_handlers, self.encoding)
        return parser.parse()
    def _get_raw_post_data(self):
        if not hasattr(self, '_raw_post_data'):
            if self._read_started:
                raise Exception("You cannot access raw_post_data after reading from request's data stream")
            self._raw_post_data = self.read()
            self._stream = StringIO(self._raw_post_data)
        return self._raw_post_data
    raw_post_data = property(_get_raw_post_data)
    def _mark_post_parse_error(self):
        self._post = QueryDict('')
        self._files = MultiValueDict()
        self._post_parse_error = True
    def _load_post_and_files(self):
        # Populates self._post and self._files
        if self.method != 'POST':
            self._post, self._files = QueryDict('', encoding=self._encoding), MultiValueDict()
            return
        if self._read_started:
            self._mark_post_parse_error()
            return
        if self.META.get('CONTENT_TYPE', '').startswith('multipart'):
            self._raw_post_data = ''
            try:
                self._post, self._files = self.parse_file_upload(self.META, self)
            except:
                # An error occured while parsing POST data.  Since when
                # formatting the error the request handler might access
                # self.POST, set self._post and self._file to prevent
                # attempts to parse POST data again.
                # Mark that an error occured.  This allows self.__repr__ to
                # be explicit about it instead of simply representing an
                # empty POST
                self._mark_post_parse_error()
                raise
        else:
            self._post, self._files = QueryDict(self.raw_post_data, encoding=self._encoding), MultiValueDict()
    ## File-like and iterator interface.
    ##
    ## Expects self._stream to be set to an appropriate source of bytes by
    ## a corresponding request subclass (WSGIRequest or ModPythonRequest).
    ## Also when request data has already been read by request.POST or
    ## request.raw_post_data, self._stream points to a StringIO instance
    ## containing that data.
    def read(self, *args, **kwargs):
        self._read_started = True
        return self._stream.read(*args, **kwargs)
    def readline(self, *args, **kwargs):
        self._read_started = True
        return self._stream.readline(*args, **kwargs)
    def xreadlines(self):
        while True:
            buf = self.readline()
            if not buf:
                break
            yield buf
    __iter__ = xreadlines
    def readlines(self):
        return list(iter(self))
 class QueryDict(MultiValueDict):
    """
    A specialized MultiValueDict that takes a query string when initialized.
@ -198,7 +269,7 @@ class QueryDict(MultiValueDict):
        for key, value in dict.items(self):
            dict.__setitem__(result, copy.deepcopy(key, memo), copy.deepcopy(value, memo))
        return result
-    
+
    def setlist(self, key, list_):
        self._assert_mutable()
        key = str_to_unicode(key, self.encoding)
@ -385,7 +456,7 @@ class HttpResponse(object):
        """
        Sets a cookie.
-        ``expires`` can be a string in the correct format or a 
+        ``expires`` can be a string in the correct format or a
        ``datetime.datetime`` object in UTC. If ``expires`` is a datetime
        object then ``max_age`` will be calculated.
        """
@ -407,7 +478,7 @@ class HttpResponse(object):
            # IE requires expires, so set it if hasn't been already.
            if not expires:
                self.cookies[key]['expires'] = cookie_date(time.time() +
-                                                           max_age) 
+                                                           max_age)
        if path is not None:
            self.cookies[key]['path'] = path
        if domain is not None:
--- a/docs/ref/request-response.txt
+++ b/docs/ref/request-response.txt
@ -189,8 +189,14 @@ All attributes except ``session`` should be considered read-only.
 .. attribute:: HttpRequest.raw_post_data
-    The raw HTTP POST data. This is only useful for advanced processing. Use
+    The raw HTTP POST data as a byte string. This is useful for processing
-    ``POST`` instead.
+    data in different formats than of conventional HTML forms: binary images,
    XML payload etc. For processing form data use ``HttpRequest.POST``.
    .. versionadded:: 1.3
    You can also read from an HttpRequest using file-like interface. See
    :meth:`HttpRequest.read()`.
 .. attribute:: HttpRequest.urlconf
@ -249,6 +255,27 @@ Methods
   If you write your own XMLHttpRequest call (on the browser side), you'll
   have to set this header manually if you want ``is_ajax()`` to work.
 .. method:: HttpRequest.read(size=None)
 .. method:: HttpRequest.readline()
 .. method:: HttpRequest.readlines()
 .. method:: HttpRequest.xreadlines()
 .. method:: HttpRequest.__iter__()
    .. versionadded:: 1.3
    Methods implementing a file-like interface for reading from an
    HttpRequest instance. This makes it possible to consume an incoming
    request in a streaming fashion. A common use-case would be to process a
    big XML payload with iterative parser without constructing a whole
    XML tree in memory.
    Given this standard interface, an HttpRequest instance can be
    passed directly to an XML parser such as ElementTree::
        import xml.etree.ElementTree as ET
        for element in ET.iterparse(request):
            process(element)
 QueryDict objects
 -----------------
--- a/tests/regressiontests/requests/tests.py
+++ b/tests/regressiontests/requests/tests.py
@ -1,9 +1,10 @@
 from datetime import datetime, timedelta
 import time
 from StringIO import StringIO
 import unittest
 from django.http import HttpRequest, HttpResponse, parse_cookie
-from django.core.handlers.wsgi import WSGIRequest
+from django.core.handlers.wsgi import WSGIRequest, LimitedStream
 from django.core.handlers.modpython import ModPythonRequest
 from django.utils.http import cookie_date
@ -17,11 +18,11 @@ class RequestsTests(unittest.TestCase):
        self.assertEqual(request.META.keys(), [])
    def test_wsgirequest(self):
-        request = WSGIRequest({'PATH_INFO': 'bogus', 'REQUEST_METHOD': 'bogus'})
+        request = WSGIRequest({'PATH_INFO': 'bogus', 'REQUEST_METHOD': 'bogus', 'wsgi.input': StringIO('')})
        self.assertEqual(request.GET.keys(), [])
        self.assertEqual(request.POST.keys(), [])
        self.assertEqual(request.COOKIES.keys(), [])
-        self.assertEqual(set(request.META.keys()), set(['PATH_INFO', 'REQUEST_METHOD', 'SCRIPT_NAME']))
+        self.assertEqual(set(request.META.keys()), set(['PATH_INFO', 'REQUEST_METHOD', 'SCRIPT_NAME', 'wsgi.input']))
        self.assertEqual(request.META['PATH_INFO'], 'bogus')
        self.assertEqual(request.META['REQUEST_METHOD'], 'bogus')
        self.assertEqual(request.META['SCRIPT_NAME'], '')
@ -88,3 +89,62 @@ class RequestsTests(unittest.TestCase):
        max_age_cookie = response.cookies['max_age']
        self.assertEqual(max_age_cookie['max-age'], 10)
        self.assertEqual(max_age_cookie['expires'], cookie_date(time.time()+10))
    def test_limited_stream(self):
        # Read all of a limited stream
        stream = LimitedStream(StringIO('test'), 2)
        self.assertEqual(stream.read(), 'te')
        # Read a number of characters greater than the stream has to offer
        stream = LimitedStream(StringIO('test'), 2)
        self.assertEqual(stream.read(5), 'te')
        # Read sequentially from a stream
        stream = LimitedStream(StringIO('12345678'), 8)
        self.assertEqual(stream.read(5), '12345')
        self.assertEqual(stream.read(5), '678')
        # Read lines from a stream
        stream = LimitedStream(StringIO('1234\n5678\nabcd\nefgh\nijkl'), 24)
        # Read a full line, unconditionally
        self.assertEqual(stream.readline(), '1234\n')
        # Read a number of characters less than a line
        self.assertEqual(stream.readline(2), '56')
        # Read the rest of the partial line
        self.assertEqual(stream.readline(), '78\n')
        # Read a full line, with a character limit greater than the line length
        self.assertEqual(stream.readline(6), 'abcd\n')
        # Read the next line, deliberately terminated at the line end
        self.assertEqual(stream.readline(4), 'efgh')
        # Read the next line... just the line end
        self.assertEqual(stream.readline(), '\n')
        # Read everything else.
        self.assertEqual(stream.readline(), 'ijkl')
    def test_stream(self):
        request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')})
        self.assertEqual(request.read(), 'name=value')
    def test_read_after_value(self):
        """
        Reading from request is allowed after accessing request contents as
        POST or raw_post_data.
        """
        request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')})
        self.assertEqual(request.POST, {u'name': [u'value']})
        self.assertEqual(request.raw_post_data, 'name=value')
        self.assertEqual(request.read(), 'name=value')
    def test_value_after_read(self):
        """
        Construction of POST or raw_post_data is not allowed after reading
        from request.
        """
        request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')})
        self.assertEqual(request.read(2), 'na')
        self.assertRaises(Exception, lambda: request.raw_post_data)
        self.assertEqual(request.POST, {})
    def test_read_by_lines(self):
        request = WSGIRequest({'REQUEST_METHOD': 'POST', 'wsgi.input': StringIO('name=value')})
        self.assertEqual(list(request), ['name=value'])