Fixed #27639 -- Added chunk_size parameter to QuerySet.iterator().

2017-06-01 16:56:51 -04:00 · 2017-06-01 16:56:51 -04:00 · edee5a8de6
parent bf50ae8210
commit edee5a8de6
5 changed files with 85 additions and 11 deletions
--- a/django/db/models/query.py
+++ b/django/db/models/query.py
@ -20,7 +20,7 @@ from django.db.models.expressions import F
 from django.db.models.fields import AutoField
 from django.db.models.functions import Trunc
 from django.db.models.query_utils import InvalidQuery, Q
-from django.db.models.sql.constants import CURSOR
+from django.db.models.sql.constants import CURSOR, GET_ITERATOR_CHUNK_SIZE
 from django.utils import timezone
 from django.utils.functional import cached_property, partition
 from django.utils.version import get_version
@ -33,9 +33,10 @@ EmptyResultSet = sql.EmptyResultSet
 class BaseIterable:
-    def __init__(self, queryset, chunked_fetch=False):
+    def __init__(self, queryset, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
        self.queryset = queryset
        self.chunked_fetch = chunked_fetch
        self.chunk_size = chunk_size
 class ModelIterable(BaseIterable):
@ -47,7 +48,7 @@ class ModelIterable(BaseIterable):
        compiler = queryset.query.get_compiler(using=db)
        # Execute the query. This will also fill compiler.select, klass_info,
        # and annotations.
-        results = compiler.execute_sql(chunked_fetch=self.chunked_fetch)
+        results = compiler.execute_sql(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size)
        select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info,
                                                  compiler.annotation_col_map)
        model_cls = klass_info['model']
@ -301,13 +302,15 @@ class QuerySet:
    # METHODS THAT DO DATABASE QUERIES #
    ####################################
-    def iterator(self):
+    def iterator(self, chunk_size=2000):
        """
        An iterator over the results from applying this QuerySet to the
        database.
        """
        if chunk_size <= 0:
            raise ValueError('Chunk size must be strictly positive.')
        use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS')
-        return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch))
+        return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch, chunk_size=chunk_size))
    def aggregate(self, *args, **kwargs):
        """
--- a/django/db/models/sql/compiler.py
+++ b/django/db/models/sql/compiler.py
@ -883,7 +883,7 @@ class SQLCompiler:
        self.query.set_extra_mask(['a'])
        return bool(self.execute_sql(SINGLE))
-    def execute_sql(self, result_type=MULTI, chunked_fetch=False):
+    def execute_sql(self, result_type=MULTI, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
        """
        Run the query against the database and return the result(s). The
        return value is a single data item if result_type is SINGLE, or an
@ -937,7 +937,8 @@ class SQLCompiler:
        result = cursor_iter(
            cursor, self.connection.features.empty_fetchmany_value,
-            self.col_count
+            self.col_count,
            chunk_size,
        )
        if not chunked_fetch and not self.connection.features.can_use_chunked_reads:
            try:
@ -1298,14 +1299,13 @@ class SQLAggregateCompiler(SQLCompiler):
        return sql, params
-def cursor_iter(cursor, sentinel, col_count):
+def cursor_iter(cursor, sentinel, col_count, itersize):
    """
    Yield blocks of rows from a cursor and ensure the cursor is closed when
    done.
    """
    try:
-        for rows in iter((lambda: cursor.fetchmany(GET_ITERATOR_CHUNK_SIZE)),
+        for rows in iter((lambda: cursor.fetchmany(itersize)), sentinel):
                         sentinel):
            yield [r[0:col_count] for r in rows]
    finally:
        cursor.close()
--- a/docs/ref/models/querysets.txt
+++ b/docs/ref/models/querysets.txt
@ -2004,7 +2004,7 @@ If you pass ``in_bulk()`` an empty list, you'll get an empty dictionary.
 ``iterator()``
 ~~~~~~~~~~~~~~
-.. method:: iterator()
+.. method:: iterator(chunk_size=2000)
 Evaluates the ``QuerySet`` (by performing the query) and returns an iterator
 (see :pep:`234`) over the results. A ``QuerySet`` typically caches its results
@ -2033,6 +2033,11 @@ set into memory.
 The Oracle database driver always uses server-side cursors.
 With server-side cursors, the ``chunk_size`` parameter specifies the number of
 results to cache at the database driver level. Fetching bigger chunks
 diminishes the number of round trips between the database driver and the
 database, at the expense of memory.
 On PostgreSQL, server-side cursors will only be used when the
 :setting:`DISABLE_SERVER_SIDE_CURSORS <DATABASE-DISABLE_SERVER_SIDE_CURSORS>`
 setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if
@ -2048,10 +2053,25 @@ drivers load the entire result set into memory. The result set is then
 transformed into Python row objects by the database adapter using the
 ``fetchmany()`` method defined in :pep:`249`.
 The ``chunk_size`` parameter controls the size of batches Django retrieves from
 the database driver. Larger batches decrease the overhead of communicating with
 the database driver at the expense of a slight increase in memory consumption.
 The default value of ``chunk_size``, 2000, comes from `a calculation on the
 psycopg mailing list <https://www.postgresql.org/message-id/4D2F2C71.8080805%40dndg.it>`_:
    Assuming rows of 10-20 columns with a mix of textual and numeric data, 2000
    is going to fetch less than 100KB of data, which seems a good compromise
    between the number of rows transferred and the data discarded if the loop
    is exited early.
 .. versionchanged:: 1.11
    PostgreSQL support for server-side cursors was added.
 .. versionchanged:: 2.0
    The ``chunk_size`` parameter was added.
 ``latest()``
 ~~~~~~~~~~~~
--- a/docs/releases/2.0.txt
+++ b/docs/releases/2.0.txt
@ -214,6 +214,11 @@ Models
  .. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109
 * The new ``chunk_size`` parameter of :meth:`.QuerySet.iterator` controls the
  number of rows fetched by the Python database client when streaming results
  from the database. For databases that don't support server-side cursors, it
  controls the number of results Django fetches from the database adapter.
 Requests and Responses
 ~~~~~~~~~~~~~~~~~~~~~~
@ -280,6 +285,13 @@ Database backend API
  attribute with the name of the database that your backend works with. Django
  may use it in various messages, such as in system checks.
 * To improve performance when streaming large result sets from the database,
  :meth:`.QuerySet.iterator` now fetches 2000 rows at a time instead of 100.
  The old behavior can be restored using the ``chunk_size`` parameter. For
  example::
      Book.objects.iterator(chunk_size=100)
 Dropped support for Oracle 11.2
 -------------------------------
--- a/tests/queries/test_iterator.py
+++ b/tests/queries/test_iterator.py
@ -0,0 +1,39 @@
 import datetime
 from unittest import mock
 from django.db.models.sql.compiler import cursor_iter
 from django.test import TestCase
 from .models import Article
 class QuerySetIteratorTests(TestCase):
    itersize_index_in_mock_args = 3
    @classmethod
    def setUpTestData(cls):
        Article.objects.create(name='Article 1', created=datetime.datetime.now())
        Article.objects.create(name='Article 2', created=datetime.datetime.now())
    def test_iterator_invalid_chunk_size(self):
        for size in (0, -1):
            with self.subTest(size=size):
                with self.assertRaisesMessage(ValueError, 'Chunk size must be strictly positive.'):
                    Article.objects.iterator(chunk_size=size)
    def test_default_iterator_chunk_size(self):
        qs = Article.objects.iterator()
        with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
            next(qs)
        self.assertEqual(cursor_iter_mock.call_count, 1)
        mock_args, _mock_kwargs = cursor_iter_mock.call_args
        self.assertEqual(mock_args[self.itersize_index_in_mock_args], 2000)
    def test_iterator_chunk_size(self):
        batch_size = 3
        qs = Article.objects.iterator(chunk_size=batch_size)
        with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
            next(qs)
        self.assertEqual(cursor_iter_mock.call_count, 1)
        mock_args, _mock_kwargs = cursor_iter_mock.call_args
        self.assertEqual(mock_args[self.itersize_index_in_mock_args], batch_size)