Fixed #27639 -- Added chunk_size parameter to QuerySet.iterator().

This commit is contained in:
François Freitag 2017-06-01 16:56:51 -04:00 committed by Tim Graham
parent bf50ae8210
commit edee5a8de6
5 changed files with 85 additions and 11 deletions

View File

@ -20,7 +20,7 @@ from django.db.models.expressions import F
from django.db.models.fields import AutoField from django.db.models.fields import AutoField
from django.db.models.functions import Trunc from django.db.models.functions import Trunc
from django.db.models.query_utils import InvalidQuery, Q from django.db.models.query_utils import InvalidQuery, Q
from django.db.models.sql.constants import CURSOR from django.db.models.sql.constants import CURSOR, GET_ITERATOR_CHUNK_SIZE
from django.utils import timezone from django.utils import timezone
from django.utils.functional import cached_property, partition from django.utils.functional import cached_property, partition
from django.utils.version import get_version from django.utils.version import get_version
@ -33,9 +33,10 @@ EmptyResultSet = sql.EmptyResultSet
class BaseIterable: class BaseIterable:
def __init__(self, queryset, chunked_fetch=False): def __init__(self, queryset, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
self.queryset = queryset self.queryset = queryset
self.chunked_fetch = chunked_fetch self.chunked_fetch = chunked_fetch
self.chunk_size = chunk_size
class ModelIterable(BaseIterable): class ModelIterable(BaseIterable):
@ -47,7 +48,7 @@ class ModelIterable(BaseIterable):
compiler = queryset.query.get_compiler(using=db) compiler = queryset.query.get_compiler(using=db)
# Execute the query. This will also fill compiler.select, klass_info, # Execute the query. This will also fill compiler.select, klass_info,
# and annotations. # and annotations.
results = compiler.execute_sql(chunked_fetch=self.chunked_fetch) results = compiler.execute_sql(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size)
select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info, select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info,
compiler.annotation_col_map) compiler.annotation_col_map)
model_cls = klass_info['model'] model_cls = klass_info['model']
@ -301,13 +302,15 @@ class QuerySet:
# METHODS THAT DO DATABASE QUERIES # # METHODS THAT DO DATABASE QUERIES #
#################################### ####################################
def iterator(self): def iterator(self, chunk_size=2000):
""" """
An iterator over the results from applying this QuerySet to the An iterator over the results from applying this QuerySet to the
database. database.
""" """
if chunk_size <= 0:
raise ValueError('Chunk size must be strictly positive.')
use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS') use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS')
return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch)) return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch, chunk_size=chunk_size))
def aggregate(self, *args, **kwargs): def aggregate(self, *args, **kwargs):
""" """

View File

@ -883,7 +883,7 @@ class SQLCompiler:
self.query.set_extra_mask(['a']) self.query.set_extra_mask(['a'])
return bool(self.execute_sql(SINGLE)) return bool(self.execute_sql(SINGLE))
def execute_sql(self, result_type=MULTI, chunked_fetch=False): def execute_sql(self, result_type=MULTI, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
""" """
Run the query against the database and return the result(s). The Run the query against the database and return the result(s). The
return value is a single data item if result_type is SINGLE, or an return value is a single data item if result_type is SINGLE, or an
@ -937,7 +937,8 @@ class SQLCompiler:
result = cursor_iter( result = cursor_iter(
cursor, self.connection.features.empty_fetchmany_value, cursor, self.connection.features.empty_fetchmany_value,
self.col_count self.col_count,
chunk_size,
) )
if not chunked_fetch and not self.connection.features.can_use_chunked_reads: if not chunked_fetch and not self.connection.features.can_use_chunked_reads:
try: try:
@ -1298,14 +1299,13 @@ class SQLAggregateCompiler(SQLCompiler):
return sql, params return sql, params
def cursor_iter(cursor, sentinel, col_count): def cursor_iter(cursor, sentinel, col_count, itersize):
""" """
Yield blocks of rows from a cursor and ensure the cursor is closed when Yield blocks of rows from a cursor and ensure the cursor is closed when
done. done.
""" """
try: try:
for rows in iter((lambda: cursor.fetchmany(GET_ITERATOR_CHUNK_SIZE)), for rows in iter((lambda: cursor.fetchmany(itersize)), sentinel):
sentinel):
yield [r[0:col_count] for r in rows] yield [r[0:col_count] for r in rows]
finally: finally:
cursor.close() cursor.close()

View File

@ -2004,7 +2004,7 @@ If you pass ``in_bulk()`` an empty list, you'll get an empty dictionary.
``iterator()`` ``iterator()``
~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~
.. method:: iterator() .. method:: iterator(chunk_size=2000)
Evaluates the ``QuerySet`` (by performing the query) and returns an iterator Evaluates the ``QuerySet`` (by performing the query) and returns an iterator
(see :pep:`234`) over the results. A ``QuerySet`` typically caches its results (see :pep:`234`) over the results. A ``QuerySet`` typically caches its results
@ -2033,6 +2033,11 @@ set into memory.
The Oracle database driver always uses server-side cursors. The Oracle database driver always uses server-side cursors.
With server-side cursors, the ``chunk_size`` parameter specifies the number of
results to cache at the database driver level. Fetching bigger chunks
diminishes the number of round trips between the database driver and the
database, at the expense of memory.
On PostgreSQL, server-side cursors will only be used when the On PostgreSQL, server-side cursors will only be used when the
:setting:`DISABLE_SERVER_SIDE_CURSORS <DATABASE-DISABLE_SERVER_SIDE_CURSORS>` :setting:`DISABLE_SERVER_SIDE_CURSORS <DATABASE-DISABLE_SERVER_SIDE_CURSORS>`
setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if
@ -2048,10 +2053,25 @@ drivers load the entire result set into memory. The result set is then
transformed into Python row objects by the database adapter using the transformed into Python row objects by the database adapter using the
``fetchmany()`` method defined in :pep:`249`. ``fetchmany()`` method defined in :pep:`249`.
The ``chunk_size`` parameter controls the size of batches Django retrieves from
the database driver. Larger batches decrease the overhead of communicating with
the database driver at the expense of a slight increase in memory consumption.
The default value of ``chunk_size``, 2000, comes from `a calculation on the
psycopg mailing list <https://www.postgresql.org/message-id/4D2F2C71.8080805%40dndg.it>`_:
Assuming rows of 10-20 columns with a mix of textual and numeric data, 2000
is going to fetch less than 100KB of data, which seems a good compromise
between the number of rows transferred and the data discarded if the loop
is exited early.
.. versionchanged:: 1.11 .. versionchanged:: 1.11
PostgreSQL support for server-side cursors was added. PostgreSQL support for server-side cursors was added.
.. versionchanged:: 2.0
The ``chunk_size`` parameter was added.
``latest()`` ``latest()``
~~~~~~~~~~~~ ~~~~~~~~~~~~

View File

@ -214,6 +214,11 @@ Models
.. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109 .. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109
* The new ``chunk_size`` parameter of :meth:`.QuerySet.iterator` controls the
number of rows fetched by the Python database client when streaming results
from the database. For databases that don't support server-side cursors, it
controls the number of results Django fetches from the database adapter.
Requests and Responses Requests and Responses
~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~
@ -280,6 +285,13 @@ Database backend API
attribute with the name of the database that your backend works with. Django attribute with the name of the database that your backend works with. Django
may use it in various messages, such as in system checks. may use it in various messages, such as in system checks.
* To improve performance when streaming large result sets from the database,
:meth:`.QuerySet.iterator` now fetches 2000 rows at a time instead of 100.
The old behavior can be restored using the ``chunk_size`` parameter. For
example::
Book.objects.iterator(chunk_size=100)
Dropped support for Oracle 11.2 Dropped support for Oracle 11.2
------------------------------- -------------------------------

View File

@ -0,0 +1,39 @@
import datetime
from unittest import mock
from django.db.models.sql.compiler import cursor_iter
from django.test import TestCase
from .models import Article
class QuerySetIteratorTests(TestCase):
itersize_index_in_mock_args = 3
@classmethod
def setUpTestData(cls):
Article.objects.create(name='Article 1', created=datetime.datetime.now())
Article.objects.create(name='Article 2', created=datetime.datetime.now())
def test_iterator_invalid_chunk_size(self):
for size in (0, -1):
with self.subTest(size=size):
with self.assertRaisesMessage(ValueError, 'Chunk size must be strictly positive.'):
Article.objects.iterator(chunk_size=size)
def test_default_iterator_chunk_size(self):
qs = Article.objects.iterator()
with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
next(qs)
self.assertEqual(cursor_iter_mock.call_count, 1)
mock_args, _mock_kwargs = cursor_iter_mock.call_args
self.assertEqual(mock_args[self.itersize_index_in_mock_args], 2000)
def test_iterator_chunk_size(self):
batch_size = 3
qs = Article.objects.iterator(chunk_size=batch_size)
with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
next(qs)
self.assertEqual(cursor_iter_mock.call_count, 1)
mock_args, _mock_kwargs = cursor_iter_mock.call_args
self.assertEqual(mock_args[self.itersize_index_in_mock_args], batch_size)