diff --git a/django/db/models/query.py b/django/db/models/query.py index fdf720cb8a..bc9ee1c5f8 100644 --- a/django/db/models/query.py +++ b/django/db/models/query.py @@ -20,7 +20,7 @@ from django.db.models.expressions import F from django.db.models.fields import AutoField from django.db.models.functions import Trunc from django.db.models.query_utils import InvalidQuery, Q -from django.db.models.sql.constants import CURSOR +from django.db.models.sql.constants import CURSOR, GET_ITERATOR_CHUNK_SIZE from django.utils import timezone from django.utils.functional import cached_property, partition from django.utils.version import get_version @@ -33,9 +33,10 @@ EmptyResultSet = sql.EmptyResultSet class BaseIterable: - def __init__(self, queryset, chunked_fetch=False): + def __init__(self, queryset, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE): self.queryset = queryset self.chunked_fetch = chunked_fetch + self.chunk_size = chunk_size class ModelIterable(BaseIterable): @@ -47,7 +48,7 @@ class ModelIterable(BaseIterable): compiler = queryset.query.get_compiler(using=db) # Execute the query. This will also fill compiler.select, klass_info, # and annotations. - results = compiler.execute_sql(chunked_fetch=self.chunked_fetch) + results = compiler.execute_sql(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size) select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info, compiler.annotation_col_map) model_cls = klass_info['model'] @@ -301,13 +302,15 @@ class QuerySet: # METHODS THAT DO DATABASE QUERIES # #################################### - def iterator(self): + def iterator(self, chunk_size=2000): """ An iterator over the results from applying this QuerySet to the database. """ + if chunk_size <= 0: + raise ValueError('Chunk size must be strictly positive.') use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS') - return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch)) + return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch, chunk_size=chunk_size)) def aggregate(self, *args, **kwargs): """ diff --git a/django/db/models/sql/compiler.py b/django/db/models/sql/compiler.py index 5c7b5b7f51..f81861ddba 100644 --- a/django/db/models/sql/compiler.py +++ b/django/db/models/sql/compiler.py @@ -883,7 +883,7 @@ class SQLCompiler: self.query.set_extra_mask(['a']) return bool(self.execute_sql(SINGLE)) - def execute_sql(self, result_type=MULTI, chunked_fetch=False): + def execute_sql(self, result_type=MULTI, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE): """ Run the query against the database and return the result(s). The return value is a single data item if result_type is SINGLE, or an @@ -937,7 +937,8 @@ class SQLCompiler: result = cursor_iter( cursor, self.connection.features.empty_fetchmany_value, - self.col_count + self.col_count, + chunk_size, ) if not chunked_fetch and not self.connection.features.can_use_chunked_reads: try: @@ -1298,14 +1299,13 @@ class SQLAggregateCompiler(SQLCompiler): return sql, params -def cursor_iter(cursor, sentinel, col_count): +def cursor_iter(cursor, sentinel, col_count, itersize): """ Yield blocks of rows from a cursor and ensure the cursor is closed when done. """ try: - for rows in iter((lambda: cursor.fetchmany(GET_ITERATOR_CHUNK_SIZE)), - sentinel): + for rows in iter((lambda: cursor.fetchmany(itersize)), sentinel): yield [r[0:col_count] for r in rows] finally: cursor.close() diff --git a/docs/ref/models/querysets.txt b/docs/ref/models/querysets.txt index cc80069894..48c547f514 100644 --- a/docs/ref/models/querysets.txt +++ b/docs/ref/models/querysets.txt @@ -2004,7 +2004,7 @@ If you pass ``in_bulk()`` an empty list, you'll get an empty dictionary. ``iterator()`` ~~~~~~~~~~~~~~ -.. method:: iterator() +.. method:: iterator(chunk_size=2000) Evaluates the ``QuerySet`` (by performing the query) and returns an iterator (see :pep:`234`) over the results. A ``QuerySet`` typically caches its results @@ -2033,6 +2033,11 @@ set into memory. The Oracle database driver always uses server-side cursors. +With server-side cursors, the ``chunk_size`` parameter specifies the number of +results to cache at the database driver level. Fetching bigger chunks +diminishes the number of round trips between the database driver and the +database, at the expense of memory. + On PostgreSQL, server-side cursors will only be used when the :setting:`DISABLE_SERVER_SIDE_CURSORS ` setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if @@ -2048,10 +2053,25 @@ drivers load the entire result set into memory. The result set is then transformed into Python row objects by the database adapter using the ``fetchmany()`` method defined in :pep:`249`. +The ``chunk_size`` parameter controls the size of batches Django retrieves from +the database driver. Larger batches decrease the overhead of communicating with +the database driver at the expense of a slight increase in memory consumption. + +The default value of ``chunk_size``, 2000, comes from `a calculation on the +psycopg mailing list `_: + + Assuming rows of 10-20 columns with a mix of textual and numeric data, 2000 + is going to fetch less than 100KB of data, which seems a good compromise + between the number of rows transferred and the data discarded if the loop + is exited early. + .. versionchanged:: 1.11 PostgreSQL support for server-side cursors was added. +.. versionchanged:: 2.0 + + The ``chunk_size`` parameter was added. ``latest()`` ~~~~~~~~~~~~ diff --git a/docs/releases/2.0.txt b/docs/releases/2.0.txt index 1070889da0..f7ac265379 100644 --- a/docs/releases/2.0.txt +++ b/docs/releases/2.0.txt @@ -214,6 +214,11 @@ Models .. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109 +* The new ``chunk_size`` parameter of :meth:`.QuerySet.iterator` controls the + number of rows fetched by the Python database client when streaming results + from the database. For databases that don't support server-side cursors, it + controls the number of results Django fetches from the database adapter. + Requests and Responses ~~~~~~~~~~~~~~~~~~~~~~ @@ -280,6 +285,13 @@ Database backend API attribute with the name of the database that your backend works with. Django may use it in various messages, such as in system checks. +* To improve performance when streaming large result sets from the database, + :meth:`.QuerySet.iterator` now fetches 2000 rows at a time instead of 100. + The old behavior can be restored using the ``chunk_size`` parameter. For + example:: + + Book.objects.iterator(chunk_size=100) + Dropped support for Oracle 11.2 ------------------------------- diff --git a/tests/queries/test_iterator.py b/tests/queries/test_iterator.py new file mode 100644 index 0000000000..56f42c2191 --- /dev/null +++ b/tests/queries/test_iterator.py @@ -0,0 +1,39 @@ +import datetime +from unittest import mock + +from django.db.models.sql.compiler import cursor_iter +from django.test import TestCase + +from .models import Article + + +class QuerySetIteratorTests(TestCase): + itersize_index_in_mock_args = 3 + + @classmethod + def setUpTestData(cls): + Article.objects.create(name='Article 1', created=datetime.datetime.now()) + Article.objects.create(name='Article 2', created=datetime.datetime.now()) + + def test_iterator_invalid_chunk_size(self): + for size in (0, -1): + with self.subTest(size=size): + with self.assertRaisesMessage(ValueError, 'Chunk size must be strictly positive.'): + Article.objects.iterator(chunk_size=size) + + def test_default_iterator_chunk_size(self): + qs = Article.objects.iterator() + with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock: + next(qs) + self.assertEqual(cursor_iter_mock.call_count, 1) + mock_args, _mock_kwargs = cursor_iter_mock.call_args + self.assertEqual(mock_args[self.itersize_index_in_mock_args], 2000) + + def test_iterator_chunk_size(self): + batch_size = 3 + qs = Article.objects.iterator(chunk_size=batch_size) + with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock: + next(qs) + self.assertEqual(cursor_iter_mock.call_count, 1) + mock_args, _mock_kwargs = cursor_iter_mock.call_args + self.assertEqual(mock_args[self.itersize_index_in_mock_args], batch_size)