mirror of https://github.com/django/django.git
Fixed #27639 -- Added chunk_size parameter to QuerySet.iterator().
This commit is contained in:
parent
bf50ae8210
commit
edee5a8de6
|
@ -20,7 +20,7 @@ from django.db.models.expressions import F
|
||||||
from django.db.models.fields import AutoField
|
from django.db.models.fields import AutoField
|
||||||
from django.db.models.functions import Trunc
|
from django.db.models.functions import Trunc
|
||||||
from django.db.models.query_utils import InvalidQuery, Q
|
from django.db.models.query_utils import InvalidQuery, Q
|
||||||
from django.db.models.sql.constants import CURSOR
|
from django.db.models.sql.constants import CURSOR, GET_ITERATOR_CHUNK_SIZE
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.utils.functional import cached_property, partition
|
from django.utils.functional import cached_property, partition
|
||||||
from django.utils.version import get_version
|
from django.utils.version import get_version
|
||||||
|
@ -33,9 +33,10 @@ EmptyResultSet = sql.EmptyResultSet
|
||||||
|
|
||||||
|
|
||||||
class BaseIterable:
|
class BaseIterable:
|
||||||
def __init__(self, queryset, chunked_fetch=False):
|
def __init__(self, queryset, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
|
||||||
self.queryset = queryset
|
self.queryset = queryset
|
||||||
self.chunked_fetch = chunked_fetch
|
self.chunked_fetch = chunked_fetch
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
|
||||||
|
|
||||||
class ModelIterable(BaseIterable):
|
class ModelIterable(BaseIterable):
|
||||||
|
@ -47,7 +48,7 @@ class ModelIterable(BaseIterable):
|
||||||
compiler = queryset.query.get_compiler(using=db)
|
compiler = queryset.query.get_compiler(using=db)
|
||||||
# Execute the query. This will also fill compiler.select, klass_info,
|
# Execute the query. This will also fill compiler.select, klass_info,
|
||||||
# and annotations.
|
# and annotations.
|
||||||
results = compiler.execute_sql(chunked_fetch=self.chunked_fetch)
|
results = compiler.execute_sql(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size)
|
||||||
select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info,
|
select, klass_info, annotation_col_map = (compiler.select, compiler.klass_info,
|
||||||
compiler.annotation_col_map)
|
compiler.annotation_col_map)
|
||||||
model_cls = klass_info['model']
|
model_cls = klass_info['model']
|
||||||
|
@ -301,13 +302,15 @@ class QuerySet:
|
||||||
# METHODS THAT DO DATABASE QUERIES #
|
# METHODS THAT DO DATABASE QUERIES #
|
||||||
####################################
|
####################################
|
||||||
|
|
||||||
def iterator(self):
|
def iterator(self, chunk_size=2000):
|
||||||
"""
|
"""
|
||||||
An iterator over the results from applying this QuerySet to the
|
An iterator over the results from applying this QuerySet to the
|
||||||
database.
|
database.
|
||||||
"""
|
"""
|
||||||
|
if chunk_size <= 0:
|
||||||
|
raise ValueError('Chunk size must be strictly positive.')
|
||||||
use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS')
|
use_chunked_fetch = not connections[self.db].settings_dict.get('DISABLE_SERVER_SIDE_CURSORS')
|
||||||
return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch))
|
return iter(self._iterable_class(self, chunked_fetch=use_chunked_fetch, chunk_size=chunk_size))
|
||||||
|
|
||||||
def aggregate(self, *args, **kwargs):
|
def aggregate(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -883,7 +883,7 @@ class SQLCompiler:
|
||||||
self.query.set_extra_mask(['a'])
|
self.query.set_extra_mask(['a'])
|
||||||
return bool(self.execute_sql(SINGLE))
|
return bool(self.execute_sql(SINGLE))
|
||||||
|
|
||||||
def execute_sql(self, result_type=MULTI, chunked_fetch=False):
|
def execute_sql(self, result_type=MULTI, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
|
||||||
"""
|
"""
|
||||||
Run the query against the database and return the result(s). The
|
Run the query against the database and return the result(s). The
|
||||||
return value is a single data item if result_type is SINGLE, or an
|
return value is a single data item if result_type is SINGLE, or an
|
||||||
|
@ -937,7 +937,8 @@ class SQLCompiler:
|
||||||
|
|
||||||
result = cursor_iter(
|
result = cursor_iter(
|
||||||
cursor, self.connection.features.empty_fetchmany_value,
|
cursor, self.connection.features.empty_fetchmany_value,
|
||||||
self.col_count
|
self.col_count,
|
||||||
|
chunk_size,
|
||||||
)
|
)
|
||||||
if not chunked_fetch and not self.connection.features.can_use_chunked_reads:
|
if not chunked_fetch and not self.connection.features.can_use_chunked_reads:
|
||||||
try:
|
try:
|
||||||
|
@ -1298,14 +1299,13 @@ class SQLAggregateCompiler(SQLCompiler):
|
||||||
return sql, params
|
return sql, params
|
||||||
|
|
||||||
|
|
||||||
def cursor_iter(cursor, sentinel, col_count):
|
def cursor_iter(cursor, sentinel, col_count, itersize):
|
||||||
"""
|
"""
|
||||||
Yield blocks of rows from a cursor and ensure the cursor is closed when
|
Yield blocks of rows from a cursor and ensure the cursor is closed when
|
||||||
done.
|
done.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
for rows in iter((lambda: cursor.fetchmany(GET_ITERATOR_CHUNK_SIZE)),
|
for rows in iter((lambda: cursor.fetchmany(itersize)), sentinel):
|
||||||
sentinel):
|
|
||||||
yield [r[0:col_count] for r in rows]
|
yield [r[0:col_count] for r in rows]
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
|
@ -2004,7 +2004,7 @@ If you pass ``in_bulk()`` an empty list, you'll get an empty dictionary.
|
||||||
``iterator()``
|
``iterator()``
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. method:: iterator()
|
.. method:: iterator(chunk_size=2000)
|
||||||
|
|
||||||
Evaluates the ``QuerySet`` (by performing the query) and returns an iterator
|
Evaluates the ``QuerySet`` (by performing the query) and returns an iterator
|
||||||
(see :pep:`234`) over the results. A ``QuerySet`` typically caches its results
|
(see :pep:`234`) over the results. A ``QuerySet`` typically caches its results
|
||||||
|
@ -2033,6 +2033,11 @@ set into memory.
|
||||||
|
|
||||||
The Oracle database driver always uses server-side cursors.
|
The Oracle database driver always uses server-side cursors.
|
||||||
|
|
||||||
|
With server-side cursors, the ``chunk_size`` parameter specifies the number of
|
||||||
|
results to cache at the database driver level. Fetching bigger chunks
|
||||||
|
diminishes the number of round trips between the database driver and the
|
||||||
|
database, at the expense of memory.
|
||||||
|
|
||||||
On PostgreSQL, server-side cursors will only be used when the
|
On PostgreSQL, server-side cursors will only be used when the
|
||||||
:setting:`DISABLE_SERVER_SIDE_CURSORS <DATABASE-DISABLE_SERVER_SIDE_CURSORS>`
|
:setting:`DISABLE_SERVER_SIDE_CURSORS <DATABASE-DISABLE_SERVER_SIDE_CURSORS>`
|
||||||
setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if
|
setting is ``False``. Read :ref:`transaction-pooling-server-side-cursors` if
|
||||||
|
@ -2048,10 +2053,25 @@ drivers load the entire result set into memory. The result set is then
|
||||||
transformed into Python row objects by the database adapter using the
|
transformed into Python row objects by the database adapter using the
|
||||||
``fetchmany()`` method defined in :pep:`249`.
|
``fetchmany()`` method defined in :pep:`249`.
|
||||||
|
|
||||||
|
The ``chunk_size`` parameter controls the size of batches Django retrieves from
|
||||||
|
the database driver. Larger batches decrease the overhead of communicating with
|
||||||
|
the database driver at the expense of a slight increase in memory consumption.
|
||||||
|
|
||||||
|
The default value of ``chunk_size``, 2000, comes from `a calculation on the
|
||||||
|
psycopg mailing list <https://www.postgresql.org/message-id/4D2F2C71.8080805%40dndg.it>`_:
|
||||||
|
|
||||||
|
Assuming rows of 10-20 columns with a mix of textual and numeric data, 2000
|
||||||
|
is going to fetch less than 100KB of data, which seems a good compromise
|
||||||
|
between the number of rows transferred and the data discarded if the loop
|
||||||
|
is exited early.
|
||||||
|
|
||||||
.. versionchanged:: 1.11
|
.. versionchanged:: 1.11
|
||||||
|
|
||||||
PostgreSQL support for server-side cursors was added.
|
PostgreSQL support for server-side cursors was added.
|
||||||
|
|
||||||
|
.. versionchanged:: 2.0
|
||||||
|
|
||||||
|
The ``chunk_size`` parameter was added.
|
||||||
|
|
||||||
``latest()``
|
``latest()``
|
||||||
~~~~~~~~~~~~
|
~~~~~~~~~~~~
|
||||||
|
|
|
@ -214,6 +214,11 @@ Models
|
||||||
|
|
||||||
.. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109
|
.. _`identity columns`: https://docs.oracle.com/database/121/DRDAA/migr_tools_feat.htm#DRDAA109
|
||||||
|
|
||||||
|
* The new ``chunk_size`` parameter of :meth:`.QuerySet.iterator` controls the
|
||||||
|
number of rows fetched by the Python database client when streaming results
|
||||||
|
from the database. For databases that don't support server-side cursors, it
|
||||||
|
controls the number of results Django fetches from the database adapter.
|
||||||
|
|
||||||
Requests and Responses
|
Requests and Responses
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -280,6 +285,13 @@ Database backend API
|
||||||
attribute with the name of the database that your backend works with. Django
|
attribute with the name of the database that your backend works with. Django
|
||||||
may use it in various messages, such as in system checks.
|
may use it in various messages, such as in system checks.
|
||||||
|
|
||||||
|
* To improve performance when streaming large result sets from the database,
|
||||||
|
:meth:`.QuerySet.iterator` now fetches 2000 rows at a time instead of 100.
|
||||||
|
The old behavior can be restored using the ``chunk_size`` parameter. For
|
||||||
|
example::
|
||||||
|
|
||||||
|
Book.objects.iterator(chunk_size=100)
|
||||||
|
|
||||||
Dropped support for Oracle 11.2
|
Dropped support for Oracle 11.2
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
import datetime
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from django.db.models.sql.compiler import cursor_iter
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from .models import Article
|
||||||
|
|
||||||
|
|
||||||
|
class QuerySetIteratorTests(TestCase):
|
||||||
|
itersize_index_in_mock_args = 3
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpTestData(cls):
|
||||||
|
Article.objects.create(name='Article 1', created=datetime.datetime.now())
|
||||||
|
Article.objects.create(name='Article 2', created=datetime.datetime.now())
|
||||||
|
|
||||||
|
def test_iterator_invalid_chunk_size(self):
|
||||||
|
for size in (0, -1):
|
||||||
|
with self.subTest(size=size):
|
||||||
|
with self.assertRaisesMessage(ValueError, 'Chunk size must be strictly positive.'):
|
||||||
|
Article.objects.iterator(chunk_size=size)
|
||||||
|
|
||||||
|
def test_default_iterator_chunk_size(self):
|
||||||
|
qs = Article.objects.iterator()
|
||||||
|
with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
|
||||||
|
next(qs)
|
||||||
|
self.assertEqual(cursor_iter_mock.call_count, 1)
|
||||||
|
mock_args, _mock_kwargs = cursor_iter_mock.call_args
|
||||||
|
self.assertEqual(mock_args[self.itersize_index_in_mock_args], 2000)
|
||||||
|
|
||||||
|
def test_iterator_chunk_size(self):
|
||||||
|
batch_size = 3
|
||||||
|
qs = Article.objects.iterator(chunk_size=batch_size)
|
||||||
|
with mock.patch('django.db.models.sql.compiler.cursor_iter', side_effect=cursor_iter) as cursor_iter_mock:
|
||||||
|
next(qs)
|
||||||
|
self.assertEqual(cursor_iter_mock.call_count, 1)
|
||||||
|
mock_args, _mock_kwargs = cursor_iter_mock.call_args
|
||||||
|
self.assertEqual(mock_args[self.itersize_index_in_mock_args], batch_size)
|
Loading…
Reference in New Issue