Fixed #18702 -- Removed chunked reads from QuerySet iteration

This commit is contained in:
Anssi Kääriäinen 2012-08-02 00:09:26 +03:00
parent ea9a0857d4
commit 70679243d1
3 changed files with 56 additions and 156 deletions

View File

@ -20,11 +20,6 @@ from django.utils.functional import partition
from django.utils import six from django.utils import six
from django.utils import timezone from django.utils import timezone
# Used to control how many objects are worked with at once in some cases (e.g.
# when deleting objects).
CHUNK_SIZE = 100
ITER_CHUNK_SIZE = CHUNK_SIZE
# The maximum number of items to display in a QuerySet.__repr__ # The maximum number of items to display in a QuerySet.__repr__
REPR_OUTPUT_SIZE = 20 REPR_OUTPUT_SIZE = 20
@ -41,7 +36,6 @@ class QuerySet(object):
self._db = using self._db = using
self.query = query or sql.Query(self.model) self.query = query or sql.Query(self.model)
self._result_cache = None self._result_cache = None
self._iter = None
self._sticky_filter = False self._sticky_filter = False
self._for_write = False self._for_write = False
self._prefetch_related_lookups = [] self._prefetch_related_lookups = []
@ -57,8 +51,8 @@ class QuerySet(object):
Deep copy of a QuerySet doesn't populate the cache Deep copy of a QuerySet doesn't populate the cache
""" """
obj = self.__class__() obj = self.__class__()
for k,v in self.__dict__.items(): for k, v in self.__dict__.items():
if k in ('_iter','_result_cache'): if k == '_result_cache':
obj.__dict__[k] = None obj.__dict__[k] = None
else: else:
obj.__dict__[k] = copy.deepcopy(v, memo) obj.__dict__[k] = copy.deepcopy(v, memo)
@ -69,10 +63,8 @@ class QuerySet(object):
Allows the QuerySet to be pickled. Allows the QuerySet to be pickled.
""" """
# Force the cache to be fully populated. # Force the cache to be fully populated.
len(self) self._fetch_all()
obj_dict = self.__dict__.copy() obj_dict = self.__dict__.copy()
obj_dict['_iter'] = None
return obj_dict return obj_dict
def __repr__(self): def __repr__(self):
@ -82,95 +74,31 @@ class QuerySet(object):
return repr(data) return repr(data)
def __len__(self): def __len__(self):
# Since __len__ is called quite frequently (for example, as part of self._fetch_all()
# list(qs), we make some effort here to be as efficient as possible
# whilst not messing up any existing iterators against the QuerySet.
if self._result_cache is None:
if self._iter:
self._result_cache = list(self._iter)
else:
self._result_cache = list(self.iterator())
elif self._iter:
self._result_cache.extend(self._iter)
if self._prefetch_related_lookups and not self._prefetch_done:
self._prefetch_related_objects()
return len(self._result_cache) return len(self._result_cache)
def __iter__(self): def __iter__(self):
if self._prefetch_related_lookups and not self._prefetch_done: """
# We need all the results in order to be able to do the prefetch The queryset iterator protocol uses three nested iterators in the
# in one go. To minimize code duplication, we use the __len__ default case:
# code path which also forces this, and also does the prefetch 1. sql.compiler:execute_sql()
len(self) - Returns 100 rows at time (constants.GET_ITERATOR_CHUNK_SIZE)
using cursor.fetchmany(). This part is responsible for
if self._result_cache is None: doing some column masking, and returning the rows in chunks.
self._iter = self.iterator() 2. sql/compiler.results_iter()
self._result_cache = [] - Returns one row at time. At this point the rows are still just
if self._iter: tuples. In some cases the return values are converted to
return self._result_iter() Python values at this location (see resolve_columns(),
# Python's list iterator is better than our version when we're just resolve_aggregate()).
# iterating over the cache. 3. self.iterator()
- Responsible for turning the rows into model objects.
"""
self._fetch_all()
return iter(self._result_cache) return iter(self._result_cache)
def _result_iter(self): def __nonzero__(self):
pos = 0 self._fetch_all()
while 1: return bool(self._result_cache)
upper = len(self._result_cache)
while pos < upper:
yield self._result_cache[pos]
pos = pos + 1
if not self._iter:
raise StopIteration
if len(self._result_cache) <= pos:
self._fill_cache()
def __bool__(self):
if self._prefetch_related_lookups and not self._prefetch_done:
# We need all the results in order to be able to do the prefetch
# in one go. To minimize code duplication, we use the __len__
# code path which also forces this, and also does the prefetch
len(self)
if self._result_cache is not None:
return bool(self._result_cache)
try:
next(iter(self))
except StopIteration:
return False
return True
def __nonzero__(self): # Python 2 compatibility
return type(self).__bool__(self)
def __contains__(self, val):
# The 'in' operator works without this method, due to __iter__. This
# implementation exists only to shortcut the creation of Model
# instances, by bailing out early if we find a matching element.
pos = 0
if self._result_cache is not None:
if val in self._result_cache:
return True
elif self._iter is None:
# iterator is exhausted, so we have our answer
return False
# remember not to check these again:
pos = len(self._result_cache)
else:
# We need to start filling the result cache out. The following
# ensures that self._iter is not None and self._result_cache is not
# None
it = iter(self)
# Carry on, one result at a time.
while True:
if len(self._result_cache) <= pos:
self._fill_cache(num=1)
if self._iter is None:
# we ran out of items
return False
if self._result_cache[pos] == val:
return True
pos += 1
def __getitem__(self, k): def __getitem__(self, k):
""" """
@ -184,19 +112,6 @@ class QuerySet(object):
"Negative indexing is not supported." "Negative indexing is not supported."
if self._result_cache is not None: if self._result_cache is not None:
if self._iter is not None:
# The result cache has only been partially populated, so we may
# need to fill it out a bit more.
if isinstance(k, slice):
if k.stop is not None:
# Some people insist on passing in strings here.
bound = int(k.stop)
else:
bound = None
else:
bound = k + 1
if len(self._result_cache) < bound:
self._fill_cache(bound - len(self._result_cache))
return self._result_cache[k] return self._result_cache[k]
if isinstance(k, slice): if isinstance(k, slice):
@ -370,7 +285,7 @@ class QuerySet(object):
If the QuerySet is already fully cached this simply returns the length If the QuerySet is already fully cached this simply returns the length
of the cached results set to avoid multiple SELECT COUNT(*) calls. of the cached results set to avoid multiple SELECT COUNT(*) calls.
""" """
if self._result_cache is not None and not self._iter: if self._result_cache is not None:
return len(self._result_cache) return len(self._result_cache)
return self.query.get_count(using=self.db) return self.query.get_count(using=self.db)
@ -933,17 +848,11 @@ class QuerySet(object):
c._setup_query() c._setup_query()
return c return c
def _fill_cache(self, num=None): def _fetch_all(self):
""" if self._result_cache is None:
Fills the result cache with 'num' more entries (or until the results self._result_cache = list(self.iterator())
iterator is exhausted). if self._prefetch_related_lookups and not self._prefetch_done:
""" self._prefetch_related_objects()
if self._iter:
try:
for i in range(num or ITER_CHUNK_SIZE):
self._result_cache.append(next(self._iter))
except StopIteration:
self._iter = None
def _next_is_sticky(self): def _next_is_sticky(self):
""" """

View File

@ -524,6 +524,25 @@ non-standard behavior has been preserved but moved to the model form field layer
and occurs only when the associated widget is and occurs only when the associated widget is
:class:`~django.forms.SelectMultiple` or a subclass. :class:`~django.forms.SelectMultiple` or a subclass.
QuerySet iteration
~~~~~~~~~~~~~~~~~~
The ``QuerySet`` iteration was changed to immediately convert all fetched
rows to ``Model`` objects. In Django 1.5 and earlier the fetched rows were
converted to ``Model`` objects in chunks of 100.
Existing code will work, but the amount of rows converted to objects
might change in certain use cases. Such usages include partially looping
over a queryset or any usage which ends up doing ``__bool__`` or
``__contains__``.
Notably most database backends did fetch all the rows in one go already in
1.5.
It is still possible to convert the fetched rows to ``Model`` objects
lazily by using the :meth:`~django.db.models.query.QuerySet.iterator()`
method.
Miscellaneous Miscellaneous
~~~~~~~~~~~~~ ~~~~~~~~~~~~~

View File

@ -9,7 +9,6 @@ from django.conf import settings
from django.core.exceptions import FieldError from django.core.exceptions import FieldError
from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS
from django.db.models import Count, F, Q from django.db.models import Count, F, Q
from django.db.models.query import ITER_CHUNK_SIZE
from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode
from django.db.models.sql.datastructures import EmptyResultSet from django.db.models.sql.datastructures import EmptyResultSet
from django.test import TestCase, skipUnlessDBFeature from django.test import TestCase, skipUnlessDBFeature
@ -1211,16 +1210,6 @@ class Queries2Tests(TestCase):
ordered=False ordered=False
) )
def test_ticket7411(self):
# Saving to db must work even with partially read result set in another
# cursor.
for num in range(2 * ITER_CHUNK_SIZE + 1):
_ = Number.objects.create(num=num)
for i, obj in enumerate(Number.objects.all()):
obj.save()
if i > 10: break
def test_ticket7759(self): def test_ticket7759(self):
# Count should work with a partially read result set. # Count should work with a partially read result set.
count = Number.objects.count() count = Number.objects.count()
@ -1700,31 +1689,6 @@ class Queries6Tests(TestCase):
ann1.notes.add(n1) ann1.notes.add(n1)
ann2 = Annotation.objects.create(name='a2', tag=t4) ann2 = Annotation.objects.create(name='a2', tag=t4)
# This next test used to cause really weird PostgreSQL behavior, but it was
# only apparent much later when the full test suite ran.
# - Yeah, it leaves global ITER_CHUNK_SIZE to 2 instead of 100...
#@unittest.expectedFailure
def test_slicing_and_cache_interaction(self):
# We can do slicing beyond what is currently in the result cache,
# too.
# We need to mess with the implementation internals a bit here to decrease the
# cache fill size so that we don't read all the results at once.
from django.db.models import query
query.ITER_CHUNK_SIZE = 2
qs = Tag.objects.all()
# Fill the cache with the first chunk.
self.assertTrue(bool(qs))
self.assertEqual(len(qs._result_cache), 2)
# Query beyond the end of the cache and check that it is filled out as required.
self.assertEqual(repr(qs[4]), '<Tag: t5>')
self.assertEqual(len(qs._result_cache), 5)
# But querying beyond the end of the result set will fail.
self.assertRaises(IndexError, lambda: qs[100])
def test_parallel_iterators(self): def test_parallel_iterators(self):
# Test that parallel iterators work. # Test that parallel iterators work.
qs = Tag.objects.all() qs = Tag.objects.all()
@ -2533,6 +2497,14 @@ class WhereNodeTest(TestCase):
w = WhereNode(children=[empty_w, NothingNode()], connector='OR') w = WhereNode(children=[empty_w, NothingNode()], connector='OR')
self.assertRaises(EmptyResultSet, w.as_sql, qn, connection) self.assertRaises(EmptyResultSet, w.as_sql, qn, connection)
class IteratorExceptionsTest(TestCase):
def test_iter_exceptions(self):
qs = ExtraInfo.objects.only('author')
with self.assertRaises(AttributeError):
list(qs)
class NullJoinPromotionOrTest(TestCase): class NullJoinPromotionOrTest(TestCase):
def setUp(self): def setUp(self):
self.d1 = ModelD.objects.create(name='foo') self.d1 = ModelD.objects.create(name='foo')