From 70679243d1786e03557c28929f9762a119e3ac14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anssi=20K=C3=A4=C3=A4ri=C3=A4inen?= Date: Thu, 2 Aug 2012 00:09:26 +0300 Subject: [PATCH] Fixed #18702 -- Removed chunked reads from QuerySet iteration --- django/db/models/query.py | 149 ++++++++------------------------------ docs/releases/1.6.txt | 19 +++++ tests/queries/tests.py | 44 ++--------- 3 files changed, 56 insertions(+), 156 deletions(-) diff --git a/django/db/models/query.py b/django/db/models/query.py index f2015f57a8..cc07ecbfd0 100644 --- a/django/db/models/query.py +++ b/django/db/models/query.py @@ -20,11 +20,6 @@ from django.utils.functional import partition from django.utils import six from django.utils import timezone -# Used to control how many objects are worked with at once in some cases (e.g. -# when deleting objects). -CHUNK_SIZE = 100 -ITER_CHUNK_SIZE = CHUNK_SIZE - # The maximum number of items to display in a QuerySet.__repr__ REPR_OUTPUT_SIZE = 20 @@ -41,7 +36,6 @@ class QuerySet(object): self._db = using self.query = query or sql.Query(self.model) self._result_cache = None - self._iter = None self._sticky_filter = False self._for_write = False self._prefetch_related_lookups = [] @@ -57,8 +51,8 @@ class QuerySet(object): Deep copy of a QuerySet doesn't populate the cache """ obj = self.__class__() - for k,v in self.__dict__.items(): - if k in ('_iter','_result_cache'): + for k, v in self.__dict__.items(): + if k == '_result_cache': obj.__dict__[k] = None else: obj.__dict__[k] = copy.deepcopy(v, memo) @@ -69,10 +63,8 @@ class QuerySet(object): Allows the QuerySet to be pickled. """ # Force the cache to be fully populated. - len(self) - + self._fetch_all() obj_dict = self.__dict__.copy() - obj_dict['_iter'] = None return obj_dict def __repr__(self): @@ -82,95 +74,31 @@ class QuerySet(object): return repr(data) def __len__(self): - # Since __len__ is called quite frequently (for example, as part of - # list(qs), we make some effort here to be as efficient as possible - # whilst not messing up any existing iterators against the QuerySet. - if self._result_cache is None: - if self._iter: - self._result_cache = list(self._iter) - else: - self._result_cache = list(self.iterator()) - elif self._iter: - self._result_cache.extend(self._iter) - if self._prefetch_related_lookups and not self._prefetch_done: - self._prefetch_related_objects() + self._fetch_all() return len(self._result_cache) def __iter__(self): - if self._prefetch_related_lookups and not self._prefetch_done: - # We need all the results in order to be able to do the prefetch - # in one go. To minimize code duplication, we use the __len__ - # code path which also forces this, and also does the prefetch - len(self) - - if self._result_cache is None: - self._iter = self.iterator() - self._result_cache = [] - if self._iter: - return self._result_iter() - # Python's list iterator is better than our version when we're just - # iterating over the cache. + """ + The queryset iterator protocol uses three nested iterators in the + default case: + 1. sql.compiler:execute_sql() + - Returns 100 rows at time (constants.GET_ITERATOR_CHUNK_SIZE) + using cursor.fetchmany(). This part is responsible for + doing some column masking, and returning the rows in chunks. + 2. sql/compiler.results_iter() + - Returns one row at time. At this point the rows are still just + tuples. In some cases the return values are converted to + Python values at this location (see resolve_columns(), + resolve_aggregate()). + 3. self.iterator() + - Responsible for turning the rows into model objects. + """ + self._fetch_all() return iter(self._result_cache) - def _result_iter(self): - pos = 0 - while 1: - upper = len(self._result_cache) - while pos < upper: - yield self._result_cache[pos] - pos = pos + 1 - if not self._iter: - raise StopIteration - if len(self._result_cache) <= pos: - self._fill_cache() - - def __bool__(self): - if self._prefetch_related_lookups and not self._prefetch_done: - # We need all the results in order to be able to do the prefetch - # in one go. To minimize code duplication, we use the __len__ - # code path which also forces this, and also does the prefetch - len(self) - - if self._result_cache is not None: - return bool(self._result_cache) - try: - next(iter(self)) - except StopIteration: - return False - return True - - def __nonzero__(self): # Python 2 compatibility - return type(self).__bool__(self) - - def __contains__(self, val): - # The 'in' operator works without this method, due to __iter__. This - # implementation exists only to shortcut the creation of Model - # instances, by bailing out early if we find a matching element. - pos = 0 - if self._result_cache is not None: - if val in self._result_cache: - return True - elif self._iter is None: - # iterator is exhausted, so we have our answer - return False - # remember not to check these again: - pos = len(self._result_cache) - else: - # We need to start filling the result cache out. The following - # ensures that self._iter is not None and self._result_cache is not - # None - it = iter(self) - - # Carry on, one result at a time. - while True: - if len(self._result_cache) <= pos: - self._fill_cache(num=1) - if self._iter is None: - # we ran out of items - return False - if self._result_cache[pos] == val: - return True - pos += 1 + def __nonzero__(self): + self._fetch_all() + return bool(self._result_cache) def __getitem__(self, k): """ @@ -184,19 +112,6 @@ class QuerySet(object): "Negative indexing is not supported." if self._result_cache is not None: - if self._iter is not None: - # The result cache has only been partially populated, so we may - # need to fill it out a bit more. - if isinstance(k, slice): - if k.stop is not None: - # Some people insist on passing in strings here. - bound = int(k.stop) - else: - bound = None - else: - bound = k + 1 - if len(self._result_cache) < bound: - self._fill_cache(bound - len(self._result_cache)) return self._result_cache[k] if isinstance(k, slice): @@ -370,7 +285,7 @@ class QuerySet(object): If the QuerySet is already fully cached this simply returns the length of the cached results set to avoid multiple SELECT COUNT(*) calls. """ - if self._result_cache is not None and not self._iter: + if self._result_cache is not None: return len(self._result_cache) return self.query.get_count(using=self.db) @@ -933,17 +848,11 @@ class QuerySet(object): c._setup_query() return c - def _fill_cache(self, num=None): - """ - Fills the result cache with 'num' more entries (or until the results - iterator is exhausted). - """ - if self._iter: - try: - for i in range(num or ITER_CHUNK_SIZE): - self._result_cache.append(next(self._iter)) - except StopIteration: - self._iter = None + def _fetch_all(self): + if self._result_cache is None: + self._result_cache = list(self.iterator()) + if self._prefetch_related_lookups and not self._prefetch_done: + self._prefetch_related_objects() def _next_is_sticky(self): """ diff --git a/docs/releases/1.6.txt b/docs/releases/1.6.txt index 29b4569d4e..a1a29471a3 100644 --- a/docs/releases/1.6.txt +++ b/docs/releases/1.6.txt @@ -524,6 +524,25 @@ non-standard behavior has been preserved but moved to the model form field layer and occurs only when the associated widget is :class:`~django.forms.SelectMultiple` or a subclass. +QuerySet iteration +~~~~~~~~~~~~~~~~~~ + +The ``QuerySet`` iteration was changed to immediately convert all fetched +rows to ``Model`` objects. In Django 1.5 and earlier the fetched rows were +converted to ``Model`` objects in chunks of 100. + +Existing code will work, but the amount of rows converted to objects +might change in certain use cases. Such usages include partially looping +over a queryset or any usage which ends up doing ``__bool__`` or +``__contains__``. + +Notably most database backends did fetch all the rows in one go already in +1.5. + +It is still possible to convert the fetched rows to ``Model`` objects +lazily by using the :meth:`~django.db.models.query.QuerySet.iterator()` +method. + Miscellaneous ~~~~~~~~~~~~~ diff --git a/tests/queries/tests.py b/tests/queries/tests.py index ad5dde34b5..ffaf6435ec 100644 --- a/tests/queries/tests.py +++ b/tests/queries/tests.py @@ -9,7 +9,6 @@ from django.conf import settings from django.core.exceptions import FieldError from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS from django.db.models import Count, F, Q -from django.db.models.query import ITER_CHUNK_SIZE from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode from django.db.models.sql.datastructures import EmptyResultSet from django.test import TestCase, skipUnlessDBFeature @@ -1211,16 +1210,6 @@ class Queries2Tests(TestCase): ordered=False ) - def test_ticket7411(self): - # Saving to db must work even with partially read result set in another - # cursor. - for num in range(2 * ITER_CHUNK_SIZE + 1): - _ = Number.objects.create(num=num) - - for i, obj in enumerate(Number.objects.all()): - obj.save() - if i > 10: break - def test_ticket7759(self): # Count should work with a partially read result set. count = Number.objects.count() @@ -1700,31 +1689,6 @@ class Queries6Tests(TestCase): ann1.notes.add(n1) ann2 = Annotation.objects.create(name='a2', tag=t4) - # This next test used to cause really weird PostgreSQL behavior, but it was - # only apparent much later when the full test suite ran. - # - Yeah, it leaves global ITER_CHUNK_SIZE to 2 instead of 100... - #@unittest.expectedFailure - def test_slicing_and_cache_interaction(self): - # We can do slicing beyond what is currently in the result cache, - # too. - - # We need to mess with the implementation internals a bit here to decrease the - # cache fill size so that we don't read all the results at once. - from django.db.models import query - query.ITER_CHUNK_SIZE = 2 - qs = Tag.objects.all() - - # Fill the cache with the first chunk. - self.assertTrue(bool(qs)) - self.assertEqual(len(qs._result_cache), 2) - - # Query beyond the end of the cache and check that it is filled out as required. - self.assertEqual(repr(qs[4]), '') - self.assertEqual(len(qs._result_cache), 5) - - # But querying beyond the end of the result set will fail. - self.assertRaises(IndexError, lambda: qs[100]) - def test_parallel_iterators(self): # Test that parallel iterators work. qs = Tag.objects.all() @@ -2533,6 +2497,14 @@ class WhereNodeTest(TestCase): w = WhereNode(children=[empty_w, NothingNode()], connector='OR') self.assertRaises(EmptyResultSet, w.as_sql, qn, connection) + +class IteratorExceptionsTest(TestCase): + def test_iter_exceptions(self): + qs = ExtraInfo.objects.only('author') + with self.assertRaises(AttributeError): + list(qs) + + class NullJoinPromotionOrTest(TestCase): def setUp(self): self.d1 = ModelD.objects.create(name='foo')