From 236ebe94bfe24d394d5b49f4405da445550e8aa6 Mon Sep 17 00:00:00 2001 From: Matthew Schinckel Date: Wed, 20 Apr 2016 16:26:51 +0930 Subject: [PATCH] Fixed #27149 -- Added Subquery and Exists database expressions. Thanks Josh Smeaton for Oracle fixes. --- django/db/backends/oracle/compiler.py | 15 ++- django/db/models/__init__.py | 6 +- django/db/models/expressions.py | 149 ++++++++++++++++++++++ docs/ref/models/expressions.txt | 172 ++++++++++++++++++++++++++ docs/releases/1.11.txt | 8 ++ tests/expressions/tests.py | 140 ++++++++++++++++++++- 6 files changed, 479 insertions(+), 11 deletions(-) diff --git a/django/db/backends/oracle/compiler.py b/django/db/backends/oracle/compiler.py index 3ae567669f..9aa4acc0fe 100644 --- a/django/db/backends/oracle/compiler.py +++ b/django/db/backends/oracle/compiler.py @@ -31,10 +31,17 @@ class SQLCompiler(compiler.SQLCompiler): high_where = '' if self.query.high_mark is not None: high_where = 'WHERE ROWNUM <= %d' % (self.query.high_mark,) - sql = ( - 'SELECT * FROM (SELECT "_SUB".*, ROWNUM AS "_RN" FROM (%s) ' - '"_SUB" %s) WHERE "_RN" > %d' % (sql, high_where, self.query.low_mark) - ) + + if self.query.low_mark: + sql = ( + 'SELECT * FROM (SELECT "_SUB".*, ROWNUM AS "_RN" FROM (%s) ' + '"_SUB" %s) WHERE "_RN" > %d' % (sql, high_where, self.query.low_mark) + ) + else: + # Simplify the query to support subqueries if there's no offset. + sql = ( + 'SELECT * FROM (SELECT "_SUB".* FROM (%s) "_SUB" %s)' % (sql, high_where) + ) return sql, params diff --git a/django/db/models/__init__.py b/django/db/models/__init__.py index 225436231c..8ab11b098a 100644 --- a/django/db/models/__init__.py +++ b/django/db/models/__init__.py @@ -6,7 +6,8 @@ from django.db.models.deletion import ( CASCADE, DO_NOTHING, PROTECT, SET, SET_DEFAULT, SET_NULL, ProtectedError, ) from django.db.models.expressions import ( - Case, Expression, ExpressionWrapper, F, Func, Value, When, + Case, Exists, Expression, ExpressionWrapper, F, Func, OuterRef, Subquery, + Value, When, ) from django.db.models.fields import * # NOQA from django.db.models.fields import __all__ as fields_all @@ -62,7 +63,8 @@ __all__ += [ 'ObjectDoesNotExist', 'signals', 'CASCADE', 'DO_NOTHING', 'PROTECT', 'SET', 'SET_DEFAULT', 'SET_NULL', 'ProtectedError', - 'Case', 'Expression', 'ExpressionWrapper', 'F', 'Func', 'Value', 'When', + 'Case', 'Exists', 'Expression', 'ExpressionWrapper', 'F', 'Func', + 'OuterRef', 'Subquery', 'Value', 'When', 'FileField', 'ImageField', 'OrderWrt', 'Lookup', 'Transform', 'Manager', 'Prefetch', 'Q', 'QuerySet', 'prefetch_related_objects', 'DEFERRED', 'Model', 'ForeignKey', 'ForeignObject', 'OneToOneField', 'ManyToManyField', diff --git a/django/db/models/expressions.py b/django/db/models/expressions.py index 1ff4cd7735..36c2b969db 100644 --- a/django/db/models/expressions.py +++ b/django/db/models/expressions.py @@ -477,6 +477,33 @@ class F(Combinable): return OrderBy(self, descending=True, **kwargs) +class ResolvedOuterRef(F): + """ + An object that contains a reference to an outer query. + + In this case, the reference to the outer query has been resolved because + the inner query has been used as a subquery. + """ + def as_sql(self, *args, **kwargs): + raise ValueError( + 'This queryset contains a reference to an outer query and may ' + 'only be used in a subquery.' + ) + + def _prepare(self, output_field=None): + return self + + +class OuterRef(F): + def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False): + if isinstance(self.name, self.__class__): + return self.name + return ResolvedOuterRef(self.name) + + def _prepare(self, output_field=None): + return self + + class Func(Expression): """ An SQL function call. @@ -873,6 +900,128 @@ class Case(Expression): return sql, sql_params +class Subquery(Expression): + """ + An explicit subquery. It may contain OuterRef() references to the outer + query which will be resolved when it is applied to that query. + """ + template = '(%(subquery)s)' + + def __init__(self, queryset, output_field=None, **extra): + self.queryset = queryset + self.extra = extra + if output_field is None and len(self.queryset.query.select) == 1: + output_field = self.queryset.query.select[0].field + super(Subquery, self).__init__(output_field) + + def copy(self): + clone = super(Subquery, self).copy() + clone.queryset = clone.queryset.all() + return clone + + def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False): + clone = self.copy() + clone.is_summary = summarize + clone.queryset.query.bump_prefix(query) + + # Need to recursively resolve these. + def resolve_all(child): + if hasattr(child, 'children'): + [resolve_all(_child) for _child in child.children] + if hasattr(child, 'rhs'): + child.rhs = resolve(child.rhs) + + def resolve(child): + if hasattr(child, 'resolve_expression'): + return child.resolve_expression( + query=query, allow_joins=allow_joins, reuse=reuse, + summarize=summarize, for_save=for_save, + ) + return child + + resolve_all(clone.queryset.query.where) + + for key, value in clone.queryset.query.annotations.items(): + if isinstance(value, Subquery): + clone.queryset.query.annotations[key] = resolve(value) + + return clone + + def get_source_expressions(self): + return [ + x for x in [ + getattr(expr, 'lhs', None) + for expr in self.queryset.query.where.children + ] if x + ] + + def relabeled_clone(self, change_map): + clone = self.copy() + clone.queryset.query = clone.queryset.query.relabeled_clone(change_map) + clone.queryset.query.external_aliases.update( + alias for alias in change_map.values() + if alias not in clone.queryset.query.tables + ) + return clone + + def as_sql(self, compiler, connection, template=None, **extra_context): + connection.ops.check_expression_support(self) + template_params = self.extra.copy() + template_params.update(extra_context) + template_params['subquery'], sql_params = self.queryset.query.get_compiler(connection=connection).as_sql() + + template = template or template_params.get('template', self.template) + sql = template % template_params + sql = connection.ops.unification_cast_sql(self.output_field) % sql + return sql, sql_params + + def _prepare(self, output_field): + # This method will only be called if this instance is the "rhs" in an + # expression: the wrapping () must be removed (as the expression that + # contains this will provide them). SQLite evaluates ((subquery)) + # differently than the other databases. + if self.template == '(%(subquery)s)': + clone = self.copy() + clone.template = '%(subquery)s' + return clone + return self + + +class Exists(Subquery): + template = 'EXISTS(%(subquery)s)' + + def __init__(self, *args, **kwargs): + self.negated = kwargs.pop('negated', False) + super(Exists, self).__init__(*args, **kwargs) + + def __invert__(self): + return type(self)(self.queryset, self.output_field, negated=(not self.negated), **self.extra) + + @property + def output_field(self): + return fields.BooleanField() + + def resolve_expression(self, query=None, **kwargs): + # As a performance optimization, remove ordering since EXISTS doesn't + # care about it, just whether or not a row matches. + self.queryset = self.queryset.order_by() + return super(Exists, self).resolve_expression(query, **kwargs) + + def as_sql(self, compiler, connection, template=None, **extra_context): + sql, params = super(Exists, self).as_sql(compiler, connection, template, **extra_context) + if self.negated: + sql = 'NOT {}'.format(sql) + return sql, params + + def as_oracle(self, compiler, connection, template=None, **extra_context): + # Oracle doesn't allow EXISTS() in the SELECT list, so wrap it with a + # CASE WHEN expression. Change the template since the When expression + # requires a left hand side (column) to compare against. + sql, params = self.as_sql(compiler, connection, template, **extra_context) + sql = 'CASE WHEN {} THEN 1 ELSE 0 END'.format(sql) + return sql, params + + class OrderBy(BaseExpression): template = '%(expression)s %(ordering)s' diff --git a/docs/ref/models/expressions.txt b/docs/ref/models/expressions.txt index e46df22f98..01db103758 100644 --- a/docs/ref/models/expressions.txt +++ b/docs/ref/models/expressions.txt @@ -450,6 +450,178 @@ Conditional expressions allow you to use :keyword:`if` ... :keyword:`elif` ... :keyword:`else` logic in queries. Django natively supports SQL ``CASE`` expressions. For more details see :doc:`conditional-expressions`. +``Subquery()`` expressions +-------------------------- + +.. class:: Subquery(queryset, output_field=None) + +.. versionadded:: 1.11 + +You can add an explicit subquery to a ``QuerySet`` using the ``Subquery`` +expression. + +For example, to annotate each post with the email address of the author of the +newest comment on that post:: + + >>> from django.db.models import OuterRef, Subquery + >>> newest = Comment.objects.filter(post=OuterRef('pk')).order_by('-created_at') + >>> Post.objects.annotate(newest_commenter_email=Subquery(newest.values('email')[:1])) + +On PostgreSQL, the SQL looks like: + +.. code-block:: sql + + SELECT "post"."id", ( + SELECT U0."email" + FROM "comment" U0 + WHERE U0."post_id" = ("post"."id") + ORDER BY U0."created_at" DESC LIMIT 1 + ) AS "newest_commenter_email" FROM "post" + +.. note:: + + The examples in this section are designed to show how to force + Django to execute a subquery. In some cases it may be possible to + write an equivalent queryset that performs the same task more + clearly or efficiently. + +Referencing columns from the outer queryset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. class:: OuterRef(field) + +.. versionadded:: 1.11 + +Use ``OuterRef`` when a queryset in a ``Subquery`` needs to refer to a field +from the outer query. It acts like an :class:`F` expression except that the +check to see if it refers to a valid field isn't made until the outer queryset +is resolved. + +Instances of ``OuterRef`` may be used in conjunction with nested instances +of ``Subquery`` to refer to a containing queryset that isn't the immediate +parent. For example, this queryset would need to be within a nested pair of +``Subquery`` instances to resolve correctly:: + + >>> Book.objects.filter(author=OuterRef(OuterRef('pk'))) + +Limiting a subquery to a single column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are times when a single column must be returned from a ``Subquery``, for +instance, to use a ``Subquery`` as the target of an ``__in`` lookup. To return +all comments for posts published within the last day:: + + >>> from datetime import timedelta + >>> from django.utils import timezone + >>> one_day_ago = timezone.now() - timedelta(days=1) + >>> posts = Post.objects.filter(published_at__gte=one_day_ago) + >>> Comment.objects.filter(post__in=Subquery(posts.values('pk'))) + +In this case, the subquery must use :meth:`~.QuerySet.values` +to return only a single column: the primary key of the post. + +Limiting the subquery to a single row +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To prevent a subquery from returning multiple rows, a slice (``[:1]``) of the +queryset is used:: + + >>> subquery = Subquery(newest.values('email')[:1]) + >>> Post.objects.annotate(newest_commenter_email=subquery) + +In this case, the subquery must only return a single column *and* a single +row: the email address of the most recently created comment. + +(Using :meth:`~.QuerySet.get` instead of a slice would fail because the +``OuterRef`` cannot be resolved until the queryset is used within a +``Subquery``.) + +``Exists()`` subqueries +~~~~~~~~~~~~~~~~~~~~~~~ + +.. class:: Exists(queryset) + +.. versionadded:: 1.11 + +``Exists`` is a ``Subquery`` subclass that uses an SQL ``EXISTS`` statement. In +many cases it will perform better than a subquery since the database is able to +stop evaluation of the subquery when a first matching row is found. + +For example, to annotate each post with whether or not it has a comment from +within the last day:: + + >>> from django.db.models import Exists, OuterRef + >>> from datetime import timedelta + >>> from django.utils import timezone + >>> one_day_ago = timezone.now() - timedelta(days=1) + >>> recent_comments = Comment.objects.filter( + ... post=OuterRef('pk'), + ... created_at__gte=one_day_ago, + ... ) + >>> Post.objects.annotate(recent_comment=Exists(recent_comments) + +On PostgreSQL, the SQL looks like: + +.. code-block:: sql + + SELECT "post"."id", "post"."published_at", EXISTS( + SELECT U0."id", U0."post_id", U0."email", U0."created_at" + FROM "comment" U0 + WHERE ( + U0."created_at" >= YYYY-MM-DD HH:MM:SS AND + U0."post_id" = ("post"."id") + ) + ) AS "recent_comment" FROM "post" + +It's unnecessary to force ``Exists`` to refer to a single column, since the +columns are discarded and a boolean result is returned. Similarly, since +ordering is unimportant within an SQL ``EXISTS`` subquery and would only +degrade performance, it's automatically removed. + +You can query using ``NOT EXISTS`` with ``~Exists()``. + +Filtering on a ``Subquery`` expression +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It's not possible to filter directly using ``Subquery`` and ``Exists``, e.g.:: + + >>> Post.objects.filter(Exists(recent_comments)) + ... + TypeError: 'Exists' object is not iterable + + +You must filter on a subquery expression by first annotating the queryset +and then filtering based on that annotation:: + + >>> Post.objects.annotate( + ... recent_comment=Exists(recent_comments), + ... ).filter(recent_comment=True) + +Using aggregates within a ``Subquery`` expression +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Aggregates may be used within a ``Subquery``, but they require a specific +combination of :meth:`~.QuerySet.filter`, :meth:`~.QuerySet.values`, and +:meth:`~.QuerySet.annotate` to get the subquery grouping correct. + +Assuming both models have a ``length`` field, to find posts where the post +length is greater than the total length of all combined comments:: + + >>> from django.db.models import OuterRef, Subquery, Sum + >>> comments = Comment.objects.filter(post=OuterRef('pk')).values('post') + >>> total_comments = comments.annotate(total=Sum('length')).values('total') + >>> Post.objects.filter(length__gt=Subquery(total_comments)) + +The initial ``filter(...)`` limits the subquery to the relevant parameters. +``values('post')`` aggregates comments by ``Post``. Finally, ``annotate(...)`` +performs the aggregation. The order in which these queryset methods are applied +is important. In this case, since the subquery must be limited to a single +column, ``values('total')`` is required. + +This is the only way to perform an aggregation within a ``Subquery``, as +using :meth:`~.QuerySet.aggregate` attempts to evaluate the queryset (and if +there is an ``OuterRef``, this will not be possible to resolve). + Raw SQL expressions ------------------- diff --git a/docs/releases/1.11.txt b/docs/releases/1.11.txt index edbd2b0be3..e5b0efffef 100644 --- a/docs/releases/1.11.txt +++ b/docs/releases/1.11.txt @@ -70,6 +70,14 @@ template system rather than in Python. See :doc:`/ref/forms/renderers`. You may need to adjust any custom widgets that you've written for a few :ref:`backwards incompatible changes `. +``Subquery`` expressions +------------------------ + +The new :class:`~django.db.models.Subquery` and +:class:`~django.db.models.Exists` database expressions allow creating +explicit subqueries. Subqueries may refer to fields from the outer queryset +using the :class:`~django.db.models.OuterRef` class. + Minor features -------------- diff --git a/tests/expressions/tests.py b/tests/expressions/tests.py index 18d003d57d..8399e3c0a9 100644 --- a/tests/expressions/tests.py +++ b/tests/expressions/tests.py @@ -12,8 +12,8 @@ from django.db.models.aggregates import ( Avg, Count, Max, Min, StdDev, Sum, Variance, ) from django.db.models.expressions import ( - Case, Col, ExpressionWrapper, F, Func, OrderBy, Random, RawSQL, Ref, Value, - When, + Case, Col, Exists, ExpressionWrapper, F, Func, OrderBy, OuterRef, Random, + RawSQL, Ref, Subquery, Value, When, ) from django.db.models.functions import ( Coalesce, Concat, Length, Lower, Substr, Upper, @@ -32,15 +32,15 @@ from .models import ( class BasicExpressionsTests(TestCase): @classmethod def setUpTestData(cls): - Company.objects.create( + cls.example_inc = Company.objects.create( name="Example Inc.", num_employees=2300, num_chairs=5, ceo=Employee.objects.create(firstname="Joe", lastname="Smith", salary=10) ) - Company.objects.create( + cls.foobar_ltd = Company.objects.create( name="Foobar Ltd.", num_employees=3, num_chairs=4, ceo=Employee.objects.create(firstname="Frank", lastname="Meyer", salary=20) ) - Company.objects.create( + cls.gmbh = Company.objects.create( name="Test GmbH", num_employees=32, num_chairs=1, ceo=Employee.objects.create(firstname="Max", lastname="Mustermann", salary=30) ) @@ -387,6 +387,136 @@ class BasicExpressionsTests(TestCase): ) self.assertEqual(str(qs.query).count('JOIN'), 2) + def test_outerref(self): + inner = Company.objects.filter(point_of_contact=OuterRef('pk')) + msg = ( + 'This queryset contains a reference to an outer query and may only ' + 'be used in a subquery.' + ) + with self.assertRaisesMessage(ValueError, msg): + inner.exists() + + outer = Employee.objects.annotate(is_point_of_contact=Exists(inner)) + self.assertIs(outer.exists(), True) + + def test_subquery(self): + Company.objects.filter(name='Example Inc.').update( + point_of_contact=Employee.objects.get(firstname='Joe', lastname='Smith'), + ceo=Employee.objects.get(firstname='Max', lastname='Mustermann'), + ) + Employee.objects.create(firstname='Bob', lastname='Brown', salary=40) + qs = Employee.objects.annotate( + is_point_of_contact=Exists(Company.objects.filter(point_of_contact=OuterRef('pk'))), + is_not_point_of_contact=~Exists(Company.objects.filter(point_of_contact=OuterRef('pk'))), + is_ceo_of_small_company=Exists(Company.objects.filter(num_employees__lt=200, ceo=OuterRef('pk'))), + is_ceo_small_2=~~Exists(Company.objects.filter(num_employees__lt=200, ceo=OuterRef('pk'))), + largest_company=Subquery(Company.objects.order_by('-num_employees').filter( + models.Q(ceo=OuterRef('pk')) | models.Q(point_of_contact=OuterRef('pk')) + ).values('name')[:1], output_field=models.CharField()) + ).values( + 'firstname', + 'is_point_of_contact', + 'is_not_point_of_contact', + 'is_ceo_of_small_company', + 'is_ceo_small_2', + 'largest_company', + ).order_by('firstname') + + results = list(qs) + # Could use Coalesce(subq, Value('')) instead except for the bug in + # cx_Oracle mentioned in #23843. + bob = results[0] + if bob['largest_company'] == '' and connection.features.interprets_empty_strings_as_nulls: + bob['largest_company'] = None + + self.assertEqual(results, [ + { + 'firstname': 'Bob', + 'is_point_of_contact': False, + 'is_not_point_of_contact': True, + 'is_ceo_of_small_company': False, + 'is_ceo_small_2': False, + 'largest_company': None, + }, + { + 'firstname': 'Frank', + 'is_point_of_contact': False, + 'is_not_point_of_contact': True, + 'is_ceo_of_small_company': True, + 'is_ceo_small_2': True, + 'largest_company': 'Foobar Ltd.', + }, + { + 'firstname': 'Joe', + 'is_point_of_contact': True, + 'is_not_point_of_contact': False, + 'is_ceo_of_small_company': False, + 'is_ceo_small_2': False, + 'largest_company': 'Example Inc.', + }, + { + 'firstname': 'Max', + 'is_point_of_contact': False, + 'is_not_point_of_contact': True, + 'is_ceo_of_small_company': True, + 'is_ceo_small_2': True, + 'largest_company': 'Example Inc.' + } + ]) + # A less elegant way to write the same query: this uses a LEFT OUTER + # JOIN and an IS NULL, inside a WHERE NOT IN which is probably less + # efficient than EXISTS. + self.assertCountEqual( + qs.filter(is_point_of_contact=True).values('pk'), + Employee.objects.exclude(company_point_of_contact_set=None).values('pk') + ) + + def test_in_subquery(self): + # This is a contrived test (and you really wouldn't write this query), + # but it is a succinct way to test the __in=Subquery() construct. + small_companies = Company.objects.filter(num_employees__lt=200).values('pk') + subquery_test = Company.objects.filter(pk__in=Subquery(small_companies)) + self.assertCountEqual(subquery_test, [self.foobar_ltd, self.gmbh]) + subquery_test2 = Company.objects.filter(pk=Subquery(small_companies.filter(num_employees=3))) + self.assertCountEqual(subquery_test2, [self.foobar_ltd]) + + def test_nested_subquery(self): + inner = Company.objects.filter(point_of_contact=OuterRef('pk')) + outer = Employee.objects.annotate(is_point_of_contact=Exists(inner)) + contrived = Employee.objects.annotate( + is_point_of_contact=Subquery( + outer.filter(pk=OuterRef('pk')).values('is_point_of_contact'), + output_field=models.BooleanField(), + ), + ) + self.assertCountEqual(contrived.values_list(), outer.values_list()) + + def test_nested_subquery_outer_ref_2(self): + first = Time.objects.create(time='09:00') + second = Time.objects.create(time='17:00') + third = Time.objects.create(time='21:00') + SimulationRun.objects.bulk_create([ + SimulationRun(start=first, end=second, midpoint='12:00'), + SimulationRun(start=first, end=third, midpoint='15:00'), + SimulationRun(start=second, end=first, midpoint='00:00'), + ]) + inner = Time.objects.filter(time=OuterRef(OuterRef('time')), pk=OuterRef('start')).values('time') + middle = SimulationRun.objects.annotate(other=Subquery(inner)).values('other')[:1] + outer = Time.objects.annotate(other=Subquery(middle, output_field=models.TimeField())) + # This is a contrived example. It exercises the double OuterRef form. + self.assertCountEqual(outer, [first, second, third]) + + def test_annotations_within_subquery(self): + Company.objects.filter(num_employees__lt=50).update(ceo=Employee.objects.get(firstname='Frank')) + inner = Company.objects.filter( + ceo=OuterRef('pk') + ).values('ceo').annotate(total_employees=models.Sum('num_employees')).values('total_employees') + outer = Employee.objects.annotate(total_employees=Subquery(inner)).filter(salary__lte=Subquery(inner)) + self.assertSequenceEqual( + outer.order_by('-total_employees').values('salary', 'total_employees'), + [{'salary': 10, 'total_employees': 2300}, {'salary': 20, 'total_employees': 35}], + ) + class IterableLookupInnerExpressionsTests(TestCase): @classmethod