Fixed #27149 -- Added Subquery and Exists database expressions.

Thanks Josh Smeaton for Oracle fixes.
2016-04-20 16:26:51 +09:30 · 2016-04-20 16:26:51 +09:30 · 236ebe94bf
parent 84c1826ded
commit 236ebe94bf
6 changed files with 479 additions and 11 deletions
--- a/django/db/backends/oracle/compiler.py
+++ b/django/db/backends/oracle/compiler.py
@ -31,10 +31,17 @@ class SQLCompiler(compiler.SQLCompiler):
            high_where = ''
            if self.query.high_mark is not None:
                high_where = 'WHERE ROWNUM <= %d' % (self.query.high_mark,)
-            sql = (
-                'SELECT * FROM (SELECT "_SUB".*, ROWNUM AS "_RN" FROM (%s) '
-                '"_SUB" %s) WHERE "_RN" > %d' % (sql, high_where, self.query.low_mark)
-            )
+
+            if self.query.low_mark:
+                sql = (
+                    'SELECT * FROM (SELECT "_SUB".*, ROWNUM AS "_RN" FROM (%s) '
+                    '"_SUB" %s) WHERE "_RN" > %d' % (sql, high_where, self.query.low_mark)
+                )
+            else:
+                # Simplify the query to support subqueries if there's no offset.
+                sql = (
+                    'SELECT * FROM (SELECT "_SUB".* FROM (%s) "_SUB" %s)' % (sql, high_where)
+                )

        return sql, params

--- a/django/db/models/init.py
+++ b/django/db/models/init.py
@ -6,7 +6,8 @@ from django.db.models.deletion import (
    CASCADE, DO_NOTHING, PROTECT, SET, SET_DEFAULT, SET_NULL, ProtectedError,
 )
 from django.db.models.expressions import (
-    Case, Expression, ExpressionWrapper, F, Func, Value, When,
+    Case, Exists, Expression, ExpressionWrapper, F, Func, OuterRef, Subquery,
+    Value, When,
 )
 from django.db.models.fields import *  # NOQA
 from django.db.models.fields import __all__ as fields_all
@ -62,7 +63,8 @@ __all__ += [
    'ObjectDoesNotExist', 'signals',
    'CASCADE', 'DO_NOTHING', 'PROTECT', 'SET', 'SET_DEFAULT', 'SET_NULL',
    'ProtectedError',
-    'Case', 'Expression', 'ExpressionWrapper', 'F', 'Func', 'Value', 'When',
+    'Case', 'Exists', 'Expression', 'ExpressionWrapper', 'F', 'Func',
+    'OuterRef', 'Subquery', 'Value', 'When',
    'FileField', 'ImageField', 'OrderWrt', 'Lookup', 'Transform', 'Manager',
    'Prefetch', 'Q', 'QuerySet', 'prefetch_related_objects', 'DEFERRED', 'Model',
    'ForeignKey', 'ForeignObject', 'OneToOneField', 'ManyToManyField',
--- a/django/db/models/expressions.py
+++ b/django/db/models/expressions.py
@ -477,6 +477,33 @@ class F(Combinable):
        return OrderBy(self, descending=True, **kwargs)


+class ResolvedOuterRef(F):
+    """
+    An object that contains a reference to an outer query.
+
+    In this case, the reference to the outer query has been resolved because
+    the inner query has been used as a subquery.
+    """
+    def as_sql(self, *args, **kwargs):
+        raise ValueError(
+            'This queryset contains a reference to an outer query and may '
+            'only be used in a subquery.'
+        )
+
+    def _prepare(self, output_field=None):
+        return self
+
+
+class OuterRef(F):
+    def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False):
+        if isinstance(self.name, self.__class__):
+            return self.name
+        return ResolvedOuterRef(self.name)
+
+    def _prepare(self, output_field=None):
+        return self
+
+
 class Func(Expression):
    """
    An SQL function call.
@ -873,6 +900,128 @@ class Case(Expression):
        return sql, sql_params


+class Subquery(Expression):
+    """
+    An explicit subquery. It may contain OuterRef() references to the outer
+    query which will be resolved when it is applied to that query.
+    """
+    template = '(%(subquery)s)'
+
+    def __init__(self, queryset, output_field=None, **extra):
+        self.queryset = queryset
+        self.extra = extra
+        if output_field is None and len(self.queryset.query.select) == 1:
+            output_field = self.queryset.query.select[0].field
+        super(Subquery, self).__init__(output_field)
+
+    def copy(self):
+        clone = super(Subquery, self).copy()
+        clone.queryset = clone.queryset.all()
+        return clone
+
+    def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False):
+        clone = self.copy()
+        clone.is_summary = summarize
+        clone.queryset.query.bump_prefix(query)
+
+        # Need to recursively resolve these.
+        def resolve_all(child):
+            if hasattr(child, 'children'):
+                [resolve_all(_child) for _child in child.children]
+            if hasattr(child, 'rhs'):
+                child.rhs = resolve(child.rhs)
+
+        def resolve(child):
+            if hasattr(child, 'resolve_expression'):
+                return child.resolve_expression(
+                    query=query, allow_joins=allow_joins, reuse=reuse,
+                    summarize=summarize, for_save=for_save,
+                )
+            return child
+
+        resolve_all(clone.queryset.query.where)
+
+        for key, value in clone.queryset.query.annotations.items():
+            if isinstance(value, Subquery):
+                clone.queryset.query.annotations[key] = resolve(value)
+
+        return clone
+
+    def get_source_expressions(self):
+        return [
+            x for x in [
+                getattr(expr, 'lhs', None)
+                for expr in self.queryset.query.where.children
+            ] if x
+        ]
+
+    def relabeled_clone(self, change_map):
+        clone = self.copy()
+        clone.queryset.query = clone.queryset.query.relabeled_clone(change_map)
+        clone.queryset.query.external_aliases.update(
+            alias for alias in change_map.values()
+            if alias not in clone.queryset.query.tables
+        )
+        return clone
+
+    def as_sql(self, compiler, connection, template=None, **extra_context):
+        connection.ops.check_expression_support(self)
+        template_params = self.extra.copy()
+        template_params.update(extra_context)
+        template_params['subquery'], sql_params = self.queryset.query.get_compiler(connection=connection).as_sql()
+
+        template = template or template_params.get('template', self.template)
+        sql = template % template_params
+        sql = connection.ops.unification_cast_sql(self.output_field) % sql
+        return sql, sql_params
+
+    def _prepare(self, output_field):
+        # This method will only be called if this instance is the "rhs" in an
+        # expression: the wrapping () must be removed (as the expression that
+        # contains this will provide them). SQLite evaluates ((subquery))
+        # differently than the other databases.
+        if self.template == '(%(subquery)s)':
+            clone = self.copy()
+            clone.template = '%(subquery)s'
+            return clone
+        return self
+
+
+class Exists(Subquery):
+    template = 'EXISTS(%(subquery)s)'
+
+    def __init__(self, *args, **kwargs):
+        self.negated = kwargs.pop('negated', False)
+        super(Exists, self).__init__(*args, **kwargs)
+
+    def __invert__(self):
+        return type(self)(self.queryset, self.output_field, negated=(not self.negated), **self.extra)
+
+    @property
+    def output_field(self):
+        return fields.BooleanField()
+
+    def resolve_expression(self, query=None, **kwargs):
+        # As a performance optimization, remove ordering since EXISTS doesn't
+        # care about it, just whether or not a row matches.
+        self.queryset = self.queryset.order_by()
+        return super(Exists, self).resolve_expression(query, **kwargs)
+
+    def as_sql(self, compiler, connection, template=None, **extra_context):
+        sql, params = super(Exists, self).as_sql(compiler, connection, template, **extra_context)
+        if self.negated:
+            sql = 'NOT {}'.format(sql)
+        return sql, params
+
+    def as_oracle(self, compiler, connection, template=None, **extra_context):
+        # Oracle doesn't allow EXISTS() in the SELECT list, so wrap it with a
+        # CASE WHEN expression. Change the template since the When expression
+        # requires a left hand side (column) to compare against.
+        sql, params = self.as_sql(compiler, connection, template, **extra_context)
+        sql = 'CASE WHEN {} THEN 1 ELSE 0 END'.format(sql)
+        return sql, params
+
+
 class OrderBy(BaseExpression):
    template = '%(expression)s %(ordering)s'

--- a/docs/ref/models/expressions.txt
+++ b/docs/ref/models/expressions.txt
@ -450,6 +450,178 @@ Conditional expressions allow you to use :keyword:`if` ... :keyword:`elif` ...
 :keyword:`else` logic in queries. Django natively supports SQL ``CASE``
 expressions. For more details see :doc:`conditional-expressions`.

+``Subquery()`` expressions
+--------------------------
+
+.. class:: Subquery(queryset, output_field=None)
+
+.. versionadded:: 1.11
+
+You can add an explicit subquery to a ``QuerySet`` using the ``Subquery``
+expression.
+
+For example, to annotate each post with the email address of the author of the
+newest comment on that post::
+
+    >>> from django.db.models import OuterRef, Subquery
+    >>> newest = Comment.objects.filter(post=OuterRef('pk')).order_by('-created_at')
+    >>> Post.objects.annotate(newest_commenter_email=Subquery(newest.values('email')[:1]))
+
+On PostgreSQL, the SQL looks like:
+
+.. code-block:: sql
+
+    SELECT "post"."id", (
+        SELECT U0."email"
+        FROM "comment" U0
+        WHERE U0."post_id" = ("post"."id")
+        ORDER BY U0."created_at" DESC LIMIT 1
+    ) AS "newest_commenter_email" FROM "post"
+
+.. note::
+
+    The examples in this section are designed to show how to force
+    Django to execute a subquery. In some cases it may be possible to
+    write an equivalent queryset that performs the same task more
+    clearly or efficiently.
+
+Referencing columns from the outer queryset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. class:: OuterRef(field)
+
+.. versionadded:: 1.11
+
+Use ``OuterRef`` when a queryset in a ``Subquery`` needs to refer to a field
+from the outer query. It acts like an :class:`F` expression except that the
+check to see if it refers to a valid field isn't made until the outer queryset
+is resolved.
+
+Instances of ``OuterRef`` may be used in conjunction with nested instances
+of ``Subquery`` to refer to a containing queryset that isn't the immediate
+parent. For example, this queryset would need to be within a nested pair of
+``Subquery`` instances to resolve correctly::
+
+    >>> Book.objects.filter(author=OuterRef(OuterRef('pk')))
+
+Limiting a subquery to a single column
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are times when a single column must be returned from a ``Subquery``, for
+instance, to use a ``Subquery`` as the target of an ``__in`` lookup. To return
+all comments for posts published within the last day::
+
+    >>> from datetime import timedelta
+    >>> from django.utils import timezone
+    >>> one_day_ago = timezone.now() - timedelta(days=1)
+    >>> posts = Post.objects.filter(published_at__gte=one_day_ago)
+    >>> Comment.objects.filter(post__in=Subquery(posts.values('pk')))
+
+In this case, the subquery must use :meth:`~.QuerySet.values`
+to return only a single column: the primary key of the post.
+
+Limiting the subquery to a single row
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To prevent a subquery from returning multiple rows, a slice (``[:1]``) of the
+queryset is used::
+
+    >>> subquery = Subquery(newest.values('email')[:1])
+    >>> Post.objects.annotate(newest_commenter_email=subquery)
+
+In this case, the subquery must only return a single column *and* a single
+row: the email address of the most recently created comment.
+
+(Using :meth:`~.QuerySet.get` instead of a slice would fail because the
+``OuterRef`` cannot be resolved until the queryset is used within a
+``Subquery``.)
+
+``Exists()`` subqueries
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. class:: Exists(queryset)
+
+.. versionadded:: 1.11
+
+``Exists`` is a ``Subquery`` subclass that uses an SQL ``EXISTS`` statement. In
+many cases it will perform better than a subquery since the database is able to
+stop evaluation of the subquery when a first matching row is found.
+
+For example, to annotate each post with whether or not it has a comment from
+within the last day::
+
+    >>> from django.db.models import Exists, OuterRef
+    >>> from datetime import timedelta
+    >>> from django.utils import timezone
+    >>> one_day_ago = timezone.now() - timedelta(days=1)
+    >>> recent_comments = Comment.objects.filter(
+    ...     post=OuterRef('pk'),
+    ...     created_at__gte=one_day_ago,
+    ... )
+    >>> Post.objects.annotate(recent_comment=Exists(recent_comments)
+
+On PostgreSQL, the SQL looks like:
+
+.. code-block:: sql
+
+    SELECT "post"."id", "post"."published_at", EXISTS(
+        SELECT U0."id", U0."post_id", U0."email", U0."created_at"
+        FROM "comment" U0
+        WHERE (
+            U0."created_at" >= YYYY-MM-DD HH:MM:SS AND
+            U0."post_id" = ("post"."id")
+        )
+    ) AS "recent_comment" FROM "post"
+
+It's unnecessary to force ``Exists`` to refer to a single column, since the
+columns are discarded and a boolean result is returned. Similarly, since
+ordering is unimportant within an SQL ``EXISTS`` subquery and would only
+degrade performance, it's automatically removed.
+
+You can query using ``NOT EXISTS`` with ``~Exists()``.
+
+Filtering on a ``Subquery`` expression
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It's not possible to filter directly using ``Subquery`` and ``Exists``, e.g.::
+
+    >>> Post.objects.filter(Exists(recent_comments))
+    ...
+    TypeError: 'Exists' object is not iterable
+
+
+You must filter on a subquery expression by first annotating the queryset
+and then filtering based on that annotation::
+
+    >>> Post.objects.annotate(
+    ...     recent_comment=Exists(recent_comments),
+    ... ).filter(recent_comment=True)
+
+Using aggregates within a ``Subquery`` expression
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Aggregates may be used within a ``Subquery``, but they require a specific
+combination of :meth:`~.QuerySet.filter`, :meth:`~.QuerySet.values`, and
+:meth:`~.QuerySet.annotate` to get the subquery grouping correct.
+
+Assuming both models have a ``length`` field, to find posts where the post
+length is greater than the total length of all combined comments::
+
+    >>> from django.db.models import OuterRef, Subquery, Sum
+    >>> comments = Comment.objects.filter(post=OuterRef('pk')).values('post')
+    >>> total_comments = comments.annotate(total=Sum('length')).values('total')
+    >>> Post.objects.filter(length__gt=Subquery(total_comments))
+
+The initial ``filter(...)`` limits the subquery to the relevant parameters.
+``values('post')`` aggregates comments by ``Post``. Finally, ``annotate(...)``
+performs the aggregation. The order in which these queryset methods are applied
+is important. In this case, since the subquery must be limited to a single
+column, ``values('total')`` is required.
+
+This is the only way to perform an aggregation within a ``Subquery``, as
+using :meth:`~.QuerySet.aggregate` attempts to evaluate the queryset (and if
+there is an ``OuterRef``, this will not be possible to resolve).
+
 Raw SQL expressions
 -------------------

--- a/docs/releases/1.11.txt
+++ b/docs/releases/1.11.txt
@ -70,6 +70,14 @@ template system rather than in Python. See :doc:`/ref/forms/renderers`.
 You may need to adjust any custom widgets that you've written for a few
 :ref:`backwards incompatible changes <template-widget-incompatibilities-1-11>`.

+``Subquery`` expressions
+------------------------
+
+The new :class:`~django.db.models.Subquery` and
+:class:`~django.db.models.Exists` database expressions allow creating
+explicit subqueries. Subqueries may refer to fields from the outer queryset
+using the :class:`~django.db.models.OuterRef` class.
+
 Minor features
 --------------

--- a/tests/expressions/tests.py
+++ b/tests/expressions/tests.py
@ -12,8 +12,8 @@ from django.db.models.aggregates import (
    Avg, Count, Max, Min, StdDev, Sum, Variance,
 )
 from django.db.models.expressions import (
-    Case, Col, ExpressionWrapper, F, Func, OrderBy, Random, RawSQL, Ref, Value,
-    When,
+    Case, Col, Exists, ExpressionWrapper, F, Func, OrderBy, OuterRef, Random,
+    RawSQL, Ref, Subquery, Value, When,
 )
 from django.db.models.functions import (
    Coalesce, Concat, Length, Lower, Substr, Upper,
@ -32,15 +32,15 @@ from .models import (
 class BasicExpressionsTests(TestCase):
    @classmethod
    def setUpTestData(cls):
-        Company.objects.create(
+        cls.example_inc = Company.objects.create(
            name="Example Inc.", num_employees=2300, num_chairs=5,
            ceo=Employee.objects.create(firstname="Joe", lastname="Smith", salary=10)
        )
-        Company.objects.create(
+        cls.foobar_ltd = Company.objects.create(
            name="Foobar Ltd.", num_employees=3, num_chairs=4,
            ceo=Employee.objects.create(firstname="Frank", lastname="Meyer", salary=20)
        )
-        Company.objects.create(
+        cls.gmbh = Company.objects.create(
            name="Test GmbH", num_employees=32, num_chairs=1,
            ceo=Employee.objects.create(firstname="Max", lastname="Mustermann", salary=30)
        )
@ -387,6 +387,136 @@ class BasicExpressionsTests(TestCase):
        )
        self.assertEqual(str(qs.query).count('JOIN'), 2)

+    def test_outerref(self):
+        inner = Company.objects.filter(point_of_contact=OuterRef('pk'))
+        msg = (
+            'This queryset contains a reference to an outer query and may only '
+            'be used in a subquery.'
+        )
+        with self.assertRaisesMessage(ValueError, msg):
+            inner.exists()
+
+        outer = Employee.objects.annotate(is_point_of_contact=Exists(inner))
+        self.assertIs(outer.exists(), True)
+
+    def test_subquery(self):
+        Company.objects.filter(name='Example Inc.').update(
+            point_of_contact=Employee.objects.get(firstname='Joe', lastname='Smith'),
+            ceo=Employee.objects.get(firstname='Max', lastname='Mustermann'),
+        )
+        Employee.objects.create(firstname='Bob', lastname='Brown', salary=40)
+        qs = Employee.objects.annotate(
+            is_point_of_contact=Exists(Company.objects.filter(point_of_contact=OuterRef('pk'))),
+            is_not_point_of_contact=~Exists(Company.objects.filter(point_of_contact=OuterRef('pk'))),
+            is_ceo_of_small_company=Exists(Company.objects.filter(num_employees__lt=200, ceo=OuterRef('pk'))),
+            is_ceo_small_2=~~Exists(Company.objects.filter(num_employees__lt=200, ceo=OuterRef('pk'))),
+            largest_company=Subquery(Company.objects.order_by('-num_employees').filter(
+                models.Q(ceo=OuterRef('pk')) | models.Q(point_of_contact=OuterRef('pk'))
+            ).values('name')[:1], output_field=models.CharField())
+        ).values(
+            'firstname',
+            'is_point_of_contact',
+            'is_not_point_of_contact',
+            'is_ceo_of_small_company',
+            'is_ceo_small_2',
+            'largest_company',
+        ).order_by('firstname')
+
+        results = list(qs)
+        # Could use Coalesce(subq, Value('')) instead except for the bug in
+        # cx_Oracle mentioned in #23843.
+        bob = results[0]
+        if bob['largest_company'] == '' and connection.features.interprets_empty_strings_as_nulls:
+            bob['largest_company'] = None
+
+        self.assertEqual(results, [
+            {
+                'firstname': 'Bob',
+                'is_point_of_contact': False,
+                'is_not_point_of_contact': True,
+                'is_ceo_of_small_company': False,
+                'is_ceo_small_2': False,
+                'largest_company': None,
+            },
+            {
+                'firstname': 'Frank',
+                'is_point_of_contact': False,
+                'is_not_point_of_contact': True,
+                'is_ceo_of_small_company': True,
+                'is_ceo_small_2': True,
+                'largest_company': 'Foobar Ltd.',
+            },
+            {
+                'firstname': 'Joe',
+                'is_point_of_contact': True,
+                'is_not_point_of_contact': False,
+                'is_ceo_of_small_company': False,
+                'is_ceo_small_2': False,
+                'largest_company': 'Example Inc.',
+            },
+            {
+                'firstname': 'Max',
+                'is_point_of_contact': False,
+                'is_not_point_of_contact': True,
+                'is_ceo_of_small_company': True,
+                'is_ceo_small_2': True,
+                'largest_company': 'Example Inc.'
+            }
+        ])
+        # A less elegant way to write the same query: this uses a LEFT OUTER
+        # JOIN and an IS NULL, inside a WHERE NOT IN which is probably less
+        # efficient than EXISTS.
+        self.assertCountEqual(
+            qs.filter(is_point_of_contact=True).values('pk'),
+            Employee.objects.exclude(company_point_of_contact_set=None).values('pk')
+        )
+
+    def test_in_subquery(self):
+        # This is a contrived test (and you really wouldn't write this query),
+        # but it is a succinct way to test the __in=Subquery() construct.
+        small_companies = Company.objects.filter(num_employees__lt=200).values('pk')
+        subquery_test = Company.objects.filter(pk__in=Subquery(small_companies))
+        self.assertCountEqual(subquery_test, [self.foobar_ltd, self.gmbh])
+        subquery_test2 = Company.objects.filter(pk=Subquery(small_companies.filter(num_employees=3)))
+        self.assertCountEqual(subquery_test2, [self.foobar_ltd])
+
+    def test_nested_subquery(self):
+        inner = Company.objects.filter(point_of_contact=OuterRef('pk'))
+        outer = Employee.objects.annotate(is_point_of_contact=Exists(inner))
+        contrived = Employee.objects.annotate(
+            is_point_of_contact=Subquery(
+                outer.filter(pk=OuterRef('pk')).values('is_point_of_contact'),
+                output_field=models.BooleanField(),
+            ),
+        )
+        self.assertCountEqual(contrived.values_list(), outer.values_list())
+
+    def test_nested_subquery_outer_ref_2(self):
+        first = Time.objects.create(time='09:00')
+        second = Time.objects.create(time='17:00')
+        third = Time.objects.create(time='21:00')
+        SimulationRun.objects.bulk_create([
+            SimulationRun(start=first, end=second, midpoint='12:00'),
+            SimulationRun(start=first, end=third, midpoint='15:00'),
+            SimulationRun(start=second, end=first, midpoint='00:00'),
+        ])
+        inner = Time.objects.filter(time=OuterRef(OuterRef('time')), pk=OuterRef('start')).values('time')
+        middle = SimulationRun.objects.annotate(other=Subquery(inner)).values('other')[:1]
+        outer = Time.objects.annotate(other=Subquery(middle, output_field=models.TimeField()))
+        # This is a contrived example. It exercises the double OuterRef form.
+        self.assertCountEqual(outer, [first, second, third])
+
+    def test_annotations_within_subquery(self):
+        Company.objects.filter(num_employees__lt=50).update(ceo=Employee.objects.get(firstname='Frank'))
+        inner = Company.objects.filter(
+            ceo=OuterRef('pk')
+        ).values('ceo').annotate(total_employees=models.Sum('num_employees')).values('total_employees')
+        outer = Employee.objects.annotate(total_employees=Subquery(inner)).filter(salary__lte=Subquery(inner))
+        self.assertSequenceEqual(
+            outer.order_by('-total_employees').values('salary', 'total_employees'),
+            [{'salary': 10, 'total_employees': 2300}, {'salary': 20, 'total_employees': 35}],
+        )
+

 class IterableLookupInnerExpressionsTests(TestCase):
    @classmethod