mirror of https://github.com/django/django.git
Refs #3254 -- Added full text search to contrib.postgres.
Adds a reasonably feature complete implementation of full text search using the built in PostgreSQL engine. It uses public APIs from Expression and Lookup. With thanks to Tim Graham, Simon Charettes, Josh Smeaton, Mikey Ariel and many others for their advice and review. Particular thanks also go to the supporters of the contrib.postgres kickstarter.
This commit is contained in:
parent
f4c2b8e04a
commit
2d877da855
|
@ -3,7 +3,7 @@ from django.db.backends.signals import connection_created
|
|||
from django.db.models import CharField, TextField
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from .lookups import Unaccent
|
||||
from .lookups import SearchLookup, Unaccent
|
||||
from .signals import register_hstore_handler
|
||||
|
||||
|
||||
|
@ -15,3 +15,5 @@ class PostgresConfig(AppConfig):
|
|||
connection_created.connect(register_hstore_handler)
|
||||
CharField.register_lookup(Unaccent)
|
||||
TextField.register_lookup(Unaccent)
|
||||
CharField.register_lookup(SearchLookup)
|
||||
TextField.register_lookup(SearchLookup)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from django.db.models import Lookup, Transform
|
||||
|
||||
from .search import SearchVector, SearchVectorExact, SearchVectorField
|
||||
|
||||
|
||||
class PostgresSimpleLookup(Lookup):
|
||||
def as_sql(self, qn, connection):
|
||||
|
@ -43,3 +45,13 @@ class Unaccent(Transform):
|
|||
bilateral = True
|
||||
lookup_name = 'unaccent'
|
||||
function = 'UNACCENT'
|
||||
|
||||
|
||||
class SearchLookup(SearchVectorExact):
|
||||
lookup_name = 'search'
|
||||
|
||||
def process_lhs(self, qn, connection):
|
||||
if not isinstance(self.lhs.output_field, SearchVectorField):
|
||||
self.lhs = SearchVector(self.lhs)
|
||||
lhs, lhs_params = super(SearchLookup, self).process_lhs(qn, connection)
|
||||
return lhs, lhs_params
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
from django.db.models import Field, FloatField
|
||||
from django.db.models.expressions import CombinedExpression, Func, Value
|
||||
from django.db.models.functions import Coalesce
|
||||
from django.db.models.lookups import Lookup
|
||||
|
||||
|
||||
class SearchVectorExact(Lookup):
|
||||
lookup_name = 'exact'
|
||||
|
||||
def process_rhs(self, qn, connection):
|
||||
if not hasattr(self.rhs, 'resolve_expression'):
|
||||
config = getattr(self.lhs, 'config', None)
|
||||
self.rhs = SearchQuery(self.rhs, config=config)
|
||||
rhs, rhs_params = super(SearchVectorExact, self).process_rhs(qn, connection)
|
||||
return rhs, rhs_params
|
||||
|
||||
def as_sql(self, qn, connection):
|
||||
lhs, lhs_params = self.process_lhs(qn, connection)
|
||||
rhs, rhs_params = self.process_rhs(qn, connection)
|
||||
params = lhs_params + rhs_params
|
||||
return '%s @@ %s = true' % (lhs, rhs), params
|
||||
|
||||
|
||||
class SearchVectorField(Field):
|
||||
|
||||
def db_type(self, connection):
|
||||
return 'tsvector'
|
||||
|
||||
|
||||
class SearchQueryField(Field):
|
||||
|
||||
def db_type(self, connection):
|
||||
return 'tsquery'
|
||||
|
||||
|
||||
class SearchVectorCombinable(object):
|
||||
ADD = '||'
|
||||
|
||||
def _combine(self, other, connector, reversed, node=None):
|
||||
if not isinstance(other, SearchVectorCombinable) or not self.config == other.config:
|
||||
raise TypeError('SearchVector can only be combined with other SearchVectors')
|
||||
if reversed:
|
||||
return CombinedSearchVector(other, connector, self, self.config)
|
||||
return CombinedSearchVector(self, connector, other, self.config)
|
||||
|
||||
|
||||
class SearchVector(SearchVectorCombinable, Func):
|
||||
function = 'to_tsvector'
|
||||
arg_joiner = " || ' ' || "
|
||||
_output_field = SearchVectorField()
|
||||
config = None
|
||||
|
||||
def __init__(self, *expressions, **extra):
|
||||
super(SearchVector, self).__init__(*expressions, **extra)
|
||||
self.source_expressions = [
|
||||
Coalesce(expression, Value('')) for expression in self.source_expressions
|
||||
]
|
||||
self.config = self.extra.get('config', self.config)
|
||||
weight = self.extra.get('weight')
|
||||
if weight is not None and not hasattr(weight, 'resolve_expression'):
|
||||
weight = Value(weight)
|
||||
self.weight = weight
|
||||
|
||||
def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False):
|
||||
resolved = super(SearchVector, self).resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
if self.config:
|
||||
if not hasattr(self.config, 'resolve_expression'):
|
||||
resolved.config = Value(self.config).resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
else:
|
||||
resolved.config = self.config.resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
return resolved
|
||||
|
||||
def as_sql(self, compiler, connection, function=None, template=None):
|
||||
config_params = []
|
||||
if template is None:
|
||||
if self.config:
|
||||
config_sql, config_params = compiler.compile(self.config)
|
||||
template = "%(function)s({}::regconfig, %(expressions)s)".format(config_sql.replace('%', '%%'))
|
||||
else:
|
||||
template = self.template
|
||||
sql, params = super(SearchVector, self).as_sql(compiler, connection, function=function, template=template)
|
||||
extra_params = []
|
||||
if self.weight:
|
||||
weight_sql, extra_params = compiler.compile(self.weight)
|
||||
sql = 'setweight({}, {})'.format(sql, weight_sql)
|
||||
return sql, config_params + params + extra_params
|
||||
|
||||
|
||||
class CombinedSearchVector(SearchVectorCombinable, CombinedExpression):
|
||||
def __init__(self, lhs, connector, rhs, config, output_field=None):
|
||||
self.config = config
|
||||
super(CombinedSearchVector, self).__init__(lhs, connector, rhs, output_field)
|
||||
|
||||
|
||||
class SearchQuery(Value):
|
||||
invert = False
|
||||
_output_field = SearchQueryField()
|
||||
config = None
|
||||
|
||||
BITAND = '&&'
|
||||
BITOR = '||'
|
||||
|
||||
def __init__(self, value, output_field=None, **extra):
|
||||
self.config = extra.pop('config', self.config)
|
||||
self.invert = extra.pop('invert', self.invert)
|
||||
super(SearchQuery, self).__init__(value, output_field=output_field)
|
||||
|
||||
def resolve_expression(self, query=None, allow_joins=True, reuse=None, summarize=False, for_save=False):
|
||||
resolved = super(SearchQuery, self).resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
if self.config:
|
||||
if not hasattr(self.config, 'resolve_expression'):
|
||||
resolved.config = Value(self.config).resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
else:
|
||||
resolved.config = self.config.resolve_expression(query, allow_joins, reuse, summarize, for_save)
|
||||
return resolved
|
||||
|
||||
def as_sql(self, compiler, connection):
|
||||
params = [self.value]
|
||||
if self.config:
|
||||
config_sql, config_params = compiler.compile(self.config)
|
||||
template = 'plainto_tsquery({}::regconfig, %s)'.format(config_sql)
|
||||
params = config_params + [self.value]
|
||||
else:
|
||||
template = 'plainto_tsquery(%s)'
|
||||
if self.invert:
|
||||
template = '!!({})'.format(template)
|
||||
return template, params
|
||||
|
||||
def _combine(self, other, connector, reversed, node=None):
|
||||
combined = super(SearchQuery, self)._combine(other, connector, reversed, node)
|
||||
combined.output_field = SearchQueryField()
|
||||
return combined
|
||||
|
||||
# On Combinable, these are not implemented to reduce confusion with Q. In
|
||||
# this case we are actually (ab)using them to do logical combination so
|
||||
# it's consistent with other usage in Django.
|
||||
def __or__(self, other):
|
||||
return self._combine(other, self.BITOR, False)
|
||||
|
||||
def __ror__(self, other):
|
||||
return self._combine(other, self.BITOR, True)
|
||||
|
||||
def __and__(self, other):
|
||||
return self._combine(other, self.BITAND, False)
|
||||
|
||||
def __rand__(self, other):
|
||||
return self._combine(other, self.BITAND, True)
|
||||
|
||||
def __invert__(self):
|
||||
extra = {
|
||||
'invert': not self.invert,
|
||||
'config': self.config,
|
||||
}
|
||||
return type(self)(self.value, **extra)
|
||||
|
||||
|
||||
class SearchRank(Func):
|
||||
function = 'ts_rank'
|
||||
_output_field = FloatField()
|
||||
|
||||
def __init__(self, vector, query, **extra):
|
||||
if not hasattr(vector, 'resolve_expression'):
|
||||
vector = SearchVector(vector)
|
||||
if not hasattr(query, 'resolve_expression'):
|
||||
query = SearchQuery(query)
|
||||
weights = extra.get('weights')
|
||||
if weights is not None and not hasattr(weights, 'resolve_expression'):
|
||||
weights = Value(weights)
|
||||
self.weights = weights
|
||||
super(SearchRank, self).__init__(vector, query, **extra)
|
||||
|
||||
def as_sql(self, compiler, connection, function=None, template=None):
|
||||
extra_params = []
|
||||
extra_context = {}
|
||||
if template is None and self.extra.get('weights'):
|
||||
if self.weights:
|
||||
template = '%(function)s(%(weights)s, %(expressions)s)'
|
||||
weight_sql, extra_params = compiler.compile(self.weights)
|
||||
extra_context['weights'] = weight_sql
|
||||
sql, params = super(SearchRank, self).as_sql(
|
||||
compiler, connection,
|
||||
function=function, template=template, **extra_context
|
||||
)
|
||||
return sql, extra_params + params
|
||||
|
||||
|
||||
SearchVectorField.register_lookup(SearchVectorExact)
|
|
@ -254,3 +254,9 @@ class DatabaseOperations(BaseDatabaseOperations):
|
|||
rhs_sql, rhs_params = rhs
|
||||
return "age(%s, %s)" % (lhs_sql, rhs_sql), lhs_params + rhs_params
|
||||
return super(DatabaseOperations, self).subtract_temporals(internal_type, lhs, rhs)
|
||||
|
||||
def fulltext_search_sql(self, field_name):
|
||||
raise NotImplementedError(
|
||||
"Add 'django.contrib.postgres' to settings.INSTALLED_APPS to use "
|
||||
"the search operator."
|
||||
)
|
||||
|
|
|
@ -125,8 +125,10 @@ class BaseExpression(object):
|
|||
|
||||
# aggregate specific fields
|
||||
is_summary = False
|
||||
_output_field = None
|
||||
|
||||
def __init__(self, output_field=None):
|
||||
if output_field is not None:
|
||||
self._output_field = output_field
|
||||
|
||||
def get_db_converters(self, connection):
|
||||
|
|
|
@ -105,6 +105,7 @@ manipulating the data of your Web application. Learn more about it below:
|
|||
:doc:`Raw SQL <topics/db/sql>` |
|
||||
:doc:`Transactions <topics/db/transactions>` |
|
||||
:doc:`Aggregation <topics/db/aggregation>` |
|
||||
:doc:`Search <topics/db/search>` |
|
||||
:doc:`Custom fields <howto/custom-model-fields>` |
|
||||
:doc:`Multiple databases <topics/db/multi-db>` |
|
||||
:doc:`Custom lookups <howto/custom-lookups>` |
|
||||
|
|
|
@ -37,4 +37,5 @@ release. Some fields require higher versions.
|
|||
functions
|
||||
lookups
|
||||
operations
|
||||
search
|
||||
validators
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
================
|
||||
Full text search
|
||||
================
|
||||
|
||||
.. versionadded:: 1.10
|
||||
|
||||
The database functions in the ``django.contrib.postgres.search`` module ease
|
||||
the use of PostgreSQL's `full text search engine
|
||||
<http://www.postgresql.org/docs/current/static/textsearch.html>`_.
|
||||
|
||||
For the examples in this document, we'll use the models defined in
|
||||
:doc:`/topics/db/queries`.
|
||||
|
||||
.. seealso::
|
||||
|
||||
For a high-level overview of searching, see the :doc:`topic documentation
|
||||
</topics/db/search>`.
|
||||
|
||||
.. currentmodule:: django.contrib.postgres.search
|
||||
|
||||
The ``search`` lookup
|
||||
=====================
|
||||
|
||||
.. fieldlookup:: search
|
||||
|
||||
The simplest way to use full text search is to search a single term against a
|
||||
single column in the database. For example::
|
||||
|
||||
>>> Entry.objects.filter(body_text__search='Cheese')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza Recipes>]
|
||||
|
||||
This creates a ``to_tsvector`` in the database from the ``body_text`` field
|
||||
and a ``plainto_tsquery`` from the search term ``'Potato'``, both using the
|
||||
default database search configuration. The results are obtained by matching the
|
||||
query and the vector.
|
||||
|
||||
To use the ``search`` lookup, ``'django.contrib.postgres'`` must be in your
|
||||
:setting:`INSTALLED_APPS`.
|
||||
|
||||
``SearchVector``
|
||||
================
|
||||
|
||||
.. class:: SearchVector(\*expressions, config=None, weight=None)
|
||||
|
||||
Searching against a single field is great but rather limiting. The ``Entry``
|
||||
instances we're searching belong to a ``Blog``, which has a ``tagline`` field.
|
||||
To query against both fields, use a ``SearchVector``::
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchVector
|
||||
>>> Entry.objects.annotate(
|
||||
... search=SearchVector('body_text', 'blog__tagline'),
|
||||
... ).filter(search='Cheese')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza Recipes>]
|
||||
|
||||
The arguments to ``SearchVector`` can be any
|
||||
:class:`~django.db.models.Expression` or the name of a field. Multiple
|
||||
arguments will be concatenated together using a space so that the search
|
||||
document includes them all.
|
||||
|
||||
``SearchVector`` objects can be combined together, allowing you to reuse them.
|
||||
For example::
|
||||
|
||||
>>> Entry.objects.annotate(
|
||||
... search=SearchVector('body_text') + SearchVector('blog__tagline'),
|
||||
... ).filter(search='Cheese')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza Recipes>]
|
||||
|
||||
See :ref:`postgresql-fts-search-configuration` and
|
||||
:ref:`postgresql-fts-weighting-queries` for an explanation of the ``config``
|
||||
and ``weight`` parameters.
|
||||
|
||||
``SearchQuery``
|
||||
===============
|
||||
|
||||
.. class:: SearchQuery(value, config=None)
|
||||
|
||||
``SearchQuery`` translates the terms the user provides into a search query
|
||||
object that the database compares to a search vector. By default, all the words
|
||||
the user provides are passed through the stemming algorithms, and then it
|
||||
looks for matches for all of the resulting terms.
|
||||
|
||||
``SearchQuery`` terms can be combined logically to provide more flexibility::
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery
|
||||
>>> SearchQuery('potato') & SearchQuery('ireland') # potato AND ireland
|
||||
>>> SearchQuery('potato') | SearchQuery('penguin') # potato OR penguin
|
||||
>>> ~SearchQuery('sausage') # NOT sausage
|
||||
|
||||
See :ref:`postgresql-fts-search-configuration` for an explanation of the
|
||||
``config`` parameter.
|
||||
|
||||
``SearchRank``
|
||||
==============
|
||||
|
||||
.. class:: SearchRank(vector, query, weights=None)
|
||||
|
||||
So far, we've just returned the results for which any match between the vector
|
||||
and the query are possible. It's likely you may wish to order the results by
|
||||
some sort of relevancy. PostgreSQL provides a ranking function which takes into
|
||||
account how often the query terms appear in the document, how close together
|
||||
the terms are in the document, and how important the part of the document is
|
||||
where they occur. The better the match, the higher the value of the rank. To
|
||||
order by relevancy::
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
|
||||
>>> vector = SearchVector('body_text')
|
||||
>>> query = SearchQuery('cheese')
|
||||
>>> Entry.objects.annotate(rank=SearchRank(vector, query)).order_by('-rank')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]
|
||||
|
||||
See :ref:`postgresql-fts-weighting-queries` for an explanation of the
|
||||
``weights`` parameter.
|
||||
|
||||
.. _postgresql-fts-search-configuration:
|
||||
|
||||
Changing the search configuration
|
||||
=================================
|
||||
|
||||
You can specify the ``config`` attribute to a :class:`SearchVector` and
|
||||
:class:`SearchQuery` to use a different search configuration. This allows using
|
||||
a different language parsers and dictionaries as defined by the database::
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery, SearchVector
|
||||
>>> Entry.objects.annotate(
|
||||
... search=SearchVector('body_text', config='french'),
|
||||
... ).filter(search=SearchQuery('œuf', config='french'))
|
||||
[<Entry: Pain perdu>]
|
||||
|
||||
The value of ``config`` could also be stored in another column::
|
||||
|
||||
>>> from djanog.db.models import F
|
||||
>>> Entry.objects.annotate(
|
||||
... search=SearchVector('body_text', config=F('blog__language')),
|
||||
... ).filter(search=SearchQuery('œuf', config=F('blog__language')))
|
||||
[<Entry: Pain perdu>]
|
||||
|
||||
.. _postgresql-fts-weighting-queries:
|
||||
|
||||
Weighting queries
|
||||
=================
|
||||
|
||||
Every field may not have the same relevance in a query, so you can set weights
|
||||
of various vectors before you combine them::
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
|
||||
>>> vector = SearchVector('body_text', weight='A') + SearchVector('blog__tagline', weight='B')
|
||||
>>> query = SearchQuery('cheese')
|
||||
>>> Entry.objects.annotate(rank=SearchRank(vector, query)).filter(rank__gte=0.3).order_by('rank')
|
||||
|
||||
The weight should be one of the following letters: D, C, B, A. By default,
|
||||
these weights refer to the numbers ``0.1``, ``0.2``, ``0.4``, and ``1.0``,
|
||||
respectively. If you wish to weight them differently, pass a list of four
|
||||
floats to :class:`SearchRank` as ``weights`` in the same order above::
|
||||
|
||||
>>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8])
|
||||
>>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by('-rank')
|
||||
|
||||
Performance
|
||||
===========
|
||||
|
||||
Special database configuration isn't necessary to use any of these functions,
|
||||
however, if you're searching more than a few hundred records, you're likely to
|
||||
run into performance problems. Full text search is a more intensive process
|
||||
than comparing the size of an integer, for example.
|
||||
|
||||
In the event that all the fields you're querying on are contained within one
|
||||
particular model, you can create a functional index which matches the search
|
||||
vector you wish to use. For example:
|
||||
|
||||
.. code-block:: sql
|
||||
|
||||
CREATE INDEX body_text_search ON blog_entry (to_tsvector(body_text));
|
||||
|
||||
This index will then be used by subsequent queries. In many cases this will be
|
||||
sufficient.
|
||||
|
||||
``SearchVectorField``
|
||||
---------------------
|
||||
|
||||
.. class:: SearchVectorField
|
||||
|
||||
If this approach becomes too slow, you can add a ``SearchVectorField`` to your
|
||||
model. You'll need to keep it populated with triggers, for example, as
|
||||
described in the `PostgreSQL documentation`_. You can then query the field as
|
||||
if it were an annotated ``SearchVector``::
|
||||
|
||||
>>> Entry.objects.update(search_vector=SearchVector('body_text'))
|
||||
>>> Entry.objects.filter(search_vector='potato')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]
|
||||
|
||||
.. _PostgreSQL documentation: http://www.postgresql.org/docs/current/static/textsearch-features.html#TEXTSEARCH-UPDATE-TRIGGERS
|
|
@ -24,7 +24,14 @@ recommend** and only officially support the latest release of each series.
|
|||
What's new in Django 1.10
|
||||
=========================
|
||||
|
||||
...
|
||||
Full text search for PostgreSQL
|
||||
-------------------------------
|
||||
|
||||
``django.contrib.postgres`` now includes a :doc:`collection of database
|
||||
functions </ref/contrib/postgres/search>` to allow the use of the full text
|
||||
search engine. You can search across multiple fields in your relational
|
||||
database, combine the searches with other lookups, use different language
|
||||
configurations and weightings, and rank the results by relevance.
|
||||
|
||||
Minor features
|
||||
--------------
|
||||
|
|
|
@ -14,6 +14,7 @@ model maps to a single database table.
|
|||
models
|
||||
queries
|
||||
aggregation
|
||||
search
|
||||
managers
|
||||
sql
|
||||
transactions
|
||||
|
|
|
@ -27,7 +27,7 @@ models, which comprise a Weblog application:
|
|||
return self.name
|
||||
|
||||
class Author(models.Model):
|
||||
name = models.CharField(max_length=50)
|
||||
name = models.CharField(max_length=200)
|
||||
email = models.EmailField()
|
||||
|
||||
def __str__(self): # __unicode__ on Python 2
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
======
|
||||
Search
|
||||
======
|
||||
|
||||
A common task for web applications is to search some data in the database with
|
||||
user input. In a simple case, this could be filtering a list of objects by a
|
||||
category. A more complex use case might require searching with weighting,
|
||||
categorization, highlighting, multiple languages, and so on. This document
|
||||
explains some of the possible use cases and the tools you can use.
|
||||
|
||||
We'll refer to the same models used in :doc:`/topics/db/queries`.
|
||||
|
||||
Use Cases
|
||||
=========
|
||||
|
||||
Standard textual queries
|
||||
------------------------
|
||||
|
||||
Text-based fields have a selection of simple matching operations. For example,
|
||||
you may wish to allow lookup up an author like so::
|
||||
|
||||
>>> Author.objects.filter(name__contains='Terry')
|
||||
[<Author: Terry Gilliam>, <Author: Terry Jones>]
|
||||
|
||||
This is a very fragile solution as it requires the user to know an exact
|
||||
substring of the author's name. A better approach could be a case-insensitive
|
||||
match (:lookup:`icontains`), but this is only marginally better.
|
||||
|
||||
A database's more advanced comparison functions
|
||||
-----------------------------------------------
|
||||
|
||||
If you're using PostgreSQL, Django provides :doc:`a selection of database
|
||||
specific tools </ref/contrib/postgres/search>` to allow you to leverage more
|
||||
complex querying options. Other databases have different selections of tools,
|
||||
possibly via plugins or user-defined functions. Django doesn't include any
|
||||
support for them at this time. We'll use some examples from PostgreSQL to
|
||||
demonstrate the kind of functionality databases may have.
|
||||
|
||||
.. admonition:: Searching in other databases
|
||||
|
||||
All of the searching tools provided by :mod:`django.contrib.postgres` are
|
||||
constructed entirely on public APIs such as :doc:`custom lookups
|
||||
</ref/models/lookups>` and :doc:`database functions
|
||||
</ref/models/database-functions>`. Depending on your database, you should
|
||||
be able to construct queries to allow similar APIs. If there are specific
|
||||
things which cannot be achieved this way, please open a ticket.
|
||||
|
||||
In the above example, we determined that a case insensitive lookup would be
|
||||
more useful. When dealing with non-English names, a further improvement is to
|
||||
use :lookup:`unaccented comparison <unaccent>`::
|
||||
|
||||
>>> Author.objects.filter(name__unaccent__icontains='Helen')
|
||||
[<Author: Helen Mirren>, <Author: Helena Bonham Carter>, <Actor: Hélène Joy>]
|
||||
|
||||
This shows another issue, where we are matching against a different spelling of
|
||||
the name. In this case we have an asymmetry though - a search for ``Helen``
|
||||
will pick up ``Helena`` or ``Hélène``, but not the reverse. Another option
|
||||
would be to use a trigram comparison, which compares sequences of letters.
|
||||
|
||||
For example::
|
||||
|
||||
>>> Author.objects.filter(name__unaccent__lower__trigram='Hélène')
|
||||
[<Author: Helen Mirren>, <Actor: Hélène Joy>]
|
||||
|
||||
Now we have a different problem - the longer name of "Helena Bonham Carter"
|
||||
doesn't show up as it is much longer. Trigram searches consider all
|
||||
combinations of three letters, and compares how many appear in both search and
|
||||
source strings. For the longer name, there are more combinations which appear
|
||||
in the source string so it is no longer considered a close match.
|
||||
|
||||
The correct choice of comparison functions here depends on your particular data
|
||||
set, for example the language(s) used and the type of text being searched. All
|
||||
of the examples we've seen are on short strings where the user is likely to
|
||||
enter something close (by varying definitions) to the source data.
|
||||
|
||||
Document-based search
|
||||
---------------------
|
||||
|
||||
Simple database operations are too simple an approach when you start
|
||||
considering large blocks of text. Whereas the examples above can be thought of
|
||||
as operations on a string of characters, full text search looks at the actual
|
||||
words. Depending on the system used, it's likely to use some of the following
|
||||
ideas:
|
||||
|
||||
- Ignoring "stop words" such as "a", "the", "and".
|
||||
- Stemming words, so that "pony" and "ponies" are considered similar.
|
||||
- Weighting words based on different criteria such as how frequently they
|
||||
appear in the text, or the importance of the fields, such as the title or
|
||||
keywords, that they appear in.
|
||||
|
||||
There are many alternatives for using searching software, some of the most
|
||||
prominent are Elastic_ and Solr_. These are full document-based search
|
||||
solutions. To use them with data from Django models, you'll need a layer which
|
||||
translates your data into a textual document, including back-references to the
|
||||
database ids. When a search using the engine returns a certain document, you
|
||||
can then look it up in the database. There are a variety of third-party
|
||||
libraries which are designed to help with this process.
|
||||
|
||||
.. _Elastic: https://www.elastic.co/
|
||||
.. _Solr: http://lucene.apache.org/solr/
|
||||
|
||||
PostgreSQL support
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
PostgreSQL has its own full text search implementation built-in. While not as
|
||||
powerful as some other search engines, it has the advantage of being inside
|
||||
your database and so can easily be combined with other relational queries such
|
||||
as categorization.
|
||||
|
||||
The :mod:`django.contrib.postgres` module provides some helpers to make these
|
||||
queries. For example, a simple query might be to select all the blog entries
|
||||
which mention "cheese"::
|
||||
|
||||
>>> Entry.objects.filter(body_text__search='cheese')
|
||||
[<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]
|
||||
|
||||
You can also filter on a combination of fields and on related models::
|
||||
|
||||
>>> Entry.objects.annotate(
|
||||
... search=SearchVector('blog__tagline', 'body_text'),
|
||||
... ).filter(search='cheese')
|
||||
[
|
||||
<Entry: Cheese on Toast recipes>,
|
||||
<Entry: Pizza Recipes>,
|
||||
<Entry: Dairy farming in Argentina>,
|
||||
]
|
||||
|
||||
See the ``contrib.postgres`` :doc:`/ref/contrib/postgres/search` document for
|
||||
complete details.
|
|
@ -9,6 +9,7 @@ try:
|
|||
ArrayField, BigIntegerRangeField, DateRangeField, DateTimeRangeField,
|
||||
FloatRangeField, HStoreField, IntegerRangeField, JSONField,
|
||||
)
|
||||
from django.contrib.postgres.search import SearchVectorField
|
||||
except ImportError:
|
||||
class DummyArrayField(models.Field):
|
||||
def __init__(self, base_field, size=None, **kwargs):
|
||||
|
@ -30,3 +31,4 @@ except ImportError:
|
|||
HStoreField = models.Field
|
||||
IntegerRangeField = models.Field
|
||||
JSONField = models.Field
|
||||
SearchVectorField = models.Field
|
||||
|
|
|
@ -114,6 +114,40 @@ class Migration(migrations.Migration):
|
|||
options=None,
|
||||
bases=None,
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Scene',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('scene', models.CharField(max_length=255)),
|
||||
('setting', models.CharField(max_length=255)),
|
||||
],
|
||||
options=None,
|
||||
bases=None,
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Character',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('name', models.CharField(max_length=255)),
|
||||
],
|
||||
options=None,
|
||||
bases=None,
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Line',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('scene', models.ForeignKey('postgres_tests.Scene', on_delete=models.SET_NULL)),
|
||||
('character', models.ForeignKey('postgres_tests.Character', on_delete=models.SET_NULL)),
|
||||
('dialogue', models.TextField(blank=True, null=True)),
|
||||
('dialogue_search_vector', SearchVectorField(blank=True, null=True)),
|
||||
('dialogue_config', models.CharField(max_length=100, blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
'required_db_vendor': 'postgresql',
|
||||
},
|
||||
bases=None,
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='AggregateTestModel',
|
||||
fields=[
|
||||
|
|
|
@ -3,6 +3,7 @@ from django.db import connection, models
|
|||
from .fields import (
|
||||
ArrayField, BigIntegerRangeField, DateRangeField, DateTimeRangeField,
|
||||
FloatRangeField, HStoreField, IntegerRangeField, JSONField,
|
||||
SearchVectorField,
|
||||
)
|
||||
|
||||
|
||||
|
@ -78,6 +79,37 @@ class CharFieldModel(models.Model):
|
|||
class TextFieldModel(models.Model):
|
||||
field = models.TextField()
|
||||
|
||||
def __str__(self):
|
||||
return self.field
|
||||
|
||||
|
||||
# Scene/Character/Line models are used to test full text search. They're
|
||||
# populated with content from Monty Python and the Holy Grail.
|
||||
class Scene(models.Model):
|
||||
scene = models.CharField(max_length=255)
|
||||
setting = models.CharField(max_length=255)
|
||||
|
||||
def __str__(self):
|
||||
return self.scene
|
||||
|
||||
|
||||
class Character(models.Model):
|
||||
name = models.CharField(max_length=255)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class Line(PostgreSQLModel):
|
||||
scene = models.ForeignKey('Scene', models.CASCADE)
|
||||
character = models.ForeignKey('Character', models.CASCADE)
|
||||
dialogue = models.TextField(blank=True, null=True)
|
||||
dialogue_search_vector = SearchVectorField(blank=True, null=True)
|
||||
dialogue_config = models.CharField(max_length=100, blank=True, null=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.dialogue or ''
|
||||
|
||||
|
||||
class RangesModel(PostgreSQLModel):
|
||||
ints = IntegerRangeField(blank=True, null=True)
|
||||
|
|
|
@ -0,0 +1,269 @@
|
|||
"""
|
||||
Test PostgreSQL full text search.
|
||||
|
||||
These tests use dialogue from the 1975 film Monty Python and the Holy Grail.
|
||||
All text copyright Python (Monty) Pictures. Thanks to sacred-texts.com for the
|
||||
transcript.
|
||||
"""
|
||||
from unittest import skipIf
|
||||
|
||||
from django.contrib.postgres.search import (
|
||||
SearchQuery, SearchRank, SearchVector,
|
||||
)
|
||||
from django.db.models import F
|
||||
from django.test import ignore_warnings, modify_settings
|
||||
from django.utils import six
|
||||
from django.utils.deprecation import RemovedInDjango20Warning
|
||||
|
||||
from . import PostgreSQLTestCase
|
||||
from .models import Character, Line, Scene
|
||||
|
||||
|
||||
class GrailTestData(object):
|
||||
|
||||
@classmethod
|
||||
def setUpTestData(cls):
|
||||
cls.robin = Scene.objects.create(scene='Scene 10', setting='The dark forest of Ewing')
|
||||
cls.minstrel = Character.objects.create(name='Minstrel')
|
||||
verses = [
|
||||
(
|
||||
'Bravely bold Sir Robin, rode forth from Camelot. '
|
||||
'He was not afraid to die, o Brave Sir Robin. '
|
||||
'He was not at all afraid to be killed in nasty ways. '
|
||||
'Brave, brave, brave, brave Sir Robin!'
|
||||
),
|
||||
(
|
||||
'He was not in the least bit scared to be mashed into a pulp, '
|
||||
'Or to have his eyes gouged out, and his elbows broken. '
|
||||
'To have his kneecaps split, and his body burned away, '
|
||||
'And his limbs all hacked and mangled, brave Sir Robin!'
|
||||
),
|
||||
(
|
||||
'His head smashed in and his heart cut out, '
|
||||
'And his liver removed and his bowels unplugged, '
|
||||
'And his nostrils ripped and his bottom burned off,'
|
||||
'And his --'
|
||||
),
|
||||
]
|
||||
cls.verses = [Line.objects.create(
|
||||
scene=cls.robin,
|
||||
character=cls.minstrel,
|
||||
dialogue=verse,
|
||||
) for verse in verses]
|
||||
cls.verse0, cls.verse1, cls.verse2 = cls.verses
|
||||
|
||||
cls.witch_scene = Scene.objects.create(scene='Scene 5', setting="Sir Bedemir's Castle")
|
||||
bedemir = Character.objects.create(name='Bedemir')
|
||||
crowd = Character.objects.create(name='Crowd')
|
||||
witch = Character.objects.create(name='Witch')
|
||||
duck = Character.objects.create(name='Duck')
|
||||
|
||||
cls.bedemir0 = Line.objects.create(
|
||||
scene=cls.witch_scene,
|
||||
character=bedemir,
|
||||
dialogue='We shall use my larger scales!',
|
||||
dialogue_config='english',
|
||||
)
|
||||
cls.bedemir1 = Line.objects.create(
|
||||
scene=cls.witch_scene,
|
||||
character=bedemir,
|
||||
dialogue='Right, remove the supports!',
|
||||
dialogue_config='english',
|
||||
)
|
||||
cls.duck = Line.objects.create(scene=cls.witch_scene, character=duck, dialogue=None)
|
||||
cls.crowd = Line.objects.create(scene=cls.witch_scene, character=crowd, dialogue='A witch! A witch!')
|
||||
cls.witch = Line.objects.create(scene=cls.witch_scene, character=witch, dialogue="It's a fair cop.")
|
||||
|
||||
trojan_rabbit = Scene.objects.create(scene='Scene 8', setting="The castle of Our Master Ruiz' de lu la Ramper")
|
||||
guards = Character.objects.create(name='French Guards')
|
||||
cls.french = Line.objects.create(
|
||||
scene=trojan_rabbit,
|
||||
character=guards,
|
||||
dialogue='Oh. Un cadeau. Oui oui.',
|
||||
dialogue_config='french',
|
||||
)
|
||||
|
||||
|
||||
class ContribPostgresNotInstalledTests(PostgreSQLTestCase):
|
||||
@skipIf(six.PY2, "This test fails occasionally and weirdly on python 2")
|
||||
@ignore_warnings(category=RemovedInDjango20Warning)
|
||||
def test_search_lookup_missing(self):
|
||||
msg = "Add 'django.contrib.postgres' to settings.INSTALLED_APPS to use the search operator."
|
||||
with self.assertRaisesMessage(NotImplementedError, msg):
|
||||
list(Line.objects.filter(dialogue__search='elbows'))
|
||||
|
||||
|
||||
@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'})
|
||||
class SimpleSearchTest(GrailTestData, PostgreSQLTestCase):
|
||||
|
||||
def test_simple(self):
|
||||
searched = Line.objects.filter(dialogue__search='elbows')
|
||||
self.assertSequenceEqual(searched, [self.verse1])
|
||||
|
||||
def test_non_exact_match(self):
|
||||
searched = Line.objects.filter(dialogue__search='hearts')
|
||||
self.assertSequenceEqual(searched, [self.verse2])
|
||||
|
||||
def test_search_two_terms(self):
|
||||
searched = Line.objects.filter(dialogue__search='heart bowel')
|
||||
self.assertSequenceEqual(searched, [self.verse2])
|
||||
|
||||
def test_search_two_terms_with_partial_match(self):
|
||||
searched = Line.objects.filter(dialogue__search='Robin killed')
|
||||
self.assertSequenceEqual(searched, [self.verse0])
|
||||
|
||||
|
||||
@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'})
|
||||
class SearchVectorFieldTest(GrailTestData, PostgreSQLTestCase):
|
||||
def test_existing_vector(self):
|
||||
Line.objects.update(dialogue_search_vector=SearchVector('dialogue'))
|
||||
searched = Line.objects.filter(dialogue_search_vector=SearchQuery('Robin killed'))
|
||||
self.assertSequenceEqual(searched, [self.verse0])
|
||||
|
||||
def test_existing_vector_config_explicit(self):
|
||||
Line.objects.update(dialogue_search_vector=SearchVector('dialogue'))
|
||||
searched = Line.objects.filter(dialogue_search_vector=SearchQuery('cadeaux', config='french'))
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
|
||||
class MultipleFieldsTest(GrailTestData, PostgreSQLTestCase):
|
||||
|
||||
def test_simple_on_dialogue(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='elbows')
|
||||
self.assertSequenceEqual(searched, [self.verse1])
|
||||
|
||||
def test_simple_on_scene(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='Forest')
|
||||
self.assertSequenceEqual(searched, self.verses)
|
||||
|
||||
def test_non_exact_match(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='heart')
|
||||
self.assertSequenceEqual(searched, [self.verse2])
|
||||
|
||||
def test_search_two_terms(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='heart forest')
|
||||
self.assertSequenceEqual(searched, [self.verse2])
|
||||
|
||||
def test_terms_adjacent(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('character__name', 'dialogue'),
|
||||
).filter(search='minstrel')
|
||||
self.assertSequenceEqual(searched, self.verses)
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='minstrelbravely')
|
||||
self.assertSequenceEqual(searched, [])
|
||||
|
||||
def test_search_with_null(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search='bedemir')
|
||||
self.assertEqual(set(searched), {self.bedemir0, self.bedemir1, self.crowd, self.witch, self.duck})
|
||||
|
||||
def test_config_query_explicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue', config='french'),
|
||||
).filter(search=SearchQuery('cadeaux', config='french'))
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_query_implicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue', config='french'),
|
||||
).filter(search='cadeaux')
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_from_field_explicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue', config=F('dialogue_config')),
|
||||
).filter(search=SearchQuery('cadeaux', config=F('dialogue_config')))
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_from_field_implicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue', config=F('dialogue_config')),
|
||||
).filter(search='cadeaux')
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
|
||||
@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'})
|
||||
class TestCombinations(GrailTestData, PostgreSQLTestCase):
|
||||
|
||||
def test_vector_add(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting') + SearchVector('character__name'),
|
||||
).filter(search='bedemir')
|
||||
self.assertEqual(set(searched), {self.bedemir0, self.bedemir1, self.crowd, self.witch, self.duck})
|
||||
|
||||
def test_vector_add_multi(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=(
|
||||
SearchVector('scene__setting') +
|
||||
SearchVector('character__name') +
|
||||
SearchVector('dialogue')
|
||||
),
|
||||
).filter(search='bedemir')
|
||||
self.assertEqual(set(searched), {self.bedemir0, self.bedemir1, self.crowd, self.witch, self.duck})
|
||||
|
||||
def test_query_and(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector('scene__setting', 'dialogue'),
|
||||
).filter(search=SearchQuery('bedemir') & SearchQuery('scales'))
|
||||
self.assertSequenceEqual(searched, [self.bedemir0])
|
||||
|
||||
def test_query_or(self):
|
||||
searched = Line.objects.filter(dialogue__search=SearchQuery('kneecaps') | SearchQuery('nostrils'))
|
||||
self.assertSequenceEqual(set(searched), {self.verse1, self.verse2})
|
||||
|
||||
def test_query_invert(self):
|
||||
searched = Line.objects.filter(character=self.minstrel, dialogue__search=~SearchQuery('kneecaps'))
|
||||
self.assertEqual(set(searched), {self.verse0, self.verse2})
|
||||
|
||||
|
||||
@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'})
|
||||
class TestRankingAndWeights(GrailTestData, PostgreSQLTestCase):
|
||||
|
||||
def test_ranking(self):
|
||||
searched = Line.objects.filter(character=self.minstrel).annotate(
|
||||
rank=SearchRank(SearchVector('dialogue'), SearchQuery('brave sir robin')),
|
||||
).order_by('rank')
|
||||
self.assertSequenceEqual(searched, [self.verse2, self.verse1, self.verse0])
|
||||
|
||||
def test_rank_passing_untyped_args(self):
|
||||
searched = Line.objects.filter(character=self.minstrel).annotate(
|
||||
rank=SearchRank('dialogue', 'brave sir robin'),
|
||||
).order_by('rank')
|
||||
self.assertSequenceEqual(searched, [self.verse2, self.verse1, self.verse0])
|
||||
|
||||
def test_weights_in_vector(self):
|
||||
vector = SearchVector('dialogue', weight='A') + SearchVector('character__name', weight='D')
|
||||
searched = Line.objects.filter(scene=self.witch_scene).annotate(
|
||||
rank=SearchRank(vector, SearchQuery('witch')),
|
||||
).order_by('-rank')[:2]
|
||||
self.assertSequenceEqual(searched, [self.crowd, self.witch])
|
||||
|
||||
vector = SearchVector('dialogue', weight='D') + SearchVector('character__name', weight='A')
|
||||
searched = Line.objects.filter(scene=self.witch_scene).annotate(
|
||||
rank=SearchRank(vector, SearchQuery('witch')),
|
||||
).order_by('-rank')[:2]
|
||||
self.assertSequenceEqual(searched, [self.witch, self.crowd])
|
||||
|
||||
def test_ranked_custom_weights(self):
|
||||
vector = SearchVector('dialogue', weight='D') + SearchVector('character__name', weight='A')
|
||||
searched = Line.objects.filter(scene=self.witch_scene).annotate(
|
||||
rank=SearchRank(vector, SearchQuery('witch'), weights=[1, 0, 0, 0.5]),
|
||||
).order_by('-rank')[:2]
|
||||
self.assertSequenceEqual(searched, [self.crowd, self.witch])
|
||||
|
||||
def test_ranking_chaining(self):
|
||||
searched = Line.objects.filter(character=self.minstrel).annotate(
|
||||
rank=SearchRank(SearchVector('dialogue'), SearchQuery('brave sir robin')),
|
||||
).filter(rank__gt=0.3)
|
||||
self.assertSequenceEqual(searched, [self.verse0])
|
Loading…
Reference in New Issue