Fixed #24938 -- Added PostgreSQL trigram support.

This commit is contained in:
Matthew Somerville 2015-06-05 17:37:48 +01:00 committed by Tim Graham
parent d7334b405f
commit 1962a96a30
11 changed files with 184 additions and 4 deletions

View File

@ -3,7 +3,7 @@ from django.db.backends.signals import connection_created
from django.db.models import CharField, TextField
from django.utils.translation import ugettext_lazy as _
from .lookups import SearchLookup, Unaccent
from .lookups import SearchLookup, TrigramSimilar, Unaccent
from .signals import register_hstore_handler
@ -17,3 +17,5 @@ class PostgresConfig(AppConfig):
TextField.register_lookup(Unaccent)
CharField.register_lookup(SearchLookup)
TextField.register_lookup(SearchLookup)
CharField.register_lookup(TrigramSimilar)
TextField.register_lookup(TrigramSimilar)

View File

@ -60,3 +60,8 @@ class SearchLookup(SearchVectorExact):
self.lhs = SearchVector(self.lhs)
lhs, lhs_params = super(SearchLookup, self).process_lhs(qn, connection)
return lhs, lhs_params
class TrigramSimilar(PostgresSimpleLookup):
lookup_name = 'trigram_similar'
operator = '%%'

View File

@ -40,3 +40,9 @@ class UnaccentExtension(CreateExtension):
def __init__(self):
self.name = 'unaccent'
class TrigramExtension(CreateExtension):
def __init__(self):
self.name = 'pg_trgm'

View File

@ -185,3 +185,19 @@ class SearchRank(Func):
SearchVectorField.register_lookup(SearchVectorExact)
class TrigramBase(Func):
def __init__(self, expression, string, **extra):
if not hasattr(string, 'resolve_expression'):
string = Value(string)
super(TrigramBase, self).__init__(expression, string, output_field=FloatField(), **extra)
class TrigramSimilarity(TrigramBase):
function = 'SIMILARITY'
class TrigramDistance(TrigramBase):
function = ''
arg_joiner = ' <-> '

View File

@ -2,6 +2,32 @@
PostgreSQL specific lookups
===========================
Trigram similarity
==================
.. fieldlookup:: trigram_similar
.. versionadded:: 1.10
The ``trigram_similar`` lookup allows you to perform trigram lookups,
measuring the number of trigrams (three consecutive characters) shared, using a
dedicated PostgreSQL extension. A trigram lookup is given an expression and
returns results that have a similarity measurement greater than the current
similarity threshold.
To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS`
and activate the `pg_trgm extension
<http://www.postgresql.org/docs/current/interactive/pgtrgm.html>`_ on
PostgreSQL. You can install the extension using the
:class:`~django.contrib.postgres.operations.TrigramExtension` migration
operation.
The ``trigram_similar`` lookup can be used on
:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`::
>>> City.objects.filter(name__trigram_similar="Middlesborough")
['<City: Middlesbrough>']
``Unaccent``
============

View File

@ -27,6 +27,16 @@ the ``django.contrib.postgres.operations`` module.
which will install the ``hstore`` extension and also immediately set up the
connection to interpret hstore data.
``TrigramExtension``
====================
.. class:: TrigramExtension()
.. versionadded:: 1.10
A subclass of :class:`~django.contrib.postgres.operations.CreateExtension`
that installs the ``pg_trgm`` extension.
``UnaccentExtension``
=====================

View File

@ -189,3 +189,58 @@ if it were an annotated ``SearchVector``::
[<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]
.. _PostgreSQL documentation: http://www.postgresql.org/docs/current/static/textsearch-features.html#TEXTSEARCH-UPDATE-TRIGGERS
Trigram similarity
==================
Another approach to searching is trigram similarity. A trigram is a group of
three consecutive characters. In addition to the :lookup:`trigram_similar`
lookup, you can use a couple of other expressions.
To use them, you need to activate the `pg_trgm extension
<http://www.postgresql.org/docs/current/interactive/pgtrgm.html>`_ on
PostgreSQL. You can install it using the
:class:`~django.contrib.postgres.operations.TrigramExtension` migration
operation.
``TrigramSimilarity``
---------------------
.. class:: TrigramSimilarity(expression, string, **extra)
.. versionadded:: 1.10
Accepts a field name or expression, and a string or expression. Returns the
trigram similarity between the two arguments.
Usage example::
>>> from django.contrib.postgres.search import TrigramSimilarity
>>> Author.objects.create(name='Katy Stevens')
>>> Author.objects.create(name='Stephen Keats')
>>> test = 'Katie Stephens'
>>> Author.objects.annotate(
... similarity=TrigramSimilarity('name', test),
... ).filter(similarity__gt=0.3).order_by('-similarity')
[<Author: Katy Stephens>, <Author: Stephen Keats>]
``TrigramDistance``
-------------------
.. class:: TrigramDistance(expression, string, **extra)
.. versionadded:: 1.10
Accepts a field name or expression, and a string or expression. Returns the
trigram distance between the two arguments.
Usage example::
>>> from django.contrib.postgres.search import TrigramDistance
>>> Author.objects.create(name='Katy Stevens')
>>> Author.objects.create(name='Stephen Keats')
>>> test = 'Katie Stephens'
>>> Author.objects.annotate(
... distance=TrigramDistance('name', test),
... ).filter(distance__lte=0.7).order_by('distance')
[<Author: Katy Stephens>, <Author: Stephen Keats>]

View File

@ -33,6 +33,10 @@ search engine. You can search across multiple fields in your relational
database, combine the searches with other lookups, use different language
configurations and weightings, and rank the results by relevance.
It also now includes trigram support, using the :lookup:`trigram_similar`
lookup, and the :class:`~django.contrib.postgres.search.TrigramSimilarity` and
:class:`~django.contrib.postgres.search.TrigramDistance` expressions.
Minor features
--------------

View File

@ -55,11 +55,12 @@ use :lookup:`unaccented comparison <unaccent>`::
This shows another issue, where we are matching against a different spelling of
the name. In this case we have an asymmetry though - a search for ``Helen``
will pick up ``Helena`` or ``Hélène``, but not the reverse. Another option
would be to use a trigram comparison, which compares sequences of letters.
would be to use a :lookup:`trigram_similar` comparison, which compares
sequences of letters.
For example::
>>> Author.objects.filter(name__unaccent__lower__trigram='Hélène')
>>> Author.objects.filter(name__unaccent__lower__trigram_similar='Hélène')
[<Author: Helen Mirren>, <Actor: Hélène Joy>]
Now we have a different problem - the longer name of "Helena Bonham Carter"

View File

@ -5,12 +5,13 @@ from django.db import migrations
try:
from django.contrib.postgres.operations import (
CreateExtension, HStoreExtension, UnaccentExtension,
CreateExtension, HStoreExtension, TrigramExtension, UnaccentExtension,
)
except ImportError:
from django.test import mock
CreateExtension = mock.Mock()
HStoreExtension = mock.Mock()
TrigramExtension = mock.Mock()
UnaccentExtension = mock.Mock()
@ -21,5 +22,6 @@ class Migration(migrations.Migration):
# dash in its name.
CreateExtension('uuid-ossp'),
HStoreExtension(),
TrigramExtension(),
UnaccentExtension(),
]

View File

@ -0,0 +1,53 @@
from django.contrib.postgres.search import TrigramDistance, TrigramSimilarity
from django.test import modify_settings
from . import PostgreSQLTestCase
from .models import CharFieldModel, TextFieldModel
@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'})
class TrigramTest(PostgreSQLTestCase):
Model = CharFieldModel
@classmethod
def setUpTestData(cls):
cls.Model.objects.bulk_create([
cls.Model(field='Matthew'),
cls.Model(field='Cat sat on mat.'),
cls.Model(field='Dog sat on rug.'),
])
def test_trigram_search(self):
self.assertQuerysetEqual(
self.Model.objects.filter(field__trigram_similar='Mathew'),
['Matthew'],
transform=lambda instance: instance.field,
)
def test_trigram_similarity(self):
search = 'Bat sat on cat.'
self.assertQuerysetEqual(
self.Model.objects.filter(
field__trigram_similar=search,
).annotate(similarity=TrigramSimilarity('field', search)).order_by('-similarity'),
[('Cat sat on mat.', 0.625), ('Dog sat on rug.', 0.333333)],
transform=lambda instance: (instance.field, instance.similarity),
ordered=True,
)
def test_trigram_similarity_alternate(self):
self.assertQuerysetEqual(
self.Model.objects.annotate(
distance=TrigramDistance('field', 'Bat sat on cat.'),
).filter(distance__lte=0.7).order_by('distance'),
[('Cat sat on mat.', 0.375), ('Dog sat on rug.', 0.666667)],
transform=lambda instance: (instance.field, instance.distance),
ordered=True,
)
class TrigramTextFieldTest(TrigramTest):
"""
TextField has the same behavior as CharField regarding trigram lookups.
"""
Model = TextFieldModel