Fixed #33788 -- Added TrigramStrictWordSimilarity() and TrigramStrictWordDistance() on PostgreSQL.

This commit is contained in:
Matt Brewer 2022-06-17 08:44:03 +01:00 committed by Mariusz Felisiak
parent 3ef37a5245
commit 8d160f154f
8 changed files with 130 additions and 5 deletions

View File

@ -636,6 +636,7 @@ answer newbie questions, and generally made Django that much better:
Mathieu Agopian <mathieu.agopian@gmail.com> Mathieu Agopian <mathieu.agopian@gmail.com>
Matías Bordese Matías Bordese
Matt Boersma <matt@sprout.org> Matt Boersma <matt@sprout.org>
Matt Brewer <matt.brewer693@gmail.com>
Matt Croydon <http://www.postneo.com/> Matt Croydon <http://www.postneo.com/>
Matt Deacalion Stevens <matt@dirtymonkey.co.uk> Matt Deacalion Stevens <matt@dirtymonkey.co.uk>
Matt Dennenbaum Matt Dennenbaum

View File

@ -11,7 +11,13 @@ from django.db.models.indexes import IndexExpression
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from .indexes import OpClass from .indexes import OpClass
from .lookups import SearchLookup, TrigramSimilar, TrigramWordSimilar, Unaccent from .lookups import (
SearchLookup,
TrigramSimilar,
TrigramStrictWordSimilar,
TrigramWordSimilar,
Unaccent,
)
from .serializers import RangeSerializer from .serializers import RangeSerializer
from .signals import register_type_handlers from .signals import register_type_handlers
@ -37,6 +43,8 @@ def uninstall_if_needed(setting, value, enter, **kwargs):
TextField._unregister_lookup(TrigramSimilar) TextField._unregister_lookup(TrigramSimilar)
CharField._unregister_lookup(TrigramWordSimilar) CharField._unregister_lookup(TrigramWordSimilar)
TextField._unregister_lookup(TrigramWordSimilar) TextField._unregister_lookup(TrigramWordSimilar)
CharField._unregister_lookup(TrigramStrictWordSimilar)
TextField._unregister_lookup(TrigramStrictWordSimilar)
# Disconnect this receiver until the next time this app is installed # Disconnect this receiver until the next time this app is installed
# and ready() connects it again to prevent unnecessary processing on # and ready() connects it again to prevent unnecessary processing on
# each setting change. # each setting change.
@ -73,5 +81,7 @@ class PostgresConfig(AppConfig):
TextField.register_lookup(TrigramSimilar) TextField.register_lookup(TrigramSimilar)
CharField.register_lookup(TrigramWordSimilar) CharField.register_lookup(TrigramWordSimilar)
TextField.register_lookup(TrigramWordSimilar) TextField.register_lookup(TrigramWordSimilar)
CharField.register_lookup(TrigramStrictWordSimilar)
TextField.register_lookup(TrigramStrictWordSimilar)
MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer) MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer)
IndexExpression.register_wrappers(OrderBy, OpClass, Collate) IndexExpression.register_wrappers(OrderBy, OpClass, Collate)

View File

@ -63,3 +63,8 @@ class TrigramSimilar(PostgresOperatorLookup):
class TrigramWordSimilar(PostgresOperatorLookup): class TrigramWordSimilar(PostgresOperatorLookup):
lookup_name = "trigram_word_similar" lookup_name = "trigram_word_similar"
postgres_operator = "%%>" postgres_operator = "%%>"
class TrigramStrictWordSimilar(PostgresOperatorLookup):
lookup_name = "trigram_strict_word_similar"
postgres_operator = "%%>>"

View File

@ -366,5 +366,14 @@ class TrigramWordDistance(TrigramWordBase):
arg_joiner = " <<-> " arg_joiner = " <<-> "
class TrigramStrictWordDistance(TrigramWordBase):
function = ""
arg_joiner = " <<<-> "
class TrigramWordSimilarity(TrigramWordBase): class TrigramWordSimilarity(TrigramWordBase):
function = "WORD_SIMILARITY" function = "WORD_SIMILARITY"
class TrigramStrictWordSimilarity(TrigramWordBase):
function = "STRICT_WORD_SIMILARITY"

View File

@ -7,6 +7,9 @@ Trigram similarity
.. fieldlookup:: trigram_similar .. fieldlookup:: trigram_similar
``trigram_similar``
-------------------
The ``trigram_similar`` lookup allows you to perform trigram lookups, The ``trigram_similar`` lookup allows you to perform trigram lookups,
measuring the number of trigrams (three consecutive characters) shared, using a measuring the number of trigrams (three consecutive characters) shared, using a
dedicated PostgreSQL extension. A trigram lookup is given an expression and dedicated PostgreSQL extension. A trigram lookup is given an expression and
@ -27,6 +30,9 @@ The ``trigram_similar`` lookup can be used on
.. fieldlookup:: trigram_word_similar .. fieldlookup:: trigram_word_similar
``trigram_word_similar``
------------------------
The ``trigram_word_similar`` lookup allows you to perform trigram word The ``trigram_word_similar`` lookup allows you to perform trigram word
similarity lookups using a dedicated PostgreSQL extension. It can be similarity lookups using a dedicated PostgreSQL extension. It can be
approximately understood as measuring the greatest number of trigrams shared approximately understood as measuring the greatest number of trigrams shared
@ -46,6 +52,25 @@ The ``trigram_word_similar`` lookup can be used on
>>> Sentence.objects.filter(name__trigram_word_similar='Middlesborough') >>> Sentence.objects.filter(name__trigram_word_similar='Middlesborough')
['<Sentence: Gumby rides on the path of Middlesbrough>'] ['<Sentence: Gumby rides on the path of Middlesbrough>']
.. fieldlookup:: trigram_strict_word_similar
``trigram_strict_word_similar``
-------------------------------
.. versionadded:: 4.2
Similar to :lookup:`trigram_word_similar`, except that it forces extent
boundaries to match word boundaries.
To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS`
and activate the `pg_trgm extension`_ on PostgreSQL. You can install the
extension using the
:class:`~django.contrib.postgres.operations.TrigramExtension` migration
operation.
The ``trigram_strict_word_similar`` lookup can be used on
:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`.
.. _`pg_trgm extension`: https://www.postgresql.org/docs/current/pgtrgm.html .. _`pg_trgm extension`: https://www.postgresql.org/docs/current/pgtrgm.html
``Unaccent`` ``Unaccent``

View File

@ -286,9 +286,9 @@ Trigram similarity
================== ==================
Another approach to searching is trigram similarity. A trigram is a group of Another approach to searching is trigram similarity. A trigram is a group of
three consecutive characters. In addition to the :lookup:`trigram_similar` and three consecutive characters. In addition to the :lookup:`trigram_similar`,
:lookup:`trigram_word_similar` lookups, you can use a couple of other :lookup:`trigram_word_similar`, and :lookup:`trigram_strict_word_similar`
expressions. lookups, you can use a couple of other expressions.
To use them, you need to activate the `pg_trgm extension To use them, you need to activate the `pg_trgm extension
<https://www.postgresql.org/docs/current/pgtrgm.html>`_ on PostgreSQL. You can <https://www.postgresql.org/docs/current/pgtrgm.html>`_ on PostgreSQL. You can
@ -334,6 +334,18 @@ Usage example::
... ).filter(similarity__gt=0.3).order_by('-similarity') ... ).filter(similarity__gt=0.3).order_by('-similarity')
[<Author: Katy Stevens>] [<Author: Katy Stevens>]
``TrigramStrictWordSimilarity``
-------------------------------
.. class:: TrigramStrictWordSimilarity(string, expression, **extra)
.. versionadded:: 4.2
Accepts a string or expression, and a field name or expression. Returns the
trigram strict word similarity between the two arguments. Similar to
:class:`TrigramWordSimilarity() <TrigramWordSimilarity>`, except that it forces
extent boundaries to match word boundaries.
``TrigramDistance`` ``TrigramDistance``
------------------- -------------------
@ -371,3 +383,13 @@ Usage example::
... distance=TrigramWordDistance(test, 'name'), ... distance=TrigramWordDistance(test, 'name'),
... ).filter(distance__lte=0.7).order_by('distance') ... ).filter(distance__lte=0.7).order_by('distance')
[<Author: Katy Stevens>] [<Author: Katy Stevens>]
``TrigramStrictWordDistance``
-----------------------------
.. class:: TrigramStrictWordDistance(string, expression, **extra)
.. versionadded:: 4.2
Accepts a string or expression, and a field name or expression. Returns the
trigram strict word distance between the two arguments.

View File

@ -65,7 +65,12 @@ Minor features
:mod:`django.contrib.postgres` :mod:`django.contrib.postgres`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* ... * The new :lookup:`trigram_strict_word_similar` lookup, and the
:class:`TrigramStrictWordSimilarity()
<django.contrib.postgres.search.TrigramStrictWordSimilarity>` and
:class:`TrigramStrictWordDistance()
<django.contrib.postgres.search.TrigramStrictWordDistance>` expressions allow
using trigram strict word similarity.
:mod:`django.contrib.redirects` :mod:`django.contrib.redirects`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -7,6 +7,8 @@ try:
from django.contrib.postgres.search import ( from django.contrib.postgres.search import (
TrigramDistance, TrigramDistance,
TrigramSimilarity, TrigramSimilarity,
TrigramStrictWordDistance,
TrigramStrictWordSimilarity,
TrigramWordDistance, TrigramWordDistance,
TrigramWordSimilarity, TrigramWordSimilarity,
) )
@ -43,6 +45,25 @@ class TrigramTest(PostgreSQLTestCase):
self.Model.objects.filter(field__trigram_word_similar="Middlesborough"), self.Model.objects.filter(field__trigram_word_similar="Middlesborough"),
[obj], [obj],
) )
self.assertSequenceEqual(
self.Model.objects.filter(field__trigram_word_similar="Middle"),
[obj],
)
def test_trigram_strict_word_search_matched(self):
obj = self.Model.objects.create(
field="Gumby rides on the path of Middlesbrough",
)
self.assertSequenceEqual(
self.Model.objects.filter(
field__trigram_strict_word_similar="Middlesborough"
),
[obj],
)
self.assertSequenceEqual(
self.Model.objects.filter(field__trigram_strict_word_similar="Middle"),
[],
)
def test_trigram_similarity(self): def test_trigram_similarity(self):
search = "Bat sat on cat." search = "Bat sat on cat."
@ -75,6 +96,19 @@ class TrigramTest(PostgreSQLTestCase):
], ],
) )
def test_trigram_strict_word_similarity(self):
search = "matt"
self.assertSequenceEqual(
self.Model.objects.filter(field__trigram_word_similar=search)
.annotate(word_similarity=TrigramStrictWordSimilarity(search, "field"))
.values("field", "word_similarity")
.order_by("-word_similarity"),
[
{"field": "Cat sat on mat.", "word_similarity": 0.5},
{"field": "Matthew", "word_similarity": 0.44444445},
],
)
def test_trigram_similarity_alternate(self): def test_trigram_similarity_alternate(self):
# Round result of distance because PostgreSQL uses greater precision. # Round result of distance because PostgreSQL uses greater precision.
self.assertQuerysetEqual( self.assertQuerysetEqual(
@ -104,6 +138,20 @@ class TrigramTest(PostgreSQLTestCase):
], ],
) )
def test_trigram_strict_word_distance(self):
self.assertSequenceEqual(
self.Model.objects.annotate(
word_distance=TrigramStrictWordDistance("matt", "field"),
)
.filter(word_distance__lte=0.7)
.values("field", "word_distance")
.order_by("word_distance"),
[
{"field": "Cat sat on mat.", "word_distance": 0.5},
{"field": "Matthew", "word_distance": 0.5555556},
],
)
class TrigramTextFieldTest(TrigramTest): class TrigramTextFieldTest(TrigramTest):
""" """