Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recherche : prototype full text avec SearchVectorField #917

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lemarche/siaes/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ class SiaeAdmin(FieldsetsInlineMixin, gis_admin.OSMGeoAdmin):
"tender_email_link_click_count_annotated_with_link",
"tender_detail_display_count_annotated_with_link",
"tender_detail_contact_click_count_annotated_with_link",
"search_vector",
"logs_display",
"import_raw_object_display",
]
Expand Down Expand Up @@ -317,6 +318,7 @@ class SiaeAdmin(FieldsetsInlineMixin, gis_admin.OSMGeoAdmin):
)
},
),
("Recherche", {"classes": ["collapse"], "fields": ("search_vector",)}),
(
"Stats",
{
Expand Down
95 changes: 95 additions & 0 deletions lemarche/siaes/management/commands/set_search_vector_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from django.contrib.postgres.search import SearchVector
from django.db import models
from django.db.models import Value

from lemarche.siaes.models import Siae
from lemarche.utils.commands import BaseCommand


SIAE_COUNT = Siae.objects.count()
RANGE_STEP = 1000


class Command(BaseCommand):
"""
Usage:
- poetry run python manage.py set_search_vector_field
"""

def handle(self, *args, **options):
self.stdout_info("-" * 80)
self.stdout_info("Reseting search_vector field...")
progress = 0
for i in range(0, SIAE_COUNT, RANGE_STEP): # to avoid memory issues
for siae in Siae.objects.prefetch_related("sectors", "offers", "labels").all()[i : i + RANGE_STEP]: # noqa
siae_search_vector = (
SearchVector(
Value(siae.name, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.brand, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.siret, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.city, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.department, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.region, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.kind, output_field=models.CharField()),
# weight="A",
# config="french",
)
+ SearchVector(
Value(siae.description, output_field=models.CharField()),
# weight="A",
config="french",
)
)
if siae.sectors:
siae_search_vector += SearchVector(
Value(
" ".join(str(sector.name) for sector in siae.sectors.all()),
),
# weight="A",
config="french",
)
if siae.offers:
siae_search_vector += SearchVector(
Value(
" ".join(str(offer.name) for offer in siae.offers.all()),
),
# weight="A",
config="french",
)
if siae.labels:
siae_search_vector += SearchVector(
Value(
" ".join(str(label.name) for label in siae.labels.all()),
),
# weight="A",
config="french",
)
siae.search_vector = siae_search_vector
siae.save(update_fields=["search_vector"])
progress += 1
if (progress % 500) == 0:
print(f"{progress}...")
18 changes: 18 additions & 0 deletions lemarche/siaes/migrations/0072_siae_search_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.2 on 2023-09-24 22:18

import django.contrib.postgres.search
from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("siaes", "0071_alter_siae_kind"),
]

operations = [
migrations.AddField(
model_name="siae",
name="search_vector",
field=django.contrib.postgres.search.SearchVectorField(null=True, verbose_name="Search vector"),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.2.2 on 2023-09-24 22:27

import django.contrib.postgres.indexes
from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("siaes", "0072_siae_search_vector"),
]

operations = [
migrations.AddIndex(
model_name="siae",
index=django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="siaes_siae_search__200779_gin"
),
),
]
50 changes: 43 additions & 7 deletions lemarche/siaes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from django.contrib.gis.db import models as gis_models
from django.contrib.gis.db.models.functions import Distance
from django.contrib.gis.measure import D
from django.contrib.postgres.search import TrigramSimilarity # SearchVector
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector, SearchVectorField, TrigramSimilarity
from django.db import IntegrityError, models, transaction
from django.db.models import BooleanField, Case, CharField, Count, F, IntegerField, PositiveIntegerField, Q, Sum, When
from django.db.models.functions import Greatest, Round
Expand Down Expand Up @@ -187,17 +188,47 @@ def search_query_set(self):
def filter_siret_startswith(self, siret):
return self.filter(siret__startswith=siret)

def filter_full_text(self, full_text_string):
def filter_on_siret_or_name_or_brand(self, search_string):
# Simple method 1: SearchVectors
# return self.annotate(
# search=SearchVector("name", config="french") + SearchVector("brand", config="french")
# ).filter(Q(search=full_text_string) | Q(siret__startswith=full_text_string))
# ).filter(Q(search=search_string) | Q(siret__startswith=search_string))
# Simple method 2: TrigramSimilarity
return self.annotate(
similarity=Greatest(
TrigramSimilarity("name", full_text_string), TrigramSimilarity("brand", full_text_string)
)
).filter(Q(similarity__gt=0.2) | Q(siret__startswith=full_text_string))
similarity=Greatest(TrigramSimilarity("name", search_string), TrigramSimilarity("brand", search_string))
).filter(Q(similarity__gt=0.2) | Q(siret__startswith=search_string))

def filter_full_text(self, search_string):
search_vector = (
SearchVector("name")
+ SearchVector("brand")
+ SearchVector("siret")
+ SearchVector("city")
+ SearchVector("department")
+ SearchVector("region")
+ SearchVector("kind")
+ SearchVector("description", config="french")
+ SearchVector("sectors__name", config="french")
+ SearchVector("offers__name", config="french")
+ SearchVector("labels__name", config="french")
)
search_query = SearchQuery(search_string, config="french")

return self.annotate(rank=SearchRank(search_vector, search_query)).filter(rank__gte=0.01)

def filter_full_text_on_search_vector_field(self, search_string):
# SearchQuery uses 'AND' by default. Change to 'OR' (and add full search_string as default)
search_string_list = search_string.split(" ")
filters = SearchQuery(search_string, config="french")
if len(search_string_list) > 1:
for search_term in search_string_list:
filters |= SearchQuery(search_term, config="french")

return (
self.filter(search_vector=filters)
.annotate(rank=SearchRank(F("search_vector"), filters))
.filter(rank__gte=0.01)
)

def filter_sectors(self, sectors):
return self.filter(sectors__in=sectors)
Expand Down Expand Up @@ -814,6 +845,8 @@ class Siae(models.Model):
c1_last_sync_date = models.DateTimeField(blank=True, null=True)
c1_sync_skip = models.BooleanField(blank=False, null=False, default=False)

search_vector = SearchVectorField("Search vector", null=True)

# admin
notes = GenericRelation("notes.Note", related_query_name="siae")

Expand Down Expand Up @@ -863,6 +896,9 @@ class Siae(models.Model):
class Meta:
verbose_name = "Structure"
verbose_name_plural = "Structures"
indexes = [
GinIndex(fields=["search_vector"]),
]
ordering = ["name"]

def __str__(self):
Expand Down
Loading
Loading