Skip to content

Commit

Permalink
Recherche sémantique avec elasticsearch (#988)
Browse files Browse the repository at this point in the history
* create env and conf for vector elasticsearch

* management command to put to elasticsearch

* method for vector search

* view to test semantic search quickly

* use cpu only for cc deployment

* embedding with openai api instead HuggingFace

* add package tiktoken needed by OpenAIEmbedding

* increase number limit

* integrate semantic search to other

* fix merge conflics in poetry deps

* organize settings, elasticsearch api and siae index/meta infos

* fix deps order

* limit semantic search for admin
  • Loading branch information
SebastienReuiller authored Jan 23, 2024
1 parent 1e55b0c commit 8d13724
Show file tree
Hide file tree
Showing 18 changed files with 1,125 additions and 79 deletions.
10 changes: 10 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,3 +876,13 @@
OPENAI_API_BASE = env.str("OPENAI_API_BASE", "")
OPENAI_API_KEY = env.str("OPENAI_API_KEY", "")
OPENAI_MODEL = env.str("OPENAI_MODEL", "")


# ELASTICSEARCH
# ------------------------------------------------------------------------------
ELASTICSEARCH_SCHEME = env.str("ELASTICSEARCH_SCHEME", "https")
ELASTICSEARCH_HOST = env.str("ELASTICSEARCH_HOST", "")
ELASTICSEARCH_PORT = env.str("ELASTICSEARCH_PORT", "443")
ELASTICSEARCH_USERNAME = env.str("ELASTICSEARCH_USERNAME", "")
ELASTICSEARCH_PASSWORD = env.str("ELASTICSEARCH_PASSWORD", "")
ELASTICSEARCH_INDEX_SIAES = env.str("ELASTICSEARCH_INDEX_SIAES", "")
7 changes: 7 additions & 0 deletions env.default.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,10 @@ export OPENAI_ORG=""
export OPENAI_API_BASE=""
export OPENAI_API_KEY=""
export OPENAI_MODEL=""

# ELASTICSEARCH
# ########################
export ELASTICSEARCH_HOST=
export ELASTICSEARCH_USERNAME=
export ELASTICSEARCH_PASSWORD=
export ELASTICSEARCH_INDEX_SIAES=
7 changes: 7 additions & 0 deletions env.docker_default.local
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ OPENAI_ORG=
OPENAI_API_BASE=
OPENAI_API_BASE=
OPENAI_MODEL=

# ELASTICSEARCH
# ########################
ELASTICSEARCH_HOST=
ELASTICSEARCH_USERNAME=
ELASTICSEARCH_PASSWORD=
ELASTICSEARCH_INDEX_SIAES=
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import time

from django.conf import settings
from django.db.models import TextField
from django.db.models.functions import Length
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import ElasticVectorSearch

from lemarche.siaes.models import Siae
from lemarche.utils.apis.api_elasticsearch import URL_WITH_USER
from lemarche.utils.commands import BaseCommand


class Command(BaseCommand):
help = ""

def handle(self, *args, **options):
self.stdout_success("put siae to elasticsearch index started..")

# Elasticsearch as a vector db
embeddings = OpenAIEmbeddings()
db = ElasticVectorSearch(
embedding=embeddings, elasticsearch_url=URL_WITH_USER, index_name=settings.ELASTICSEARCH_INDEX_SIAES
)

# Siaes with completed description
TextField.register_lookup(Length) # at least 10 characters
siaes = Siae.objects.filter(description__length__gt=9).all()

for siae in siaes:
db.from_texts(
[siae.elasticsearch_index_text],
metadatas=[siae.elasticsearch_index_metadata],
embedding=embeddings,
elasticsearch_url=URL_WITH_USER,
index_name=settings.ELASTICSEARCH_INDEX_SIAES,
)
time.sleep(1)
self.stdout_success(f"{siae.name} added !")
14 changes: 14 additions & 0 deletions lemarche/siaes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,20 @@ def latest_activity_at(self):
latest_activity_at = self.updated_at
return latest_activity_at

@property
def elasticsearch_index_text(self):
text = self.description
if self.offers.count() > 0:
offers = "\n\nPrestations:\n"
for offer in self.offers.all():
offers += f"- {offer.name}:\n{offer.description}\n\n"
text += offers
return text

@property
def elasticsearch_index_metadata(self):
return {"id": self.id, "name": self.name, "website": self.website if self.website else ""}

def sectors_list_string(self, display_max=3):
sectors_name_list = self.sectors.form_filter_queryset().values_list("name", flat=True)
if display_max and len(sectors_name_list) > display_max:
Expand Down
22 changes: 22 additions & 0 deletions lemarche/static/img/spinner-bars.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions lemarche/static/itou_marche/base/_custom.scss
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,12 @@ a.disabled {
.form-description-ckeditor .django-ckeditor-widget {
width: 100%;
}

.htmx-indicator{
opacity:0;
transition: opacity 500ms ease-in;
}

.htmx-request .htmx-indicator{
opacity:1
}
35 changes: 35 additions & 0 deletions lemarche/templates/siaes/_si_ideas_search_result.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<div class="siae-info-sticky c-box bg-light mb-3">
<div class="si-ideas">
<h3 class="h4">Idées reçues</h3>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
Le prestataire est trop petit pour répondre à mon besoin…
<b>Mais il est sûrement ouvert à la co-traitance.</b>
</span>
</p>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
Son chiffre d'affaires est trop bas et je ne veux pas être
son seul client… <b>Mais Vous pouvez commencer par lui confier
un marché de plus faible périmètre, sans prendre de risque,
puis faire grandir ce partenariat si vous en êtes satisfait.</b>
</span>
</p>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
L'offre ne correspond pas exactement à ce que je cherche…
<b>Heureusement les entreprises sociales inclusives sont très
innovantes et s'adaptent à vos besoins.</b>
</span>
</p>
</div>
</div>
83 changes: 47 additions & 36 deletions lemarche/templates/siaes/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,22 @@
aria-controls="search-text"
aria-selected="false">
Recherche par SIRET / nom
</a>
</li>
{% if is_admin %}
<li class="nav-item" role="presentation">
<a class="nav-link"
id="search-semantic-tab"
data-toggle="tab"
href="#search-semantic"
role="tab"
aria-controls="search-semantic"
aria-selected="false">
Recherche sémantique
<span class="badge badge-sm badge-pill badge-important ml-2">Nouveauté</span>
</a>
</li>
{% endif %}
<li class="nav-item-dropdown dropdown">
<a class="nav-link dropdown-toggle"
href="#"
Expand Down Expand Up @@ -192,13 +205,45 @@
</div>
</form>
</div>
{% if is_admin %}
<div class="tab-pane fade"
id="search-semantic"
role="tabpanel"
aria-labelledby="search-semantic-tab">
<form method="GET"
hx-get="{% url 'siae:semantic_search_results' %}"
hx-target="#searchResults"
hx-indicator="#spinner"
action=""
id="text-semantic-search-form">
{% bootstrap_form_errors form_semantic type="all" %}
<div class="row">
<div class="col-12 col-lg-8">
<div class="row">
<div class="col-12">{% bootstrap_field form_semantic.search_query %}</div>
</div>
</div>
<div class="col-12 col-lg-4">
<span class="mb-2 d-none d-md-inline-block">&nbsp;</span>
<button id="text-search-submit"
class="btn btn-primary btn-block btn-ico"
type="submit">
<span>Rechercher</span>
<i class="ri-search-line ri-lg"></i>
</button>
</div>
</div>
</form>
</div>
{% endif %}
</div>
</div>
</div>
</div>
</section>
<section class="s-siae-02">
<div class="container">
<img id="spinner" class="htmx-indicator" src="{% static 'img/spinner-bars.svg' %}"/>
<div id="dir_list">
<div id="searchResults" class="row dir_list-row">
<div class="col-12 col-lg-8">
Expand Down Expand Up @@ -247,46 +292,12 @@ <h1 class="h4 mb-0">
<div id="map-siae-list" class="map-canvas"></div>
</div>
{% cms_advert layout="card" %}
<div class="siae-info-sticky c-box bg-light mb-3">
<div class="si-ideas">
<h3 class="h4">Idées reçues</h3>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
Le prestataire est trop petit pour répondre à mon besoin…
<b>Mais il est sûrement ouvert à la co-traitance.</b>
</span>
</p>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
Son chiffre d'affaires est trop bas et je ne veux pas être
son seul client… <b>Mais Vous pouvez commencer par lui confier
un marché de plus faible périmètre, sans prendre de risque,
puis faire grandir ce partenariat si vous en êtes satisfait.</b>
</span>
</p>
<p>
<span>
<i class="ri-check-fill ri-xl font-weight-bold"></i>
</span>
<span class="ml-2">
L'offre ne correspond pas exactement à ce que je cherche…
<b>Heureusement les entreprises sociales inclusives sont très
innovantes et s'adaptent à vos besoins.</b>
</span>
</p>
</div>
{% include "siaes/_si_ideas_search_result.html" %}
</div>
</div>
</div>
</div>
</div>
</section>
</section>
{% endblock %}
{% block modals %}
{% include "auth/_login_or_signup_modal.html" %}
Expand Down
31 changes: 31 additions & 0 deletions lemarche/templates/siaes/semantic_search_results.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{% load static bootstrap4 wagtailcore_tags advert_cms %}
{% block content %}
<div class="col-12 col-lg-8">
<div class="c-box mb-3">
{% if siaes %}
{% for siae in siaes %}
{% include "siaes/_card_search_result.html" with siae=siae %}
<!-- insert to nudge tender creation -->
{% if forloop.counter in position_promote_tenders and page_obj.number == 1 %}
{% include "siaes/_card_suggest_tender.html" with current_perimeters=current_perimeters current_sectors=current_sectors %}
{% endif %}
{% endfor %}
{% else %}
<!-- no results -->
<p>Il y a encore de l'espoir ❤️</p>
<p>Publiez votre besoin, et on s'occupe de vous trouver des prestataires inclusifs.</p>
<p>Obtenez des réponses en moins de 24 heures (en moyenne).</p>
<a href="{% url 'tenders:create' %}"
id="siae-search-empty-demande"
class="btn btn-primary d-block d-md-inline-block mb-2">
<i class="ri-mail-send-line ri-lg mr-2"></i>Publier un besoin d'achat
</a>
{% endif %}
</div>
</div>
<!-- sidebar -->
<div class="col-12 col-lg-4 siae-info mt-6 mt-sm-0">
{% cms_advert layout="card" %}
{% include "siaes/_si_ideas_search_result.html" %}
</div>
{% endblock %}
35 changes: 35 additions & 0 deletions lemarche/utils/apis/api_elasticsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from django.conf import settings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.elasticsearch import ElasticsearchStore


BASE_URL = f"{settings.ELASTICSEARCH_HOST}:{settings.ELASTICSEARCH_PORT}"
URL = f"{settings.ELASTICSEARCH_SCHEME}://{BASE_URL}"
URL_WITH_USER = (
f"{settings.ELASTICSEARCH_SCHEME}://{settings.ELASTICSEARCH_USERNAME}:{settings.ELASTICSEARCH_PASSWORD}@{BASE_URL}"
)


def siaes_similarity_search(search_text):
"""Performs semantic search with Elasticsearch as a vector db
Args:
search_text (str): User search query
Returns:
list: list of siaes id that match the search query
"""
db = ElasticsearchStore(
embedding=OpenAIEmbeddings(),
es_user=settings.ELASTICSEARCH_USERNAME,
es_password=settings.ELASTICSEARCH_PASSWORD,
es_url=URL,
index_name=settings.ELASTICSEARCH_INDEX_SIAES,
)

similar_docs = db.similarity_search(search_text, k=10)
siaes_id = []
for similar_doc in similar_docs:
siaes_id.append(similar_doc.metadata["id"])

return siaes_id
14 changes: 14 additions & 0 deletions lemarche/www/siaes/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,17 @@ def filter_queryset(self, qs=None):
qs = qs.distinct()

return qs


class SiaeSemanticForm(forms.Form):
search_query = forms.CharField(
label="Recherche sémantique",
required=False,
widget=forms.TextInput(attrs={"placeholder": "Je cherche…"}),
help_text=" ".join(
[
"Soyez le plus précis possible (Exemple: nettoyage des locaux d'entreprise",
"/ entretien des espaces verts)",
]
),
)
2 changes: 2 additions & 0 deletions lemarche/www/siaes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
SiaeFavoriteView,
SiaeSearchResultsDownloadView,
SiaeSearchResultsView,
SiaeSemanticSearchResultsView,
)


Expand All @@ -13,6 +14,7 @@

urlpatterns = [
path("", SiaeSearchResultsView.as_view(), name="search_results"),
path("semantic/search/", SiaeSemanticSearchResultsView.as_view(), name="semantic_search_results"),
path("download/", SiaeSearchResultsDownloadView.as_view(), name="search_results_download"),
path("<str:slug>/", SiaeDetailView.as_view(), name="detail"),
path("<str:slug>/favoris/", SiaeFavoriteView.as_view(), name="favorite_lists"),
Expand Down
Loading

0 comments on commit 8d13724

Please sign in to comment.