Skip to content

Commit

Permalink
fix(community): Search index for roles (#1979)
Browse files Browse the repository at this point in the history
fix(community): Search index for roles (#1979)

No-Issue
  • Loading branch information
rochacbruno authored Nov 22, 2023
1 parent f168319 commit 5d53bbb
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 124 deletions.
166 changes: 77 additions & 89 deletions galaxy_ng/app/api/ui/views/search.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib.postgres.aggregates import JSONBAgg
from django.contrib.postgres.search import SearchQuery, SearchVector
from django.contrib.postgres.search import SearchQuery
from django.db.models import (
Exists,
F,
Expand Down Expand Up @@ -42,8 +42,7 @@
SORT_PARAM = "order_by"
SORTABLE_FIELDS = ["name", "namespace_name", "download_count", "last_updated", "relevance"]
SORTABLE_FIELDS += [f"-{item}" for item in SORTABLE_FIELDS]
DEFAULT_SORT = "-download_count"
DEFAULT_SEARCH_TYPE = "sql" # websearch,sql
DEFAULT_SEARCH_TYPE = "websearch" # websearch,sql
QUERYSET_VALUES = [
"namespace_avatar",
"content_list",
Expand All @@ -61,67 +60,10 @@
"relevance",
]
RANK_NORMALIZATION = 32
EMPTY_QUERY = SearchQuery(Value(None))


class SearchListView(api_base.GenericViewSet, mixins.ListModelMixin):
"""Lists Search results for Collections + Roles.
Aggregates search from Collections and Roles in the same results set.
## filtering
- **search_type:** ["sql", "websearch"]
- **keywords:** string
- queried against name,namespace,description,tags,platform
- when search_type is websearch allows operators e.g: "this OR that AND (A OR B) -notthis"
- when search_type is sql performs a SQL ilike on the same fields
- **type:** ["collection", "role"]
- **deprecated:** boolean
- **name:** string (iexact query)
- **namespace:** string (iexact query)
- **tags:** string[] (allows multiple &tags=..&tags=..)
- **platform:** string
## Sorting
Sorting is performed by passing `order_by` parameter, optionally prefixed with `-` for DESC,
the allowed fields are:
- name
- namespace_name
- download_count
- last_updated
- relevance (only when search_type is websearch)
## Pagination
Pagination is based on `limit` and `offset` parameters.
## Results
Results are embedded in the pagination serializer including
`meta:count` and `links:first,previous,next,last`.
The `data` key contains the results in the format::
```python
{
"name": "brunogphmzthghu",
"namespace": "brunovrhvjkdh",
"description": "Lorem ipsum dolor sit amet, consectetur adipisicing elit.",
"type": "role",
"latest_version": "1.4.9",
"avatar_url": "https://github.com/brunogphmzthghu.png,
"contents": [],
"download_count": 9999,
"last_updated": "2023-11-09T15:17:01.235457Z",
"deprecated": false,
"tags": ["development", "java", "python"],
"platforms": [{"name": "Ubuntu", "versions": ["jammy", "focal"]}]
}
```
"""
"""Search collections and roles"""

permission_classes = [AllowAny]
serializer_class = SearchResultsSerializer
Expand All @@ -146,7 +88,63 @@ class SearchListView(api_base.GenericViewSet, mixins.ListModelMixin):
]
)
def list(self, *args, **kwargs):
"""Override the default method just to provide extended schema"""
"""Lists Search results for Collections + Roles.
Aggregates search from Collections and Roles in the same results set.
## filtering
- **search_type:** ["sql", "websearch"]
- **keywords:** string
- queried against name,namespace,description,tags,platform
- when search_type is websearch allows operators e.g: "this OR that AND (A OR B) -C"
- when search_type is sql performs a SQL ilike on the same fields
- **type:** ["collection", "role"]
- **deprecated:** boolean
- **name:** string (iexact query)
- **namespace:** string (iexact query)
- **tags:** string[] (allows multiple &tags=..&tags=..)
- **platform:** string
## Sorting
Sorting is performed by passing `order_by` parameter, optionally prefixed with `-` for DESC,
the allowed fields are:
- name
- namespace_name
- download_count
- last_updated
- relevance (only when search_type is websearch)
## Pagination
Pagination is based on `limit` and `offset` parameters.
## Results
Results are embedded in the pagination serializer including
`meta:count` and `links:first,previous,next,last`.
The `data` key contains the results in the format::
```python
{
"name": "brunogphmzthghu",
"namespace": "brunovrhvjkdh",
"description": "Lorem ipsum dolor sit amet, consectetur adipisicing elit.",
"type": "role",
"latest_version": "1.4.9",
"avatar_url": "https://github.com/brunogphmzthghu.png,
"contents": [],
"download_count": 9999,
"last_updated": "2023-11-09T15:17:01.235457Z",
"deprecated": false,
"tags": ["development", "java", "python"],
"platforms": [{"name": "Ubuntu", "versions": ["jammy", "focal"]}]
}
```
"""
return super().list(*args, **kwargs)

def get_queryset(self):
Expand Down Expand Up @@ -185,11 +183,15 @@ def get_filter_params(self, request):

def get_sorting_param(self, request):
"""Validates the sorting parameter is valid."""
sort = request.query_params.get(SORT_PARAM, DEFAULT_SORT)
if sort not in SORTABLE_FIELDS:
raise ValidationError(f"{SORT_PARAM} requires one of {SORTABLE_FIELDS}")
search_type = request.query_params.get("search_type", "sql")
if "relevance" in sort and search_type != "websearch":
search_type = request.query_params.get("search_type", DEFAULT_SEARCH_TYPE)
default_sort = "-download_count,-relevance"
if search_type == "sql":
default_sort = "-download_count,-last_updated"
sort = request.query_params.get(SORT_PARAM, default_sort).split(",")
for item in sort:
if item not in SORTABLE_FIELDS:
raise ValidationError(f"{SORT_PARAM} requires one of {SORTABLE_FIELDS}")
if ("relevance" in sort or "-relevance" in sort) and search_type != "websearch":
raise ValidationError("'order_by=relevance' works only with 'search_type=websearch'")
return sort

Expand All @@ -203,12 +205,10 @@ def get_collection_queryset(self, query=None):
)
namespace_qs = Namespace.objects.filter(name=OuterRef("namespace"))

vector = Value("")
relevance = Value(0)
if query:
vector = F("search_vector")
relevance = Func(
F("search"),
F("search_vector"),
query,
RANK_NORMALIZATION,
function="ts_rank",
Expand All @@ -231,7 +231,7 @@ def get_collection_queryset(self, query=None):
latest_version=F("version"),
content_list=F("contents"),
namespace_avatar=Subquery(namespace_qs.values("_avatar_url")),
search=vector,
search=F("search_vector"),
relevance=relevance,
)
.values(*QUERYSET_VALUES)
Expand All @@ -241,18 +241,8 @@ def get_collection_queryset(self, query=None):

def get_role_queryset(self, query=None):
"""Build the LegacyRole queryset from annotations."""
vector = Value("")
relevance = Value(0)
if query:
# TODO: Build search_vector field in the LegacyRole model and update via trigger or
# hook during import.
vector = (
SearchVector("name", weight="A")
+ SearchVector("namespace_name", weight="B")
+ SearchVector("description_text", weight="C")
+ SearchVector("tag_names", weight="D")
+ SearchVector("platform_names")
)
relevance = Func(
F("search"),
query,
Expand All @@ -272,8 +262,8 @@ def get_role_queryset(self, query=None):
download_count=Coalesce(F("legacyroledownloadcount__count"), Value(0)),
latest_version=KT("full_metadata__versions__-1__version"),
content_list=Value([], JSONField()), # There is no contents for roles
namespace_avatar=F("namespace__avatar_url"),
search=vector,
namespace_avatar=F("namespace__namespace___avatar_url"), # v3 namespace._avatar_url
search=F("legacyrolesearchvector__search_vector"),
relevance=relevance,
).values(*QUERYSET_VALUES)
return qs
Expand Down Expand Up @@ -319,11 +309,11 @@ def filter_and_sort(self, collections, roles, filter_params, sort, type="", quer
roles = roles.filter(query)

if type.lower() == "role":
qs = roles.order_by(sort)
qs = roles.order_by(*sort)
elif type.lower() == "collection":
qs = collections.order_by(sort)
qs = collections.order_by(*sort)
else:
qs = collections.union(roles, all=True).order_by(sort)
qs = collections.union(roles, all=True).order_by(*sort)
return qs


Expand All @@ -334,9 +324,7 @@ def test():
print()
print(f"{' START ':#^40}")
s = SearchListView()
data = s.get_search_results(
{"type": "", "search_type": "websearch", "keywords": "java web"}, sort="-relevance"
)
data = s.get_search_results({"type": "", "keywords": "java web"}, sort="-relevance")
print(f"{' SQLQUERY ':#^40}")
print(data._query)
print(f"{' COUNT ':#^40}")
Expand Down
15 changes: 15 additions & 0 deletions galaxy_ng/app/api/v1/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from django.db import models
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex

from galaxy_ng.app.models import Namespace
from galaxy_ng.app.models.auth import User
Expand Down Expand Up @@ -185,6 +187,19 @@ class LegacyRoleDownloadCount(models.Model):
count = models.IntegerField(default=0)


class LegacyRoleSearchVector(models.Model):
role = models.OneToOneField(
LegacyRole,
on_delete=models.CASCADE,
primary_key=True,
)
search_vector = SearchVectorField(default="")
modified = models.DateTimeField(auto_now=True)

class Meta:
indexes = (GinIndex(fields=["search_vector"]),)


class LegacyRoleImport(models.Model):
role = models.ForeignKey(
'LegacyRole',
Expand Down
38 changes: 38 additions & 0 deletions galaxy_ng/app/migrations/0046_legacyrolesearchvector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Generated by Django 4.2.7 on 2023-11-15 15:52

import django.contrib.postgres.indexes
import django.contrib.postgres.search
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):
dependencies = [
("galaxy", "0045_setting"),
]

operations = [
migrations.CreateModel(
name="LegacyRoleSearchVector",
fields=[
(
"role",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
primary_key=True,
serialize=False,
to="galaxy.legacyrole",
),
),
("search_vector", django.contrib.postgres.search.SearchVectorField(default="")),
("modified", models.DateTimeField(auto_now=True)),
],
options={
"indexes": [
django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="galaxy_lega_search__13e661_gin"
)
],
},
),
]
64 changes: 64 additions & 0 deletions galaxy_ng/app/migrations/0047_update_role_search_vector_trigger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Generated by Django 4.2.7 on 2023-11-15 15:55

from django.db import migrations

REBUILD_ROLES_TS_VECTOR = """
UPDATE galaxy_legacyrole SET name = name;
"""

CREATE_ROLE_TS_VECTOR_TRIGGER = """
CREATE OR REPLACE FUNCTION update_role_ts_vector()
RETURNS TRIGGER
AS $$
DECLARE
_search_vector tsvector;
_namespace text;
BEGIN
_namespace := (select name from galaxy_legacynamespace WHERE id = NEW.namespace_id);
_search_vector := ((((
setweight(to_tsvector(COALESCE(_namespace, '')), 'A')
|| setweight(to_tsvector(COALESCE(NEW."name", '')), 'A'))
|| setweight(to_tsvector(COALESCE(((NEW."full_metadata"->'tags'))::text, '')), 'B'))
|| setweight(to_tsvector(COALESCE(((NEW."full_metadata"->'platforms'))::text, '')), 'C'))
|| setweight(to_tsvector(COALESCE((NEW."full_metadata"->>'description'), '')), 'D'));
INSERT INTO galaxy_legacyrolesearchvector(role_id,search_vector,modified)
VALUES(new.id,_search_vector,current_timestamp)
ON CONFLICT (role_id)
DO UPDATE SET
search_vector = _search_vector, modified = current_timestamp;
RETURN NEW;
END;
$$
LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS update_ts_vector ON galaxy_legacyrole;
CREATE TRIGGER update_ts_vector
AFTER INSERT OR UPDATE
ON galaxy_legacyrole
FOR EACH ROW
EXECUTE PROCEDURE update_role_ts_vector();
"""

DROP_ROLE_TS_VECTOR_TRIGGER = """
DROP TRIGGER IF EXISTS update_ts_vector ON galaxy_legacyrole;
DROP FUNCTION IF EXISTS update_role_ts_vector();
"""


class Migration(migrations.Migration):
dependencies = [
("galaxy", "0046_legacyrolesearchvector"),
]

operations = [
migrations.RunSQL(
sql=CREATE_ROLE_TS_VECTOR_TRIGGER,
reverse_sql=DROP_ROLE_TS_VECTOR_TRIGGER,
),
migrations.RunSQL(
sql=REBUILD_ROLES_TS_VECTOR,
reverse_sql=migrations.RunSQL.noop,
)
]
Loading

0 comments on commit 5d53bbb

Please sign in to comment.