fix(community): Search index for roles (#1979)

fix(community): Search index for roles (#1979) No-Issue
ansible · Nov 22, 2023 · 5d53bbb · 5d53bbb
1 parent f168319
commit 5d53bbb
Show file tree

Hide file tree

Showing 5 changed files with 220 additions and 124 deletions.
diff --git a/galaxy_ng/app/api/ui/views/search.py b/galaxy_ng/app/api/ui/views/search.py
@@ -1,5 +1,5 @@
 from django.contrib.postgres.aggregates import JSONBAgg
-from django.contrib.postgres.search import SearchQuery, SearchVector
+from django.contrib.postgres.search import SearchQuery
 from django.db.models import (
     Exists,
     F,
@@ -42,8 +42,7 @@
 SORT_PARAM = "order_by"
 SORTABLE_FIELDS = ["name", "namespace_name", "download_count", "last_updated", "relevance"]
 SORTABLE_FIELDS += [f"-{item}" for item in SORTABLE_FIELDS]
-DEFAULT_SORT = "-download_count"
-DEFAULT_SEARCH_TYPE = "sql"  # websearch,sql
+DEFAULT_SEARCH_TYPE = "websearch"  # websearch,sql
 QUERYSET_VALUES = [
     "namespace_avatar",
     "content_list",
@@ -61,67 +60,10 @@
     "relevance",
 ]
 RANK_NORMALIZATION = 32
-EMPTY_QUERY = SearchQuery(Value(None))
 
 
 class SearchListView(api_base.GenericViewSet, mixins.ListModelMixin):
-    """Lists Search results for Collections + Roles.
-    Aggregates search from Collections and Roles in the same results set.
-
-
-    ## filtering
-
-    - **search_type:** ["sql", "websearch"]
-    - **keywords:** string
-        - queried against name,namespace,description,tags,platform
-        - when search_type is websearch allows operators e.g: "this OR that AND (A OR B) -notthis"
-        - when search_type is sql performs a SQL ilike on the same fields
-    - **type:** ["collection", "role"]
-    - **deprecated:** boolean
-    - **name:** string (iexact query)
-    - **namespace:** string (iexact query)
-    - **tags:** string[] (allows multiple &tags=..&tags=..)
-    - **platform:** string
-
-    ## Sorting
-
-    Sorting is performed by passing `order_by` parameter, optionally prefixed with `-` for DESC,
-    the allowed fields are:
-
-    - name
-    - namespace_name
-    - download_count
-    - last_updated
-    - relevance (only when search_type is websearch)
-
-    ## Pagination
-
-    Pagination is based on `limit` and `offset` parameters.
-
-    ## Results
-
-    Results are embedded in the pagination serializer including
-    `meta:count` and `links:first,previous,next,last`.
-
-    The `data` key contains the results in the format::
-
-    ```python
-    {
-      "name": "brunogphmzthghu",
-      "namespace": "brunovrhvjkdh",
-      "description": "Lorem ipsum dolor sit amet, consectetur adipisicing elit.",
-      "type": "role",
-      "latest_version": "1.4.9",
-      "avatar_url": "https://github.com/brunogphmzthghu.png,
-      "contents": [],
-      "download_count": 9999,
-      "last_updated": "2023-11-09T15:17:01.235457Z",
-      "deprecated": false,
-      "tags": ["development", "java", "python"],
-      "platforms": [{"name": "Ubuntu", "versions": ["jammy", "focal"]}]
-    }
-    ```
-    """
+    """Search collections and roles"""
 
     permission_classes = [AllowAny]
     serializer_class = SearchResultsSerializer
@@ -146,7 +88,63 @@ class SearchListView(api_base.GenericViewSet, mixins.ListModelMixin):
         ]
     )
     def list(self, *args, **kwargs):
-        """Override the default method just to provide extended schema"""
+        """Lists Search results for Collections + Roles.
+        Aggregates search from Collections and Roles in the same results set.
+
+
+        ## filtering
+
+        - **search_type:** ["sql", "websearch"]
+        - **keywords:** string
+            - queried against name,namespace,description,tags,platform
+            - when search_type is websearch allows operators e.g: "this OR that AND (A OR B) -C"
+            - when search_type is sql performs a SQL ilike on the same fields
+        - **type:** ["collection", "role"]
+        - **deprecated:** boolean
+        - **name:** string (iexact query)
+        - **namespace:** string (iexact query)
+        - **tags:** string[] (allows multiple &tags=..&tags=..)
+        - **platform:** string
+
+        ## Sorting
+
+        Sorting is performed by passing `order_by` parameter, optionally prefixed with `-` for DESC,
+        the allowed fields are:
+
+        - name
+        - namespace_name
+        - download_count
+        - last_updated
+        - relevance (only when search_type is websearch)
+
+        ## Pagination
+
+        Pagination is based on `limit` and `offset` parameters.
+
+        ## Results
+
+        Results are embedded in the pagination serializer including
+        `meta:count` and `links:first,previous,next,last`.
+
+        The `data` key contains the results in the format::
+
+        ```python
+        {
+          "name": "brunogphmzthghu",
+          "namespace": "brunovrhvjkdh",
+          "description": "Lorem ipsum dolor sit amet, consectetur adipisicing elit.",
+          "type": "role",
+          "latest_version": "1.4.9",
+          "avatar_url": "https://github.com/brunogphmzthghu.png,
+          "contents": [],
+          "download_count": 9999,
+          "last_updated": "2023-11-09T15:17:01.235457Z",
+          "deprecated": false,
+          "tags": ["development", "java", "python"],
+          "platforms": [{"name": "Ubuntu", "versions": ["jammy", "focal"]}]
+        }
+        ```
+        """
         return super().list(*args, **kwargs)
 
     def get_queryset(self):
@@ -185,11 +183,15 @@ def get_filter_params(self, request):
 
     def get_sorting_param(self, request):
         """Validates the sorting parameter is valid."""
-        sort = request.query_params.get(SORT_PARAM, DEFAULT_SORT)
-        if sort not in SORTABLE_FIELDS:
-            raise ValidationError(f"{SORT_PARAM} requires one of {SORTABLE_FIELDS}")
-        search_type = request.query_params.get("search_type", "sql")
-        if "relevance" in sort and search_type != "websearch":
+        search_type = request.query_params.get("search_type", DEFAULT_SEARCH_TYPE)
+        default_sort = "-download_count,-relevance"
+        if search_type == "sql":
+            default_sort = "-download_count,-last_updated"
+        sort = request.query_params.get(SORT_PARAM, default_sort).split(",")
+        for item in sort:
+            if item not in SORTABLE_FIELDS:
+                raise ValidationError(f"{SORT_PARAM} requires one of {SORTABLE_FIELDS}")
+        if ("relevance" in sort or "-relevance" in sort) and search_type != "websearch":
             raise ValidationError("'order_by=relevance' works only with 'search_type=websearch'")
         return sort
 
@@ -203,12 +205,10 @@ def get_collection_queryset(self, query=None):
         )
         namespace_qs = Namespace.objects.filter(name=OuterRef("namespace"))
 
-        vector = Value("")
         relevance = Value(0)
         if query:
-            vector = F("search_vector")
             relevance = Func(
-                F("search"),
+                F("search_vector"),
                 query,
                 RANK_NORMALIZATION,
                 function="ts_rank",
@@ -231,7 +231,7 @@ def get_collection_queryset(self, query=None):
                 latest_version=F("version"),
                 content_list=F("contents"),
                 namespace_avatar=Subquery(namespace_qs.values("_avatar_url")),
-                search=vector,
+                search=F("search_vector"),
                 relevance=relevance,
             )
             .values(*QUERYSET_VALUES)
@@ -241,18 +241,8 @@ def get_collection_queryset(self, query=None):
 
     def get_role_queryset(self, query=None):
         """Build the LegacyRole queryset from annotations."""
-        vector = Value("")
         relevance = Value(0)
         if query:
-            # TODO: Build search_vector field in the LegacyRole model and update via trigger or
-            # hook during import.
-            vector = (
-                SearchVector("name", weight="A")
-                + SearchVector("namespace_name", weight="B")
-                + SearchVector("description_text", weight="C")
-                + SearchVector("tag_names", weight="D")
-                + SearchVector("platform_names")
-            )
             relevance = Func(
                 F("search"),
                 query,
@@ -272,8 +262,8 @@ def get_role_queryset(self, query=None):
             download_count=Coalesce(F("legacyroledownloadcount__count"), Value(0)),
             latest_version=KT("full_metadata__versions__-1__version"),
             content_list=Value([], JSONField()),  # There is no contents for roles
-            namespace_avatar=F("namespace__avatar_url"),
-            search=vector,
+            namespace_avatar=F("namespace__namespace___avatar_url"),  # v3 namespace._avatar_url
+            search=F("legacyrolesearchvector__search_vector"),
             relevance=relevance,
         ).values(*QUERYSET_VALUES)
         return qs
@@ -319,11 +309,11 @@ def filter_and_sort(self, collections, roles, filter_params, sort, type="", quer
             roles = roles.filter(query)
 
         if type.lower() == "role":
-            qs = roles.order_by(sort)
+            qs = roles.order_by(*sort)
         elif type.lower() == "collection":
-            qs = collections.order_by(sort)
+            qs = collections.order_by(*sort)
         else:
-            qs = collections.union(roles, all=True).order_by(sort)
+            qs = collections.union(roles, all=True).order_by(*sort)
         return qs
 
 
@@ -334,9 +324,7 @@ def test():
     print()
     print(f"{' START ':#^40}")
     s = SearchListView()
-    data = s.get_search_results(
-        {"type": "", "search_type": "websearch", "keywords": "java web"}, sort="-relevance"
-    )
+    data = s.get_search_results({"type": "", "keywords": "java web"}, sort="-relevance")
     print(f"{' SQLQUERY ':#^40}")
     print(data._query)
     print(f"{' COUNT ':#^40}")

diff --git a/galaxy_ng/app/api/v1/models.py b/galaxy_ng/app/api/v1/models.py
@@ -1,4 +1,6 @@
 from django.db import models
+from django.contrib.postgres.search import SearchVectorField
+from django.contrib.postgres.indexes import GinIndex
 
 from galaxy_ng.app.models import Namespace
 from galaxy_ng.app.models.auth import User
@@ -185,6 +187,19 @@ class LegacyRoleDownloadCount(models.Model):
     count = models.IntegerField(default=0)
 
 
+class LegacyRoleSearchVector(models.Model):
+    role = models.OneToOneField(
+        LegacyRole,
+        on_delete=models.CASCADE,
+        primary_key=True,
+    )
+    search_vector = SearchVectorField(default="")
+    modified = models.DateTimeField(auto_now=True)
+
+    class Meta:
+        indexes = (GinIndex(fields=["search_vector"]),)
+
+
 class LegacyRoleImport(models.Model):
     role = models.ForeignKey(
         'LegacyRole',

diff --git a/galaxy_ng/app/migrations/0046_legacyrolesearchvector.py b/galaxy_ng/app/migrations/0046_legacyrolesearchvector.py
@@ -0,0 +1,38 @@
+# Generated by Django 4.2.7 on 2023-11-15 15:52
+
+import django.contrib.postgres.indexes
+import django.contrib.postgres.search
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("galaxy", "0045_setting"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="LegacyRoleSearchVector",
+            fields=[
+                (
+                    "role",
+                    models.OneToOneField(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        primary_key=True,
+                        serialize=False,
+                        to="galaxy.legacyrole",
+                    ),
+                ),
+                ("search_vector", django.contrib.postgres.search.SearchVectorField(default="")),
+                ("modified", models.DateTimeField(auto_now=True)),
+            ],
+            options={
+                "indexes": [
+                    django.contrib.postgres.indexes.GinIndex(
+                        fields=["search_vector"], name="galaxy_lega_search__13e661_gin"
+                    )
+                ],
+            },
+        ),
+    ]
diff --git a/galaxy_ng/app/migrations/0047_update_role_search_vector_trigger.py b/galaxy_ng/app/migrations/0047_update_role_search_vector_trigger.py
@@ -0,0 +1,64 @@
+# Generated by Django 4.2.7 on 2023-11-15 15:55
+
+from django.db import migrations
+
+REBUILD_ROLES_TS_VECTOR = """
+UPDATE galaxy_legacyrole SET name = name;
+"""
+
+CREATE_ROLE_TS_VECTOR_TRIGGER = """
+CREATE OR REPLACE FUNCTION update_role_ts_vector()
+    RETURNS TRIGGER
+    AS $$
+DECLARE
+    _search_vector tsvector;
+    _namespace text;
+BEGIN
+    _namespace := (select name from galaxy_legacynamespace WHERE id = NEW.namespace_id);
+    _search_vector := ((((
+        setweight(to_tsvector(COALESCE(_namespace, '')), 'A')
+        || setweight(to_tsvector(COALESCE(NEW."name", '')), 'A'))
+        || setweight(to_tsvector(COALESCE(((NEW."full_metadata"->'tags'))::text, '')), 'B')) 
+        || setweight(to_tsvector(COALESCE(((NEW."full_metadata"->'platforms'))::text, '')), 'C')) 
+        || setweight(to_tsvector(COALESCE((NEW."full_metadata"->>'description'), '')), 'D'));
+
+    INSERT INTO galaxy_legacyrolesearchvector(role_id,search_vector,modified)
+        VALUES(new.id,_search_vector,current_timestamp)
+    ON CONFLICT (role_id)
+        DO UPDATE SET
+            search_vector = _search_vector, modified = current_timestamp;
+    RETURN NEW;
+END;
+$$
+LANGUAGE plpgsql;
+
+DROP TRIGGER IF EXISTS update_ts_vector ON galaxy_legacyrole;
+
+CREATE TRIGGER update_ts_vector
+    AFTER INSERT OR UPDATE
+    ON galaxy_legacyrole
+    FOR EACH ROW
+EXECUTE PROCEDURE update_role_ts_vector();
+"""
+
+DROP_ROLE_TS_VECTOR_TRIGGER = """
+DROP TRIGGER IF EXISTS update_ts_vector ON galaxy_legacyrole;
+DROP FUNCTION IF EXISTS update_role_ts_vector();
+"""
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("galaxy", "0046_legacyrolesearchvector"),
+    ]
+
+    operations = [
+        migrations.RunSQL(
+            sql=CREATE_ROLE_TS_VECTOR_TRIGGER,
+            reverse_sql=DROP_ROLE_TS_VECTOR_TRIGGER,
+        ),
+        migrations.RunSQL(
+            sql=REBUILD_ROLES_TS_VECTOR,
+            reverse_sql=migrations.RunSQL.noop,
+        )
+    ]