50 bug wrong response type for glom (#51)

* #50 bug wrong response type for glom WIP: Reproducing bug * #50 bug wrong response type for glom WIP: Reproducing bug * #50 bug wrong response type for glom Refactoring glom usage * #50 bug wrong response type for glom Refactoring glom usage * #50 bug wrong response type for glom Refactoring naming Added description * #50 bug wrong response type for glom Refactoring naming Forcing glom to return empty strings instead of none for missing description * #50 bug wrong response type for glom Refactoring enum unions * #50 bug wrong response type for glom Refactoring CollectionAttributes * #50 bug wrong response type for glom Refactoring CollectionAttributes * #50 bug wrong response type for glom Refactoring and cleanup, removing outdated logic and code Adding documentation * #50 bug wrong response type for glom Cleanup
openeduhub · Jun 22, 2022 · 2b33ccc · 2b33ccc
1 parent 9b413a9
commit 2b33ccc
Show file tree

Hide file tree

Showing 13 changed files with 323 additions and 100 deletions.
diff --git a/src/app/api/api.py b/src/app/api/api.py
@@ -16,7 +16,11 @@
     CollectionTreeCount,
     collection_counts,
 )
-from app.api.collections.models import CollectionNode
+from app.api.collections.missing_attributes import (
+    collections_with_missing_attributes,
+    missing_attribute_filter,
+)
+from app.api.collections.models import CollectionNode, MissingMaterials
 from app.api.collections.tree import collection_tree
 from app.api.quality_matrix.collections import collection_quality
 from app.api.quality_matrix.models import ColumnOutputModel, Forms, Timeline
@@ -57,6 +61,7 @@ def get_database(request: Request) -> Database:
     """
 
 TAG_STATISTICS = "Statistics"
+_TAG_COLLECTIONS = "Collections"
 
 
 def node_ids_for_major_collections(
@@ -164,7 +169,7 @@ async def get_timestamps(
     response_model=ScoreOutput,
     status_code=HTTP_200_OK,
     responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
-    tags=[TAG_STATISTICS],
+    tags=[_TAG_COLLECTIONS],
     description=f"""Returns the average ratio of non-empty properties for the chosen collection.
     For certain properties, e.g. `properties.cclom:title`, the ratio of
     elements which miss this entry compared to the total number of entries is calculated.
@@ -222,7 +227,7 @@ async def ping_api():
     response_model=list[CollectionNode],
     status_code=HTTP_200_OK,
     responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
-    tags=["Collections"],
+    tags=[_TAG_COLLECTIONS],
 )
 async def get_collection_tree(
     *, node_id: UUID = Depends(node_ids_for_major_collections)
@@ -236,7 +241,7 @@ async def get_collection_tree(
     response_model=list[CollectionTreeCount],
     status_code=HTTP_200_OK,
     responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
-    tags=["Collections"],
+    tags=[_TAG_COLLECTIONS],
 )
 async def get_collection_counts(
     *,
@@ -248,3 +253,27 @@ async def get_collection_counts(
 ):
     counts = await collection_counts(node_id=node_id, facet=facet)
     return counts
+
+
+@router.get(
+    "/collections/{node_id}/pending-subcollections/{missing_attribute}",
+    response_model=list[MissingMaterials],
+    response_model_exclude_unset=True,
+    status_code=HTTP_200_OK,
+    responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
+    tags=[_TAG_COLLECTIONS],
+    description="""A list of missing entries for different types of materials by subcollection.
+    Searches for entries with one of the following properties being empty or missing: """
+    + f"{', '.join([entry.value for entry in missing_attribute_filter])}.",
+)
+async def filter_collections_with_missing_attributes(
+    *,
+    noderef_id: UUID = Depends(node_ids_for_major_collections),
+    missing_attribute: str = Path(
+        ...,
+        examples={
+            form.name: {"value": form.value} for form in missing_attribute_filter
+        },
+    ),
+):
+    return await collections_with_missing_attributes(noderef_id, missing_attribute)
diff --git a/src/app/api/collections/missing_attributes.py b/src/app/api/collections/missing_attributes.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+from typing import Optional
+from uuid import UUID
+
+from elasticsearch_dsl.query import Q
+from glom import Coalesce, Iter
+
+from app.api.collections.models import MissingMaterials
+from app.api.collections.utils import map_elastic_response_to_model
+from app.core.config import ELASTIC_TOTAL_SIZE
+from app.elastic.dsl import qbool, qmatch
+from app.elastic.elastic import ResourceType, type_filter
+from app.elastic.search import Search
+from app.models import CollectionAttribute, ElasticResourceAttribute
+
+missing_attribute_filter = [
+    CollectionAttribute.TITLE,
+    ElasticResourceAttribute.NAME,
+    ElasticResourceAttribute.KEYWORDS,
+    CollectionAttribute.DESCRIPTION,
+]
+
+
+all_source_fields: list = [
+    ElasticResourceAttribute.NODEREF_ID,
+    ElasticResourceAttribute.TYPE,
+    ElasticResourceAttribute.NAME,
+    CollectionAttribute.TITLE,
+    ElasticResourceAttribute.KEYWORDS,
+    CollectionAttribute.DESCRIPTION,
+    CollectionAttribute.PATH,
+    CollectionAttribute.PARENT_ID,
+]
+
+missing_attributes_spec = {
+    "title": Coalesce(CollectionAttribute.TITLE.path, default=""),
+    "keywords": (
+        Coalesce(ElasticResourceAttribute.KEYWORDS.path, default=[]),
+        Iter().all(),
+    ),
+    "description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=""),
+    "path": (
+        Coalesce(CollectionAttribute.PATH.path, default=[]),
+        Iter().all(),
+    ),
+    "parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=""),
+    "noderef_id": Coalesce(CollectionAttribute.NODE_ID.path, default=""),
+    "name": Coalesce(ElasticResourceAttribute.NAME.path, default=""),
+    "type": Coalesce(ElasticResourceAttribute.TYPE.path, default=""),
+    "children": Coalesce("", default=[]),  # workaround to map easier to pydantic model
+}
+
+
+def missing_attributes_search(
+    noderef_id: UUID, missing_attribute: str, max_hits: int
+) -> Search:
+    query = {
+        "filter": [*type_filter[ResourceType.COLLECTION]],
+        "minimum_should_match": 1,
+        "should": [
+            qmatch(**{"path": noderef_id}),
+            qmatch(**{"nodeRef.id": noderef_id}),
+        ],
+        "must_not": Q("wildcard", **{missing_attribute: {"value": "*"}}),
+    }
+
+    return (
+        Search()
+        .base_filters()
+        .query(qbool(**query))
+        .source(includes=[source.path for source in all_source_fields])[:max_hits]
+    )
+
+
+async def collections_with_missing_attributes(
+    noderef_id: UUID,
+    missing_attribute: str,
+    max_hits: Optional[int] = ELASTIC_TOTAL_SIZE,
+) -> list[MissingMaterials]:
+    search = missing_attributes_search(noderef_id, missing_attribute, max_hits)
+
+    response = search.execute()
+    if response.success():
+        return map_elastic_response_to_model(
+            response, missing_attributes_spec, MissingMaterials
+        )
diff --git a/src/app/api/collections/models.py b/src/app/api/collections/models.py
@@ -13,3 +13,21 @@ class CollectionNode(BaseModel):
     title: Optional[str]  # might be none due to data model
     children: list[CollectionNode]
     parent_id: Optional[UUID]
+
+
+class MissingMaterials(CollectionNode):
+    """
+    A model containing information about entries which miss, e.g, a description.
+    By returning this model the editors know enough about the entry to find and correct it
+
+    :param
+        description: a free text description of the context
+        path: the complete id path, i.e., from parent node id up to the root id of elastic search
+        type: Indicates the type of content, must be ccm:map in the current implementation
+    """
+
+    keywords: list[str]
+    description: str
+    path: list[str]
+    type: str
+    name: str
diff --git a/src/app/api/collections/tree.py b/src/app/api/collections/tree.py
@@ -2,14 +2,15 @@
 
 from aiohttp import ClientSession
 from elasticsearch_dsl.response import Response
-from glom import Coalesce, Iter, glom
+from glom import Coalesce, Iter
 
 from app.api.collections.models import CollectionNode
+from app.api.collections.utils import map_elastic_response_to_model
 from app.api.collections.vocabs import tree_from_vocabs
 from app.core.config import ELASTIC_TOTAL_SIZE
 from app.elastic.dsl import qbool, qterm
 from app.elastic.search import Search
-from app.models import CollectionAttribute
+from app.models import CollectionAttribute, ElasticResourceAttribute
 
 
 def build_portal_tree(collections: list, root_noderef_id: UUID) -> list[CollectionNode]:
@@ -47,42 +48,30 @@ def tree_search(node_id: UUID) -> Search:
     return s
 
 
-def hits_to_collection(hits: Response) -> list[CollectionNode]:
-    collections = []
-    for hit in hits:
-        entry = hit.to_dict()
-        spec = {
-            "title": Coalesce(CollectionAttribute.TITLE.path, default=None),
-            "keywords": (
-                Coalesce(CollectionAttribute.KEYWORDS.path, default=[]),
-                Iter().all(),
-            ),
-            "description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=None),
-            "path": (
-                Coalesce(CollectionAttribute.PATH.path, default=[]),
-                Iter().all(),
-            ),
-            "parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=None),
-            "node_id": Coalesce(CollectionAttribute.NODE_ID.path, default=None),
-        }
-        parsed_entry = glom(entry, spec)
-        if parsed_entry["title"] is not None:
-            collections.append(
-                CollectionNode(
-                    noderef_id=parsed_entry["node_id"],
-                    title=parsed_entry["title"],
-                    children=[],
-                    parent_id=parsed_entry["parent_id"],
-                )
-            )
-    return collections
+collection_spec = {
+    "title": Coalesce(CollectionAttribute.TITLE.path, default=None),
+    "keywords": (
+        Coalesce(ElasticResourceAttribute.KEYWORDS.path, default=[]),
+        Iter().all(),
+    ),
+    "description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=None),
+    "path": (
+        Coalesce(CollectionAttribute.PATH.path, default=[]),
+        Iter().all(),
+    ),
+    "parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=None),
+    "noderef_id": Coalesce(CollectionAttribute.NODE_ID.path, default=None),
+    "children": Coalesce("", default=[]),
+}
 
 
 def tree_from_elastic(node_id: UUID) -> list[CollectionNode]:
     response: Response = tree_search(node_id).execute()
 
     if response.success():
-        collection = hits_to_collection(response)
+        collection = map_elastic_response_to_model(
+            response, collection_spec, CollectionNode
+        )
         return build_portal_tree(collection, node_id)
 
 

diff --git a/src/app/api/collections/utils.py b/src/app/api/collections/utils.py
@@ -0,0 +1,14 @@
+from typing import Generic, TypeVar
+
+from elasticsearch_dsl.response import Response
+from glom import glom
+
+from app.api.collections.models import CollectionNode, MissingMaterials
+
+T = TypeVar("T", CollectionNode, MissingMaterials)
+
+
+def map_elastic_response_to_model(
+    response: Response, specs: dict, model: Generic[T]
+) -> list[T]:
+    return [model(**glom(hit.to_dict(), specs)) for hit in response]
diff --git a/src/app/api/score/models.py b/src/app/api/score/models.py
@@ -2,35 +2,34 @@
 
 from pydantic import BaseModel, Field
 
-from app.elastic.fields import Field as ElasticField
-from app.elastic.fields import FieldType
+from app.elastic.fields import ElasticField, ElasticFieldType
 from app.models import ElasticResourceAttribute
 
 
 class _LearningMaterialAttribute(ElasticField):
-    TITLE = ("properties.cclom:title", FieldType.TEXT)
-    SUBJECTS = ("properties.ccm:taxonid", FieldType.TEXT)
-    SUBJECTS_DE = ("i18n.de_DE.ccm:taxonid", FieldType.TEXT)
-    WWW_URL = ("properties.ccm:wwwurl", FieldType.TEXT)
-    DESCRIPTION = ("properties.cclom:general_description", FieldType.TEXT)
-    LICENSES = ("properties.ccm:commonlicense_key", FieldType.TEXT)
-    COLLECTION_NODEREF_ID = ("collections.nodeRef.id", FieldType.TEXT)
-    COLLECTION_PATH = ("collections.path", FieldType.TEXT)
-    CONTENT_FULLTEXT = ("content.fulltext", FieldType.TEXT)
+    TITLE = ("properties.cclom:title", ElasticFieldType.TEXT)
+    SUBJECTS = ("properties.ccm:taxonid", ElasticFieldType.TEXT)
+    SUBJECTS_DE = ("i18n.de_DE.ccm:taxonid", ElasticFieldType.TEXT)
+    WWW_URL = ("properties.ccm:wwwurl", ElasticFieldType.TEXT)
+    DESCRIPTION = ("properties.cclom:general_description", ElasticFieldType.TEXT)
+    LICENSES = ("properties.ccm:commonlicense_key", ElasticFieldType.TEXT)
+    COLLECTION_NODEREF_ID = ("collections.nodeRef.id", ElasticFieldType.TEXT)
+    COLLECTION_PATH = ("collections.path", ElasticFieldType.TEXT)
+    CONTENT_FULLTEXT = ("content.fulltext", ElasticFieldType.TEXT)
     LEARNINGRESOURCE_TYPE = (
         "properties.ccm:oeh_lrt_aggregated",
-        FieldType.TEXT,
+        ElasticFieldType.TEXT,
     )
     LEARNINGRESOURCE_TYPE_DE = (
         "i18n.de_DE.ccm:oeh_lrt_aggregated",
-        FieldType.TEXT,
+        ElasticFieldType.TEXT,
     )
     EDUENDUSERROLE_DE = (
         "i18n.de_DE.ccm:educationalintendedenduserrole",
-        FieldType.TEXT,
+        ElasticFieldType.TEXT,
     )
-    CONTAINS_ADS = ("properties.ccm:containsAdvertisement", FieldType.TEXT)
-    OBJECT_TYPE = ("properties.ccm:objecttype", FieldType.TEXT)
+    CONTAINS_ADS = ("properties.ccm:containsAdvertisement", ElasticFieldType.TEXT)
+    OBJECT_TYPE = ("properties.ccm:objecttype", ElasticFieldType.TEXT)
 
 
 LearningMaterialAttribute = ElasticField(

diff --git a/src/app/api/score/score.py b/src/app/api/score/score.py
@@ -14,7 +14,7 @@
     query_missing_material_license,
 )
 from app.elastic.search import Search
-from app.models import CollectionAttribute
+from app.models import CollectionAttribute, ElasticResourceAttribute
 
 material_terms_relevant_for_score = [
     "missing_title",
@@ -116,11 +116,11 @@ def field_names_used_for_score_calculation(properties: dict) -> list[str]:
 aggs_collection_validation = {
     "missing_title": amissing(qfield=CollectionAttribute.TITLE),
     "short_title": afilter(Q("range", char_count_title={"gt": 0, "lt": 5})),
-    "missing_keywords": amissing(qfield=CollectionAttribute.KEYWORDS),
+    "missing_keywords": amissing(qfield=ElasticResourceAttribute.KEYWORDS),
     "few_keywords": afilter(Q("range", token_count_keywords={"gt": 0, "lt": 3})),
     "missing_description": amissing(qfield=CollectionAttribute.DESCRIPTION),
     "short_description": afilter(
         Q("range", char_count_description={"gt": 0, "lt": 30})
     ),
-    "missing_edu_context": amissing(qfield=CollectionAttribute.EDU_CONTEXT),
+    "missing_edu_context": amissing(qfield=ElasticResourceAttribute.EDU_CONTEXT),
 }
diff --git a/src/app/elastic/dsl.py b/src/app/elastic/dsl.py
@@ -4,16 +4,16 @@
 from elasticsearch_dsl.aggs import Agg
 from elasticsearch_dsl.query import Query
 
-from .fields import Field
+from .fields import ElasticField
 from .utils import handle_text_field
 
 
-def qterm(qfield: Union[Field, str], value, **kwargs) -> Query:
+def qterm(qfield: Union[ElasticField, str], value, **kwargs) -> Query:
     kwargs[handle_text_field(qfield)] = value
     return Q("term", **kwargs)
 
 
-def qterms(qfield: Union[Field, str], values: list, **kwargs) -> Query:
+def qterms(qfield: Union[ElasticField, str], values: list, **kwargs) -> Query:
     kwargs[handle_text_field(qfield)] = values
     return Q("terms", **kwargs)
 
@@ -26,8 +26,8 @@ def qbool(**kwargs) -> Query:
     return Q("bool", **kwargs)
 
 
-def qexists(qfield: Union[Field, str], **kwargs) -> Query:
-    if isinstance(qfield, Field):
+def qexists(qfield: Union[ElasticField, str], **kwargs) -> Query:
+    if isinstance(qfield, ElasticField):
         qfield = qfield.path
     return Q("exists", field=qfield, **kwargs)
 
@@ -47,5 +47,5 @@ def afilter(query: Query) -> Agg:
     return A("filter", query)
 
 
-def amissing(qfield: Union[Field, str]) -> Agg:
+def amissing(qfield: Union[ElasticField, str]) -> Agg:
     return A("missing", field=handle_text_field(qfield))