Skip to content

Commit

Permalink
50 bug wrong response type for glom (#51)
Browse files Browse the repository at this point in the history
* #50 bug wrong response type for glom

WIP: Reproducing bug

* #50 bug wrong response type for glom

WIP: Reproducing bug

* #50 bug wrong response type for glom

Refactoring glom usage

* #50 bug wrong response type for glom

Refactoring glom usage

* #50 bug wrong response type for glom

Refactoring naming

Added description

* #50 bug wrong response type for glom

Refactoring naming

Forcing glom to return empty strings instead of none for missing description

* #50 bug wrong response type for glom

Refactoring enum unions

* #50 bug wrong response type for glom

Refactoring CollectionAttributes

* #50 bug wrong response type for glom

Refactoring CollectionAttributes

* #50 bug wrong response type for glom

Refactoring and cleanup, removing outdated logic and code

Adding documentation

* #50 bug wrong response type for glom

Cleanup
  • Loading branch information
RobertMeissner authored Jun 22, 2022
1 parent 9b413a9 commit 2b33ccc
Show file tree
Hide file tree
Showing 13 changed files with 323 additions and 100 deletions.
37 changes: 33 additions & 4 deletions src/app/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
CollectionTreeCount,
collection_counts,
)
from app.api.collections.models import CollectionNode
from app.api.collections.missing_attributes import (
collections_with_missing_attributes,
missing_attribute_filter,
)
from app.api.collections.models import CollectionNode, MissingMaterials
from app.api.collections.tree import collection_tree
from app.api.quality_matrix.collections import collection_quality
from app.api.quality_matrix.models import ColumnOutputModel, Forms, Timeline
Expand Down Expand Up @@ -57,6 +61,7 @@ def get_database(request: Request) -> Database:
"""

TAG_STATISTICS = "Statistics"
_TAG_COLLECTIONS = "Collections"


def node_ids_for_major_collections(
Expand Down Expand Up @@ -164,7 +169,7 @@ async def get_timestamps(
response_model=ScoreOutput,
status_code=HTTP_200_OK,
responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
tags=[TAG_STATISTICS],
tags=[_TAG_COLLECTIONS],
description=f"""Returns the average ratio of non-empty properties for the chosen collection.
For certain properties, e.g. `properties.cclom:title`, the ratio of
elements which miss this entry compared to the total number of entries is calculated.
Expand Down Expand Up @@ -222,7 +227,7 @@ async def ping_api():
response_model=list[CollectionNode],
status_code=HTTP_200_OK,
responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
tags=["Collections"],
tags=[_TAG_COLLECTIONS],
)
async def get_collection_tree(
*, node_id: UUID = Depends(node_ids_for_major_collections)
Expand All @@ -236,7 +241,7 @@ async def get_collection_tree(
response_model=list[CollectionTreeCount],
status_code=HTTP_200_OK,
responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
tags=["Collections"],
tags=[_TAG_COLLECTIONS],
)
async def get_collection_counts(
*,
Expand All @@ -248,3 +253,27 @@ async def get_collection_counts(
):
counts = await collection_counts(node_id=node_id, facet=facet)
return counts


@router.get(
"/collections/{node_id}/pending-subcollections/{missing_attribute}",
response_model=list[MissingMaterials],
response_model_exclude_unset=True,
status_code=HTTP_200_OK,
responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}},
tags=[_TAG_COLLECTIONS],
description="""A list of missing entries for different types of materials by subcollection.
Searches for entries with one of the following properties being empty or missing: """
+ f"{', '.join([entry.value for entry in missing_attribute_filter])}.",
)
async def filter_collections_with_missing_attributes(
*,
noderef_id: UUID = Depends(node_ids_for_major_collections),
missing_attribute: str = Path(
...,
examples={
form.name: {"value": form.value} for form in missing_attribute_filter
},
),
):
return await collections_with_missing_attributes(noderef_id, missing_attribute)
87 changes: 87 additions & 0 deletions src/app/api/collections/missing_attributes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import annotations

from typing import Optional
from uuid import UUID

from elasticsearch_dsl.query import Q
from glom import Coalesce, Iter

from app.api.collections.models import MissingMaterials
from app.api.collections.utils import map_elastic_response_to_model
from app.core.config import ELASTIC_TOTAL_SIZE
from app.elastic.dsl import qbool, qmatch
from app.elastic.elastic import ResourceType, type_filter
from app.elastic.search import Search
from app.models import CollectionAttribute, ElasticResourceAttribute

missing_attribute_filter = [
CollectionAttribute.TITLE,
ElasticResourceAttribute.NAME,
ElasticResourceAttribute.KEYWORDS,
CollectionAttribute.DESCRIPTION,
]


all_source_fields: list = [
ElasticResourceAttribute.NODEREF_ID,
ElasticResourceAttribute.TYPE,
ElasticResourceAttribute.NAME,
CollectionAttribute.TITLE,
ElasticResourceAttribute.KEYWORDS,
CollectionAttribute.DESCRIPTION,
CollectionAttribute.PATH,
CollectionAttribute.PARENT_ID,
]

missing_attributes_spec = {
"title": Coalesce(CollectionAttribute.TITLE.path, default=""),
"keywords": (
Coalesce(ElasticResourceAttribute.KEYWORDS.path, default=[]),
Iter().all(),
),
"description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=""),
"path": (
Coalesce(CollectionAttribute.PATH.path, default=[]),
Iter().all(),
),
"parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=""),
"noderef_id": Coalesce(CollectionAttribute.NODE_ID.path, default=""),
"name": Coalesce(ElasticResourceAttribute.NAME.path, default=""),
"type": Coalesce(ElasticResourceAttribute.TYPE.path, default=""),
"children": Coalesce("", default=[]), # workaround to map easier to pydantic model
}


def missing_attributes_search(
noderef_id: UUID, missing_attribute: str, max_hits: int
) -> Search:
query = {
"filter": [*type_filter[ResourceType.COLLECTION]],
"minimum_should_match": 1,
"should": [
qmatch(**{"path": noderef_id}),
qmatch(**{"nodeRef.id": noderef_id}),
],
"must_not": Q("wildcard", **{missing_attribute: {"value": "*"}}),
}

return (
Search()
.base_filters()
.query(qbool(**query))
.source(includes=[source.path for source in all_source_fields])[:max_hits]
)


async def collections_with_missing_attributes(
noderef_id: UUID,
missing_attribute: str,
max_hits: Optional[int] = ELASTIC_TOTAL_SIZE,
) -> list[MissingMaterials]:
search = missing_attributes_search(noderef_id, missing_attribute, max_hits)

response = search.execute()
if response.success():
return map_elastic_response_to_model(
response, missing_attributes_spec, MissingMaterials
)
18 changes: 18 additions & 0 deletions src/app/api/collections/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,21 @@ class CollectionNode(BaseModel):
title: Optional[str] # might be none due to data model
children: list[CollectionNode]
parent_id: Optional[UUID]


class MissingMaterials(CollectionNode):
"""
A model containing information about entries which miss, e.g, a description.
By returning this model the editors know enough about the entry to find and correct it
:param
description: a free text description of the context
path: the complete id path, i.e., from parent node id up to the root id of elastic search
type: Indicates the type of content, must be ccm:map in the current implementation
"""

keywords: list[str]
description: str
path: list[str]
type: str
name: str
53 changes: 21 additions & 32 deletions src/app/api/collections/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

from aiohttp import ClientSession
from elasticsearch_dsl.response import Response
from glom import Coalesce, Iter, glom
from glom import Coalesce, Iter

from app.api.collections.models import CollectionNode
from app.api.collections.utils import map_elastic_response_to_model
from app.api.collections.vocabs import tree_from_vocabs
from app.core.config import ELASTIC_TOTAL_SIZE
from app.elastic.dsl import qbool, qterm
from app.elastic.search import Search
from app.models import CollectionAttribute
from app.models import CollectionAttribute, ElasticResourceAttribute


def build_portal_tree(collections: list, root_noderef_id: UUID) -> list[CollectionNode]:
Expand Down Expand Up @@ -47,42 +48,30 @@ def tree_search(node_id: UUID) -> Search:
return s


def hits_to_collection(hits: Response) -> list[CollectionNode]:
collections = []
for hit in hits:
entry = hit.to_dict()
spec = {
"title": Coalesce(CollectionAttribute.TITLE.path, default=None),
"keywords": (
Coalesce(CollectionAttribute.KEYWORDS.path, default=[]),
Iter().all(),
),
"description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=None),
"path": (
Coalesce(CollectionAttribute.PATH.path, default=[]),
Iter().all(),
),
"parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=None),
"node_id": Coalesce(CollectionAttribute.NODE_ID.path, default=None),
}
parsed_entry = glom(entry, spec)
if parsed_entry["title"] is not None:
collections.append(
CollectionNode(
noderef_id=parsed_entry["node_id"],
title=parsed_entry["title"],
children=[],
parent_id=parsed_entry["parent_id"],
)
)
return collections
collection_spec = {
"title": Coalesce(CollectionAttribute.TITLE.path, default=None),
"keywords": (
Coalesce(ElasticResourceAttribute.KEYWORDS.path, default=[]),
Iter().all(),
),
"description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=None),
"path": (
Coalesce(CollectionAttribute.PATH.path, default=[]),
Iter().all(),
),
"parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=None),
"noderef_id": Coalesce(CollectionAttribute.NODE_ID.path, default=None),
"children": Coalesce("", default=[]),
}


def tree_from_elastic(node_id: UUID) -> list[CollectionNode]:
response: Response = tree_search(node_id).execute()

if response.success():
collection = hits_to_collection(response)
collection = map_elastic_response_to_model(
response, collection_spec, CollectionNode
)
return build_portal_tree(collection, node_id)


Expand Down
14 changes: 14 additions & 0 deletions src/app/api/collections/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Generic, TypeVar

from elasticsearch_dsl.response import Response
from glom import glom

from app.api.collections.models import CollectionNode, MissingMaterials

T = TypeVar("T", CollectionNode, MissingMaterials)


def map_elastic_response_to_model(
response: Response, specs: dict, model: Generic[T]
) -> list[T]:
return [model(**glom(hit.to_dict(), specs)) for hit in response]
31 changes: 15 additions & 16 deletions src/app/api/score/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,34 @@

from pydantic import BaseModel, Field

from app.elastic.fields import Field as ElasticField
from app.elastic.fields import FieldType
from app.elastic.fields import ElasticField, ElasticFieldType
from app.models import ElasticResourceAttribute


class _LearningMaterialAttribute(ElasticField):
TITLE = ("properties.cclom:title", FieldType.TEXT)
SUBJECTS = ("properties.ccm:taxonid", FieldType.TEXT)
SUBJECTS_DE = ("i18n.de_DE.ccm:taxonid", FieldType.TEXT)
WWW_URL = ("properties.ccm:wwwurl", FieldType.TEXT)
DESCRIPTION = ("properties.cclom:general_description", FieldType.TEXT)
LICENSES = ("properties.ccm:commonlicense_key", FieldType.TEXT)
COLLECTION_NODEREF_ID = ("collections.nodeRef.id", FieldType.TEXT)
COLLECTION_PATH = ("collections.path", FieldType.TEXT)
CONTENT_FULLTEXT = ("content.fulltext", FieldType.TEXT)
TITLE = ("properties.cclom:title", ElasticFieldType.TEXT)
SUBJECTS = ("properties.ccm:taxonid", ElasticFieldType.TEXT)
SUBJECTS_DE = ("i18n.de_DE.ccm:taxonid", ElasticFieldType.TEXT)
WWW_URL = ("properties.ccm:wwwurl", ElasticFieldType.TEXT)
DESCRIPTION = ("properties.cclom:general_description", ElasticFieldType.TEXT)
LICENSES = ("properties.ccm:commonlicense_key", ElasticFieldType.TEXT)
COLLECTION_NODEREF_ID = ("collections.nodeRef.id", ElasticFieldType.TEXT)
COLLECTION_PATH = ("collections.path", ElasticFieldType.TEXT)
CONTENT_FULLTEXT = ("content.fulltext", ElasticFieldType.TEXT)
LEARNINGRESOURCE_TYPE = (
"properties.ccm:oeh_lrt_aggregated",
FieldType.TEXT,
ElasticFieldType.TEXT,
)
LEARNINGRESOURCE_TYPE_DE = (
"i18n.de_DE.ccm:oeh_lrt_aggregated",
FieldType.TEXT,
ElasticFieldType.TEXT,
)
EDUENDUSERROLE_DE = (
"i18n.de_DE.ccm:educationalintendedenduserrole",
FieldType.TEXT,
ElasticFieldType.TEXT,
)
CONTAINS_ADS = ("properties.ccm:containsAdvertisement", FieldType.TEXT)
OBJECT_TYPE = ("properties.ccm:objecttype", FieldType.TEXT)
CONTAINS_ADS = ("properties.ccm:containsAdvertisement", ElasticFieldType.TEXT)
OBJECT_TYPE = ("properties.ccm:objecttype", ElasticFieldType.TEXT)


LearningMaterialAttribute = ElasticField(
Expand Down
6 changes: 3 additions & 3 deletions src/app/api/score/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
query_missing_material_license,
)
from app.elastic.search import Search
from app.models import CollectionAttribute
from app.models import CollectionAttribute, ElasticResourceAttribute

material_terms_relevant_for_score = [
"missing_title",
Expand Down Expand Up @@ -116,11 +116,11 @@ def field_names_used_for_score_calculation(properties: dict) -> list[str]:
aggs_collection_validation = {
"missing_title": amissing(qfield=CollectionAttribute.TITLE),
"short_title": afilter(Q("range", char_count_title={"gt": 0, "lt": 5})),
"missing_keywords": amissing(qfield=CollectionAttribute.KEYWORDS),
"missing_keywords": amissing(qfield=ElasticResourceAttribute.KEYWORDS),
"few_keywords": afilter(Q("range", token_count_keywords={"gt": 0, "lt": 3})),
"missing_description": amissing(qfield=CollectionAttribute.DESCRIPTION),
"short_description": afilter(
Q("range", char_count_description={"gt": 0, "lt": 30})
),
"missing_edu_context": amissing(qfield=CollectionAttribute.EDU_CONTEXT),
"missing_edu_context": amissing(qfield=ElasticResourceAttribute.EDU_CONTEXT),
}
12 changes: 6 additions & 6 deletions src/app/elastic/dsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
from elasticsearch_dsl.aggs import Agg
from elasticsearch_dsl.query import Query

from .fields import Field
from .fields import ElasticField
from .utils import handle_text_field


def qterm(qfield: Union[Field, str], value, **kwargs) -> Query:
def qterm(qfield: Union[ElasticField, str], value, **kwargs) -> Query:
kwargs[handle_text_field(qfield)] = value
return Q("term", **kwargs)


def qterms(qfield: Union[Field, str], values: list, **kwargs) -> Query:
def qterms(qfield: Union[ElasticField, str], values: list, **kwargs) -> Query:
kwargs[handle_text_field(qfield)] = values
return Q("terms", **kwargs)

Expand All @@ -26,8 +26,8 @@ def qbool(**kwargs) -> Query:
return Q("bool", **kwargs)


def qexists(qfield: Union[Field, str], **kwargs) -> Query:
if isinstance(qfield, Field):
def qexists(qfield: Union[ElasticField, str], **kwargs) -> Query:
if isinstance(qfield, ElasticField):
qfield = qfield.path
return Q("exists", field=qfield, **kwargs)

Expand All @@ -47,5 +47,5 @@ def afilter(query: Query) -> Agg:
return A("filter", query)


def amissing(qfield: Union[Field, str]) -> Agg:
def amissing(qfield: Union[ElasticField, str]) -> Agg:
return A("missing", field=handle_text_field(qfield))
Loading

0 comments on commit 2b33ccc

Please sign in to comment.