diff --git a/README.md b/README.md index 9affe32..752d0d9 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Launch `index.html` in `build` directory ## For confluence WIP: Currently not possible to automatically push to confluence. +Add token from JIRA. ```bash ./build_confluence.sh diff --git a/src/app/api/analytics/__init__.py b/src/app/api/analytics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/app/api/analytics/analytics.py b/src/app/api/analytics/analytics.py new file mode 100644 index 0000000..2691083 --- /dev/null +++ b/src/app/api/analytics/analytics.py @@ -0,0 +1,111 @@ +import uuid +from datetime import datetime +from enum import Enum +from typing import ClassVar, Generic, Optional, TypeVar + +from pydantic import BaseModel, Extra, Field, validator +from pydantic.generics import GenericModel +from starlette.exceptions import HTTPException +from starlette.status import HTTP_404_NOT_FOUND + + +class StatType(str, Enum): + # PORTAL_TREE = "portal-tree" # Currently unused + SEARCH = "search" + MATERIAL_TYPES = "material-types" + VALIDATION_COLLECTIONS = "validation-collections" + VALIDATION_MATERIALS = "validation-materials" # Currently unused + + +class StatsNotFoundException(HTTPException): + def __init__(self): + super().__init__( + status_code=HTTP_404_NOT_FOUND, + detail="Stats not found", + ) + + +class ElasticConfig: + allow_population_by_field_name = True + extra = Extra.allow + + +class ElasticModel(BaseModel): + class Config(ElasticConfig): + pass + + +class ResponseConfig: + allow_population_by_field_name = True + extra = Extra.ignore + + +class ResponseModel(BaseModel): + class Config(ResponseConfig): + pass + + +COUNT_STATISTICS_TYPE = dict[str, int] + + +class StatsResponse(ResponseModel): + derived_at: datetime + stats: dict[str, dict[str, COUNT_STATISTICS_TYPE]] + + +ValidationStatsT = TypeVar("ValidationStatsT") + + +class ValidationStatsResponse(GenericModel, Generic[ValidationStatsT]): + noderef_id: uuid.UUID + derived_at: datetime = Field(default_factory=datetime.now) + validation_stats: ValidationStatsT + + +ElasticFieldValidationT = TypeVar("ElasticFieldValidationT") + + +class ElasticValidationStats(GenericModel, Generic[ElasticFieldValidationT]): + title: Optional[ElasticFieldValidationT] + keywords: Optional[ElasticFieldValidationT] + description: Optional[ElasticFieldValidationT] + edu_context: Optional[ElasticFieldValidationT] + + +class OehValidationError(str, Enum): + MISSING = "missing" + TOO_SHORT = "too_short" + TOO_FEW = "too_few" + LACKS_CLARITY = "lacks_clarity" + INVALID_SPELLING = "invalid_spelling" + + _lut: ClassVar[dict] + + +class CollectionValidationStats(ElasticValidationStats[list[OehValidationError]]): + pass + + +def none_to_empty_list(v: list) -> Optional[list]: + if v is None: + return [] + return v + + +class MaterialFieldValidation(BaseModel): + missing: Optional[list[uuid.UUID]] + too_short: Optional[list[uuid.UUID]] + too_few: Optional[list[uuid.UUID]] + lacks_clarity: Optional[list[uuid.UUID]] + invalid_spelling: Optional[list[uuid.UUID]] + + # validators + _none_to_empty_list = validator("*", pre=True, allow_reuse=True)(none_to_empty_list) + + +class MaterialValidationStats(ElasticValidationStats[MaterialFieldValidation]): + subjects: Optional[MaterialFieldValidation] + license: Optional[MaterialFieldValidation] + ads_qualifier: Optional[MaterialFieldValidation] + material_type: Optional[MaterialFieldValidation] + object_type: Optional[MaterialFieldValidation] diff --git a/src/app/api/analytics/background_task.py b/src/app/api/analytics/background_task.py new file mode 100644 index 0000000..90c33f8 --- /dev/null +++ b/src/app/api/analytics/background_task.py @@ -0,0 +1,116 @@ +import asyncio +import os +from datetime import datetime + +from fastapi import APIRouter +from fastapi_utils.tasks import repeat_every +from starlette.background import BackgroundTasks +from starlette.status import HTTP_202_ACCEPTED + +import app.api.analytics.storage +from app.api.analytics.models import Collection +from app.api.analytics.stats import get_ids_to_iterate, search_hits_by_material_type +from app.api.analytics.storage import ( + _COLLECTION_COUNT, + _COLLECTIONS, + _MATERIALS, + _SEARCH, +) +from app.api.collections.counts import AggregationMappings, collection_counts +from app.api.score.models import required_collection_properties +from app.core.config import BACKGROUND_TASK_TIME_INTERVAL +from app.core.constants import COLLECTION_ROOT_ID +from app.core.logging import logger +from app.elastic.elastic import query_collections, query_materials +from app.elastic.search import Search + +background_router = APIRouter(tags=["Background"]) + + +@background_router.post("/run-analytics", status_code=HTTP_202_ACCEPTED) +async def run_analytics(*, background_tasks: BackgroundTasks): + background_tasks.add_task(run) + + +@repeat_every(seconds=BACKGROUND_TASK_TIME_INTERVAL, logger=logger) +def background_task(): + run() + + +def import_collections(derived_at: datetime): + s = ( + Search() + .query(query_collections(node_id=COLLECTION_ROOT_ID)) + .source( + includes=["nodeRef.*", "path", *list(required_collection_properties.keys())] + ) + ) + + seen = set() + collections = [] + for hit in s.scan(): + if hit.nodeRef["id"] in seen: + continue + + seen.add(hit.nodeRef["id"]) + collections.append( + Collection( + id=str(hit.nodeRef["id"]), + doc=hit.to_dict(), + derived_at=derived_at, + ) + ) + app.api.analytics.storage.global_storage[_COLLECTIONS] = collections + + +def import_materials(derived_at: datetime): + s = ( + Search() + .query(query_materials(node_id=COLLECTION_ROOT_ID)) + .source( + includes=[ + "nodeRef.*", + "collections.nodeRef.id", + *list(required_collection_properties.keys()), + ] + ) + ) + + seen = set() + collections = [] + for hit in s.scan(): + node_id = hit.nodeRef["id"] + if node_id not in seen: + seen.add(node_id) + collections.append( + Collection( + id=str(node_id), + doc=hit.to_dict(), + derived_at=derived_at, + ) + ) + app.api.analytics.storage.global_storage[_MATERIALS] = collections + + +def run(): + derived_at = datetime.now() + logger.info(f"{os.getpid()}: Starting analytics import at: {derived_at}") + + import_collections(derived_at=derived_at) + + import_materials(derived_at=derived_at) + + print("Collection and materials imported") + + app.api.analytics.storage.global_storage[_COLLECTION_COUNT] = asyncio.run( + collection_counts(COLLECTION_ROOT_ID, AggregationMappings.lrt) + ) + + all_collections = asyncio.run(get_ids_to_iterate(node_id=COLLECTION_ROOT_ID)) + print("Tree ready to iterate. Length: ", len(all_collections)) + + # TODO Refactor, this is very expensive + app.api.analytics.storage.global_storage[_SEARCH] = { + row.id: search_hits_by_material_type(row.title) for row in all_collections + } + print("Background task done") diff --git a/src/app/api/analytics/models.py b/src/app/api/analytics/models.py new file mode 100644 index 0000000..a28991f --- /dev/null +++ b/src/app/api/analytics/models.py @@ -0,0 +1,10 @@ +from datetime import datetime + +from pydantic import BaseModel + + +# TODO: Rename, as used for materials in background_task, as well +class Collection(BaseModel): + id: str + doc: dict + derived_at: datetime diff --git a/src/app/api/analytics/stats.py b/src/app/api/analytics/stats.py new file mode 100644 index 0000000..bff8047 --- /dev/null +++ b/src/app/api/analytics/stats.py @@ -0,0 +1,306 @@ +import datetime +import uuid +from dataclasses import dataclass +from typing import Union + +from elasticsearch_dsl.aggs import Agg +from elasticsearch_dsl.query import Q, Query +from elasticsearch_dsl.response import AggResponse, Response +from glom import merge + +from app.api.analytics.analytics import ( + COUNT_STATISTICS_TYPE, + CollectionValidationStats, + MaterialValidationStats, + StatsNotFoundException, + StatsResponse, + StatType, + ValidationStatsResponse, +) +from app.api.analytics.models import Collection +from app.api.analytics.storage import ( + _COLLECTION_COUNT, + _COLLECTIONS, + _MATERIALS, + global_storage, +) +from app.api.collections.descendants import aterms +from app.api.collections.models import CollectionNode +from app.api.collections.tree import collection_tree +from app.api.score.models import ( + LearningMaterialAttribute, + required_collection_properties, +) +from app.core.config import ELASTIC_TOTAL_SIZE +from app.elastic.elastic import query_materials +from app.elastic.fields import ElasticField +from app.elastic.search import Search + + +def qsimplequerystring( + query: str, qfields: list[Union[ElasticField, str]], **kwargs +) -> Query: + kwargs["query"] = query + kwargs["fields"] = [ + (qfield.path if isinstance(qfield, ElasticField) else qfield) + for qfield in qfields + ] + return Q("simple_query_string", **kwargs) + + +def search_materials(query_str: str) -> Query: + return qsimplequerystring( + query=query_str, + qfields=[ + LearningMaterialAttribute.TITLE, + LearningMaterialAttribute.KEYWORDS, + LearningMaterialAttribute.DESCRIPTION, + LearningMaterialAttribute.CONTENT_FULLTEXT, + LearningMaterialAttribute.SUBJECTS_DE, + LearningMaterialAttribute.LEARNINGRESOURCE_TYPE_DE, + LearningMaterialAttribute.EDU_CONTEXT_DE, + LearningMaterialAttribute.EDUENDUSERROLE_DE, + ], + default_operator="and", + ) + + +def agg_material_types(size: int = ELASTIC_TOTAL_SIZE) -> Agg: + # TODO: This is the key property we are aggregating for + return aterms( + qfield=LearningMaterialAttribute.LEARNINGRESOURCE_TYPE, + missing="N/A", + size=size, + ) + + +def merge_agg_response( + agg: AggResponse, key: str = "key", result_field: str = "doc_count" +) -> dict: + def op(carry: dict, bucket: dict): + carry[bucket[key]] = bucket[result_field] + + return merge(agg.buckets, op=op) + + +def search_hits_by_material_type(collection_title: str) -> dict: + """Title used here to shotgun search for any matches with the title of the material""" + s = build_material_search(collection_title) + response: Response = s[:0].execute() + + if response.success(): + # TODO: Clear and cleanu p: what does this do? + stats = merge_agg_response(response.aggregations.material_types) + stats["total"] = sum(stats.values()) + return stats + + +def build_material_search(query_string: str): + s = ( + Search() + .base_filters() + .query(query_materials()) + .query(search_materials(query_string)) + ) + s.aggs.bucket("material_types", agg_material_types()) + return s + + +@dataclass +class Row: + id: uuid.UUID + title: str + + +async def get_ids_to_iterate(node_id: uuid.UUID): + """ + Contains the collection id's to iterate over. + + Hardcoded for now including multiple inefficient data transformations, e.g., from list to tree back to list + :return: + """ + + def flatten_list(list_of_lists): + flat_list = [] + for item in list_of_lists: + if type(item) == list: + flat_list += flatten_list(item) + else: + flat_list.append(item) + + return flat_list + + def nodes(data: list[CollectionNode]) -> list: + return [ + nodes(collection.children) + if collection.children + else (collection.noderef_id, collection.title) + for collection in data + ] + + tree = await collection_tree(node_id) + return [Row(id=row[0], title=row[1]) for row in flatten_list(nodes(tree))] + + +def query_material_types(node_id: uuid.UUID) -> dict[str, COUNT_STATISTICS_TYPE]: + """ + get collections with parent id equal to node_id + + portal_id == node_id + """ + collections = global_storage[_COLLECTIONS] + collections = filtered_collections(collections, node_id) + + """ + collection id - learning_resource_type - counts + Join filtered collections and filtered counts into one, now + """ + stats = {} + + counts = global_storage[_COLLECTION_COUNT] + + # TODO: Refactor with filter and dict comprehension + for collection in collections: + for count in counts: + if str(collection.id) == str(count.noderef_id): + stats.update( + {str(collection.id): {"total": count.total, **count.counts}} + ) + return stats + + +def filtered_collections(collections: list[Collection], node_id: uuid.UUID): + return [ + collection + for collection in collections + if str(node_id) in collection.doc["path"] + ] + + +async def stats_latest( + stat_type: StatType, node_id: uuid.UUID +) -> dict[str, COUNT_STATISTICS_TYPE]: + results = {} + + if stat_type is StatType.SEARCH: + all_collection_nodes = await get_ids_to_iterate(node_id) + for row in all_collection_nodes: + stats = search_hits_by_material_type(row.title) + results.update({str(row.id): stats}) + elif stat_type is StatType.MATERIAL_TYPES: + results = query_material_types(node_id) + return results + + +async def overall_stats(node_id) -> StatsResponse: + search_stats = await stats_latest(stat_type=StatType.SEARCH, node_id=node_id) + + if not search_stats: + raise StatsNotFoundException + + material_types_stats = await stats_latest( + stat_type=StatType.MATERIAL_TYPES, node_id=node_id + ) + + if not material_types_stats: + raise StatsNotFoundException + + stats_output = {key: {"search": value} for key, value in search_stats.items()} + + for key, value in material_types_stats.items(): + if key in stats_output.keys(): + stats_output[key].update({"material_types": value}) + else: + stats_output.update({key: {"material_types": value}}) + + output = StatsResponse(derived_at=datetime.datetime.now(), stats=stats_output) + return output + + +def collections_with_missing_properties( + node_id: uuid.UUID, +) -> list[ValidationStatsResponse[CollectionValidationStats]]: + """ + Check whether any of the following are missing: + title, description, keywords, license, taxon_id, edu_context, learning_resource_type, ads_qualifier, object_type + + """ + + collections = global_storage[_COLLECTIONS] + collections = filtered_collections(collections, node_id) + + missing_properties = {} + for collection in collections: + missing_properties.update({collection.id: {}}) + for entry in required_collection_properties.keys(): + value = {required_collection_properties[entry]: ["missing"]} + if ( + "properties" not in collection.doc.keys() + or entry.split(".")[-1] not in collection.doc["properties"].keys() + ): + missing_properties[collection.id].update(value) + + if not missing_properties: + raise StatsNotFoundException + + return [ + ValidationStatsResponse[CollectionValidationStats]( + noderef_id=uuid.UUID(key), + validation_stats=CollectionValidationStats(**value), + ) + for key, value in missing_properties.items() + ] + + +def materials_with_missing_properties( + node_id, +) -> list[ValidationStatsResponse[MaterialValidationStats]]: + """ + Returns the number of materials missing certain properties for this collection node_id and its sub collections + + Similar to collections_with_missing_properties, but counting the underlying materials missing that property + + param node_id: + :return: + """ + + collections: list[Collection] = global_storage[_COLLECTIONS] + collections = filtered_collections(collections, node_id) + + materials: list[Collection] = global_storage[_MATERIALS] + # find materials belonging to each collection + # check whether they are missing the required properties + # if so, add them as a list to validation stats + # materials = filtered_collections(collections, node_id) + missing_properties = {} + for collection in collections: + missing_properties.update({collection.id: {}}) + for material in materials: + if material.doc["collections"][0]["nodeRef"]["id"] == collection.id: + # check if property is present + # if not, add the respective material id to the "missing" field of this property + for entry in required_collection_properties.keys(): + if ( + "properties" not in material.doc.keys() + or entry.split(".")[-1] not in material.doc["properties"].keys() + ): + entry_key = required_collection_properties[entry] + if entry_key not in missing_properties[collection.id].keys(): + missing_properties[collection.id].update( + {entry_key: {"missing": []}} + ) + + missing_properties[collection.id][entry_key]["missing"].append( + material.id + ) + + if not missing_properties: + raise StatsNotFoundException + + return [ + ValidationStatsResponse[MaterialValidationStats]( + noderef_id=uuid.UUID(key), + validation_stats=MaterialValidationStats(**value), + ) + for key, value in missing_properties.items() + ] diff --git a/src/app/api/analytics/storage.py b/src/app/api/analytics/storage.py new file mode 100644 index 0000000..151f54c --- /dev/null +++ b/src/app/api/analytics/storage.py @@ -0,0 +1,14 @@ +_COLLECTIONS = "collections" +_MATERIALS = "materials" +_SEARCH = "search" +_COLLECTION_COUNT = "counts" + +""" +A quick fix for a global storage +""" +global_storage = { + _COLLECTIONS: [], + _MATERIALS: [], + _SEARCH: {}, + _COLLECTION_COUNT: {}, +} # TODO: Refactor me ASAP diff --git a/src/app/api/api.py b/src/app/api/api.py index 489b884..d5d09a7 100644 --- a/src/app/api/api.py +++ b/src/app/api/api.py @@ -1,7 +1,6 @@ import json import uuid -from typing import Mapping -from uuid import UUID +from typing import Mapping, Optional from databases import Database from fastapi import APIRouter, Depends, HTTPException, Path, Query @@ -11,15 +10,39 @@ from starlette.requests import Request from starlette.status import HTTP_200_OK, HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND +from app.api.analytics.analytics import ( + CollectionValidationStats, + MaterialValidationStats, + StatsResponse, + ValidationStatsResponse, +) +from app.api.analytics.background_task import background_router +from app.api.analytics.stats import ( + collections_with_missing_properties, + materials_with_missing_properties, + overall_stats, +) +from app.api.analytics.storage import global_storage from app.api.collections.counts import ( AggregationMappings, CollectionTreeCount, collection_counts, ) +from app.api.collections.descendants import ( + CollectionMaterialsCount, + get_material_count_tree, +) from app.api.collections.missing_attributes import ( collections_with_missing_attributes, missing_attribute_filter, ) +from app.api.collections.missing_materials import ( + LearningMaterial, + MissingAttributeFilter, + get_materials_with_missing_attributes, + material_response_fields, + materials_filter_params, +) from app.api.collections.models import CollectionNode, MissingMaterials from app.api.collections.tree import collection_tree from app.api.quality_matrix.collections import collection_quality @@ -27,7 +50,7 @@ from app.api.quality_matrix.quality_matrix import source_quality, store_in_timeline from app.api.quality_matrix.timeline import timestamps from app.api.quality_matrix.utils import transpose -from app.api.score.models import ScoreOutput +from app.api.score.models import LearningMaterialAttribute, ScoreOutput from app.api.score.score import ( aggs_collection_validation, aggs_material_validation, @@ -37,6 +60,7 @@ field_names_used_for_score_calculation, search_score, ) +from app.core.config import API_DEBUG, BACKGROUND_TASK_TIME_INTERVAL from app.core.constants import COLLECTION_NAME_TO_ID, COLLECTION_ROOT_ID from app.elastic.elastic import ResourceType @@ -46,6 +70,7 @@ def get_database(request: Request) -> Database: router = APIRouter() +router.include_router(background_router) QUALITY_MATRIX_DESCRIPTION = """Calculation of the quality matrix. Depending on the chosen form the quality matrix returns the ratio of entries which miss this property compared to @@ -60,20 +85,20 @@ def get_database(request: Request) -> Database: The user chooses the node id in the editorial environment (german: Redaktionsumgebung) in the "Fach" selection. """ -TAG_STATISTICS = "Statistics" +_TAG_STATISTICS = "Statistics" _TAG_COLLECTIONS = "Collections" def node_ids_for_major_collections( *, - node_id: UUID = Path( + node_id: uuid.UUID = Path( ..., examples={ "Alle Fachportale": {"value": COLLECTION_ROOT_ID}, **COLLECTION_NAME_TO_ID, }, ), -) -> UUID: +) -> uuid.UUID: return node_id @@ -82,7 +107,7 @@ def node_ids_for_major_collections( status_code=HTTP_200_OK, response_model=list[ColumnOutputModel], responses={HTTP_404_NOT_FOUND: {"description": "Quality matrix not determinable"}}, - tags=[TAG_STATISTICS], + tags=[_TAG_STATISTICS], description=QUALITY_MATRIX_DESCRIPTION, ) async def get_quality( @@ -121,8 +146,9 @@ async def get_quality( status_code=HTTP_200_OK, response_model=list[ColumnOutputModel], responses={HTTP_404_NOT_FOUND: {"description": "Quality matrix not determinable"}}, - tags=[TAG_STATISTICS], - description="""An unix timestamp in integer seconds since epoch yields the quality matrix at the respective date.""", + tags=[_TAG_STATISTICS], + description="""An unix timestamp in integer seconds since epoch yields the + quality matrix at the respective date.""", ) async def get_past_quality_matrix( *, timestamp: int, database: Database = Depends(get_database) @@ -150,7 +176,7 @@ async def get_past_quality_matrix( "description": "Timestamps of old quality matrix results not determinable" } }, - tags=[TAG_STATISTICS], + tags=[_TAG_STATISTICS], description="""Return timestamps in seconds since epoch of past calculations of the quality matrix. Additional parameters: form: The desired form of quality. This is used to query only the relevant type of data.""", @@ -167,7 +193,7 @@ async def get_timestamps( @router.get( - "/collections/{collection_id}/stats/score", + "/collections/{node_id}/stats/score", response_model=ScoreOutput, status_code=HTTP_200_OK, responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, @@ -182,16 +208,14 @@ async def get_timestamps( + field_names_used_for_score_calculation(aggs_material_validation)}`. """, ) -async def score(*, collection_id: UUID = Depends(collection_id_param)): - collection_stats = await search_score( - noderef_id=collection_id, resource_type=ResourceType.COLLECTION +async def score(*, node_id: uuid.UUID = Depends(collection_id_param)): + collection_stats = search_score( + node_id=node_id, resource_type=ResourceType.COLLECTION ) collection_scores = calc_scores(stats=collection_stats) - material_stats = await search_score( - noderef_id=collection_id, resource_type=ResourceType.MATERIAL - ) + material_stats = search_score(node_id=node_id, resource_type=ResourceType.MATERIAL) material_scores = calc_scores(stats=material_stats) @@ -230,9 +254,10 @@ async def ping_api(): status_code=HTTP_200_OK, responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, tags=[_TAG_COLLECTIONS], + description="Returns the tree of collections.", ) async def get_collection_tree( - *, node_id: UUID = Depends(node_ids_for_major_collections) + *, node_id: uuid.UUID = Depends(node_ids_for_major_collections) ): return await collection_tree(node_id) @@ -247,7 +272,7 @@ async def get_collection_tree( ) async def get_collection_counts( *, - node_id: UUID = Depends(node_ids_for_major_collections), + node_id: uuid.UUID = Depends(node_ids_for_major_collections), facet: AggregationMappings = Param( default=AggregationMappings.lrt, examples={key: {"value": key} for key in AggregationMappings}, @@ -270,7 +295,7 @@ async def get_collection_counts( ) async def filter_collections_with_missing_attributes( *, - node_id: UUID = Depends(node_ids_for_major_collections), + node_id: uuid.UUID = Depends(node_ids_for_major_collections), missing_attribute: str = Path( ..., examples={ @@ -279,3 +304,110 @@ async def filter_collections_with_missing_attributes( ), ): return await collections_with_missing_attributes(node_id, missing_attribute) + + +@router.get( + "/collections/{node_id}/pending-materials/{missing_attr}", + response_model=list[LearningMaterial], + response_model_exclude_unset=True, + status_code=HTTP_200_OK, + responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, + tags=[_TAG_COLLECTIONS], + description="""A list of missing entries for different types of materials by subcollection. + Searches for materials with one of the following properties being empty or missing: """ + + f"{', '.join([entry.value for entry in missing_attribute_filter])}.", +) +async def filter_materials_with_missing_attributes( + *, + node_id: uuid.UUID = Depends(node_ids_for_major_collections), + missing_attr_filter: MissingAttributeFilter = Depends(materials_filter_params), + response_fields: Optional[set[LearningMaterialAttribute]] = Depends( + material_response_fields + ), +): + return await get_materials_with_missing_attributes( + missing_attr_filter, node_id, response_fields + ) + + +@router.get( + "/collections/{node_id}/stats/descendant-collections-materials-counts", + response_model=list[CollectionMaterialsCount], + status_code=HTTP_200_OK, + responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, + tags=[_TAG_STATISTICS], + description="""Returns the number of materials connected to all collections + below this 'node_id' as a flat list.""", +) +async def material_counts_tree( + *, node_id: uuid.UUID = Depends(node_ids_for_major_collections) +): + return await get_material_count_tree(node_id) + + +@router.get( + "/analytics/{node_id}", + response_model=StatsResponse, + status_code=HTTP_200_OK, + responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, + tags=[_TAG_STATISTICS], + description=f""" + Returns the number of materials found connected to the this collection's 'node_id' and its sub + collections as well as materials containing the name of the respective collection, e.g., in the title. + It is therefore an overview of materials, which could be added to a collection in the future. + It relies on background data and is read every {BACKGROUND_TASK_TIME_INTERVAL}s. + This is the granularity of the data.""", +) +async def read_stats(*, node_id: uuid.UUID = Depends(node_ids_for_major_collections)): + return await overall_stats(node_id) + + +@router.get( + "/analytics/{node_id}/validation/collections", + response_model=list[ValidationStatsResponse[CollectionValidationStats]], + response_model_exclude_unset=True, + status_code=HTTP_200_OK, + responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, + tags=[_TAG_STATISTICS], + description=f""" + Returns the number of collections missing certain properties for this collection's 'node_id' and its sub + collections. It relies on background data and is read every {BACKGROUND_TASK_TIME_INTERVAL}s. + This is the granularity of the data.""", +) +async def read_stats_validation_collection( + *, + node_id: uuid.UUID = Depends(node_ids_for_major_collections), +): + return collections_with_missing_properties(node_id) + + +@router.get( + "/analytics/{node_id}/validation", + response_model=list[ValidationStatsResponse[MaterialValidationStats]], + response_model_exclude_unset=True, + status_code=HTTP_200_OK, + responses={HTTP_404_NOT_FOUND: {"description": "Collection not found"}}, + tags=[_TAG_STATISTICS], + description=""" + Returns the number of materials missing certain properties for this collection's 'node_id' and its sub collections. + + This endpoint is similar to '/analytics/node_id/validation/collections', but instead of showing missing + properties in collections, it counts the materials inside each collection that are missing that property.""" + + f"It relies on background data and is read every {BACKGROUND_TASK_TIME_INTERVAL}s. " + + "This is the granularity of the data.", +) +async def read_stats_validation( + *, + node_id: uuid.UUID = Depends(node_ids_for_major_collections), +): + return materials_with_missing_properties(node_id) + + +if API_DEBUG: + + @router.get( + "/global", + description="""A debug endpoint to access the data stored inside the global storage.""", + ) + async def get_global(): + return global_storage diff --git a/src/app/api/collections/counts.py b/src/app/api/collections/counts.py index 057aed9..638fede 100644 --- a/src/app/api/collections/counts.py +++ b/src/app/api/collections/counts.py @@ -1,6 +1,6 @@ +import uuid from enum import Enum from typing import Optional -from uuid import UUID from elasticsearch_dsl import A from pydantic import BaseModel @@ -13,10 +13,11 @@ class CollectionTreeCount(BaseModel): """ - A preliminary model to yield the total number of collections as well as counts for specific metrics, e.g. OER licence + A preliminary model to yield the total number of collections as well as counts for specific metrics, + e.g. OER licence """ - noderef_id: UUID + noderef_id: uuid.UUID total: int counts: dict[str, int] @@ -33,8 +34,8 @@ class AggregationMappings(str, Enum): license = ("properties.ccm:commonlicense_key.keyword",) -def collection_counts_search(node_id: UUID, facet: AggregationMappings) -> Search: - s = Search().base_filters().query(query_materials(ancestor_id=node_id)) +def collection_counts_search(node_id: uuid.UUID, facet: AggregationMappings) -> Search: + s = Search().base_filters().query(query_materials(node_id=node_id)) material_agg = A( "terms", field="collections.nodeRef.id.keyword", size=ELASTIC_TOTAL_SIZE ) @@ -59,7 +60,7 @@ def collection_counts_search(node_id: UUID, facet: AggregationMappings) -> Searc async def collection_counts( - node_id: UUID, facet: AggregationMappings + node_id: uuid.UUID, facet: AggregationMappings ) -> Optional[list[CollectionTreeCount]]: response = collection_counts_search(node_id, facet).execute() if response.success(): diff --git a/src/app/api/collections/descendants.py b/src/app/api/collections/descendants.py new file mode 100644 index 0000000..71dcb8d --- /dev/null +++ b/src/app/api/collections/descendants.py @@ -0,0 +1,266 @@ +import uuid +from itertools import chain +from typing import Optional, Type, TypeVar, Union + +from elasticsearch_dsl.aggs import A, Agg +from elasticsearch_dsl.query import Query +from elasticsearch_dsl.response import Response +from glom import Coalesce, Iter, glom +from pydantic import BaseModel, Extra + +from app.api.collections.missing_materials import ( + ElasticResource, + EmptyStrToNone, + LearningMaterialAttribute, +) +from app.api.collections.utils import all_source_fields +from app.core.config import ELASTIC_TOTAL_SIZE +from app.elastic.dsl import qbool, qmatch +from app.elastic.elastic import ResourceType, query_materials, type_filter +from app.elastic.fields import ElasticField, ElasticFieldType +from app.elastic.search import Search +from app.elastic.utils import handle_text_field +from app.models import ElasticResourceAttribute + + +class _CollectionAttribute(ElasticField): + TITLE = ("properties.cm:title", ElasticFieldType.TEXT) + DESCRIPTION = ("properties.cm:description", ElasticFieldType.TEXT) + PATH = ("path", ElasticFieldType.KEYWORD) + PARENT_ID = ("parentRef.id", ElasticFieldType.KEYWORD) + NODE_ID = ("nodeRef.id", ElasticFieldType.KEYWORD) + + +_COLLECTION = TypeVar("_COLLECTION") +# TODO Remove duplicate +CollectionAttribute = ElasticField( + "CollectionAttribute", + [ + (f.name, (f.value, f.field_type)) + for f in chain(ElasticResourceAttribute, _CollectionAttribute) + ], +) + + +class ResponseConfig: + allow_population_by_field_name = True + extra = Extra.ignore + + +class ResponseModel(BaseModel): + class Config(ResponseConfig): + pass + + +class CollectionMaterialsCount(ResponseModel): + noderef_id: uuid.UUID + title: str + materials_count: int + + +_DESCENDANT_COLLECTIONS_MATERIALS_COUNTS = TypeVar( + "_DESCENDANT_COLLECTIONS_MATERIALS_COUNTS" +) + + +# TODO: Refactor +class DescendantCollectionsMaterialsCounts(BaseModel): + results: list[CollectionMaterialsCount] + + class Config: + arbitrary_types_allowed = True + allow_population_by_field_name = True + extra = Extra.forbid + + @classmethod + def parse_elastic_response( + cls: Type[_DESCENDANT_COLLECTIONS_MATERIALS_COUNTS], + response: Response, + ) -> _DESCENDANT_COLLECTIONS_MATERIALS_COUNTS: + results = glom( + response, + ( + "aggregations.grouped_by_collection.buckets", + [{"noderef_id": "key.noderef_id", "materials_count": "doc_count"}], + ), + ) + return cls.construct( + results=[ + CollectionMaterialsCount.construct(**record) for record in results + ], + ) + + +def aterms(qfield: Union[ElasticField, str], **kwargs) -> Agg: + kwargs["field"] = handle_text_field(qfield) + return A("terms", **kwargs) + + +def acomposite(sources: list[Union[Query, dict]], **kwargs) -> Agg: + return A("composite", sources=sources, **kwargs) + + +def agg_materials_by_collection(size: int = 65536) -> Agg: + return acomposite( + sources=[ + { + "noderef_id": aterms( + qfield=LearningMaterialAttribute.COLLECTION_NODEREF_ID + ) + } + ], + size=size, + ) + + +def abucketsort(sort: list[Union[Query, dict]], **kwargs) -> Agg: + return A("bucket_sort", sort=sort, **kwargs) + + +def material_counts_by_descendant( + node_id: uuid.UUID, +) -> DescendantCollectionsMaterialsCounts: + search = material_counts_search(node_id) + response: Response = search.execute() + + if response.success(): + return DescendantCollectionsMaterialsCounts.parse_elastic_response(response) + + +def material_counts_search(node_id: uuid.UUID): + s = Search().base_filters().query(query_materials(node_id=node_id)) + s.aggs.bucket("grouped_by_collection", agg_materials_by_collection()).pipeline( + "sorted_by_count", + abucketsort(sort=[{"_count": {"order": "asc"}}]), + ) + return s + + +class CollectionBase(ElasticResource): + title: Optional[EmptyStrToNone] = None + keywords: Optional[list[str]] = None + description: Optional[EmptyStrToNone] = None + path: Optional[list[uuid.UUID]] = None + parent_id: Optional[uuid.UUID] = None + + source_fields = { + CollectionAttribute.NODEREF_ID, + CollectionAttribute.TYPE, + CollectionAttribute.NAME, + CollectionAttribute.TITLE, + CollectionAttribute.KEYWORDS, + CollectionAttribute.DESCRIPTION, + CollectionAttribute.PATH, + CollectionAttribute.PARENT_ID, + } + + @classmethod + def parse_elastic_hit_to_dict( + cls: Type[_COLLECTION], + hit: dict, + ) -> dict: + spec = { + "title": Coalesce(CollectionAttribute.TITLE.path, default=None), + "keywords": ( + Coalesce(CollectionAttribute.KEYWORDS.path, default=[]), + Iter().all(), + ), + "description": Coalesce(CollectionAttribute.DESCRIPTION.path, default=None), + "path": ( + Coalesce(CollectionAttribute.PATH.path, default=[]), + Iter().all(), + ), + "parent_id": Coalesce(CollectionAttribute.PARENT_ID.path, default=None), + } + return { + **super(CollectionBase, cls).parse_elastic_hit_to_dict(hit), + **glom(hit, spec), + } + + @classmethod + def parse_elastic_hit( + cls: Type[_COLLECTION], + hit: dict, + ) -> _COLLECTION: + collection = cls.construct(**cls.parse_elastic_hit_to_dict(hit)) + try: + collection.parent_id = collection.path[-1] + except IndexError: + pass + return collection + + +# TODO: Double naming with collection in types +class Collection(ResponseModel, CollectionBase): + pass + + +def descendants_search(node_id: uuid.UUID, max_hits): + query = { + "filter": [*type_filter[ResourceType.COLLECTION]], + "minimum_should_match": 1, + "should": [ + qmatch(**{"path": node_id}), + qmatch(**{"nodeRef.id": node_id}), + ], + } + return ( + Search() + .base_filters() + .query(qbool(**query)) + .source(includes=[source.path for source in all_source_fields])[:max_hits] + ) + + +def get_many_descendants( + node_id: Optional[uuid.UUID] = None, + max_hits: Optional[int] = ELASTIC_TOTAL_SIZE, +) -> list[Collection]: + search = descendants_search(node_id, max_hits) + + response = search.execute() + + if response.success(): + return [Collection.parse_elastic_hit(hit) for hit in response] + + +async def get_material_count_tree(node_id) -> list[CollectionMaterialsCount]: + """ + TODO: Refactor this function, it is very unclear to me + + :param node_id: + :return: + """ + descendant_collections = get_many_descendants(node_id=node_id) + materials_counts = material_counts_by_descendant( + node_id=node_id, + ) + descendant_collections = { + collection.noderef_id: collection.title for collection in descendant_collections + } + stats = [] + for record in materials_counts.results: + try: + title = descendant_collections.pop(record.noderef_id) + except KeyError: + continue + + stats.append( + CollectionMaterialsCount( + noderef_id=record.noderef_id, + title=title, + materials_count=record.materials_count, + ) + ) + stats = [ + *[ + CollectionMaterialsCount( + noderef_id=noderef_id, + title=title, + materials_count=0, + ) + for (noderef_id, title) in descendant_collections.items() + ], + *stats, + ] + return stats diff --git a/src/app/api/collections/missing_attributes.py b/src/app/api/collections/missing_attributes.py index c2a9b67..036ef57 100644 --- a/src/app/api/collections/missing_attributes.py +++ b/src/app/api/collections/missing_attributes.py @@ -1,13 +1,13 @@ from __future__ import annotations +import uuid from typing import Optional -from uuid import UUID from elasticsearch_dsl.query import Q from glom import Coalesce, Iter from app.api.collections.models import MissingMaterials -from app.api.collections.utils import map_elastic_response_to_model +from app.api.collections.utils import all_source_fields, map_elastic_response_to_model from app.core.config import ELASTIC_TOTAL_SIZE from app.elastic.dsl import qbool, qmatch from app.elastic.elastic import ResourceType, type_filter @@ -22,17 +22,6 @@ ] -all_source_fields: list = [ - ElasticResourceAttribute.NODEREF_ID, - ElasticResourceAttribute.TYPE, - ElasticResourceAttribute.NAME, - CollectionAttribute.TITLE, - ElasticResourceAttribute.KEYWORDS, - CollectionAttribute.DESCRIPTION, - CollectionAttribute.PATH, - CollectionAttribute.PARENT_ID, -] - missing_attributes_spec = { "title": Coalesce(CollectionAttribute.TITLE.path, default=""), "keywords": ( @@ -53,14 +42,14 @@ def missing_attributes_search( - noderef_id: UUID, missing_attribute: str, max_hits: int + node_id: uuid.UUID, missing_attribute: str, max_hits: int ) -> Search: query = { "filter": [*type_filter[ResourceType.COLLECTION]], "minimum_should_match": 1, "should": [ - qmatch(**{"path": noderef_id}), - qmatch(**{"nodeRef.id": noderef_id}), + qmatch(**{"path": node_id}), + qmatch(**{"nodeRef.id": node_id}), ], "must_not": Q("wildcard", **{missing_attribute: {"value": "*"}}), } @@ -74,11 +63,11 @@ def missing_attributes_search( async def collections_with_missing_attributes( - noderef_id: UUID, + node_id: uuid.UUID, missing_attribute: str, max_hits: Optional[int] = ELASTIC_TOTAL_SIZE, ) -> list[MissingMaterials]: - search = missing_attributes_search(noderef_id, missing_attribute, max_hits) + search = missing_attributes_search(node_id, missing_attribute, max_hits) response = search.execute() if response.success(): diff --git a/src/app/api/collections/missing_materials.py b/src/app/api/collections/missing_materials.py new file mode 100644 index 0000000..94eaf1f --- /dev/null +++ b/src/app/api/collections/missing_materials.py @@ -0,0 +1,255 @@ +import uuid +from typing import ClassVar, Optional, Type, TypeVar + +from elasticsearch_dsl import Q +from fastapi.params import Path, Query +from glom import Coalesce, Iter, glom +from pydantic import BaseModel, Extra +from pydantic.validators import str_validator + +from app.api.score.models import LearningMaterialAttribute +from app.core.config import ELASTIC_TOTAL_SIZE +from app.elastic.dsl import qbool, qmatch, qterm +from app.elastic.elastic import ( + ResourceType, + query_missing_material_license, + type_filter, +) +from app.elastic.fields import ElasticField +from app.elastic.search import Search +from app.models import _ELASTIC_RESOURCE, ElasticResourceAttribute + +_LEARNING_MATERIAL = TypeVar("_LEARNING_MATERIAL") + + +def empty_to_none(v: str) -> Optional[str]: + if v == "": + return None + return v + + +class EmptyStrToNone(str): + @classmethod + def __get_validators__(cls): + yield str_validator + yield empty_to_none + + +class ElasticConfig: + allow_population_by_field_name = True + extra = Extra.allow + + +class ElasticResource(BaseModel): + noderef_id: uuid.UUID + type: Optional[EmptyStrToNone] = None + name: Optional[EmptyStrToNone] = None + + source_fields: ClassVar[set] = { + ElasticResourceAttribute.NODEREF_ID, + ElasticResourceAttribute.TYPE, + ElasticResourceAttribute.NAME, + } + + class Config(ElasticConfig): + pass + + @classmethod + def parse_elastic_hit_to_dict( + cls: Type[_ELASTIC_RESOURCE], + hit: dict, + ) -> dict: + spec = { + "noderef_id": ElasticResourceAttribute.NODEREF_ID.path, + "type": Coalesce(ElasticResourceAttribute.TYPE.path, default=None), + "name": Coalesce(ElasticResourceAttribute.NAME.path, default=None), + } + return glom(hit, spec) + + @classmethod + def parse_elastic_hit( + cls: Type[_ELASTIC_RESOURCE], + hit: dict, + ) -> _ELASTIC_RESOURCE: + return cls.construct(**cls.parse_elastic_hit_to_dict(hit)) + + +class LearningMaterialBase(ElasticResource): + title: Optional[EmptyStrToNone] = None + keywords: Optional[list[str]] = None + edu_context: Optional[list[str]] = None + subjects: Optional[list[str]] = None + www_url: Optional[str] = None + description: Optional[EmptyStrToNone] = None + licenses: Optional[EmptyStrToNone] = None + + source_fields: ClassVar[set] = { + LearningMaterialAttribute.NODEREF_ID, + LearningMaterialAttribute.TYPE, + LearningMaterialAttribute.NAME, + LearningMaterialAttribute.TITLE, + LearningMaterialAttribute.KEYWORDS, + LearningMaterialAttribute.EDU_CONTEXT, + LearningMaterialAttribute.SUBJECTS, + LearningMaterialAttribute.WWW_URL, + LearningMaterialAttribute.DESCRIPTION, + LearningMaterialAttribute.LICENSES, + } + + @classmethod + def parse_elastic_hit_to_dict( + cls: Type[_LEARNING_MATERIAL], + hit: dict, + ) -> dict: + spec = { + "title": Coalesce(LearningMaterialAttribute.TITLE.path, default=None), + "keywords": ( + Coalesce(LearningMaterialAttribute.KEYWORDS.path, default=[]), + Iter().all(), + ), + "edu_context": ( + Coalesce(LearningMaterialAttribute.EDU_CONTEXT.path, default=[]), + Iter().all(), + ), + "subjects": ( + Coalesce(LearningMaterialAttribute.SUBJECTS.path, default=[]), + Iter().all(), + ), + "www_url": Coalesce(LearningMaterialAttribute.WWW_URL.path, default=None), + "description": ( + Coalesce(LearningMaterialAttribute.DESCRIPTION.path, default=[]), + (Iter().all(), "\n".join), + ), + "licenses": ( + Coalesce(LearningMaterialAttribute.LICENSES.path, default=[]), + (Iter().all(), "\n".join), + ), + } + return { + **super(LearningMaterialBase, cls).parse_elastic_hit_to_dict(hit), + **glom(hit, spec), + } + + +class ResponseConfig: + allow_population_by_field_name = True + extra = Extra.ignore + + +class ResponseModel(BaseModel): + class Config(ResponseConfig): + pass + + +class LearningMaterial(ResponseModel, LearningMaterialBase): + pass + + +LearningMaterialResponseField = ElasticField( + "MaterialAttribute", + [(f.name, (f.value, f.field_type)) for f in LearningMaterialAttribute], +) + + +def material_response_fields( + *, response_fields: set[LearningMaterialResponseField] = Query(None) +) -> set[LearningMaterialAttribute]: + return response_fields + + +MissingMaterialField = ElasticField( + "MissingMaterialField", + [ + (f.name, (f.value, f.field_type)) + for f in [ + LearningMaterialAttribute.NAME, + LearningMaterialAttribute.TITLE, + LearningMaterialAttribute.KEYWORDS, + LearningMaterialAttribute.EDU_CONTEXT, + LearningMaterialAttribute.SUBJECTS, + LearningMaterialAttribute.WWW_URL, + LearningMaterialAttribute.DESCRIPTION, + LearningMaterialAttribute.LICENSES, + ] + ], +) + + +class MissingAttributeFilter(BaseModel): + attr: MissingMaterialField + + +def materials_filter_params( + *, missing_attr: MissingMaterialField = Path(...) +) -> MissingAttributeFilter: + return MissingAttributeFilter(attr=missing_attr) + + +base_filter = [ + qterm(qfield=ElasticResourceAttribute.PERMISSION_READ, value="GROUP_EVERYONE"), + qterm(qfield=ElasticResourceAttribute.EDU_METADATASET, value="mds_oeh"), + qterm(qfield=ElasticResourceAttribute.PROTOCOL, value="workspace"), +] + + +def missing_attributes_search( + node_id: uuid.UUID, missing_attribute: str, max_hits: int +) -> Search: + if missing_attribute == LearningMaterialAttribute.LICENSES.path: + missing_attribute_query = {"filter": query_missing_material_license()} + else: + missing_attribute_query = { + "must_not": Q("wildcard", **{missing_attribute: {"value": "*"}}) + } + query = { + "filter": [*type_filter[ResourceType.MATERIAL]], + "minimum_should_match": 1, + "should": [ + qmatch(**{"path": node_id}), + qmatch(**{"nodeRef.id": node_id}), + ], + **missing_attribute_query, + } + + return ( + Search() + .base_filters() + .query(qbool(**query)) + .source(includes=[source.path for source in LearningMaterial.source_fields])[ + :max_hits + ] + ) + + +async def get_many( + node_id: Optional[uuid.UUID] = None, + missing_attr_filter: Optional[MissingAttributeFilter] = None, +) -> list[LearningMaterial]: + search = missing_attributes_search( + node_id, missing_attr_filter.attr.value, ELASTIC_TOTAL_SIZE + ) + response = search.execute() + if response.success(): + return [LearningMaterial.parse_elastic_hit(hit) for hit in response] + + +def filter_response_fields( + items: list[BaseModel], response_fields: set[ElasticField] = None +) -> list[BaseModel]: + if response_fields: + return [ + i.copy(include={f.name.lower() for f in response_fields}) for i in items + ] + return items + + +async def get_materials_with_missing_attributes( + missing_attr_filter, node_id, response_fields +): + if response_fields: + response_fields.add(LearningMaterialAttribute.NODEREF_ID) + materials = await get_many( + node_id=node_id, + missing_attr_filter=missing_attr_filter, + ) + return filter_response_fields(materials, response_fields=response_fields) diff --git a/src/app/api/collections/models.py b/src/app/api/collections/models.py index 3d0f6e7..172fb90 100644 --- a/src/app/api/collections/models.py +++ b/src/app/api/collections/models.py @@ -2,17 +2,17 @@ annotations, ) +import uuid from typing import Optional -from uuid import UUID from pydantic import BaseModel class CollectionNode(BaseModel): - noderef_id: UUID + noderef_id: uuid.UUID title: Optional[str] # might be none due to data model children: list[CollectionNode] - parent_id: Optional[UUID] + parent_id: Optional[uuid.UUID] class MissingMaterials(CollectionNode): @@ -20,7 +20,7 @@ class MissingMaterials(CollectionNode): A model containing information about entries which miss, e.g, a description. By returning this model the editors know enough about the entry to find and correct it - :param + param description: a free text description of the context path: the complete id path, i.e., from parent node id up to the root id of elastic search type: Indicates the type of content, must be ccm:map in the current implementation diff --git a/src/app/api/collections/tree.py b/src/app/api/collections/tree.py index c8193ab..cd07668 100644 --- a/src/app/api/collections/tree.py +++ b/src/app/api/collections/tree.py @@ -1,4 +1,4 @@ -from uuid import UUID +import uuid from aiohttp import ClientSession from elasticsearch_dsl.response import Response @@ -13,14 +13,14 @@ from app.models import CollectionAttribute, ElasticResourceAttribute -def build_portal_tree(collections: list, root_noderef_id: UUID) -> list[CollectionNode]: - tree_hierarchy = {str(root_noderef_id): []} +def build_portal_tree(collections: list, root_id: uuid.UUID) -> list[CollectionNode]: + tree_hierarchy = {str(root_id): []} for collection in collections: if collection.title: tree_hierarchy.update(build_hierarchy(collection, tree_hierarchy)) - return tree_hierarchy[str(root_noderef_id)] + return tree_hierarchy[str(root_id)] def build_hierarchy( @@ -40,7 +40,7 @@ def build_hierarchy( return tree_hierarchy -def tree_search(node_id: UUID) -> Search: +def tree_search(node_id: uuid.UUID) -> Search: s = Search().base_filters().query(qbool(filter=qterm(qfield="path", value=node_id))) s = s.source( ["nodeRef.id", "properties.cm:title", "collections.path", "parentRef.id"] @@ -65,7 +65,7 @@ def tree_search(node_id: UUID) -> Search: } -def tree_from_elastic(node_id: UUID) -> list[CollectionNode]: +def tree_from_elastic(node_id: uuid.UUID) -> list[CollectionNode]: response: Response = tree_search(node_id).execute() if response.success(): @@ -76,7 +76,7 @@ def tree_from_elastic(node_id: UUID) -> list[CollectionNode]: async def collection_tree( - node_id: UUID, use_vocabs: bool = False + node_id: uuid.UUID, use_vocabs: bool = False ) -> list[CollectionNode]: if use_vocabs: async with ClientSession() as session: diff --git a/src/app/api/collections/utils.py b/src/app/api/collections/utils.py index b6d56ac..ec21a71 100644 --- a/src/app/api/collections/utils.py +++ b/src/app/api/collections/utils.py @@ -4,6 +4,7 @@ from glom import glom from app.api.collections.models import CollectionNode, MissingMaterials +from app.models import CollectionAttribute, ElasticResourceAttribute T = TypeVar("T", CollectionNode, MissingMaterials) @@ -12,3 +13,15 @@ def map_elastic_response_to_model( response: Response, specs: dict, model: Generic[T] ) -> list[T]: return [model(**glom(hit.to_dict(), specs)) for hit in response] + + +all_source_fields: list = [ + ElasticResourceAttribute.NODEREF_ID, + ElasticResourceAttribute.TYPE, + ElasticResourceAttribute.NAME, + CollectionAttribute.TITLE, + ElasticResourceAttribute.KEYWORDS, + CollectionAttribute.DESCRIPTION, + CollectionAttribute.PATH, + CollectionAttribute.PARENT_ID, +] diff --git a/src/app/api/collections/vocabs.py b/src/app/api/collections/vocabs.py index bf15891..85809c9 100644 --- a/src/app/api/collections/vocabs.py +++ b/src/app/api/collections/vocabs.py @@ -1,4 +1,4 @@ -from uuid import UUID +import uuid from aiohttp import ClientSession @@ -7,7 +7,7 @@ async def tree_from_vocabs( - session: ClientSession, node_id: UUID + session: ClientSession, node_id: uuid.UUID ) -> list[CollectionNode]: url = f"https://vocabs.openeduhub.de/w3id.org/openeduhub/vocabs/oeh-topics/{node_id}.json" response = await session.get(url=url) diff --git a/src/app/api/quality_matrix/collections.py b/src/app/api/quality_matrix/collections.py index 7ba87ef..db1b776 100644 --- a/src/app/api/quality_matrix/collections.py +++ b/src/app/api/quality_matrix/collections.py @@ -1,4 +1,4 @@ -from uuid import UUID +import uuid from elasticsearch_dsl import Q from elasticsearch_dsl.response import Response @@ -17,11 +17,11 @@ _TITLE_PROPERTY = "properties.cm:title" -def queried_collections(node_id: UUID = COLLECTION_ROOT_ID) -> dict[str, int]: +def queried_collections(node_id: uuid.UUID = COLLECTION_ROOT_ID) -> dict[str, int]: """ Query collection ID's and number of entries connected to this node id from Elasticsearch. - :param node_id: Parent node ID, from which to search childrens. + param node_id: Parent node ID, from which to search childrens. :return: Dictionary of node_id: total count of entries connected to this node id """ aggregation_name = "unique_collections" @@ -44,7 +44,7 @@ def queried_collections(node_id: UUID = COLLECTION_ROOT_ID) -> dict[str, int]: return extract_sources_from_response(response, aggregation_name) -async def id_to_title_mapping(node_id: UUID) -> dict[str, str]: +async def id_to_title_mapping(node_id: uuid.UUID) -> dict[str, str]: s = ( Search() .base_filters() @@ -65,7 +65,7 @@ async def id_to_title_mapping(node_id: UUID) -> dict[str, str]: async def collection_quality( - node_id: UUID, match_keyword: str = "path" + node_id: uuid.UUID, match_keyword: str = "path" ) -> QUALITY_MATRIX_RETURN_TYPE: mapping = await id_to_title_mapping(node_id) columns = queried_collections(node_id) diff --git a/src/app/api/quality_matrix/quality_matrix.py b/src/app/api/quality_matrix/quality_matrix.py index f5d7ed4..2bd6345 100644 --- a/src/app/api/quality_matrix/quality_matrix.py +++ b/src/app/api/quality_matrix/quality_matrix.py @@ -1,6 +1,6 @@ +import uuid from datetime import datetime from typing import Union -from uuid import UUID import sqlalchemy from databases import Database @@ -9,6 +9,7 @@ from app.api.quality_matrix.models import QUALITY_MATRIX_RETURN_TYPE, Forms, Timeline from app.api.quality_matrix.utils import default_properties +from app.api.score.models import required_collection_properties from app.core.config import ELASTIC_TOTAL_SIZE from app.core.constants import COLLECTION_ROOT_ID, PROPERTIES, REPLICATION_SOURCE_ID from app.core.logging import logger @@ -55,16 +56,19 @@ def create_properties_search() -> Search: return Search().base_filters().source([PROPERTIES]) -def get_properties() -> PROPERTY_TYPE: - s = create_properties_search() - response = s.execute() - return extract_properties(response.hits) +def get_properties(use_required_properties_only: bool = True) -> PROPERTY_TYPE: + if use_required_properties_only: + return [entry.split(".")[-1] for entry in required_collection_properties.keys()] + else: + s = create_properties_search() + response = s.execute() + return extract_properties(response.hits) def create_empty_entries_search( properties: PROPERTY_TYPE, search_keyword: str, - node_id: UUID, + node_id: uuid.UUID, match_keyword: str, ) -> Search: s = ( @@ -89,7 +93,7 @@ def create_empty_entries_search( def queried_missing_properties( properties: PROPERTY_TYPE, search_keyword: str, - node_id: UUID, + node_id: uuid.UUID, match_keyword: str, ) -> Response: return create_empty_entries_search( @@ -135,7 +139,7 @@ async def items_in_response(response: Response) -> dict: async def source_quality( - node_id: UUID = COLLECTION_ROOT_ID, + node_id: uuid.UUID = COLLECTION_ROOT_ID, match_keyword: str = f"{PROPERTIES}.{REPLICATION_SOURCE_ID}", ) -> QUALITY_MATRIX_RETURN_TYPE: properties = get_properties() diff --git a/src/app/api/score/models.py b/src/app/api/score/models.py index 2172e89..4b5cb54 100644 --- a/src/app/api/score/models.py +++ b/src/app/api/score/models.py @@ -114,3 +114,16 @@ class ScoreOutput(BaseModel): materials: MissingMaterialProperties = Field( description="Score for specific material properties" ) + + +required_collection_properties = { + "properties.cclom:title": "title", + "properties.cclom:general_description": "description", + "properties.cclom:general_keyword": "keywords", + "properties.ccm:taxonid": "taxon_id", + "properties.ccm:educationalcontext": "license", + "properties.ccm:commonlicense_key": "license", + "properties.ccm:objecttype": "object_type", + "properties.ccm:containsAdvertisement": "ads_qualifier", + "properties.cclom:oeh_lrt_aggregated": "learning_resource_type", +} diff --git a/src/app/api/score/score.py b/src/app/api/score/score.py index 29c90fd..4fae573 100644 --- a/src/app/api/score/score.py +++ b/src/app/api/score/score.py @@ -1,4 +1,4 @@ -from uuid import UUID +import uuid from elasticsearch_dsl import Q from elasticsearch_dsl.response import Response @@ -52,13 +52,13 @@ def calc_weighted_score(collection_scores: dict, material_scores: dict) -> int: return int((100 * score_sum) / number_of_relevant_terms) -def get_score_search(noderef_id: UUID, resource_type: ResourceType) -> Search: +def get_score_search(node_id: uuid.UUID, resource_type: ResourceType) -> Search: query, aggs = None, None if resource_type is ResourceType.COLLECTION: query, aggs = query_collections, aggs_collection_validation elif resource_type is ResourceType.MATERIAL: query, aggs = query_materials, aggs_material_validation - s = Search().base_filters().query(query(ancestor_id=noderef_id)) + s = Search().base_filters().query(query(node_id=node_id)) for name, _agg in aggs.items(): s.aggs.bucket(name, _agg) return s @@ -71,8 +71,8 @@ def score(response: Response) -> dict: } -async def search_score(noderef_id: UUID, resource_type: ResourceType) -> dict: - s = get_score_search(noderef_id, resource_type) +def search_score(node_id: uuid.UUID, resource_type: ResourceType) -> dict: + s = get_score_search(node_id, resource_type) response: Response = s.execute() @@ -82,8 +82,10 @@ async def search_score(noderef_id: UUID, resource_type: ResourceType) -> dict: def collection_id_param( *, - collection_id: UUID = Path(..., examples=app.core.constants.COLLECTION_NAME_TO_ID), -) -> UUID: + collection_id: uuid.UUID = Path( + ..., examples=app.core.constants.COLLECTION_NAME_TO_ID + ), +) -> uuid.UUID: return collection_id diff --git a/src/app/core/config.py b/src/app/core/config.py index 0161524..7d27e45 100644 --- a/src/app/core/config.py +++ b/src/app/core/config.py @@ -17,3 +17,5 @@ ELASTIC_INDEX = "workspace" ELASTIC_TOTAL_SIZE = 500_000 # Maximum number of entries elasticsearch queries, very large to query all entries ELASTICSEARCH_TIMEOUT = int(os.getenv("ELASTICSEARCH_TIMEOUT", 20)) + +BACKGROUND_TASK_TIME_INTERVAL = 10 * 60 # Time between consecutive background calls diff --git a/src/app/core/errors.py b/src/app/core/errors.py index ea57848..d1403b7 100644 --- a/src/app/core/errors.py +++ b/src/app/core/errors.py @@ -11,16 +11,14 @@ from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY -async def http_error_handler(request: Request, exc: HTTPException) -> JSONResponse: - request +async def http_error_handler(_: Request, exc: HTTPException) -> JSONResponse: return JSONResponse({"errors": [exc.detail]}, status_code=exc.status_code) -async def http_422_error_handler(request: Request, exc: HTTPException) -> JSONResponse: +async def http_422_error_handler(_: Request, exc: HTTPException) -> JSONResponse: """ Handler for 422 error to transform default pydantic error object to gothinkster format """ - request errors = {"body": []} if isinstance(exc.detail, Iterable) and not isinstance( diff --git a/src/app/elastic/elastic.py b/src/app/elastic/elastic.py index d18b0f5..e88e8d1 100644 --- a/src/app/elastic/elastic.py +++ b/src/app/elastic/elastic.py @@ -1,5 +1,5 @@ +import uuid from enum import Enum -from uuid import UUID from elasticsearch_dsl.query import Query @@ -23,27 +23,25 @@ class ResourceType(str, Enum): } -def query_many(resource_type: ResourceType, ancestor_id: UUID = None) -> Query: +def query_many(resource_type: ResourceType, node_id: uuid.UUID = None) -> Query: qfilter = [*type_filter[resource_type]] - if ancestor_id: + if node_id: if resource_type is ResourceType.COLLECTION: - qfilter.append(qterm(qfield=CollectionAttribute.PATH, value=ancestor_id)) + qfilter.append(qterm(qfield=CollectionAttribute.PATH, value=node_id)) elif resource_type is ResourceType.MATERIAL: qfilter.append( - qterm( - qfield=LearningMaterialAttribute.COLLECTION_PATH, value=ancestor_id - ) + qterm(qfield=LearningMaterialAttribute.COLLECTION_PATH, value=node_id) ) return qbool(filter=qfilter) -def query_collections(ancestor_id: UUID = None) -> Query: - return query_many(ResourceType.COLLECTION, ancestor_id=ancestor_id) +def query_collections(node_id: uuid.UUID = None) -> Query: + return query_many(ResourceType.COLLECTION, node_id=node_id) -def query_materials(ancestor_id: UUID = None) -> Query: - return query_many(ResourceType.MATERIAL, ancestor_id=ancestor_id) +def query_materials(node_id: uuid.UUID = None) -> Query: + return query_many(ResourceType.MATERIAL, node_id=node_id) def query_missing_material_license() -> Query: diff --git a/src/app/main.py b/src/app/main.py index d0dbbd4..d5f1442 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -6,6 +6,7 @@ from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY from starlette_context.middleware import RawContextMiddleware +from app.api.analytics.background_task import background_task from app.api.api import router from app.core.config import ALLOWED_HOSTS, API_DEBUG, API_PORT, LOG_LEVEL, ROOT_PATH from app.core.constants import OPEN_API_VERSION @@ -34,6 +35,7 @@ def api() -> FastAPI: _api.add_event_handler("startup", connect_to_elastic) _api.add_event_handler("startup", create_start_app_handler(_api)) + _api.add_event_handler("startup", background_task) _api.add_event_handler("shutdown", create_stop_app_handler(_api)) _api.add_exception_handler(HTTPException, http_error_handler) diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..c487fdf --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,240 @@ +import json +import uuid +from unittest import mock + +import pytest + +from app.api.analytics.models import Collection +from app.api.analytics.stats import ( + Row, + build_material_search, + collections_with_missing_properties, + materials_with_missing_properties, + overall_stats, + query_material_types, +) +from app.api.collections.counts import CollectionTreeCount + + +@pytest.mark.asyncio +async def test_overall_stats(): + test_node = "11bdb8a0-a9f5-4028-becc-cbf8e328dd4b" # Spanish + + directory = "tests/unit_tests/resources" + with open(f"{directory}/test_global.json") as file: + global_response = json.load(file) + + # TODO: Refactor with wrapper/fixture/decorator + with mock.patch("app.api.analytics.stats.global_storage") as mocked_global: + + def _get_item(_, key): + if key == "collections": + return [ + Collection( + id=entry["id"], + doc=entry["doc"], + derived_at=entry["derived_at"], + ) + for entry in global_response[key] + ] + if key == "counts": + return [ + CollectionTreeCount( + noderef_id=entry["noderef_id"], + total=entry["total"], + counts=entry["counts"], + ) + for entry in global_response[key] + ] + return global_response[key] + + mocked_global.__getitem__ = _get_item + + with mock.patch( + "app.api.analytics.stats.search_hits_by_material_type" + ) as mocked_search: + with mock.patch("app.api.analytics.stats.get_ids_to_iterate") as mocked_ids: + mocked_search.return_value = {"total": 30} + mocked_ids.return_value = [ + Row( + id=uuid.UUID("f3dc9ea1-d608-4b4e-a78c-98063a3e8461"), + title="test_title", + ) + ] + stats = await overall_stats(test_node) + + assert len(stats.stats) == 1 + first_key_values = stats.stats[list(stats.stats.keys())[0]] + + # assert correct structure + assert list(first_key_values.keys()) == ["search", "material_types"] + assert "total" in list(first_key_values["search"].keys()) + + +def test_build_material_search(): + dummy_query = "dummy_query" + search = build_material_search(dummy_query) + + assert search.to_dict() == { + "query": { + "bool": { + "filter": [ + {"term": {"permissions.Read.keyword": "GROUP_EVERYONE"}}, + {"term": {"properties.cm:edu_metadataset.keyword": "mds_oeh"}}, + {"term": {"nodeRef.storeRef.protocol": "workspace"}}, + {"term": {"type": "ccm:io"}}, + ], + "must": [ + { + "simple_query_string": { + "default_operator": "and", + "query": dummy_query, + "fields": [ + "properties.cclom:title", + "properties.cclom:general_keyword", + "properties.cclom:general_description", + "content.fulltext", + "i18n.de_DE.ccm:taxonid", + "i18n.de_DE.ccm:oeh_lrt_aggregated", + "i18n.de_DE.ccm:educationalcontext", + "i18n.de_DE.ccm:educationalintendedenduserrole", + ], + } + } + ], + } + }, + "aggs": { + "material_types": { + "terms": { + "missing": "N/A", + "size": 500000, + "field": "properties.ccm:oeh_lrt_aggregated.keyword", + } + } + }, + } + + +def test_query_material_types(): + directory = "tests/unit_tests/resources" + + with open(f"{directory}/test_global.json") as file: + global_response = json.load(file) + + dummy_node = uuid.UUID("11bdb8a0-a9f5-4028-becc-cbf8e328dd4b") + with mock.patch("app.api.analytics.stats.global_storage") as mocked_global: + + def _get_item(_, key): + if key == "collections": + return [ + Collection( + id=entry["id"], + doc=entry["doc"], + derived_at=entry["derived_at"], + ) + for entry in global_response[key] + ] + if key == "counts": + return [ + CollectionTreeCount( + noderef_id=entry["noderef_id"], + total=entry["total"], + counts=entry["counts"], + ) + for entry in global_response[key] + ] + return global_response[key] + + mocked_global.__getitem__ = _get_item + + result = query_material_types(dummy_node) + + assert len(result) == 1 + first_value = result[list(result.keys())[0]] + assert "total" in first_value.keys() + + +def test_collections_with_missing_properties(): + directory = "tests/unit_tests/resources" + + with open(f"{directory}/test_global.json") as file: + global_response = json.load(file) + + dummy_node = uuid.UUID("11bdb8a0-a9f5-4028-becc-cbf8e328dd4b") + with mock.patch("app.api.analytics.stats.global_storage") as mocked_global: + + def _get_item(_, key): + if key == "collections": + return [ + Collection( + id=entry["id"], + doc=entry["doc"], + derived_at=entry["derived_at"], + ) + for entry in global_response[key] + ] + if key == "counts": + return [ + CollectionTreeCount( + noderef_id=entry["noderef_id"], + total=entry["total"], + counts=entry["counts"], + ) + for entry in global_response[key] + ] + return global_response[key] + + mocked_global.__getitem__ = _get_item + + result = collections_with_missing_properties(dummy_node) + + assert len(result) == 1 + assert result[0].noderef_id == uuid.UUID("f3dc9ea1-d608-4b4e-a78c-98063a3e8461") + assert result[0].validation_stats == { + "title": None, + "keywords": ["missing"], + "description": ["missing"], + "edu_context": None, + } + + +def test_materials_with_missing_properties(): + directory = "tests/unit_tests/resources" + + with open(f"{directory}/test_global.json") as file: + global_response = json.load(file) + + dummy_node = uuid.UUID("11bdb8a0-a9f5-4028-becc-cbf8e328dd4b") + with mock.patch("app.api.analytics.stats.global_storage") as mocked_global: + + def _get_item(_, key): + if key in ["collections", "materials"]: + return [ + Collection( + id=entry["id"], + doc=entry["doc"], + derived_at=entry["derived_at"], + ) + for entry in global_response[key] + ] + if key == "counts": + return [ + CollectionTreeCount( + noderef_id=entry["noderef_id"], + total=entry["total"], + counts=entry["counts"], + ) + for entry in global_response[key] + ] + return global_response[key] + + mocked_global.__getitem__ = _get_item + + result = materials_with_missing_properties(dummy_node) + + assert len(result) == 1 + assert result[0].noderef_id == uuid.UUID("f3dc9ea1-d608-4b4e-a78c-98063a3e8461") + dummy_material_node = uuid.UUID("263afc5b-6445-4a5a-b014-a77f1db473b9") + assert result[0].validation_stats.ads_qualifier.missing == [dummy_material_node] + assert result[0].validation_stats.object_type is None diff --git a/tests/unit_tests/crud/test_descendants.py b/tests/unit_tests/crud/test_descendants.py new file mode 100644 index 0000000..da4a830 --- /dev/null +++ b/tests/unit_tests/crud/test_descendants.py @@ -0,0 +1,79 @@ +import uuid + +from app.api.collections.descendants import descendants_search, material_counts_search + + +def test_descendants_search(): + dummy_node = uuid.UUID("f3dc9ea1-d608-4b4e-a78c-98063a3e8461") + dummy_maximum_hits = 30 + search = descendants_search(dummy_node, dummy_maximum_hits) + + assert search.to_dict() == { + "_source": { + "includes": [ + "nodeRef.id", + "type", + "properties.cm:name", + "properties.cm:title", + "properties.cclom:general_keyword", + "properties.cm:description", + "path", + "parentRef.id", + ] + }, + "from": 0, + "query": { + "bool": { + "filter": [ + {"term": {"permissions.Read.keyword": "GROUP_EVERYONE"}}, + {"term": {"properties.cm:edu_metadataset.keyword": "mds_oeh"}}, + {"term": {"nodeRef.storeRef.protocol": "workspace"}}, + {"term": {"type": "ccm:map"}}, + ], + "minimum_should_match": 1, + "should": [ + {"match": {"path": dummy_node}}, + {"match": {"nodeRef.id": dummy_node}}, + ], + } + }, + "size": dummy_maximum_hits, + } + + +def test_material_counts_search(): + dummy_node = uuid.UUID("f3dc9ea1-d608-4b4e-a78c-98063a3e8461") + search = material_counts_search(dummy_node) + + assert search.to_dict() == { + "aggs": { + "grouped_by_collection": { + "aggs": { + "sorted_by_count": { + "bucket_sort": {"sort": [{"_count": {"order": "asc"}}]} + } + }, + "composite": { + "size": 65536, + "sources": [ + { + "noderef_id": { + "terms": {"field": "collections.nodeRef.id.keyword"} + } + } + ], + }, + } + }, + "query": { + "bool": { + "filter": [ + {"term": {"permissions.Read.keyword": "GROUP_EVERYONE"}}, + {"term": {"properties.cm:edu_metadataset.keyword": "mds_oeh"}}, + {"term": {"nodeRef.storeRef.protocol": "workspace"}}, + {"term": {"type": "ccm:io"}}, + {"term": {"collections.path.keyword": dummy_node}}, + ] + } + }, + } diff --git a/tests/unit_tests/crud/test_missing_materials.py b/tests/unit_tests/crud/test_missing_materials.py new file mode 100644 index 0000000..0f35b77 --- /dev/null +++ b/tests/unit_tests/crud/test_missing_materials.py @@ -0,0 +1,69 @@ +""" + +import uuid + +import pytest + +from app.api.collections.missing_materials import ( + LearningMaterialAttribute, + MissingAttributeFilter, + MissingMaterialField, +) + +# TODO: More tests cases to also enable filtering, see __call__ MissingAttributeFilter +@pytest.mark.skip(reason="Outdated") +def test_missing_materials_search(): + dummy_uuid = uuid.uuid4() + attribute = LearningMaterialAttribute.KEYWORDS + dummy_missing_attribute = MissingAttributeFilter( + attr=MissingMaterialField[attribute.name] + ) + dummy_maximum_size = 3 + search = missing_materials_search( + dummy_uuid, dummy_missing_attribute, dummy_maximum_size + ) + actual = search.to_dict() + actual_source = actual["_source"] + actual["_source"] = [] + assert actual == { + "query": { + "bool": { + "filter": [ + {"term": {"permissions.Read.keyword": "GROUP_EVERYONE"}}, + {"term": {"properties.cm:edu_metadataset.keyword": "mds_oeh"}}, + {"term": {"nodeRef.storeRef.protocol": "workspace"}}, + {"term": {"type": "ccm:io"}}, + ], + "should": [ + {"match": {"collections.path": dummy_uuid}}, + {"match": {"collections.nodeRef.id": dummy_uuid}}, + ], + "minimum_should_match": 1, + "must_not": [ + {"wildcard": {dummy_missing_attribute.attr.value: {"value": "*"}}} + ], + } + }, + "from": 0, + "size": dummy_maximum_size, + "_source": [], + } + expected_source = [ + "properties.cclom:general_keyword", + "properties.ccm:taxonid", + "properties.ccm:wwwurl", + "nodeRef.id", + "type", + "properties.ccm:commonlicense_key", + "properties.cclom:general_description", + "properties.cm:name", + "properties.cclom:title", + "properties.ccm:educationalcontext", + ] + + actual_source.sort() + expected_source.sort() + source_contains_equal_elements = actual_source == expected_source + assert source_contains_equal_elements + +""" diff --git a/tests/unit_tests/crud/test_quality_matrix.py b/tests/unit_tests/crud/test_quality_matrix.py index 50633bd..f7b381d 100644 --- a/tests/unit_tests/crud/test_quality_matrix.py +++ b/tests/unit_tests/crud/test_quality_matrix.py @@ -30,8 +30,12 @@ ) async def test_get_properties(): await connect_to_elastic() - data = get_properties() + data = get_properties(False) assert "ccm:author_freetext" in data + assert len(data) > 150 + data = get_properties(True) + assert "cclom:title" in data + assert len(data) == 9 @pytest.mark.asyncio diff --git a/tests/unit_tests/resources/test_global.json b/tests/unit_tests/resources/test_global.json new file mode 100644 index 0000000..cd9f690 --- /dev/null +++ b/tests/unit_tests/resources/test_global.json @@ -0,0 +1,58 @@ +{ + "collections": [ + { + "id": "f3dc9ea1-d608-4b4e-a78c-98063a3e8461", + "doc": { + "path": [ + "5e40e372-735c-4b17-bbf7-e827a5702b57", + "11bdb8a0-a9f5-4028-becc-cbf8e328dd4b" + ], + "nodeRef": { + "storeRef": { + "identifier": "SpacesStore", + "protocol": "workspace" + }, + "id": "f3dc9ea1-d608-4b4e-a78c-98063a3e8461" + }, + "properties": { + "cclom:title": "dummy_title" + } + }, + "derived_at": "2022-07-05T16:50:18.789156" + } + ], + "counts": [ + { + "noderef_id": "f3dc9ea1-d608-4b4e-a78c-98063a3e8461", + "total": 18025, + "counts": { + "test_lrt": 3 + } + } + ], + "materials": [ + { + "id": "263afc5b-6445-4a5a-b014-a77f1db473b9", + "doc": { + "collections": [ + { + "nodeRef": { + "id": "f3dc9ea1-d608-4b4e-a78c-98063a3e8461" + } + } + ], + "nodeRef": { + "storeRef": { + "identifier": "SpacesStore", + "protocol": "workspace" + }, + "id": "263afc5b-6445-4a5a-b014-a77f1db473b9" + }, + "properties": { + "ccm:objecttype": "dummy_object_type" + } + }, + "derived_at": "2022-07-07T08:51:25.636346" + } + ] +} diff --git a/tests/unit_tests/test_tree.py b/tests/unit_tests/test_tree.py index d6b85b4..b690618 100644 --- a/tests/unit_tests/test_tree.py +++ b/tests/unit_tests/test_tree.py @@ -30,7 +30,7 @@ async def test_collection_tree(): data = await collection_tree(root_node_id, use_vocabs=True) assert len(data) == 26 count = node_count(data) - assert count == 2217 # adapt this number to the current state, may change regularly + assert count >= 2200 # adapt this number to the current state, may change regularly @pytest.mark.asyncio