diff --git a/src/app/api/analytics/analytics.py b/src/app/api/analytics/analytics.py index f822273..ff6a55a 100644 --- a/src/app/api/analytics/analytics.py +++ b/src/app/api/analytics/analytics.py @@ -43,6 +43,9 @@ class Config(ElasticConfig): class StatsResponse(ResponseModel): derived_at: datetime stats: dict[str, dict[str, COUNT_STATISTICS_TYPE]] + oer_ratio: int = Field( + default=0, ge=0, le=100, description="Overall ratio of OER content" + ) ValidationStatsT = TypeVar("ValidationStatsT") diff --git a/src/app/api/analytics/background_task.py b/src/app/api/analytics/background_task.py index 90c33f8..6dc2fca 100644 --- a/src/app/api/analytics/background_task.py +++ b/src/app/api/analytics/background_task.py @@ -12,6 +12,7 @@ from app.api.analytics.stats import get_ids_to_iterate, search_hits_by_material_type from app.api.analytics.storage import ( _COLLECTION_COUNT, + _COLLECTION_COUNT_OER, _COLLECTIONS, _MATERIALS, _SEARCH, @@ -105,6 +106,9 @@ def run(): app.api.analytics.storage.global_storage[_COLLECTION_COUNT] = asyncio.run( collection_counts(COLLECTION_ROOT_ID, AggregationMappings.lrt) ) + app.api.analytics.storage.global_storage[_COLLECTION_COUNT_OER] = asyncio.run( + collection_counts(COLLECTION_ROOT_ID, AggregationMappings.lrt, oer_only=True) + ) all_collections = asyncio.run(get_ids_to_iterate(node_id=COLLECTION_ROOT_ID)) print("Tree ready to iterate. Length: ", len(all_collections)) diff --git a/src/app/api/analytics/stats.py b/src/app/api/analytics/stats.py index 4375962..9b8d99a 100644 --- a/src/app/api/analytics/stats.py +++ b/src/app/api/analytics/stats.py @@ -25,6 +25,7 @@ global_storage, ) from app.api.collections.models import CollectionNode +from app.api.collections.oer import oer_ratio from app.api.collections.tree import collection_tree from app.api.score.models import required_collection_properties from app.core.config import ELASTIC_TOTAL_SIZE @@ -139,7 +140,9 @@ def nodes(data: list[CollectionNode]) -> list: return [Row(id=row[0], title=row[1]) for row in flatten_list(nodes(tree))] -def query_material_types(node_id: uuid.UUID) -> dict[str, COUNT_STATISTICS_TYPE]: +def query_material_types( + node_id: uuid.UUID, oer_only: bool +) -> dict[str, COUNT_STATISTICS_TYPE]: """ get collections with parent id equal to node_id @@ -175,7 +178,7 @@ def filtered_collections(collections: list[Collection], node_id: uuid.UUID): async def stats_latest( - stat_type: StatType, node_id: uuid.UUID + stat_type: StatType, node_id: uuid.UUID, oer_only: bool ) -> dict[str, COUNT_STATISTICS_TYPE]: results = {} @@ -185,18 +188,20 @@ async def stats_latest( stats = search_hits_by_material_type(row.title) results.update({str(row.id): stats}) elif stat_type is StatType.MATERIAL_TYPES: - results = query_material_types(node_id) + results = query_material_types(node_id, oer_only) return results -async def overall_stats(node_id) -> StatsResponse: - search_stats = await stats_latest(stat_type=StatType.SEARCH, node_id=node_id) +async def overall_stats(node_id, oer_only: bool = False) -> StatsResponse: + search_stats = await stats_latest( + stat_type=StatType.SEARCH, node_id=node_id, oer_only=oer_only + ) if not search_stats: raise StatsNotFoundException material_types_stats = await stats_latest( - stat_type=StatType.MATERIAL_TYPES, node_id=node_id + stat_type=StatType.MATERIAL_TYPES, node_id=node_id, oer_only=oer_only ) if not material_types_stats: @@ -210,8 +215,10 @@ async def overall_stats(node_id) -> StatsResponse: else: stats_output.update({key: {"material_types": value}}) - output = StatsResponse(derived_at=datetime.datetime.now(), stats=stats_output) - return output + oer = oer_ratio(node_id) + return StatsResponse( + derived_at=datetime.datetime.now(), stats=stats_output, oer_ratio=oer + ) def collections_with_missing_properties( diff --git a/src/app/api/analytics/storage.py b/src/app/api/analytics/storage.py index 151f54c..c6050ae 100644 --- a/src/app/api/analytics/storage.py +++ b/src/app/api/analytics/storage.py @@ -2,6 +2,7 @@ _MATERIALS = "materials" _SEARCH = "search" _COLLECTION_COUNT = "counts" +_COLLECTION_COUNT_OER = "counts_oer" """ A quick fix for a global storage @@ -11,4 +12,5 @@ _MATERIALS: [], _SEARCH: {}, _COLLECTION_COUNT: {}, + _COLLECTION_COUNT_OER: {}, } # TODO: Refactor me ASAP diff --git a/src/app/api/api.py b/src/app/api/api.py index 95ec388..2053e69 100644 --- a/src/app/api/api.py +++ b/src/app/api/api.py @@ -337,8 +337,12 @@ async def material_counts_tree( It relies on background data and is read every {BACKGROUND_TASK_TIME_INTERVAL}s. This is the granularity of the data.""", ) -async def read_stats(*, node_id: uuid.UUID = Depends(node_ids_for_major_collections)): - return await overall_stats(node_id) +async def read_stats( + *, + node_id: uuid.UUID = Depends(node_ids_for_major_collections), + oer_only: bool = Query(default=False), +): + return await overall_stats(node_id, oer_only) @router.get( diff --git a/src/app/api/collections/counts.py b/src/app/api/collections/counts.py index b6a895b..bca5f9c 100644 --- a/src/app/api/collections/counts.py +++ b/src/app/api/collections/counts.py @@ -34,7 +34,9 @@ class AggregationMappings(str, Enum): license = ("properties.ccm:commonlicense_key.keyword",) -def collection_counts_search(node_id: uuid.UUID, facet: AggregationMappings) -> Search: +def collection_counts_search( + node_id: uuid.UUID, facet: AggregationMappings, oer_only: bool = False +) -> Search: search = Search().base_filters().query(query_materials(node_id=node_id)) material_agg = A( "terms", field="collections.nodeRef.id.keyword", size=ELASTIC_TOTAL_SIZE @@ -61,9 +63,9 @@ def collection_counts_search(node_id: uuid.UUID, facet: AggregationMappings) -> async def collection_counts( - node_id: uuid.UUID, facet: AggregationMappings + node_id: uuid.UUID, facet: AggregationMappings, oer_only: bool = False ) -> Optional[list[CollectionTreeCount]]: - response = collection_counts_search(node_id, facet).execute() + response = collection_counts_search(node_id, facet, oer_only).execute() if response.success(): return build_counts(response) diff --git a/src/app/api/collections/oer.py b/src/app/api/collections/oer.py new file mode 100644 index 0000000..1436e97 --- /dev/null +++ b/src/app/api/collections/oer.py @@ -0,0 +1,22 @@ +import uuid + +from app.api.collections.counts import ( + _AGGREGATION_NAME, + AggregationMappings, + collection_counts_search, +) + + +def oer_ratio(node_id: uuid.UUID) -> int: + oer_statistics = collection_counts_search(node_id, AggregationMappings.license) + response = oer_statistics.execute() + oer_elements = 0 + oer_total = 0 + oer_license = ["CC_0", "PDM", "CC_BY", "CC_BY_SA"] + for data in response.aggregations[_AGGREGATION_NAME].buckets: + for bucket in data["facet"]["buckets"]: + oer_total += bucket["doc_count"] + if bucket["key"] in oer_license: + oer_elements += bucket["doc_count"] + + return round((oer_elements / oer_total) * 100) diff --git a/src/app/api/score/score.py b/src/app/api/score/score.py index 8e25d1b..7405d83 100644 --- a/src/app/api/score/score.py +++ b/src/app/api/score/score.py @@ -4,11 +4,7 @@ from elasticsearch_dsl.response import Response from fastapi import Path -from app.api.collections.counts import ( - _AGGREGATION_NAME, - AggregationMappings, - collection_counts_search, -) +from app.api.collections.oer import oer_ratio from app.api.score.models import ( MissingCollectionProperties, MissingMaterialProperties, @@ -158,18 +154,3 @@ async def get_score(node_id: uuid.UUID) -> ScoreOutput: return ScoreOutput( score=score_, collections=collections, materials=materials, oer_ratio=oer ) - - -def oer_ratio(node_id: uuid.UUID) -> int: - oer_statistics = collection_counts_search(node_id, AggregationMappings.license) - response = oer_statistics.execute() - oer_elements = 0 - oer_total = 0 - oer_license = ["CC_0", "PDM", "CC_BY", "CC_BY_SA"] - for data in response.aggregations[_AGGREGATION_NAME].buckets: - for bucket in data["facet"]["buckets"]: - oer_total += bucket["doc_count"] - if bucket["key"] in oer_license: - oer_elements += bucket["doc_count"] - - return round((oer_elements / oer_total) * 100) diff --git a/src/app/core/config.py b/src/app/core/config.py index 7d27e45..2418630 100644 --- a/src/app/core/config.py +++ b/src/app/core/config.py @@ -19,3 +19,5 @@ ELASTICSEARCH_TIMEOUT = int(os.getenv("ELASTICSEARCH_TIMEOUT", 20)) BACKGROUND_TASK_TIME_INTERVAL = 10 * 60 # Time between consecutive background calls + +ENABLE_DATABASE = True diff --git a/src/app/main.py b/src/app/main.py index d5f1442..3c42454 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -8,7 +8,14 @@ from app.api.analytics.background_task import background_task from app.api.api import router -from app.core.config import ALLOWED_HOSTS, API_DEBUG, API_PORT, LOG_LEVEL, ROOT_PATH +from app.core.config import ( + ALLOWED_HOSTS, + API_DEBUG, + API_PORT, + ENABLE_DATABASE, + LOG_LEVEL, + ROOT_PATH, +) from app.core.constants import OPEN_API_VERSION from app.core.errors import http_422_error_handler, http_error_handler from app.core.logging import logger @@ -34,9 +41,11 @@ def api() -> FastAPI: _api.add_middleware(RawContextMiddleware) _api.add_event_handler("startup", connect_to_elastic) - _api.add_event_handler("startup", create_start_app_handler(_api)) _api.add_event_handler("startup", background_task) - _api.add_event_handler("shutdown", create_stop_app_handler(_api)) + + if ENABLE_DATABASE: + _api.add_event_handler("startup", create_start_app_handler(_api)) + _api.add_event_handler("shutdown", create_stop_app_handler(_api)) _api.add_exception_handler(HTTPException, http_error_handler) _api.add_exception_handler(HTTP_422_UNPROCESSABLE_ENTITY, http_422_error_handler)