From 7829b58c406afc92dd2bddea269834e7dbe32750 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Fri, 8 Nov 2024 14:32:59 +0100 Subject: [PATCH] moderation:added match_query_rule and percolator --- site/setup.cfg | 1 + .../moderation/test_moderation_queries.py | 39 ++++++++ site/zenodo_rdm/cli.py | 33 +++++++ site/zenodo_rdm/moderation/config.py | 23 ++++- site/zenodo_rdm/moderation/models.py | 52 +++++++++++ site/zenodo_rdm/moderation/percolator.py | 90 +++++++++++++++++++ site/zenodo_rdm/moderation/rules.py | 30 +++++++ 7 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 site/tests/moderation/test_moderation_queries.py create mode 100644 site/zenodo_rdm/moderation/percolator.py diff --git a/site/setup.cfg b/site/setup.cfg index fafdbbb3..3e095e16 100644 --- a/site/setup.cfg +++ b/site/setup.cfg @@ -35,6 +35,7 @@ tests = [options.entry_points] flask.commands = zenodo-admin = zenodo_rdm.cli:zenodo_admin + moderation = zenodo_rdm.cli:moderation invenio_base.blueprints = zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint zenodo_rdm_support = zenodo_rdm.views:create_blueprint diff --git a/site/tests/moderation/test_moderation_queries.py b/site/tests/moderation/test_moderation_queries.py new file mode 100644 index 00000000..f9a1824a --- /dev/null +++ b/site/tests/moderation/test_moderation_queries.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Test ModerationQuery model class.""" + +from invenio_db import db +from invenio_search import current_search_client +from zenodo_rdm.moderation.models import ModerationQuery +from zenodo_rdm.moderation.rules import match_query_rule +from zenodo_rdm.api import ZenodoRDMRecord + +def test_moderation_query_creation(app): + """Test to create and index a ModerationQuery.""" + with app.app_context(): + query_string = "metadata.title:SimpleTest" + notes = "test query" + score = 5 + active = True + + query = ModerationQuery.create( + query_string, ZenodoRDMRecord, notes=notes, score=score, active=active + ) + db.session.commit() + + # Check if query attributes are set correctly + assert all( + [ + query.query_string == query_string, + query.notes == notes, + query.score == score, + query.active == active, + ] + ) + +#TODO: Add test for matching query \ No newline at end of file diff --git a/site/zenodo_rdm/cli.py b/site/zenodo_rdm/cli.py index e43928a2..b7535f67 100644 --- a/site/zenodo_rdm/cli.py +++ b/site/zenodo_rdm/cli.py @@ -9,6 +9,7 @@ import click from flask.cli import with_appcontext from invenio_access.permissions import system_identity +from invenio_communities.communities.records.api import Community from invenio_db import db from invenio_pidstore.models import PersistentIdentifier from invenio_rdm_records.proxies import current_rdm_records_service @@ -25,6 +26,12 @@ from invenio_requests.records.api import Request from invenio_requests.records.models import RequestMetadata +from zenodo_rdm.api import ZenodoRDMRecord +from zenodo_rdm.moderation.percolator import ( + create_percolator_index, + get_percolator_index, +) + def _get_parent(record_model): parent_model = record_model.parent @@ -246,3 +253,29 @@ def delete_record(recid): for req in requests: current_requests_service.indexer.delete(req) + + +@click.group() +def moderation(): + """Moderation commands.""" + + +@moderation.command("create-queries-index") +@click.option( + "-r", + "--record-cls", + type=click.Choice(["records", "communities"], case_sensitive=False), + default="records", + help="Record class to base the index on (default: records).", +) +@with_appcontext +def create_index(record_cls): + """Command to create a percolator index for moderation queries.""" + record_cls = ZenodoRDMRecord if record_cls == "records" else Community + + try: + create_percolator_index(record_cls) + index_name = get_percolator_index(record_cls) + click.secho(f"Percolator index '{index_name}' created successfully.") + except Exception as e: + click.secho(f"Error creating percolator index: {e}") diff --git a/site/zenodo_rdm/moderation/config.py b/site/zenodo_rdm/moderation/config.py index 3cf4c258..3ce2f997 100644 --- a/site/zenodo_rdm/moderation/config.py +++ b/site/zenodo_rdm/moderation/config.py @@ -7,7 +7,13 @@ """Moderation config.""" -from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule +from .rules import ( + files_rule, + links_rule, + match_query_rule, + text_sanitization_rule, + verified_user_rule, +) MODERATION_SCORES = { "spam_link": 8, @@ -40,6 +46,7 @@ links_rule, files_rule, text_sanitization_rule, + match_query_rule, ] """Scoring rules for record moderation.""" @@ -47,5 +54,19 @@ links_rule, text_sanitization_rule, verified_user_rule, + match_query_rule, ] """Scoring rules for communtiy moderation.""" + +MODERATION_PERCOLATOR_INDEX_PREFIX = "moderation-queries" +"""Index Prefix for percolator index.""" + +MODERATION_PERCOLATOR_MAPPING = { + "properties": { + "query": {"type": "percolator"}, + "score": {"type": "integer"}, + "notes": {"type": "text"}, + "active": {"type": "boolean"}, + } +} +"""Properties for moderation percolator index.""" diff --git a/site/zenodo_rdm/moderation/models.py b/site/zenodo_rdm/moderation/models.py index a2af0e68..0574a7aa 100644 --- a/site/zenodo_rdm/moderation/models.py +++ b/site/zenodo_rdm/moderation/models.py @@ -10,9 +10,15 @@ import enum from urllib.parse import urlparse +from flask import current_app from invenio_db import db +from invenio_search import current_search_client from sqlalchemy_utils import ChoiceType, Timestamp +from zenodo_rdm.api import ZenodoRDMRecord + +from .percolator import index_percolate_query + class LinkDomainStatus(enum.Enum): """Link domain status.""" @@ -73,3 +79,49 @@ def lookup_domain(cls, url): .limit(1) .scalar() ) + + +class ModerationQuery(db.Model): + """Moderation queries model.""" + + __tablename__ = "moderation_queries" + + id = db.Column(db.Integer, primary_key=True, autoincrement=True) + """Primary key identifier for the moderation query.""" + + score = db.Column(db.Integer, default=0) + """Score associated with the query.""" + + query_string = db.Column(db.Text, nullable=False) + """Query string containing the filter criteria.""" + + notes = db.Column(db.Text, nullable=True) + """Additional notes or comments regarding the moderation query.""" + + active = db.Column(db.Boolean, default=True) + """Indicates whether the moderation query is currently active.""" + + @classmethod + def create( + cls, query_string, record_cls=ZenodoRDMRecord, notes=None, score=0, active=True + ): + """Create a new moderation query with a configurable record class.""" + query = cls(query_string=query_string, notes=notes, score=score, active=active) + db.session.add(query) + + index_percolate_query(record_cls, query_string, active, score, notes) + + return query + + @classmethod + def get(cls, query_id=None): + """Retrieve a moderation query by ID or return all queries if no ID is provided.""" + if query_id is not None: + return cls.query.filter_by(id=query_id).one_or_none() + return cls.query.all() + + def __repr__(self): + """Get a string representation of the moderation query.""" + return ( + f"" + ) diff --git a/site/zenodo_rdm/moderation/percolator.py b/site/zenodo_rdm/moderation/percolator.py new file mode 100644 index 00000000..b159815c --- /dev/null +++ b/site/zenodo_rdm/moderation/percolator.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2017-2024 CERN. +# Copyright (C) 2022 Graz University of Technology. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Percolator.""" + + +from flask import current_app +from invenio_search import current_search_client +from invenio_search.utils import build_alias_name, build_index_name + + +def get_percolator_index(record_cls): + """Build the percolator index alias name for a given record class.""" + prefix = current_app.config.get("MODERATION_PERCOLATOR_INDEX_PREFIX") + combined_index = f"{prefix}-{record_cls.index._name}" + return build_alias_name(combined_index, app=current_app) + + +def create_percolator_index(record_cls): + """Create mappings with the percolator field for moderation queries. + + This function creates a new Elasticsearch index for percolator queries by copying + the settings and mappings from an existing record index and adding specific + percolator mappings. + """ + # Build the name for the new percolator index, using a prefix and the record's index name + combined_index_name = f"{current_app.config.get('MODERATION_PERCOLATOR_INDEX_PREFIX')}-{record_cls.index._name}" + percolator_index = build_index_name(combined_index_name, app=current_app) + + # Get the current mapping for the record index to copy its structure + record_index = build_alias_name(record_cls.index._name) + record_mapping = current_search_client.indices.get_mapping(index=record_index) + assert len(record_mapping) == 1 + # Extract the mappings from the record index and store in `percolator_mappings` + percolator_mappings = list(record_mapping.values())[0]["mappings"] + + # Add specific properties for percolator fields from the app configuration + percolator_mappings["properties"].update( + current_app.config.get("MODERATION_PERCOLATOR_MAPPING")["properties"] + ) + + # Retrieve the current settings of the record index to copy them to the percolator index + record_settings = list( + current_search_client.indices.get_settings(index=record_index).values() + )[0]["settings"]["index"] + + percolator_settings = { + "index": { + "query": { + "default_field": record_settings.get("query", {}).get( + "default_field", [] + ) + } + }, + "analysis": record_settings.get("analysis", {}), + } + + if not current_search_client.indices.exists(percolator_index): + try: + current_search_client.indices.create( + index=percolator_index, + body={ + "settings": percolator_settings, + "mappings": {**percolator_mappings}, + }, + ) + except Exception as e: + current_app.logger.exception(e) + + +def index_percolate_query(record_cls, query_string, active=True, score=1, notes=None): + """Index a percolate query.""" + try: + current_search_client.index( + index=get_percolator_index(record_cls), + body={ + "query": {"query_string": {"query": query_string}}, + "active": active, + "score": score, + "notes": notes, + }, + ) + except Exception as e: + current_app.logger.exception(e) diff --git a/site/zenodo_rdm/moderation/rules.py b/site/zenodo_rdm/moderation/rules.py index 18f59ad0..8e6b8462 100644 --- a/site/zenodo_rdm/moderation/rules.py +++ b/site/zenodo_rdm/moderation/rules.py @@ -10,8 +10,11 @@ import re from flask import current_app +from invenio_search import current_search_client +from invenio_search.utils import build_alias_name from .models import LinkDomain, LinkDomainStatus +from .percolator import get_percolator_index from .proxies import current_scores # @@ -130,3 +133,30 @@ def files_rule(identity, draft=None, record=None): score += current_scores.ham_files return score + + +def match_query_rule(identity, draft=None, record=None): + """Calculate a score based on matched percolate queries against the given document in the specified index.""" + document = record.dumps() + percolator_index = get_percolator_index(record) + if percolator_index: + matched_queries = current_search_client.search( + index=percolator_index, + body={ + "query": { + "bool": { + "must": [ + {"term": {"active": True}}, + {"percolate": {"field": "query", "document": document}}, + ] + } + } + }, + ) + + score = 0 + + for hit in matched_queries["hits"]["hits"]: + query_score = hit["_source"].get("score", 0) + score += query_score + return score