Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moderation: added query match rule #1049

Merged
merged 2 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tests =
[options.entry_points]
flask.commands =
zenodo-admin = zenodo_rdm.cli:zenodo_admin
moderation = zenodo_rdm.cli:moderation
invenio_base.blueprints =
zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint
zenodo_rdm_support = zenodo_rdm.views:create_blueprint
Expand Down
42 changes: 42 additions & 0 deletions site/tests/moderation/test_moderation_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Test ModerationQuery model class."""

from invenio_db import db

from zenodo_rdm.moderation.models import ModerationQuery


def test_moderation_query_creation(app):
slint marked this conversation as resolved.
Show resolved Hide resolved
"""Test to create and index a ModerationQuery."""
with app.app_context():
query_string = "metadata.title:SimpleTest"
notes = "test query"
score = 5
active = True

query = ModerationQuery.create(
query_string,
notes=notes,
score=score,
active=active,
)
db.session.commit()

# Check if query attributes are set correctly
assert all(
[
query.query_string == query_string,
query.notes == notes,
query.score == score,
query.active == active,
]
)


# TODO: Add test for matching query
124 changes: 124 additions & 0 deletions site/zenodo_rdm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@
# under the terms of the MIT License; see LICENSE file for more details.
"""Zenodo RDM cli commands."""

import csv
import os

import click
from flask.cli import with_appcontext
from invenio_access.permissions import system_identity
from invenio_communities.communities.records.api import Community
from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier
from invenio_rdm_records.proxies import current_rdm_records_service
Expand All @@ -25,6 +29,14 @@
from invenio_requests.records.api import Request
from invenio_requests.records.models import RequestMetadata

from zenodo_rdm.api import ZenodoRDMRecord
from zenodo_rdm.moderation.models import ModerationQuery
from zenodo_rdm.moderation.percolator import (
create_percolator_index,
get_percolator_index,
index_percolate_query,
)


def _get_parent(record_model):
parent_model = record_model.parent
Expand Down Expand Up @@ -246,3 +258,115 @@ def delete_record(recid):

for req in requests:
current_requests_service.indexer.delete(req)


@click.group()
def moderation():
"""Moderation commands."""


@moderation.command("create-queries-index")
@click.option(
"-r",
"--record-cls",
type=click.Choice(["records", "communities"], case_sensitive=False),
default="records",
help="Record class to base the index on (default: records).",
)
@with_appcontext
def create_index(record_cls):
"""Command to create a percolator index for moderation queries."""
record_cls = ZenodoRDMRecord if record_cls == "records" else Community

try:
create_percolator_index(record_cls)
index_name = get_percolator_index(record_cls)
click.secho(f"Percolator index '{index_name}' created successfully.")
except Exception as e:
click.secho(f"Error creating percolator index: {e}")


@moderation.command("add-query")
@click.option(
"-r",
"--record-cls",
type=click.Choice(["records", "communities"], case_sensitive=False),
default="records",
help="Record class to base the query on (default: records).",
)
@click.option(
"-q",
"--query-string",
help="The query string for the moderation query (optional if loading from CSV).",
)
@click.option(
"-n",
"--notes",
default="Example note",
help="Additional notes for the moderation query (optional if loading from CSV).",
)
@click.option(
"-s",
"--score",
default=10,
type=int,
help="The score for the moderation query (optional if loading from CSV).",
)
@click.option(
"-a",
"--active",
default=True,
type=bool,
help="Whether the query is active (optional if loading from CSV).",
)
@click.option(
"-f",
"--file",
type=click.Path(exists=True, readable=True),
help="Path to CSV file containing queries.",
)
@with_appcontext
def add_query(record_cls, query_string, notes, score, active, file):
"""Command to add a moderation query from CSV or directly and index it."""
record_cls = ZenodoRDMRecord if record_cls == "records" else Community

try:
if file:
add_queries_from_csv(file, record_cls)
else:
create_and_index_query(record_cls, query_string, notes, score, active)

click.secho("Queries added and indexed successfully.")
except Exception as e:
click.secho(f"Error adding or indexing query: {e}")


def add_queries_from_csv(file_path, record_cls=ZenodoRDMRecord):
"""Load queries from a CSV file, add them to the database, and index them."""
with open(file_path, mode="r", newline="", encoding="utf-8") as csvfile:
csvreader = csv.reader(csvfile)

for row in csvreader:
if row:
query_string = row[0].strip().strip("'")
notes = row[1].strip().strip("'") if len(row) > 1 else None
score = int(row[2].strip()) if len(row) > 2 else 10 # Default score 10
active = (
row[3].strip().lower() == "true" if len(row) > 3 else True
) # Default to True

# Ensure to add query only if there's a query string
if query_string:
create_and_index_query(
record_cls, query_string, notes, score, active
)


def create_and_index_query(record_cls, query_string, notes, score, active):
"""Create and index a single moderation query."""
query = ModerationQuery.create(
query_string=query_string, notes=notes, score=score, active=active
)

db.session.commit()
index_percolate_query(record_cls, query.id, query_string, active, score, notes)
slint marked this conversation as resolved.
Show resolved Hide resolved
23 changes: 22 additions & 1 deletion site/zenodo_rdm/moderation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

"""Moderation config."""

from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule
from .rules import (
files_rule,
links_rule,
match_query_rule,
text_sanitization_rule,
verified_user_rule,
)

MODERATION_SCORES = {
"spam_link": 8,
Expand Down Expand Up @@ -40,12 +46,27 @@
links_rule,
files_rule,
text_sanitization_rule,
match_query_rule,
]
"""Scoring rules for record moderation."""

MODERATION_COMMUNITY_SCORE_RULES = [
links_rule,
text_sanitization_rule,
verified_user_rule,
match_query_rule,
slint marked this conversation as resolved.
Show resolved Hide resolved
]
"""Scoring rules for communtiy moderation."""

MODERATION_PERCOLATOR_INDEX_PREFIX = "moderation-queries"
"""Index Prefix for percolator index."""

MODERATION_PERCOLATOR_MAPPING = {
"properties": {
"query": {"type": "percolator"},
"score": {"type": "integer"},
"notes": {"type": "text"},
"active": {"type": "boolean"},
}
}
"""Properties for moderation percolator index."""
39 changes: 39 additions & 0 deletions site/zenodo_rdm/moderation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,42 @@ def lookup_domain(cls, url):
.limit(1)
.scalar()
)


class ModerationQuery(db.Model):
"""Moderation queries model."""

__tablename__ = "moderation_queries"

id = db.Column(db.Integer, primary_key=True)
"""Primary key identifier for the moderation query."""

score = db.Column(db.Integer, default=0)
"""Score associated with the query."""

query_string = db.Column(db.Text, nullable=False)
"""Query string containing the filter criteria."""

notes = db.Column(db.Text, nullable=True)
"""Additional notes or comments regarding the moderation query."""

active = db.Column(db.Boolean, default=True)
"""Indicates whether the moderation query is currently active."""

@classmethod
def create(cls, query_string, notes=None, score=0, active=True):
"""Create a new moderation query with a configurable record class."""
query = cls(query_string=query_string, notes=notes, score=score, active=active)
db.session.add(query)

return query

@classmethod
def get(cls, query_id=None):
"""Retrieve a moderation query by ID or return all queries if no ID is provided."""
if query_id is not None:
return cls.query.filter_by(id=query_id).one_or_none()

def __repr__(self):
"""Get a string representation of the moderation query."""
return f"<ModerationQuery id={self.id}, query_string={self.query_string}, score={self.score}, active={self.active}>"
92 changes: 92 additions & 0 deletions site/zenodo_rdm/moderation/percolator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2024 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Percolator."""


from flask import current_app
from invenio_search import current_search_client
from invenio_search.utils import build_alias_name, build_index_name


def get_percolator_index(record_cls):
"""Build the percolator index alias name for a given record class."""
prefix = current_app.config.get("MODERATION_PERCOLATOR_INDEX_PREFIX")
combined_index = f"{prefix}-{record_cls.index._name}"
return build_alias_name(combined_index, app=current_app)


def create_percolator_index(record_cls):
"""Create mappings with the percolator field for moderation queries.

This function creates a new Elasticsearch index for percolator queries by copying
the settings and mappings from an existing record index and adding specific
percolator mappings.
"""
# Build the name for the new percolator index, using a prefix and the record's index name
combined_index_name = f"{current_app.config.get('MODERATION_PERCOLATOR_INDEX_PREFIX')}-{record_cls.index._name}"
percolator_index = build_index_name(combined_index_name, app=current_app)

# Get the current mapping for the record index to copy its structure
record_index = build_alias_name(record_cls.index._name)
record_mapping = current_search_client.indices.get_mapping(index=record_index)
assert len(record_mapping) == 1
# Extract the mappings from the record index and store in `percolator_mappings`
percolator_mappings = list(record_mapping.values())[0]["mappings"]
slint marked this conversation as resolved.
Show resolved Hide resolved

# Add specific properties for percolator fields from the app configuration
percolator_mappings["properties"].update(
current_app.config.get("MODERATION_PERCOLATOR_MAPPING")["properties"]
)

# Retrieve the current settings of the record index to copy them to the percolator index
record_settings = list(
current_search_client.indices.get_settings(index=record_index).values()
)[0]["settings"]["index"]

percolator_settings = {
"index": {
"query": {
"default_field": record_settings.get("query", {}).get(
"default_field", []
)
}
},
"analysis": record_settings.get("analysis", {}),
}

if not current_search_client.indices.exists(percolator_index):
try:
current_search_client.indices.create(
index=percolator_index,
body={
"settings": percolator_settings,
"mappings": {**percolator_mappings},
},
)
except Exception as e:
current_app.logger.exception(e)


def index_percolate_query(
record_cls, query_id, query_string, active=True, score=1, notes=None
):
"""Index a percolate query."""
try:
current_search_client.index(
index=get_percolator_index(record_cls),
body={
"id": query_id,
"query": {"query_string": {"query": query_string}},
"active": active,
"score": score,
"notes": notes,
},
)
except Exception as e:
current_app.logger.exception(e)
Loading