From 5f387bf7e77e9ec829fcbb893b8a61f29b60bfaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Wed, 20 Mar 2024 16:57:14 +0100 Subject: [PATCH] Add script to extract tutorial and metadata from GTN - Add script with shared functions between tool and tutorial extraction scripts --- bin/extract_all_gtn_tutorials.py | 332 +++++++++++++++++++++++++++++++ bin/extract_galaxy_tools.py | 10 +- bin/shared_functions.py | 52 +++++ requirements.txt | 4 +- 4 files changed, 390 insertions(+), 8 deletions(-) create mode 100644 bin/extract_all_gtn_tutorials.py create mode 100644 bin/shared_functions.py diff --git a/bin/extract_all_gtn_tutorials.py b/bin/extract_all_gtn_tutorials.py new file mode 100644 index 00000000..fc82939b --- /dev/null +++ b/bin/extract_all_gtn_tutorials.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python + +import argparse +from datetime import datetime +import io +from pathlib import Path +import requests +import shutil +from typing import ( + Any, + Dict, + List, + Optional, +) +import zipfile + +import pandas as pd +from owlready2 import get_ontology, Thing +import yt_dlp + +from shared_functions import * + + +def get_request_json(url: str) -> dict: + """ + Return JSON output using request + + :param url: galaxy tool id + """ + r = requests.get(url) + r.raise_for_status() + return r.json() + + +def format_date(date: str) -> str: + return datetime.fromisoformat(date).strftime("%Y-%m-%d") + + +def add_supported_servers(tuto: dict) -> None: + """ + Split supported_servers into 2 lists there + """ + if "supported_servers" in tuto: + if "exact" in tuto["supported_servers"]: + tuto["exact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["exact"]] + if "inexact" in tuto["supported_servers"]: + tuto["inexact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["inexact"]] + + +def get_short_tool_ids(tuto: dict) -> None: + """ + Get tool ids without toolshed URL + """ + if "tools" in tuto: + tuto["short_tools"] = [] + for tool in tuto["tools"]: + if "toolshed" in tool: + tuto["short_tools"].append(tool.split("/")[-3]) + else: + tuto["short_tools"].append(tool) + + +def get_edam_topics(tuto: dict, edam_ontology) -> None: + """ + Get EDAM topics instead of EDAM ids + """ + tuto["edam_topic"] = [] + if "edam_ontology" in tuto: + for term in tuto["edam_ontology"]: + if "topic" in term and edam_ontology[term]: + tuto["edam_topic"] += edam_ontology[term].label + + +def get_edam_operations(tuto: dict, tools: dict) -> None: + """ + Get EDAM operations from the tools + """ + tuto["edam_operation"] = [] + if "short_tools" in tuto: + edam_operation = set() + for t in tuto["short_tools"]: + if t in tools: + edam_operation.update(set(tools[t]["EDAM operation"])) + tuto["edam_operation"] = list(edam_operation) + + +def get_feedback(tuto: dict, feedback: dict) -> None: + """ + Get feedback for tutorial + """ + tuto["feedback_number"] = 0 + tuto["feedback_mean_note"] = None + title = tuto["title"] + if title in feedback: + tuto["feedback_number"] = feedback[title]["number"] + tuto["feedback_mean_note"] = feedback[title]["mean note"] + + +def get_visits(tuto): + """ + Extract tutorial visitors and pageviews from Plausible + """ + tuto["visitors"] = 0 + tuto["pageviews"] = 0 + url = f"https://plausible.galaxyproject.eu/training.galaxyproject.org/export?page=%2Ftraining-material%2Ftopics%2F{tuto["topic_name"]}%2Ftutorials%2F{tuto["tutorial_name"]}%2Ftutorial.html&period=all" + r = requests.get(url) + z = zipfile.ZipFile(io.BytesIO(r.content)) + tmp_dp = Path("tmp") + z.extractall(tmp_dp) + visitor_fp = tmp_dp / Path("visitors.csv") + if visitor_fp.exists(): + visitor_df = pd.read_csv(visitor_fp).sum() + tuto["visitors"] = visitor_df["visitors"] + tuto["pageviews"] = visitor_df["pageviews"] + shutil.rmtree(tmp_dp) + + +def get_youtube_stats(tuto: dict) -> None: + """ + Get YouTube stats + """ + tuto["video_versions"] = 0 + tuto["video_view"] = 0 + ydl_opts = { + "ignoreerrors": True, + "quiet": True + } + if "video_library" in tuto and tuto["video_library"]["tutorial"]: + tuto["video_versions"] = len(tuto["video_library"]["tutorial"]["versions"]) + for v in tuto["video_library"]["tutorial"]["versions"]: + url = f"https://www.youtube.com/watch?v={v['link']}" + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + info = ydl.sanitize_info(info) + if info: + tuto["video_view"] += info["view_count"] + + +def format_tutorial(tuto: dict, edam_ontology, tools: dict, feedback: dict) -> None: + tuto["url"] = f'https://training.galaxyproject.org/{tuto["url"]}' + tuto["mod_date"] = format_date(tuto["mod_date"]) + tuto["pub_date"] = format_date(tuto["pub_date"]) + add_supported_servers(tuto) + get_short_tool_ids(tuto) + get_edam_topics(tuto, edam_ontology) + get_edam_operations(tuto, tools) + #get_visits(tuto) + get_feedback(tuto, feedback) + get_youtube_stats(tuto) + return tuto + + +def read_suite_per_tool_id(tool_fp: str) -> Dict: + """ + Read the tool suite table and extract a dictionary per tool id + """ + tool_suites = pd.read_csv(tool_fp, sep="\t", keep_default_na=False).to_dict("records") + tools = {} + for suite in tool_suites: + for tool in suite["Galaxy tool ids"].split(", "): + tools[tool] = { + "Galaxy wrapper id": suite["Galaxy wrapper id"], + "Galaxy wrapper owner": suite["Galaxy wrapper id"], + "EDAM operation": suite["EDAM operation"].split(", "), + } + return tools + + +def get_feedback_per_tutorials() -> Dict: + """ + Get feedback from GTN API and group per tutorial + """ + feedback = get_request_json("https://training.galaxyproject.org/training-material/api/feedback.json") + feedback_per_tuto = {} + for f in feedback: + tuto = f["tutorial"] + feedback_per_tuto.setdefault(tuto, {"number": 0, "mean note": 0}) + feedback_per_tuto[tuto]["number"] += 1 + feedback_per_tuto[tuto]["mean note"] += int(f["note"]) + for tuto in feedback_per_tuto: + feedback_per_tuto[tuto]["mean note"] /= feedback_per_tuto[tuto]["number"] + return feedback_per_tuto + + +def get_trainings(tool_fp: str) -> List[Dict]: + """ + Extract training material from the GTN API, format them, extract EDAM operations from tools, feedback stats, view stats, etc + """ + tools = read_suite_per_tool_id(tool_fp) + feedback = get_feedback_per_tutorials() + edam_ontology = get_ontology('https://edamontology.org/EDAM_unstable.owl').load() + topics = get_request_json('https://training.galaxyproject.org/training-material/api/topics.json') + tutos = [] + for topic in topics: + topic_information = get_request_json(f"https://training.galaxyproject.org/training-material/api/topics/{topic}.json") + for tuto in topic_information["materials"]: + if tuto is None: + continue + format_tutorial(tuto, edam_ontology, tools, feedback) + tutos.append(tuto) + return tutos + + +def filter_training(trainings: List[Dict], tags: List) -> List[Dict]: + """ + Filter training based on a list of tags + """ + filtered_trainings = [] + for training in trainings: + to_keep = False + if "tags" in training and training["tags"]: + for t in training["tags"]: + if t in tags: + to_keep = True + if to_keep: + filtered_trainings.append(training) + return filtered_trainings + + +def export_training_to_tsv(trainings: List[Dict], output_fp: str) -> None: + """ + Export trainings to a TSV file + """ + df = (pd.DataFrame(trainings) + .assign( + Workflows=lambda df: df.workflows.notna(), + exact_supported_servers= lambda df: df.exact_supported_servers.fillna("").apply(list), + inexact_supported_servers= lambda df: df.inexact_supported_servers.fillna("").apply(list), + ) + ) + + for col in ["exact_supported_servers", "inexact_supported_servers", "short_tools", "edam_operation", "edam_topic"]: + df[col] = format_list_column(df[col]) + + df = (df + .rename(columns = { + "title": "Title", + "hands_on": "Tutorial", + "url": "Link", + "slides": "Slides", + "mod_date": "Last modification", + "pub_date": "Creation", + "version": "Version", + "short_tools": "Tools", + "exact_supported_servers": "Servers with precise tool versions", + "inexact_supported_servers": "Servers with tool but different versions", + "topic_name_human": "Topic", + "video": "Video", + "edam_topic": "EDAM topic", + "edam_operation": "EDAM operation", + "feedback_number": "Feedback number", + "feedback_mean_note": "Feedback mean note", + "visitors": "Visitors", + "pageviews": "Page views", + "video_versions": "Video versions", + "video_view": "Video views" + }) + .fillna("") + .reindex(columns = [ + "Topic", + "Title", + "Link", + "EDAM topic", + "EDAM operation", + "Creation", + "Last modification", + "Version", + "Tutorial", + "Slides", + "Video", + "Workflows", + "Tools", + "Servers with precise tool versions", + "Servers with tool but different versions", + "Feedback number", + "Feedback mean note", + "Visitors", + "Page views", + "Video views" + ]) + ) + + df.to_csv(output_fp, sep="\t", index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extract Galaxy Training Materials from GTN API together with statistics" + ) + subparser = parser.add_subparsers(dest="command") + # Extract training + extracttraining = subparser.add_parser("extracttraining", help="Extract all training materials") + extracttraining.add_argument("--all_trainings", "-o", required=True, help="Filepath to JSON with all extracted training materials") + extracttraining.add_argument( + "--tools", + "-t", + required=True, + help="Filepath to TSV with all extracted tools, generated by extractools command", + ) + + # Filter training + filtertraining = subparser.add_parser("filtertraining", help="Filter training materials based on their tags") + filtertraining.add_argument( + "--all_trainings", + "-t", + required=True, + help="Filepath to JSON with all extracted trainings, generated by extracttraining command", + ) + filtertraining.add_argument( + "--filtered_trainings", + "-f", + required=True, + help="Filepath to TSV with filtered trainings", + ) + filtertraining.add_argument( + "--tags", + "-c", + help="Path to a file with tags to keep in the extraction (one per line)", + ) + args = parser.parse_args() + + if args.command == "extracttraining": + trainings = get_trainings(args.tools) + export_training_to_json(trainings, args.all_trainings) + + elif args.command == "filtertraining": + trainings = load_json(args.all_trainings) + # get categories and training to exclude + tags = read_file(args.tags) + # filter training lists + filtered_training = filter_training(trainings, tags) + export_training_to_tsv(filtered_training, args.filtered_trainings) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 95a19eb9..543da5ff 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -21,6 +21,9 @@ from github.ContentFile import ContentFile from github.Repository import Repository +import shared_functions + + # Config variables BIOTOOLS_API_URL = "https://bio.tools" # BIOTOOLS_API_URL = "https://130.226.25.21" @@ -542,13 +545,6 @@ def add_instances_to_table( return new_table -def format_list_column(col: pd.Series) -> pd.Series: - """ - Format a column that could be a list before exporting - """ - return col.apply(lambda x: ", ".join(str(i) for i in x)) - - def export_tools( tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False ) -> None: diff --git a/bin/shared_functions.py b/bin/shared_functions.py new file mode 100644 index 00000000..126fdb18 --- /dev/null +++ b/bin/shared_functions.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import pandas as pd + + +def format_list_column(col: pd.Series) -> pd.Series: + """ + Format a column that could be a list before exporting + """ + return col.apply(lambda x: ", ".join(str(i) for i in x)) + + +def read_file(filepath: Optional[str]) -> List[str]: + """ + Read an optional file with 1 element per line + + :param filepath: path to a file + """ + if filepath is None: + return [] + fp = Path(filepath) + if fp.is_file(): + with fp.open("r") as f: + return [x.rstrip() for x in f.readlines()] + else: + return [] + + +def export_to_json(data: List[Dict], output_fp: str) -> None: + """ + Export to a JSON file + """ + with Path(output_fp).open("w") as f: + json.dump(data, f, indent=4, sort_keys=True) + + +def load_json(input_df: str): + """ + Read a JSON file + """ + with Path(input_df).open("r") as t: + content = json.load(t) + return content \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 34ad35e7..040ea6b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ pyyaml numpy Pillow matplotlib -wordcloud \ No newline at end of file +wordcloud +owlready2 +yt-dlp \ No newline at end of file