Add script to extract tutorial and metadata from GTN

- Add script with shared functions between tool and tutorial extraction scripts
galaxyproject · Mar 21, 2024 · 5f387bf · 5f387bf
1 parent bd960ac
commit 5f387bf
Show file tree

Hide file tree

Showing 4 changed files with 390 additions and 8 deletions.
diff --git a/bin/extract_all_gtn_tutorials.py b/bin/extract_all_gtn_tutorials.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python
+
+import argparse
+from datetime import datetime
+import io
+from pathlib import Path
+import requests
+import shutil
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+)
+import zipfile
+
+import pandas as pd
+from owlready2 import get_ontology, Thing
+import yt_dlp
+
+from shared_functions import *
+
+
+def get_request_json(url: str) -> dict:
+    """
+    Return JSON output using request
+
+    :param url: galaxy tool id
+    """
+    r = requests.get(url)
+    r.raise_for_status()
+    return r.json()
+
+
+def format_date(date: str) -> str:
+    return datetime.fromisoformat(date).strftime("%Y-%m-%d")
+
+
+def add_supported_servers(tuto: dict) -> None:
+    """
+    Split supported_servers into 2 lists there
+    """
+    if "supported_servers" in tuto:
+        if "exact" in tuto["supported_servers"]:
+            tuto["exact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["exact"]]
+        if "inexact" in tuto["supported_servers"]:
+            tuto["inexact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["inexact"]]
+
+
+def get_short_tool_ids(tuto: dict) -> None:
+    """
+    Get tool ids without toolshed URL
+    """
+    if "tools" in tuto:
+        tuto["short_tools"] = []
+        for tool in tuto["tools"]:
+            if "toolshed" in tool:
+                tuto["short_tools"].append(tool.split("/")[-3])
+            else:
+                tuto["short_tools"].append(tool)
+
+
+def get_edam_topics(tuto: dict, edam_ontology) -> None:
+    """
+    Get EDAM topics instead of EDAM ids
+    """
+    tuto["edam_topic"] = []
+    if "edam_ontology" in tuto:
+        for term in tuto["edam_ontology"]:
+            if "topic" in term and edam_ontology[term]:
+                tuto["edam_topic"] += edam_ontology[term].label
+
+
+def get_edam_operations(tuto: dict, tools: dict) -> None:
+    """
+    Get EDAM operations from the tools
+    """
+    tuto["edam_operation"] = []
+    if "short_tools" in tuto:
+        edam_operation = set()
+        for t in tuto["short_tools"]:
+            if t in tools:
+                edam_operation.update(set(tools[t]["EDAM operation"]))
+        tuto["edam_operation"] = list(edam_operation)
+
+
+def get_feedback(tuto: dict, feedback: dict) -> None:
+    """
+    Get feedback for tutorial
+    """
+    tuto["feedback_number"] = 0
+    tuto["feedback_mean_note"] = None
+    title = tuto["title"]
+    if title in feedback:
+        tuto["feedback_number"] = feedback[title]["number"]
+        tuto["feedback_mean_note"] = feedback[title]["mean note"]
+
+
+def get_visits(tuto):
+    """
+    Extract tutorial visitors and pageviews from Plausible
+    """
+    tuto["visitors"] = 0
+    tuto["pageviews"] = 0
+    url = f"https://plausible.galaxyproject.eu/training.galaxyproject.org/export?page=%2Ftraining-material%2Ftopics%2F{tuto["topic_name"]}%2Ftutorials%2F{tuto["tutorial_name"]}%2Ftutorial.html&period=all"
+    r = requests.get(url)
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    tmp_dp = Path("tmp")
+    z.extractall(tmp_dp)
+    visitor_fp = tmp_dp / Path("visitors.csv")
+    if visitor_fp.exists():
+        visitor_df = pd.read_csv(visitor_fp).sum()
+        tuto["visitors"] = visitor_df["visitors"]
+        tuto["pageviews"] = visitor_df["pageviews"]
+    shutil.rmtree(tmp_dp)
+
+
+def get_youtube_stats(tuto: dict) -> None:
+    """
+    Get YouTube stats
+    """
+    tuto["video_versions"] = 0
+    tuto["video_view"] = 0
+    ydl_opts = {
+        "ignoreerrors": True,
+        "quiet": True
+    }
+    if "video_library" in tuto and tuto["video_library"]["tutorial"]:
+        tuto["video_versions"] = len(tuto["video_library"]["tutorial"]["versions"])
+        for v in tuto["video_library"]["tutorial"]["versions"]:
+            url = f"https://www.youtube.com/watch?v={v['link']}"
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+                info = ydl.sanitize_info(info)
+                if info:
+                    tuto["video_view"] += info["view_count"]
+
+
+def format_tutorial(tuto: dict, edam_ontology, tools: dict, feedback: dict) -> None:
+    tuto["url"] = f'https://training.galaxyproject.org/{tuto["url"]}'
+    tuto["mod_date"] = format_date(tuto["mod_date"])
+    tuto["pub_date"] = format_date(tuto["pub_date"])
+    add_supported_servers(tuto)
+    get_short_tool_ids(tuto)
+    get_edam_topics(tuto, edam_ontology)
+    get_edam_operations(tuto, tools)
+    #get_visits(tuto)
+    get_feedback(tuto, feedback)
+    get_youtube_stats(tuto)
+    return tuto
+
+
+def read_suite_per_tool_id(tool_fp: str) -> Dict:
+    """
+    Read the tool suite table and extract a dictionary per tool id
+    """
+    tool_suites = pd.read_csv(tool_fp, sep="\t", keep_default_na=False).to_dict("records")
+    tools = {}
+    for suite in tool_suites:
+        for tool in suite["Galaxy tool ids"].split(", "):
+            tools[tool] = {
+                "Galaxy wrapper id": suite["Galaxy wrapper id"],
+                "Galaxy wrapper owner": suite["Galaxy wrapper id"],
+                "EDAM operation": suite["EDAM operation"].split(", "),
+            }
+    return tools
+
+
+def get_feedback_per_tutorials() -> Dict:
+    """
+    Get feedback from GTN API and group per tutorial
+    """
+    feedback = get_request_json("https://training.galaxyproject.org/training-material/api/feedback.json")
+    feedback_per_tuto = {}
+    for f in feedback:
+        tuto = f["tutorial"]
+        feedback_per_tuto.setdefault(tuto, {"number": 0, "mean note": 0})
+        feedback_per_tuto[tuto]["number"] += 1
+        feedback_per_tuto[tuto]["mean note"] += int(f["note"])
+    for tuto in feedback_per_tuto:
+        feedback_per_tuto[tuto]["mean note"] /= feedback_per_tuto[tuto]["number"]
+    return feedback_per_tuto
+
+
+def get_trainings(tool_fp: str) -> List[Dict]:
+    """
+    Extract training material from the GTN API, format them, extract EDAM operations from tools, feedback stats, view stats, etc
+    """
+    tools = read_suite_per_tool_id(tool_fp) 
+    feedback = get_feedback_per_tutorials()
+    edam_ontology = get_ontology('https://edamontology.org/EDAM_unstable.owl').load()
+    topics = get_request_json('https://training.galaxyproject.org/training-material/api/topics.json')
+    tutos = []
+    for topic in topics:
+        topic_information = get_request_json(f"https://training.galaxyproject.org/training-material/api/topics/{topic}.json")
+        for tuto in topic_information["materials"]:
+            if tuto is None:
+                continue
+            format_tutorial(tuto, edam_ontology, tools, feedback)
+            tutos.append(tuto)
+    return tutos
+
+
+def filter_training(trainings: List[Dict], tags: List) ->  List[Dict]:
+    """
+    Filter training based on a list of tags
+    """
+    filtered_trainings = []
+    for training in trainings:
+        to_keep = False
+        if "tags" in training and training["tags"]:
+            for t in training["tags"]:
+                if t in tags:
+                    to_keep = True
+        if to_keep:
+            filtered_trainings.append(training)
+    return filtered_trainings
+
+
+def export_training_to_tsv(trainings: List[Dict], output_fp: str) -> None:
+    """
+    Export trainings to a TSV file
+    """
+    df = (pd.DataFrame(trainings)
+        .assign(
+            Workflows=lambda df: df.workflows.notna(),
+            exact_supported_servers= lambda df: df.exact_supported_servers.fillna("").apply(list),
+            inexact_supported_servers= lambda df: df.inexact_supported_servers.fillna("").apply(list),
+        )
+    )
+
+    for col in ["exact_supported_servers", "inexact_supported_servers", "short_tools", "edam_operation", "edam_topic"]:
+        df[col] = format_list_column(df[col])
+
+    df = (df
+        .rename(columns = {
+            "title": "Title",
+            "hands_on": "Tutorial",
+            "url": "Link",
+            "slides": "Slides",
+            "mod_date": "Last modification",
+            "pub_date": "Creation",
+            "version": "Version",
+            "short_tools": "Tools",
+            "exact_supported_servers": "Servers with precise tool versions",
+            "inexact_supported_servers": "Servers with tool but different versions",
+            "topic_name_human": "Topic",
+            "video": "Video",
+            "edam_topic": "EDAM topic",
+            "edam_operation": "EDAM operation",
+            "feedback_number": "Feedback number",
+            "feedback_mean_note": "Feedback mean note",
+            "visitors": "Visitors",
+            "pageviews": "Page views",
+            "video_versions": "Video versions",
+            "video_view": "Video views"
+        })
+        .fillna("")
+        .reindex(columns = [
+            "Topic",
+            "Title",
+            "Link",
+            "EDAM topic",
+            "EDAM operation",
+            "Creation",
+            "Last modification",
+            "Version",
+            "Tutorial",
+            "Slides",
+            "Video",
+            "Workflows",
+            "Tools",
+            "Servers with precise tool versions",
+            "Servers with tool but different versions",
+            "Feedback number",
+            "Feedback mean note",
+            "Visitors",
+            "Page views",
+            "Video views"
+        ])
+    )
+
+    df.to_csv(output_fp, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract Galaxy Training Materials from GTN API together with statistics"
+    )
+    subparser = parser.add_subparsers(dest="command")
+    # Extract training
+    extracttraining = subparser.add_parser("extracttraining", help="Extract all training materials")
+    extracttraining.add_argument("--all_trainings", "-o", required=True, help="Filepath to JSON with all extracted training materials")
+    extracttraining.add_argument(
+        "--tools",
+        "-t",
+        required=True,
+        help="Filepath to TSV with all extracted tools, generated by extractools command",
+    )
+
+    # Filter training
+    filtertraining = subparser.add_parser("filtertraining", help="Filter training materials based on their tags")
+    filtertraining.add_argument(
+        "--all_trainings",
+        "-t",
+        required=True,
+        help="Filepath to JSON with all extracted trainings, generated by extracttraining command",
+    )
+    filtertraining.add_argument(
+        "--filtered_trainings",
+        "-f",
+        required=True,
+        help="Filepath to TSV with filtered trainings",
+    )
+    filtertraining.add_argument(
+        "--tags",
+        "-c",
+        help="Path to a file with tags to keep in the extraction (one per line)",
+    )
+    args = parser.parse_args()
+
+    if args.command == "extracttraining":
+        trainings = get_trainings(args.tools)
+        export_training_to_json(trainings, args.all_trainings)
+
+    elif args.command == "filtertraining":
+        trainings = load_json(args.all_trainings)
+        # get categories and training to exclude
+        tags = read_file(args.tags)
+        # filter training lists
+        filtered_training = filter_training(trainings, tags)
+        export_training_to_tsv(filtered_training, args.filtered_trainings)
diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py
@@ -21,6 +21,9 @@
 from github.ContentFile import ContentFile
 from github.Repository import Repository
 
+import shared_functions
+
+
 # Config variables
 BIOTOOLS_API_URL = "https://bio.tools"
 # BIOTOOLS_API_URL = "https://130.226.25.21"
@@ -542,13 +545,6 @@ def add_instances_to_table(
     return new_table
 
 
-def format_list_column(col: pd.Series) -> pd.Series:
-    """
-    Format a column that could be a list before exporting
-    """
-    return col.apply(lambda x: ", ".join(str(i) for i in x))
-
-
 def export_tools(
     tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False
 ) -> None: