Skip to content

Commit

Permalink
Add script to extract tutorial and metadata from GTN
Browse files Browse the repository at this point in the history
- Add script with shared functions between tool and tutorial extraction scripts
  • Loading branch information
bebatut committed Mar 21, 2024
1 parent bd960ac commit 5f387bf
Show file tree
Hide file tree
Showing 4 changed files with 390 additions and 8 deletions.
332 changes: 332 additions & 0 deletions bin/extract_all_gtn_tutorials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,332 @@
#!/usr/bin/env python

import argparse
from datetime import datetime
import io
from pathlib import Path
import requests
import shutil
from typing import (
Any,
Dict,
List,
Optional,
)
import zipfile

import pandas as pd
from owlready2 import get_ontology, Thing
import yt_dlp

from shared_functions import *


def get_request_json(url: str) -> dict:
"""
Return JSON output using request
:param url: galaxy tool id
"""
r = requests.get(url)
r.raise_for_status()
return r.json()


def format_date(date: str) -> str:
return datetime.fromisoformat(date).strftime("%Y-%m-%d")


def add_supported_servers(tuto: dict) -> None:
"""
Split supported_servers into 2 lists there
"""
if "supported_servers" in tuto:
if "exact" in tuto["supported_servers"]:
tuto["exact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["exact"]]
if "inexact" in tuto["supported_servers"]:
tuto["inexact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["inexact"]]


def get_short_tool_ids(tuto: dict) -> None:
"""
Get tool ids without toolshed URL
"""
if "tools" in tuto:
tuto["short_tools"] = []
for tool in tuto["tools"]:
if "toolshed" in tool:
tuto["short_tools"].append(tool.split("/")[-3])
else:
tuto["short_tools"].append(tool)


def get_edam_topics(tuto: dict, edam_ontology) -> None:
"""
Get EDAM topics instead of EDAM ids
"""
tuto["edam_topic"] = []
if "edam_ontology" in tuto:
for term in tuto["edam_ontology"]:
if "topic" in term and edam_ontology[term]:
tuto["edam_topic"] += edam_ontology[term].label


def get_edam_operations(tuto: dict, tools: dict) -> None:
"""
Get EDAM operations from the tools
"""
tuto["edam_operation"] = []
if "short_tools" in tuto:
edam_operation = set()
for t in tuto["short_tools"]:
if t in tools:
edam_operation.update(set(tools[t]["EDAM operation"]))
tuto["edam_operation"] = list(edam_operation)


def get_feedback(tuto: dict, feedback: dict) -> None:
"""
Get feedback for tutorial
"""
tuto["feedback_number"] = 0
tuto["feedback_mean_note"] = None
title = tuto["title"]
if title in feedback:
tuto["feedback_number"] = feedback[title]["number"]
tuto["feedback_mean_note"] = feedback[title]["mean note"]


def get_visits(tuto):
"""
Extract tutorial visitors and pageviews from Plausible
"""
tuto["visitors"] = 0
tuto["pageviews"] = 0
url = f"https://plausible.galaxyproject.eu/training.galaxyproject.org/export?page=%2Ftraining-material%2Ftopics%2F{tuto["topic_name"]}%2Ftutorials%2F{tuto["tutorial_name"]}%2Ftutorial.html&period=all"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
tmp_dp = Path("tmp")
z.extractall(tmp_dp)
visitor_fp = tmp_dp / Path("visitors.csv")
if visitor_fp.exists():
visitor_df = pd.read_csv(visitor_fp).sum()
tuto["visitors"] = visitor_df["visitors"]
tuto["pageviews"] = visitor_df["pageviews"]
shutil.rmtree(tmp_dp)


def get_youtube_stats(tuto: dict) -> None:
"""
Get YouTube stats
"""
tuto["video_versions"] = 0
tuto["video_view"] = 0
ydl_opts = {
"ignoreerrors": True,
"quiet": True
}
if "video_library" in tuto and tuto["video_library"]["tutorial"]:
tuto["video_versions"] = len(tuto["video_library"]["tutorial"]["versions"])
for v in tuto["video_library"]["tutorial"]["versions"]:
url = f"https://www.youtube.com/watch?v={v['link']}"
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
info = ydl.sanitize_info(info)
if info:
tuto["video_view"] += info["view_count"]


def format_tutorial(tuto: dict, edam_ontology, tools: dict, feedback: dict) -> None:
tuto["url"] = f'https://training.galaxyproject.org/{tuto["url"]}'
tuto["mod_date"] = format_date(tuto["mod_date"])
tuto["pub_date"] = format_date(tuto["pub_date"])
add_supported_servers(tuto)
get_short_tool_ids(tuto)
get_edam_topics(tuto, edam_ontology)
get_edam_operations(tuto, tools)
#get_visits(tuto)
get_feedback(tuto, feedback)
get_youtube_stats(tuto)
return tuto


def read_suite_per_tool_id(tool_fp: str) -> Dict:
"""
Read the tool suite table and extract a dictionary per tool id
"""
tool_suites = pd.read_csv(tool_fp, sep="\t", keep_default_na=False).to_dict("records")
tools = {}
for suite in tool_suites:
for tool in suite["Galaxy tool ids"].split(", "):
tools[tool] = {
"Galaxy wrapper id": suite["Galaxy wrapper id"],
"Galaxy wrapper owner": suite["Galaxy wrapper id"],
"EDAM operation": suite["EDAM operation"].split(", "),
}
return tools


def get_feedback_per_tutorials() -> Dict:
"""
Get feedback from GTN API and group per tutorial
"""
feedback = get_request_json("https://training.galaxyproject.org/training-material/api/feedback.json")
feedback_per_tuto = {}
for f in feedback:
tuto = f["tutorial"]
feedback_per_tuto.setdefault(tuto, {"number": 0, "mean note": 0})
feedback_per_tuto[tuto]["number"] += 1
feedback_per_tuto[tuto]["mean note"] += int(f["note"])
for tuto in feedback_per_tuto:
feedback_per_tuto[tuto]["mean note"] /= feedback_per_tuto[tuto]["number"]
return feedback_per_tuto


def get_trainings(tool_fp: str) -> List[Dict]:
"""
Extract training material from the GTN API, format them, extract EDAM operations from tools, feedback stats, view stats, etc
"""
tools = read_suite_per_tool_id(tool_fp)
feedback = get_feedback_per_tutorials()
edam_ontology = get_ontology('https://edamontology.org/EDAM_unstable.owl').load()
topics = get_request_json('https://training.galaxyproject.org/training-material/api/topics.json')
tutos = []
for topic in topics:
topic_information = get_request_json(f"https://training.galaxyproject.org/training-material/api/topics/{topic}.json")
for tuto in topic_information["materials"]:
if tuto is None:
continue
format_tutorial(tuto, edam_ontology, tools, feedback)
tutos.append(tuto)
return tutos


def filter_training(trainings: List[Dict], tags: List) -> List[Dict]:
"""
Filter training based on a list of tags
"""
filtered_trainings = []
for training in trainings:
to_keep = False
if "tags" in training and training["tags"]:
for t in training["tags"]:
if t in tags:
to_keep = True
if to_keep:
filtered_trainings.append(training)
return filtered_trainings


def export_training_to_tsv(trainings: List[Dict], output_fp: str) -> None:
"""
Export trainings to a TSV file
"""
df = (pd.DataFrame(trainings)
.assign(
Workflows=lambda df: df.workflows.notna(),
exact_supported_servers= lambda df: df.exact_supported_servers.fillna("").apply(list),
inexact_supported_servers= lambda df: df.inexact_supported_servers.fillna("").apply(list),
)
)

for col in ["exact_supported_servers", "inexact_supported_servers", "short_tools", "edam_operation", "edam_topic"]:
df[col] = format_list_column(df[col])

df = (df
.rename(columns = {
"title": "Title",
"hands_on": "Tutorial",
"url": "Link",
"slides": "Slides",
"mod_date": "Last modification",
"pub_date": "Creation",
"version": "Version",
"short_tools": "Tools",
"exact_supported_servers": "Servers with precise tool versions",
"inexact_supported_servers": "Servers with tool but different versions",
"topic_name_human": "Topic",
"video": "Video",
"edam_topic": "EDAM topic",
"edam_operation": "EDAM operation",
"feedback_number": "Feedback number",
"feedback_mean_note": "Feedback mean note",
"visitors": "Visitors",
"pageviews": "Page views",
"video_versions": "Video versions",
"video_view": "Video views"
})
.fillna("")
.reindex(columns = [
"Topic",
"Title",
"Link",
"EDAM topic",
"EDAM operation",
"Creation",
"Last modification",
"Version",
"Tutorial",
"Slides",
"Video",
"Workflows",
"Tools",
"Servers with precise tool versions",
"Servers with tool but different versions",
"Feedback number",
"Feedback mean note",
"Visitors",
"Page views",
"Video views"
])
)

df.to_csv(output_fp, sep="\t", index=False)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract Galaxy Training Materials from GTN API together with statistics"
)
subparser = parser.add_subparsers(dest="command")
# Extract training
extracttraining = subparser.add_parser("extracttraining", help="Extract all training materials")
extracttraining.add_argument("--all_trainings", "-o", required=True, help="Filepath to JSON with all extracted training materials")
extracttraining.add_argument(
"--tools",
"-t",
required=True,
help="Filepath to TSV with all extracted tools, generated by extractools command",
)

# Filter training
filtertraining = subparser.add_parser("filtertraining", help="Filter training materials based on their tags")
filtertraining.add_argument(
"--all_trainings",
"-t",
required=True,
help="Filepath to JSON with all extracted trainings, generated by extracttraining command",
)
filtertraining.add_argument(
"--filtered_trainings",
"-f",
required=True,
help="Filepath to TSV with filtered trainings",
)
filtertraining.add_argument(
"--tags",
"-c",
help="Path to a file with tags to keep in the extraction (one per line)",
)
args = parser.parse_args()

if args.command == "extracttraining":
trainings = get_trainings(args.tools)
export_training_to_json(trainings, args.all_trainings)

elif args.command == "filtertraining":
trainings = load_json(args.all_trainings)
# get categories and training to exclude
tags = read_file(args.tags)
# filter training lists
filtered_training = filter_training(trainings, tags)
export_training_to_tsv(filtered_training, args.filtered_trainings)
10 changes: 3 additions & 7 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
from github.ContentFile import ContentFile
from github.Repository import Repository

import shared_functions


# Config variables
BIOTOOLS_API_URL = "https://bio.tools"
# BIOTOOLS_API_URL = "https://130.226.25.21"
Expand Down Expand Up @@ -542,13 +545,6 @@ def add_instances_to_table(
return new_table


def format_list_column(col: pd.Series) -> pd.Series:
"""
Format a column that could be a list before exporting
"""
return col.apply(lambda x: ", ".join(str(i) for i in x))


def export_tools(
tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False
) -> None:
Expand Down
Loading

0 comments on commit 5f387bf

Please sign in to comment.