-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add script to extract tutorial and metadata from GTN
- Add script with shared functions between tool and tutorial extraction scripts
- Loading branch information
Showing
4 changed files
with
390 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,332 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
from datetime import datetime | ||
import io | ||
from pathlib import Path | ||
import requests | ||
import shutil | ||
from typing import ( | ||
Any, | ||
Dict, | ||
List, | ||
Optional, | ||
) | ||
import zipfile | ||
|
||
import pandas as pd | ||
from owlready2 import get_ontology, Thing | ||
import yt_dlp | ||
|
||
from shared_functions import * | ||
|
||
|
||
def get_request_json(url: str) -> dict: | ||
""" | ||
Return JSON output using request | ||
:param url: galaxy tool id | ||
""" | ||
r = requests.get(url) | ||
r.raise_for_status() | ||
return r.json() | ||
|
||
|
||
def format_date(date: str) -> str: | ||
return datetime.fromisoformat(date).strftime("%Y-%m-%d") | ||
|
||
|
||
def add_supported_servers(tuto: dict) -> None: | ||
""" | ||
Split supported_servers into 2 lists there | ||
""" | ||
if "supported_servers" in tuto: | ||
if "exact" in tuto["supported_servers"]: | ||
tuto["exact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["exact"]] | ||
if "inexact" in tuto["supported_servers"]: | ||
tuto["inexact_supported_servers"] = [server["name"] for server in tuto["supported_servers"]["inexact"]] | ||
|
||
|
||
def get_short_tool_ids(tuto: dict) -> None: | ||
""" | ||
Get tool ids without toolshed URL | ||
""" | ||
if "tools" in tuto: | ||
tuto["short_tools"] = [] | ||
for tool in tuto["tools"]: | ||
if "toolshed" in tool: | ||
tuto["short_tools"].append(tool.split("/")[-3]) | ||
else: | ||
tuto["short_tools"].append(tool) | ||
|
||
|
||
def get_edam_topics(tuto: dict, edam_ontology) -> None: | ||
""" | ||
Get EDAM topics instead of EDAM ids | ||
""" | ||
tuto["edam_topic"] = [] | ||
if "edam_ontology" in tuto: | ||
for term in tuto["edam_ontology"]: | ||
if "topic" in term and edam_ontology[term]: | ||
tuto["edam_topic"] += edam_ontology[term].label | ||
|
||
|
||
def get_edam_operations(tuto: dict, tools: dict) -> None: | ||
""" | ||
Get EDAM operations from the tools | ||
""" | ||
tuto["edam_operation"] = [] | ||
if "short_tools" in tuto: | ||
edam_operation = set() | ||
for t in tuto["short_tools"]: | ||
if t in tools: | ||
edam_operation.update(set(tools[t]["EDAM operation"])) | ||
tuto["edam_operation"] = list(edam_operation) | ||
|
||
|
||
def get_feedback(tuto: dict, feedback: dict) -> None: | ||
""" | ||
Get feedback for tutorial | ||
""" | ||
tuto["feedback_number"] = 0 | ||
tuto["feedback_mean_note"] = None | ||
title = tuto["title"] | ||
if title in feedback: | ||
tuto["feedback_number"] = feedback[title]["number"] | ||
tuto["feedback_mean_note"] = feedback[title]["mean note"] | ||
|
||
|
||
def get_visits(tuto): | ||
""" | ||
Extract tutorial visitors and pageviews from Plausible | ||
""" | ||
tuto["visitors"] = 0 | ||
tuto["pageviews"] = 0 | ||
url = f"https://plausible.galaxyproject.eu/training.galaxyproject.org/export?page=%2Ftraining-material%2Ftopics%2F{tuto["topic_name"]}%2Ftutorials%2F{tuto["tutorial_name"]}%2Ftutorial.html&period=all" | ||
r = requests.get(url) | ||
z = zipfile.ZipFile(io.BytesIO(r.content)) | ||
tmp_dp = Path("tmp") | ||
z.extractall(tmp_dp) | ||
visitor_fp = tmp_dp / Path("visitors.csv") | ||
if visitor_fp.exists(): | ||
visitor_df = pd.read_csv(visitor_fp).sum() | ||
tuto["visitors"] = visitor_df["visitors"] | ||
tuto["pageviews"] = visitor_df["pageviews"] | ||
shutil.rmtree(tmp_dp) | ||
|
||
|
||
def get_youtube_stats(tuto: dict) -> None: | ||
""" | ||
Get YouTube stats | ||
""" | ||
tuto["video_versions"] = 0 | ||
tuto["video_view"] = 0 | ||
ydl_opts = { | ||
"ignoreerrors": True, | ||
"quiet": True | ||
} | ||
if "video_library" in tuto and tuto["video_library"]["tutorial"]: | ||
tuto["video_versions"] = len(tuto["video_library"]["tutorial"]["versions"]) | ||
for v in tuto["video_library"]["tutorial"]["versions"]: | ||
url = f"https://www.youtube.com/watch?v={v['link']}" | ||
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | ||
info = ydl.extract_info(url, download=False) | ||
info = ydl.sanitize_info(info) | ||
if info: | ||
tuto["video_view"] += info["view_count"] | ||
|
||
|
||
def format_tutorial(tuto: dict, edam_ontology, tools: dict, feedback: dict) -> None: | ||
tuto["url"] = f'https://training.galaxyproject.org/{tuto["url"]}' | ||
tuto["mod_date"] = format_date(tuto["mod_date"]) | ||
tuto["pub_date"] = format_date(tuto["pub_date"]) | ||
add_supported_servers(tuto) | ||
get_short_tool_ids(tuto) | ||
get_edam_topics(tuto, edam_ontology) | ||
get_edam_operations(tuto, tools) | ||
#get_visits(tuto) | ||
get_feedback(tuto, feedback) | ||
get_youtube_stats(tuto) | ||
return tuto | ||
|
||
|
||
def read_suite_per_tool_id(tool_fp: str) -> Dict: | ||
""" | ||
Read the tool suite table and extract a dictionary per tool id | ||
""" | ||
tool_suites = pd.read_csv(tool_fp, sep="\t", keep_default_na=False).to_dict("records") | ||
tools = {} | ||
for suite in tool_suites: | ||
for tool in suite["Galaxy tool ids"].split(", "): | ||
tools[tool] = { | ||
"Galaxy wrapper id": suite["Galaxy wrapper id"], | ||
"Galaxy wrapper owner": suite["Galaxy wrapper id"], | ||
"EDAM operation": suite["EDAM operation"].split(", "), | ||
} | ||
return tools | ||
|
||
|
||
def get_feedback_per_tutorials() -> Dict: | ||
""" | ||
Get feedback from GTN API and group per tutorial | ||
""" | ||
feedback = get_request_json("https://training.galaxyproject.org/training-material/api/feedback.json") | ||
feedback_per_tuto = {} | ||
for f in feedback: | ||
tuto = f["tutorial"] | ||
feedback_per_tuto.setdefault(tuto, {"number": 0, "mean note": 0}) | ||
feedback_per_tuto[tuto]["number"] += 1 | ||
feedback_per_tuto[tuto]["mean note"] += int(f["note"]) | ||
for tuto in feedback_per_tuto: | ||
feedback_per_tuto[tuto]["mean note"] /= feedback_per_tuto[tuto]["number"] | ||
return feedback_per_tuto | ||
|
||
|
||
def get_trainings(tool_fp: str) -> List[Dict]: | ||
""" | ||
Extract training material from the GTN API, format them, extract EDAM operations from tools, feedback stats, view stats, etc | ||
""" | ||
tools = read_suite_per_tool_id(tool_fp) | ||
feedback = get_feedback_per_tutorials() | ||
edam_ontology = get_ontology('https://edamontology.org/EDAM_unstable.owl').load() | ||
topics = get_request_json('https://training.galaxyproject.org/training-material/api/topics.json') | ||
tutos = [] | ||
for topic in topics: | ||
topic_information = get_request_json(f"https://training.galaxyproject.org/training-material/api/topics/{topic}.json") | ||
for tuto in topic_information["materials"]: | ||
if tuto is None: | ||
continue | ||
format_tutorial(tuto, edam_ontology, tools, feedback) | ||
tutos.append(tuto) | ||
return tutos | ||
|
||
|
||
def filter_training(trainings: List[Dict], tags: List) -> List[Dict]: | ||
""" | ||
Filter training based on a list of tags | ||
""" | ||
filtered_trainings = [] | ||
for training in trainings: | ||
to_keep = False | ||
if "tags" in training and training["tags"]: | ||
for t in training["tags"]: | ||
if t in tags: | ||
to_keep = True | ||
if to_keep: | ||
filtered_trainings.append(training) | ||
return filtered_trainings | ||
|
||
|
||
def export_training_to_tsv(trainings: List[Dict], output_fp: str) -> None: | ||
""" | ||
Export trainings to a TSV file | ||
""" | ||
df = (pd.DataFrame(trainings) | ||
.assign( | ||
Workflows=lambda df: df.workflows.notna(), | ||
exact_supported_servers= lambda df: df.exact_supported_servers.fillna("").apply(list), | ||
inexact_supported_servers= lambda df: df.inexact_supported_servers.fillna("").apply(list), | ||
) | ||
) | ||
|
||
for col in ["exact_supported_servers", "inexact_supported_servers", "short_tools", "edam_operation", "edam_topic"]: | ||
df[col] = format_list_column(df[col]) | ||
|
||
df = (df | ||
.rename(columns = { | ||
"title": "Title", | ||
"hands_on": "Tutorial", | ||
"url": "Link", | ||
"slides": "Slides", | ||
"mod_date": "Last modification", | ||
"pub_date": "Creation", | ||
"version": "Version", | ||
"short_tools": "Tools", | ||
"exact_supported_servers": "Servers with precise tool versions", | ||
"inexact_supported_servers": "Servers with tool but different versions", | ||
"topic_name_human": "Topic", | ||
"video": "Video", | ||
"edam_topic": "EDAM topic", | ||
"edam_operation": "EDAM operation", | ||
"feedback_number": "Feedback number", | ||
"feedback_mean_note": "Feedback mean note", | ||
"visitors": "Visitors", | ||
"pageviews": "Page views", | ||
"video_versions": "Video versions", | ||
"video_view": "Video views" | ||
}) | ||
.fillna("") | ||
.reindex(columns = [ | ||
"Topic", | ||
"Title", | ||
"Link", | ||
"EDAM topic", | ||
"EDAM operation", | ||
"Creation", | ||
"Last modification", | ||
"Version", | ||
"Tutorial", | ||
"Slides", | ||
"Video", | ||
"Workflows", | ||
"Tools", | ||
"Servers with precise tool versions", | ||
"Servers with tool but different versions", | ||
"Feedback number", | ||
"Feedback mean note", | ||
"Visitors", | ||
"Page views", | ||
"Video views" | ||
]) | ||
) | ||
|
||
df.to_csv(output_fp, sep="\t", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Extract Galaxy Training Materials from GTN API together with statistics" | ||
) | ||
subparser = parser.add_subparsers(dest="command") | ||
# Extract training | ||
extracttraining = subparser.add_parser("extracttraining", help="Extract all training materials") | ||
extracttraining.add_argument("--all_trainings", "-o", required=True, help="Filepath to JSON with all extracted training materials") | ||
extracttraining.add_argument( | ||
"--tools", | ||
"-t", | ||
required=True, | ||
help="Filepath to TSV with all extracted tools, generated by extractools command", | ||
) | ||
|
||
# Filter training | ||
filtertraining = subparser.add_parser("filtertraining", help="Filter training materials based on their tags") | ||
filtertraining.add_argument( | ||
"--all_trainings", | ||
"-t", | ||
required=True, | ||
help="Filepath to JSON with all extracted trainings, generated by extracttraining command", | ||
) | ||
filtertraining.add_argument( | ||
"--filtered_trainings", | ||
"-f", | ||
required=True, | ||
help="Filepath to TSV with filtered trainings", | ||
) | ||
filtertraining.add_argument( | ||
"--tags", | ||
"-c", | ||
help="Path to a file with tags to keep in the extraction (one per line)", | ||
) | ||
args = parser.parse_args() | ||
|
||
if args.command == "extracttraining": | ||
trainings = get_trainings(args.tools) | ||
export_training_to_json(trainings, args.all_trainings) | ||
|
||
elif args.command == "filtertraining": | ||
trainings = load_json(args.all_trainings) | ||
# get categories and training to exclude | ||
tags = read_file(args.tags) | ||
# filter training lists | ||
filtered_training = filter_training(trainings, tags) | ||
export_training_to_tsv(filtered_training, args.filtered_trainings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.