galaxyproject · paulzierep · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py
@@ -5,32 +5,68 @@
 import sys
 import time
 import xml.etree.ElementTree as et
-from functools import lru_cache
 from pathlib import Path
 from typing import (
     Any,
+    cast,
     Dict,
+    Iterable,
     List,
     Optional,
 )
 
+import numpy as np
 import pandas as pd
 import requests
 import yaml
 from github import Github
 from github.ContentFile import ContentFile
 from github.Repository import Repository
 
+COLUMN_ORDER = [
+    "Galaxy wrapper id",
+    "Galaxy tool ids",
+    "No. tools in the suite",
+    "Description",
+    "bio.tool id",
+    "bio.tool ids",
+    "bio.tool name",
+    "biii",
+    "bio.tool description",
+    "EDAM operation",
+    "EDAM topic",
+    "Conda id",
+    "Conda version",
+    "Galaxy wrapper version",
+    "Status",
+    "ToolShed categories",
+    "ToolShed id",
+    "Source",
+    "Galaxy wrapper owner",
+    "Galaxy wrapper source",
+    "Galaxy wrapper parsed folder",
+    "Galaxy Star Availability",
+    "All Server Availability",
+    "Tools available on: UseGalaxy.org",
+    "Tools available on: UseGalaxy.org.au",
+    "Tools available on: UseGalaxy.eu",
+    "Tools available on: UseGalaxy.org.fr",
-    "Tools available on: UseGalaxy.org",
-    "Tools available on: UseGalaxy.org.au",
-    "Tools available on: UseGalaxy.eu",
-    "Tools available on: UseGalaxy.org.fr",
+    "Tools available on UseGalaxy.org",
+    "Tools available on UseGalaxy.org.au",
+    "Tools available on UseGalaxy.eu",
+    "Tools available on UseGalaxy.fr",
-    "Tools available on: UseGalaxy.org",
-    "Tools available on: UseGalaxy.org.au",
-    "Tools available on: UseGalaxy.eu",
-    "Tools available on: UseGalaxy.org.fr",
+    "Tools available on UseGalaxy.org",
+    "Tools available on UseGalaxy.org.au",
+    "Tools available on UseGalaxy.eu",
+    "Tools available on UseGalaxy.fr",
+    "No. of tool users (2022-2023) (usegalaxy.eu)",
-    "No. of tool users (2022-2023) (usegalaxy.eu)",
+    "Tool users in 2022-2023 on UseGalaxy.eu",
-    "No. of tool users (2022-2023) (usegalaxy.eu)",
+    "Tool users in 2022-2023 on UseGalaxy.eu",
+    "Total tool usage (usegalaxy.eu)",
-    "Total tool usage (usegalaxy.eu)",
+    "Total tool usage on UseGalaxy.eu",
-    "Total tool usage (usegalaxy.eu)",
+    "Total tool usage on UseGalaxy.eu",
+]
+
+
 # Config variables
 BIOTOOLS_API_URL = "https://bio.tools"
 # BIOTOOLS_API_URL = "https://130.226.25.21"
 
-GALAXY_SERVER_URLS = [
-    "https://usegalaxy.org",
-    "https://usegalaxy.org.au",
-    "https://usegalaxy.eu",
-    "https://usegalaxy.fr",
-]
+
+USEGALAXY_STAR_SERVER_URLS = {
+    "UseGalaxy.org": "https://usegalaxy.org",
+    "UseGalaxy.org.au": "https://usegalaxy.org.au",
+    "UseGalaxy.eu": "https://usegalaxy.eu",
+    "UseGalaxy.org.fr": "https://usegalaxy.fr",
+}
 
 project_path = Path(__file__).resolve().parent.parent  # galaxy_tool_extractor folder
 usage_stats_path = project_path.joinpath("data", "usage_stats")
@@ -41,22 +77,24 @@
     "Total tool usage (usegalaxy.eu)": usage_stats_path.joinpath("total_tool_usage_EU.csv"),
 }
 
+PUBLIC_GALAXY_SERVERS = usage_stats_path.joinpath("public_galaxy_servers.csv")
+
 # load the configs globally
 with open(conf_path) as f:
     configs = yaml.safe_load(f)
 
 
 def get_last_url_position(toot_id: str) -> str:
     """
-    Returns the second last url position of the toot_id, if the value is not a
+    Returns the last url position of the toot_id, if the value is not a
     url it returns the toot_id. So works for local and toolshed
     installed tools.
 
     :param tool_id: galaxy tool id
     """
 
     if "/" in toot_id:
-        toot_id = toot_id.split("/")[-2]
+        toot_id = toot_id.split("/")[-1]
     return toot_id
 
 
@@ -70,7 +108,6 @@ def add_tool_stats_to_tools(tools_df: pd.DataFrame, tool_stats_path: Path, colum
     :param tools_path: path to the table with
         the tools (csv,
         must include "Galaxy wrapper id")
-    :param output_path: path to store the new table
     :param column_name: column to add for the tool stats,
         different columns could be added for the main servers
     """
@@ -82,13 +119,31 @@ def add_tool_stats_to_tools(tools_df: pd.DataFrame, tool_stats_path: Path, colum
     tool_stats_df["Galaxy wrapper id"] = tool_stats_df["tool_name"].apply(get_last_url_position)
 
     # group local and toolshed tools into one entry
-    grouped_tool_stats_tools = tool_stats_df.groupby("Galaxy wrapper id", as_index=False)["count"].sum()
+    # also group tools with different versions
+    grouped_tool_stats_tools = tool_stats_df.groupby("Galaxy wrapper id")["count"].sum()
+
+    # new column to store the stats
+    tools_df[column_name] = np.NaN
+
+    # check for each tool_id if a count exists in the stats file
+    # and sum the stats for each suite
+    for row_index, row in tools_df.iterrows():
+        counts = []
+        if isinstance(row["Galaxy tool ids"], str):
+            for tool_id in row["Galaxy tool ids"].split(","):
+                tool_id = tool_id.strip()
+                if tool_id in grouped_tool_stats_tools:
+                    count = grouped_tool_stats_tools[tool_id]
+                    counts.append(count)
+
+            if len(counts) == 0:
+                summed_count = np.NaN
+            else:
+                summed_count = sum(counts)
 
-    # keep all rows of the tools table (how='right'), also for those where no stats are available
-    community_tool_stats = pd.merge(grouped_tool_stats_tools, tools_df, how="right", on="Galaxy wrapper id")
-    community_tool_stats.rename(columns={"count": column_name}, inplace=True)
+            tools_df.loc[pd.Index([row_index]), column_name] = summed_count
 
-    return community_tool_stats
+    return tools_df
 
 
 def add_usage_stats_for_all_server(tools_df: pd.DataFrame) -> pd.DataFrame:
@@ -207,7 +262,7 @@ def get_xref(el: et.Element, attrib_type: str) -> Optional[str]:
         for xref in xref_items:
             if xref is not None and xref.attrib["type"] == attrib_type:
                 # should not contain any space of linebreak
-                xref_sanitized = str(xref.text).strip()
+                xref_sanitized = str(xref.text).replace("\n", "").replace(" ", "")
                 return xref_sanitized
     return None
 
@@ -472,83 +527,165 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
     return tools
 
 
-@lru_cache  # need to run this for each suite, so just cache it
 def get_all_installed_tool_ids(galaxy_url: str) -> List[str]:
     """
     Get all tool ids from a Galaxy server
 
     :param galaxy_url: URL of Galaxy instance
     """
+
+    print(galaxy_url)
     galaxy_url = galaxy_url.rstrip("/")
     base_url = f"{galaxy_url}/api"
-    r = requests.get(f"{base_url}/tools", params={"in_panel": False})
-    r.raise_for_status()
-    tool_dict_list = r.json()
+    try:
+        r = requests.get(f"{base_url}/tools", params={"in_panel": False}, timeout=5)
+        r.raise_for_status()
+        tool_dict_list = r.json()
+    except Exception as ex:
+        print(f"Exception:\n{ex} \nfor server {galaxy_url}!")
+        return []
+
     return [tool_dict["id"] for tool_dict in tool_dict_list]
 
 
-def check_tools_on_servers(tool_ids: List[str]) -> pd.DataFrame:
+def get_tool_ids_on_server(galaxy_servers: dict) -> dict:
     """
-    Get True/False for each tool on each server
+    Get all tool ids from all Galaxy servers in galaxy_servers
 
-    :param tool_ids: galaxy tool ids
+    :param galaxy_servers: dict with name and urls to galaxy servers
     """
-    assert all("/" not in tool_id for tool_id in tool_ids), "This function only works on short tool ids"
-    data: List[Dict[str, bool]] = []
-    for galaxy_url in GALAXY_SERVER_URLS:
+
+    tools_on_server = {}
+    for name, galaxy_url in galaxy_servers.items():
         installed_tool_ids = get_all_installed_tool_ids(galaxy_url)
         installed_tool_short_ids = [
             tool_id.split("/")[4] if "/" in tool_id else tool_id for tool_id in installed_tool_ids
         ]
+        tools_on_server[name] = installed_tool_short_ids
+
+    return tools_on_server
+
+
+def check_tools_on_servers(
+    tool_ids: List[str],
+    installed_tool_ids: dict,
+) -> pd.DataFrame:
+    """
+    Get True/False for each tool on each server
+
+    :param tool_ids: galaxy tool ids
+    :installed_tool_ids: a dict with tools for each server
+    """
+    assert all("/" not in tool_id for tool_id in tool_ids), "This function only works on short tool ids"
+    data: List[Dict[str, bool]] = []
+    names: List = []
+
+    # check for each tool if installed on server
+    for name in installed_tool_ids.keys():
         d: Dict[str, bool] = {}
         for tool_id in tool_ids:
-            d[tool_id] = tool_id in installed_tool_short_ids
+            d[tool_id] = tool_id in installed_tool_ids[name]
         data.append(d)
-    return pd.DataFrame(data, index=GALAXY_SERVER_URLS)
+        names.append(name)
+    return pd.DataFrame(data, index=names)
 
 
-def get_tool_count_per_server(tool_ids: Any) -> pd.Series:
+def get_tool_count_per_server(tool_ids: Any, installed_tool_ids: Any) -> Any:
     """
     Aggregate tool count for each suite for each
     server into (Number of tools on server/Total number of tools)
 
+    :param installed_tool_ids: a dict with tools on each server
     :param tool_ids: string of tools ids for one suite
     """
     if not isinstance(tool_ids, str):
-        series = pd.Series({key: None for key in GALAXY_SERVER_URLS})
+        series = pd.Series({key: None for key in installed_tool_ids.keys()})
     else:
         tool_id_list = [x.strip(" ") for x in tool_ids.split(",")]
-        data = check_tools_on_servers(tool_id_list)
+        data = check_tools_on_servers(tool_id_list, installed_tool_ids)
         result_df: pd.DataFrame = pd.DataFrame()
-        result_df["true_count"] = data.sum(axis=1).astype(str)
-        result_df["false_count"] = len(data.columns)
-        result_df["counts"] = result_df.apply(lambda x: "({}/{})".format(x["true_count"], x["false_count"]), axis=1)
+        result_df["counts"] = data.sum(axis=1).astype(str)
+        result_df.loc["No. tools in the suite", "counts"] = len(data.columns)
 
         series = result_df["counts"].T
 
     return series
 
 
-def add_instances_to_table(
-    table: pd.DataFrame,
-) -> pd.DataFrame:
+def add_instances_to_table(table: pd.DataFrame, galaxy_servers: dict) -> pd.DataFrame:
     """
     Add tool availability to table
 
+    :param galaxy_servers: a dict with server names and urls
     :param table_path: path to tool table (must include
     "Galaxy tool ids" column)
     """
-    new_table = table.join(table["Galaxy tool ids"].apply(get_tool_count_per_server))
+
+    # get all installed tools on all servers ones
+    installed_tool_ids = get_tool_ids_on_server(galaxy_servers)
+
+    tool_count_per_server = table["Galaxy tool ids"].apply(get_tool_count_per_server, args=[installed_tool_ids])  # type: ignore
+
+    new_table = table.join(tool_count_per_server)
     return new_table
 
 
+def get_server_list(row: pd.Series) -> str:
+    """
+    Returns a list of servers if at least one tool is installed there
+
+    :param row: a pandas row with tool availability stats for each server
+    """
+
+    available_servers = []
+    for name, val in row.items():
+        if int(val) > 0:
+            available_servers.append(name)
+    return ", ".join(cast(Iterable[str], available_servers))
+
+
+def aggregate_servers(df: pd.DataFrame, server_names: list, column_name: str) -> pd.DataFrame:
+    """
+    Aggregates a list of servers where the tools are installed
+
+    :param df: the results dataframe that already contains for each server a column
+    :param server_names: names of servers to aggregate -  must be a column name in df
+    """
+
+    sub_df = df.loc[:, server_names]
+    df[column_name] = sub_df.apply(get_server_list, axis=1)
+    return df
+
+
+def extract_public_galaxy_servers_tools() -> Dict:
+    """
+    Extract the tools from the public Galaxy servers using their API -> this is actually done in
+    galaxy_tool_extractor/data/usage_stats/get_public_galaxy_servers.py
+    Here we only load the list -> much faster
+    TODO: run get_public_galaxy_servers.py as CI
+    """
+
+    df = pd.read_csv(PUBLIC_GALAXY_SERVERS)
+    to_process = pd.Series(df["urls"].values, index=df["Name"]).to_dict()
+
+    return to_process
+
+
 def format_list_column(col: pd.Series) -> pd.Series:
     """
     Format a column that could be a list before exporting
     """
     return col.apply(lambda x: ", ".join(str(i) for i in x))
 
 
+def order_output_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Reorder the columns based on best fitted output
+    """
+    df = df.reindex(columns=COLUMN_ORDER)
+    return df
+
+
 def export_tools(
     tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False
 ) -> None:
@@ -569,11 +706,31 @@ def export_tools(
 
         # the Galaxy tools need to be formatted for the add_instances_to_table to work
         df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"])
-        df = add_instances_to_table(df)
+
+        # add availability of star servers
+        df = add_instances_to_table(df, USEGALAXY_STAR_SERVER_URLS)
+        df = aggregate_servers(df, list(USEGALAXY_STAR_SERVER_URLS.keys()), column_name="Galaxy Star Availability")
+
+        # rename the the columns for each server
+        server_reindex_columns = {f"Tools available on: {k}": v for k, v in USEGALAXY_STAR_SERVER_URLS.items()}
+        df = df.rename(columns=server_reindex_columns)
+
+        print(df)
+
+        # add availability of all servers star servers
+        # only add the aggregated column
+        server_list = extract_public_galaxy_servers_tools()
+
+        df_selection = df.loc[:, ["Galaxy wrapper id", "Galaxy tool ids"]].copy()
+        df_selection = add_instances_to_table(df_selection, server_list)  # add all instance to the selection
+        df_selection = aggregate_servers(df_selection, list(server_list.keys()), column_name="All Server Availability")
+        df["All Server Availability"] = df_selection["All Server Availability"]
 
     if add_usage_stats:
         df = add_usage_stats_for_all_server(df)
 
+    df = order_output_columns(df)
+
     df.to_csv(output_fp, sep="\t", index=False)