Skip to content

Commit

Permalink
Merge pull request #112 from paulzierep/main
Browse files Browse the repository at this point in the history
Simplify server availability
  • Loading branch information
bebatut authored Jun 5, 2024
2 parents 057e09c + e7c9832 commit 639672e
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 58 deletions.
75 changes: 22 additions & 53 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@
BIOTOOLS_API_URL = "https://bio.tools"
# BIOTOOLS_API_URL = "https://130.226.25.21"

GALAXY_SERVER_URLS = [
"https://usegalaxy.org",
"https://usegalaxy.org.au",
"https://usegalaxy.eu",
"https://usegalaxy.fr",
]
USEGALAXY_SERVER_URLS = {
"UseGalaxy.org": "https://usegalaxy.org",
"UseGalaxy.org.au": "https://usegalaxy.org.au",
"UseGalaxy.eu": "https://usegalaxy.eu",
"UseGalaxy.org.fr": "https://usegalaxy.fr",
}

project_path = Path(__file__).resolve().parent.parent # galaxy_tool_extractor folder
usage_stats_path = project_path.joinpath("data", "usage_stats")
Expand Down Expand Up @@ -475,7 +475,7 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:


@lru_cache # need to run this for each suite, so just cache it
def get_all_installed_tool_ids(galaxy_url: str) -> List[str]:
def get_all_installed_tool_ids_on_server(galaxy_url: str) -> List[str]:
"""
Get all tool ids from a Galaxy server
Expand All @@ -489,59 +489,23 @@ def get_all_installed_tool_ids(galaxy_url: str) -> List[str]:
return [tool_dict["id"] for tool_dict in tool_dict_list]


def check_tools_on_servers(tool_ids: List[str]) -> pd.DataFrame:
def check_tools_on_servers(tool_ids: List[str], galaxy_server_url: str) -> int:
"""
Get True/False for each tool on each server
Return number of tools in tool_ids installed on galaxy_server_url
:param tool_ids: galaxy tool ids
"""
assert all("/" not in tool_id for tool_id in tool_ids), "This function only works on short tool ids"
data: List[Dict[str, bool]] = []
for galaxy_url in GALAXY_SERVER_URLS:
installed_tool_ids = get_all_installed_tool_ids(galaxy_url)
installed_tool_short_ids = [
tool_id.split("/")[4] if "/" in tool_id else tool_id for tool_id in installed_tool_ids
]
d: Dict[str, bool] = {}
for tool_id in tool_ids:
d[tool_id] = tool_id in installed_tool_short_ids
data.append(d)
return pd.DataFrame(data, index=GALAXY_SERVER_URLS)

installed_tool_ids = get_all_installed_tool_ids_on_server(galaxy_server_url)
installed_tool_short_ids = [tool_id.split("/")[4] if "/" in tool_id else tool_id for tool_id in installed_tool_ids]

def get_tool_count_per_server(tool_ids: Any) -> pd.Series:
"""
Aggregate tool count for each suite for each
server into (Number of tools on server/Total number of tools)
counter = 0
for tool_id in tool_ids:
if tool_id in installed_tool_short_ids:
counter += 1

:param tool_ids: string of tools ids for one suite
"""
if not isinstance(tool_ids, str):
series = pd.Series({key: None for key in GALAXY_SERVER_URLS})
else:
tool_id_list = [x.strip(" ") for x in tool_ids.split(",")]
data = check_tools_on_servers(tool_id_list)
result_df: pd.DataFrame = pd.DataFrame()
result_df["true_count"] = data.sum(axis=1).astype(str)
result_df["false_count"] = len(data.columns)
result_df["counts"] = result_df.apply(lambda x: "({}/{})".format(x["true_count"], x["false_count"]), axis=1)

series = result_df["counts"].T

return series


def add_instances_to_table(
table: pd.DataFrame,
) -> pd.DataFrame:
"""
Add tool availability to table
:param table_path: path to tool table (must include
"Galaxy tool ids" column)
"""
new_table = table.join(table["Galaxy tool ids"].apply(get_tool_count_per_server))
return new_table
return counter


def format_list_column(col: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -585,7 +549,6 @@ def export_tools_to_tsv(

# the Galaxy tools need to be formatted for the add_instances_to_table to work
df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"])
df = add_instances_to_table(df)

if add_usage_stats:
df = add_usage_stats_for_all_server(df)
Expand Down Expand Up @@ -761,11 +724,17 @@ def reduce_ontology_terms(terms: List, ontology: Any) -> List:
edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load()

for tool in tools:

# add EDAM terms without superclass
tool["EDAM operation (no superclasses)"] = reduce_ontology_terms(
tool["EDAM operation"], ontology=edam_ontology
)
tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology)

# add availability for UseGalaxy servers
for name, url in USEGALAXY_SERVER_URLS.items():
tool[f"Available on {name}"] = check_tools_on_servers(tool["Galaxy tool ids"], url)

export_tools_to_json(tools, args.all_tools_json)
export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

Expand Down
234 changes: 234 additions & 0 deletions results/test.list_tools.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
[
{
"Galaxy wrapper id": "2d_auto_threshold",
"Galaxy tool ids": [
"ip_threshold"
],
"Description": "Automatic thresholding",
"bio.tool id": "scikit-image",
"bio.tool ids": [
"scikit-image"
],
"biii": "scikit-image",
"bio.tool name": "scikit-image",
"bio.tool description": "Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc.",
"EDAM operation": [
"Image analysis",
"Image annotation",
"Visualisation",
"Data handling"
],
"EDAM topic": [
"Imaging",
"Software engineering",
"Literature and language"
],
"Status": "To update",
"Source": "https://github.com/bmcv",
"ToolShed categories": [
"Imaging"
],
"ToolShed id": "2d_auto_threshold",
"Galaxy wrapper owner": "imgteam",
"Galaxy wrapper source": "https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/",
"Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold",
"Galaxy wrapper version": "0.0.6-2",
"Conda id": "scikit-image",
"Conda version": null,
"EDAM operation (no superclasses)": [
"Image analysis",
"Image annotation",
"Visualisation",
"Data handling"
],
"EDAM topic (no superclasses)": [
"Imaging",
"Software engineering",
"Literature and language"
],
"Tools available on UseGalaxy.org": 0,
"Tools available on UseGalaxy.org.au": 1,
"Tools available on UseGalaxy.eu": 1,
"Tools available on UseGalaxy.org.fr": 1
},
{
"Galaxy wrapper id": "abritamr",
"Galaxy tool ids": [
"abritamr"
],
"Description": "A pipeline for running AMRfinderPlus and collating results into functional classes",
"bio.tool id": null,
"bio.tool ids": [],
"biii": null,
"bio.tool name": null,
"bio.tool description": null,
"EDAM operation": [],
"EDAM topic": [],
"Status": "To update",
"Source": "https://zenodo.org/record/7370628",
"ToolShed categories": [
"Sequence Analysis"
],
"ToolShed id": "abritamr",
"Galaxy wrapper owner": "iuc",
"Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr",
"Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr",
"Galaxy wrapper version": "1.0.14",
"Conda id": "abritamr",
"Conda version": "1.0.17",
"EDAM operation (no superclasses)": [],
"EDAM topic (no superclasses)": [],
"Tools available on UseGalaxy.org": 0,
"Tools available on UseGalaxy.org.au": 0,
"Tools available on UseGalaxy.eu": 1,
"Tools available on UseGalaxy.org.fr": 0
},
{
"Galaxy wrapper id": "aldex2",
"Galaxy tool ids": [
"aldex2"
],
"Description": "Performs analysis Of differential abundance taking sample variation into account",
"bio.tool id": "aldex2",
"bio.tool ids": [
"aldex2"
],
"biii": null,
"bio.tool name": "ALDEx2",
"bio.tool description": "A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction.",
"EDAM operation": [
"Statistical inference"
],
"EDAM topic": [
"Gene expression",
"Statistics and probability"
],
"Status": "To update",
"Source": "https://github.com/ggloor/ALDEx_bioc",
"ToolShed categories": [
"Metagenomics"
],
"ToolShed id": "aldex2",
"Galaxy wrapper owner": "iuc",
"Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2",
"Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2",
"Galaxy wrapper version": "1.26.0",
"Conda id": "bioconductor-aldex2",
"Conda version": "1.34.0",
"EDAM operation (no superclasses)": [
"Statistical inference"
],
"EDAM topic (no superclasses)": [
"Gene expression",
"Statistics and probability"
],
"Tools available on UseGalaxy.org": 0,
"Tools available on UseGalaxy.org.au": 0,
"Tools available on UseGalaxy.eu": 1,
"Tools available on UseGalaxy.org.fr": 0
},
{
"Galaxy wrapper id": "fastp",
"Galaxy tool ids": [
"fastp"
],
"Description": "Fast all-in-one preprocessing for FASTQ files",
"bio.tool id": "fastp",
"bio.tool ids": [
"fastp"
],
"biii": null,
"bio.tool name": "fastp",
"bio.tool description": "A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.",
"EDAM operation": [
"Sequencing quality control",
"Sequence contamination filtering"
],
"EDAM topic": [
"Sequence analysis",
"Probes and primers"
],
"Status": "To update",
"Source": "https://github.com/OpenGene/fastp",
"ToolShed categories": [
"Sequence Analysis"
],
"ToolShed id": "fastp",
"Galaxy wrapper owner": "iuc",
"Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastp",
"Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/fastp",
"Galaxy wrapper version": null,
"Conda id": "fastp",
"Conda version": "0.23.4",
"EDAM operation (no superclasses)": [
"Sequence contamination filtering"
],
"EDAM topic (no superclasses)": [
"Probes and primers"
],
"Tools available on UseGalaxy.org": 1,
"Tools available on UseGalaxy.org.au": 1,
"Tools available on UseGalaxy.eu": 1,
"Tools available on UseGalaxy.org.fr": 1
},
{
"Galaxy wrapper id": "spades",
"Galaxy tool ids": [
"spades_biosyntheticspades",
"spades_coronaspades",
"spades_metaplasmidspades",
"metaspades",
"spades_metaviralspades",
"spades_plasmidspades",
"rnaspades",
"spades_rnaviralspades",
"spades"
],
"Description": "SPAdes is an assembly toolkit containing various assembly pipelines. It implements the following 4 stages: assembly graph construction, k-bimer adjustment, construction of paired assembly graph and contig construction.",
"bio.tool id": "spades",
"bio.tool ids": [
"rnaviralspades",
"metaviralspades",
"metaspades",
"biosyntheticspades",
"metaplasmidspades",
"spades",
"rnaspades",
"plasmidspades",
"coronaspades"
],
"biii": null,
"bio.tool name": "SPAdes",
"bio.tool description": "St. Petersburg genome assembler \u2013 is intended for both standard isolates and single-cell MDA bacteria assemblies. SPAdes 3.9 works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. Additional contigs can be provided and can be used as long reads.",
"EDAM operation": [
"Genome assembly"
],
"EDAM topic": [
"Sequence assembly"
],
"Status": "Up-to-date",
"Source": "https://github.com/ablab/spades",
"ToolShed categories": [
"Assembly",
"RNA",
"Metagenomics"
],
"ToolShed id": "spades",
"Galaxy wrapper owner": "iuc",
"Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/spades",
"Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/spades",
"Galaxy wrapper version": "3.15.5",
"Conda id": "spades",
"Conda version": "3.15.5",
"EDAM operation (no superclasses)": [
"Genome assembly"
],
"EDAM topic (no superclasses)": [
"Sequence assembly"
],
"Tools available on UseGalaxy.org": 9,
"Tools available on UseGalaxy.org.au": 9,
"Tools available on UseGalaxy.eu": 9,
"Tools available on UseGalaxy.org.fr": 9
}
]
Loading

0 comments on commit 639672e

Please sign in to comment.