Skip to content

Commit

Permalink
compare tool_ids
Browse files Browse the repository at this point in the history
  • Loading branch information
paulzierep committed Mar 13, 2024
1 parent 7cd49a7 commit 3d3edea
Showing 1 changed file with 26 additions and 9 deletions.
35 changes: 26 additions & 9 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@

def get_last_url_position(toot_id: str) -> str:
"""
Returns the second last url position of the toot_id, if the value is not a
Returns the last url position of the toot_id, if the value is not a
url it returns the toot_id. So works for local and toolshed
installed tools.
:param tool_id: galaxy tool id
"""

if "/" in toot_id:
toot_id = toot_id.split("/")[-2]
toot_id = toot_id.split("/")[-1]
return toot_id


Expand All @@ -69,7 +69,6 @@ def add_tool_stats_to_tools(tools_df: pd.DataFrame, tool_stats_path: Path, colum
:param tools_path: path to the table with
the tools (csv,
must include "Galaxy wrapper id")
:param output_path: path to store the new table
:param column_name: column to add for the tool stats,
different columns could be added for the main servers
"""
Expand All @@ -81,13 +80,31 @@ def add_tool_stats_to_tools(tools_df: pd.DataFrame, tool_stats_path: Path, colum
tool_stats_df["Galaxy wrapper id"] = tool_stats_df["tool_name"].apply(get_last_url_position)

# group local and toolshed tools into one entry
grouped_tool_stats_tools = tool_stats_df.groupby("Galaxy wrapper id", as_index=False)["count"].sum()
# also group tools with different versions
grouped_tool_stats_tools = tool_stats_df.groupby("Galaxy wrapper id")["count"].sum()

# new column to store the stats
tools_df[column_name] = np.NaN

# check for each tool_id if a count exists in the stats file
# and sum the stats for each suite
for row_index, row in tools_df.iterrows():
counts = []
if isinstance(row["Galaxy tool ids"], str):
for tool_id in row["Galaxy tool ids"].split(","):
tool_id = tool_id.strip()
if tool_id in grouped_tool_stats_tools:
count = grouped_tool_stats_tools[tool_id]
counts.append(count)

if len(counts) == 0:
summed_count = np.NaN
else:
summed_count = sum(counts)

# keep all rows of the tools table (how='right'), also for those where no stats are available
community_tool_stats = pd.merge(grouped_tool_stats_tools, tools_df, how="right", on="Galaxy wrapper id")
community_tool_stats.rename(columns={"count": column_name}, inplace=True)
tools_df.loc[row_index, column_name] = summed_count

return community_tool_stats
return tools_df


def add_usage_stats_for_all_server(tools_df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -206,7 +223,7 @@ def get_xref(el: et.Element, attrib_type: str) -> Optional[str]:
for xref in xref_items:
if xref is not None and xref.attrib["type"] == attrib_type:
# should not contain any space of linebreak
xref_sanitized = str(xref.text).strip()
xref_sanitized = str(xref.text).replace("\n", "").replace(" ", "")
return xref_sanitized
return None

Expand Down

0 comments on commit 3d3edea

Please sign in to comment.