Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
paulzierep committed Nov 2, 2023
1 parent 43b0143 commit a88e49c
Showing 1 changed file with 23 additions and 69 deletions.
92 changes: 23 additions & 69 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ def get_github_repo(url: str, g: Github) -> Repository:
return g.get_user(u_split[-2]).get_repo(u_split[-1])


def get_shed_attribute(
attrib: str, shed_content: Dict[str, Any], empty_value: Any
) -> Any:
def get_shed_attribute(attrib: str, shed_content: Dict[str, Any], empty_value: Any) -> Any:
"""
Get a shed attribute
Expand Down Expand Up @@ -157,9 +155,7 @@ def check_categories(ts_categories: str, ts_cat: List[str]) -> bool:
return bool(set(ts_cat) & set(ts_cats))


def get_tool_metadata(
tool: ContentFile, repo: Repository
) -> Optional[Dict[str, Any]]:
def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, Any]]:
"""
Get tool metadata from the .shed.yaml, requirements in the macros or xml
file, bio.tools information if available in the macros or xml, EDAM
Expand Down Expand Up @@ -197,27 +193,17 @@ def get_tool_metadata(
else:
file_content = get_string_content(shed)
yaml_content = yaml.load(file_content, Loader=yaml.FullLoader)
metadata["Description"] = get_shed_attribute(
"description", yaml_content, None
)
metadata["Description"] = get_shed_attribute("description", yaml_content, None)
if metadata["Description"] is None:
metadata["Description"] = get_shed_attribute(
"long_description", yaml_content, None
)
metadata["Description"] = get_shed_attribute("long_description", yaml_content, None)
if metadata["Description"] is not None:
metadata["Description"] = metadata["Description"].replace("\n", "")
metadata["ToolShed id"] = get_shed_attribute("name", yaml_content, None)
metadata["Galaxy wrapper owner"] = get_shed_attribute(
"owner", yaml_content, None
)
metadata["Galaxy wrapper source"] = get_shed_attribute(
"remote_repository_url", yaml_content, None
)
metadata["Galaxy wrapper owner"] = get_shed_attribute("owner", yaml_content, None)
metadata["Galaxy wrapper source"] = get_shed_attribute("remote_repository_url", yaml_content, None)
if "homepage_url" in yaml_content:
metadata["Source"] = yaml_content["homepage_url"]
metadata["ToolShed categories"] = get_shed_attribute(
"categories", yaml_content, []
)
metadata["ToolShed categories"] = get_shed_attribute("categories", yaml_content, [])
if metadata["ToolShed categories"] is None:
metadata["ToolShed categories"] = []
# find and parse macro file
Expand All @@ -229,10 +215,7 @@ def get_tool_metadata(
root = et.fromstring(file_content)
for child in root:
if "name" in child.attrib:
if (
child.attrib["name"] == "@TOOL_VERSION@"
or child.attrib["name"] == "@VERSION@"
):
if child.attrib["name"] == "@TOOL_VERSION@" or child.attrib["name"] == "@VERSION@":
metadata["Galaxy wrapper version"] = child.text
elif child.attrib["name"] == "requirements":
metadata["Conda id"] = get_conda_package(child)
Expand All @@ -259,12 +242,9 @@ def get_tool_metadata(
if macros is not None:
for child in macros:
if "name" in child.attrib and (
child.attrib["name"] == "@TOOL_VERSION@"
or child.attrib["name"] == "@VERSION@"
child.attrib["name"] == "@TOOL_VERSION@" or child.attrib["name"] == "@VERSION@"
):
metadata[
"Galaxy wrapper version"
] = child.text
metadata["Galaxy wrapper version"] = child.text
# bio.tools
if metadata["bio.tool id"] is None:
biotools = get_biotools(root)
Expand All @@ -280,23 +260,16 @@ def get_tool_metadata(
metadata["Galaxy tool ids"].append(root.attrib["id"])
# get latest conda version and compare to the wrapper version
if metadata["Conda id"] is not None:
r = requests.get(
f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}'
)
r = requests.get(f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}')
if r.status_code == requests.codes.ok:
conda_info = r.json()
if "latest_version" in conda_info:
metadata["Conda version"] = conda_info["latest_version"]
if (
metadata["Conda version"]
== metadata["Galaxy wrapper version"]
):
if metadata["Conda version"] == metadata["Galaxy wrapper version"]:
metadata["Status"] = "Up-to-date"
# get bio.tool information
if metadata["bio.tool id"] is not None:
r = requests.get(
f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json'
)
r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json')
if r.status_code == requests.codes.ok:
biotool_info = r.json()
if "function" in biotool_info:
Expand All @@ -310,9 +283,7 @@ def get_tool_metadata(
if "name" in biotool_info:
metadata["bio.tool name"] = biotool_info["name"]
if "description" in biotool_info:
metadata["bio.tool description"] = biotool_info[
"description"
].replace("\n", "")
metadata["bio.tool description"] = biotool_info["description"].replace("\n", "")
return metadata


Expand Down Expand Up @@ -347,9 +318,7 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
for tool in folder:
# to avoid API request limit issue, wait for one hour
if g.get_rate_limit().core.remaining < 200:
print(
"WAITING for 1 hour to retrieve GitHub API request access !!!"
)
print("WAITING for 1 hour to retrieve GitHub API request access !!!")
print()
time.sleep(60 * 60)
# parse tool
Expand Down Expand Up @@ -392,15 +361,12 @@ def check_tools_on_servers(tool_ids: List[str]) -> pd.DataFrame:
:param tool_ids: galaxy tool ids
"""
assert all(
"/" not in tool_id for tool_id in tool_ids
), "This function only works on short tool ids"
assert all("/" not in tool_id for tool_id in tool_ids), "This function only works on short tool ids"
data: List[Dict[str, bool]] = []
for galaxy_url in GALAXY_SERVER_URLS:
installed_tool_ids = get_all_installed_tool_ids(galaxy_url)
installed_tool_short_ids = [
tool_id.split("/")[4] if "/" in tool_id else tool_id
for tool_id in installed_tool_ids
tool_id.split("/")[4] if "/" in tool_id else tool_id for tool_id in installed_tool_ids
]
d: Dict[str, bool] = {}
for tool_id in tool_ids:
Expand All @@ -425,9 +391,7 @@ def get_tool_count_per_server(tool_ids: str) -> pd.Series:
result_df: pd.DataFrame = pd.DataFrame()
result_df["true_count"] = data.sum(axis=1).astype(str)
result_df["false_count"] = len(data.columns)
result_df["counts"] = result_df.apply(
lambda x: "({}/{})".format(x["true_count"], x["false_count"]), axis=1
)
result_df["counts"] = result_df.apply(lambda x: "({}/{})".format(x["true_count"], x["false_count"]), axis=1)

count_row = result_df["counts"].T
return count_row
Expand All @@ -442,9 +406,7 @@ def add_instances_to_table(
:param table_path: path to tool table (must include
"Galaxy tool ids" column)
"""
new_table = table.join(
table["Galaxy tool ids"].apply(get_tool_count_per_server)
)
new_table = table.join(table["Galaxy tool ids"].apply(get_tool_count_per_server))
return new_table


Expand All @@ -455,9 +417,7 @@ def format_list_column(col: pd.Series) -> pd.Series:
return col.apply(lambda x: ", ".join(str(i) for i in x))


def export_tools(
tools: List[Dict], output_fp: str, format_list_col: bool = False
) -> None:
def export_tools(tools: List[Dict], output_fp: str, format_list_col: bool = False) -> None:
"""
Export tool metadata to tsv output file
Expand All @@ -467,9 +427,7 @@ def export_tools(
"""
df = pd.DataFrame(tools)
if format_list_col:
df["ToolShed categories"] = format_list_column(
df["ToolShed categories"]
)
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

Expand Down Expand Up @@ -516,9 +474,7 @@ def filter_tools(
subparser = parser.add_subparsers(dest="command")
# Extract tools
extractools = subparser.add_parser("extractools", help="Extract tools")
extractools.add_argument(
"--api", "-a", required=True, help="GitHub access token"
)
extractools.add_argument("--api", "-a", required=True, help="GitHub access token")
extractools.add_argument(
"--all_tools",
"-o",
Expand Down Expand Up @@ -580,9 +536,7 @@ def filter_tools(
export_tools(tools, args.all_tools, format_list_col=True)

elif args.command == "filtertools":
tools = pd.read_csv(
Path(args.tools), sep="\t", keep_default_na=False
).to_dict("records")
tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
# get categories and tools to exclude
categories = read_file(args.categories)
excl_tools = read_file(args.exclude)
Expand Down

0 comments on commit a88e49c

Please sign in to comment.