Skip to content

Commit

Permalink
Merge list of tools to keep and to exclude in 1 file and extract ts f…
Browse files Browse the repository at this point in the history
…iltered tools
  • Loading branch information
bebatut committed May 24, 2024
1 parent e81694a commit 44d06f3
Show file tree
Hide file tree
Showing 11 changed files with 4,071 additions and 1,524 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,16 @@ The script will generate a TSV file with each tool found in the list of GitHub r
1. Run the extraction as explained before
2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories))
3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude))
4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep))
3. (Optional) Create a TSV (tabular) file with 2 columns: ToolShed ids of tool suites (one per line), Boolean with True to keep and False to exclude / 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep_exclude.tsv))
4. Run the tool extractor script
```
$ python bin/extract_galaxy_tools.py \
$ python bin/extract_galaxy_tools.py filtertools \
--tools <Path to CSV file with all extracted tools> \
--filtered_tools <Path to output CSV file with filtered tools> \
--ts_filtered_tools <Path to output TSV with tools filtered based on ToolShed category>
--filtered_tools <Path to output TSV with filtered tools based on ToolShed category and manual curation> \
[--categories <Path to ToolShed category file>] \
[--excluded <Path to excluded tool file category file>]\
[--keep <Path to to-keep tool file category file>]
[--keep_exclude <Path to a TSV file with 2 columns: ToolShed ids of tool suites (one per line), Boolean with True to keep and False to exclude>]
```
## Development
Expand Down
58 changes: 31 additions & 27 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,12 +559,14 @@ def export_tools(
:param output_fp: path to output file
:param format_list_col: boolean indicating if list columns should be formatting
"""
df = pd.DataFrame(tools)
df = (
pd.DataFrame(tools)
.sort_values("Galaxy wrapper id")
)
if format_list_col:
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand All @@ -580,30 +582,31 @@ def export_tools(
def filter_tools(
tools: List[Dict],
ts_cat: List[str],
excluded_tools: List[str],
keep_tools: List[str],
) -> List[Dict]:
keep_excl_tools: List[Dict],
) -> tuple:
"""
Filter tools for specific ToolShed categories and add information if to keep or to exclude
:param tools: dictionary with tools and their metadata
:param ts_cat: list of ToolShed categories to keep in the extraction
:param excluded_tools: list of tools to skip
:param keep_tools: list of tools to keep
:param keep_excl_tools: dictionary with tools and their status (True to keep, False to exclude)
"""
ts_filtered_tools = []
filtered_tools = []
for tool in tools:
# filter ToolShed categories and leave function if not in expected categories
if check_categories(tool["ToolShed categories"], ts_cat):
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = name in keep_tools or name in excluded_tools
tool["To keep"] = None
if name in keep_tools:
tool["To keep"] = True
elif name in excluded_tools:
tool["To keep"] = False
filtered_tools.append(tool)
return filtered_tools
tool["Reviewed"] = name in keep_excl_tools
keep_status = None
if name in keep_excl_tools:
keep_status = keep_excl_tools[name][1]
if keep_status:
filtered_tools.append(tool)
tool["To keep"] = keep_status
ts_filtered_tools.append(tool)
return ts_filtered_tools, filtered_tools


if __name__ == "__main__":
Expand Down Expand Up @@ -632,30 +635,31 @@ def filter_tools(
filtertools = subparser.add_parser("filtertools", help="Filter tools")
filtertools.add_argument(
"--tools",
"-t",
"-i",
required=True,
help="Filepath to TSV with all extracted tools, generated by extractools command",
)
filtertools.add_argument(
"--ts_filtered_tools",
"-t",
required=True,
help="Filepath to TSV with tools filtered based on ToolShed category",
)
filtertools.add_argument(
"--filtered_tools",
"-f",
required=True,
help="Filepath to TSV with filtered tools",
help="Filepath to TSV with tools filtered based on ToolShed category and manual curation",
)
filtertools.add_argument(
"--categories",
"-c",
help="Path to a file with ToolShed category to keep in the extraction (one per line)",
)
filtertools.add_argument(
"--exclude",
"-e",
help="Path to a file with ToolShed ids of tools to exclude (one per line)",
)
filtertools.add_argument(
"--keep",
"--keep_exclude",
"-k",
help="Path to a file with ToolShed ids of tools to keep (one per line)",
help="Path to a TSV file with 2 columns: ToolShed ids of tool suites (one per line), Boolean with True to keep and False to exclude ",
)
args = parser.parse_args()

Expand All @@ -681,11 +685,11 @@ def filter_tools(
export_tools(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

elif args.command == "filtertools":
tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
tools = pd.read_csv(args.tools, sep="\t", keep_default_na=False).to_dict("records")
# get categories and tools to exclude
categories = read_file(args.categories)
excl_tools = read_file(args.exclude)
keep_tools = read_file(args.keep)
keep_excl_tools = pd.read_csv(args.keep_exclude, sep="\t", index_col=0, header=None).to_dict("index")
# filter tool lists
filtered_tools = filter_tools(tools, categories, excl_tools, keep_tools)
ts_filtered_tools, filtered_tools = filter_tools(tools, categories, keep_excl_tools)
export_tools(ts_filtered_tools, args.ts_filtered_tools)
export_tools(filtered_tools, args.filtered_tools)
4 changes: 2 additions & 2 deletions bin/get_community_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ for com_data_fp in data/communities/* ; do
python bin/extract_galaxy_tools.py \
filtertools \
--tools "results/all_tools.tsv" \
--ts_filtered_tools "results/$community/tools_filtered_by_ts_categories.tsv" \
--filtered_tools "results/$community/tools.tsv" \
--categories "data/communities/$community/categories" \
--exclude "data/communities/$community/tools_to_exclude" \
--keep "data/communities/$community/tools_to_keep"
--keep_exclude "data/communities/$community/tools_to_keep_exclude.tsv"

python bin/create_interactive_table.py \
--table "results/$community/tools.tsv" \
Expand Down
Empty file.
File renamed without changes.
Loading

0 comments on commit 44d06f3

Please sign in to comment.