Skip to content

Commit

Permalink
Merge branch 'main' of github.com:galaxyproject/galaxy_tool_metadata_…
Browse files Browse the repository at this point in the history
…extractor into json_export
  • Loading branch information
bebatut committed Jun 3, 2024
2 parents e667e1e + a4c7c5c commit 30f1c13
Show file tree
Hide file tree
Showing 24 changed files with 9,886 additions and 9,112 deletions.
8 changes: 8 additions & 0 deletions CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Code of Conduct
===============

As part of the Galaxy Community, this project is committed to providing a
welcoming and harassment-free experience for everyone. We therefore expect
participants to abide by our Code of Conduct, which can be found at:

https://galaxyproject.org/community/coc/
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,22 @@ The script will generate a TSV file with each tool found in the list of GitHub r
1. Run the extraction as explained before
2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories))
3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude))
4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep))
3. (Optional) Create a TSV (tabular) file with tool status (1 tool suite per row) as 3 columns:
- ToolShed ids of tool suites (one per line)
- Boolean with True to keep and False to exclude
- Boolean with True if deprecated and False if not
[Example for microbial data analysis](data/microgalaxy/tools_to_keep_exclude.tsv)
4. Run the tool extractor script
```
$ python bin/extract_galaxy_tools.py \
--tools <Path to JSON file with all extracted tools> \
--filtered_tools <Path to output TSV file with filtered tools> \
--ts-filtered-tools <Path to output TSV with tools filtered based on ToolShed category>
--filtered-tools <Path to output TSV with filtered tools based on ToolShed category and manual curation> \
[--categories <Path to ToolShed category file>] \
[--excluded <Path to excluded tool file category file>]\
[--keep <Path to to-keep tool file category file>]
[--status <Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not>]
```
## Development
Expand Down
2 changes: 1 addition & 1 deletion bin/extract_all_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ mkdir -p 'results/'
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all_tools 'results/all_tools.tsv' \
--all-tools 'results/all_tools.tsv' \
--all-tools-json 'results/all_tools.json'

python bin/create_interactive_table.py \
Expand Down
17 changes: 14 additions & 3 deletions bin/extract_all_tools_stepwise.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,21 @@ mkdir -p 'results/'
tsv_output="results/${1}_tools.tsv"
json_output="results/${1}_tools.json"

python bin/extract_galaxy_tools.py \
if [[ $1 =~ "01" ]]; then
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all_tools $tsv_output \
--all-tools $output \
--all-tools-json $json_output \
--planemorepository $1
--planemo-repository-list $1
else
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--planemo-repository-list $1 \
--avoid-extra-repositories
fi



4 changes: 2 additions & 2 deletions bin/extract_all_tools_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ json_output="results/${1}_tools.json"
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all_tools $tsv_output \
--all-tools $output \
--all-tools-json $json_output \
--planemorepository $1 \
--planemo-repository-list $1 \
--test

95 changes: 56 additions & 39 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,14 @@ def get_string_content(cf: ContentFile) -> str:


def get_tool_github_repositories(
g: Github, RepoSelection: Optional[str], run_test: bool, add_extra_repositories: bool = True
g: Github, repository_list: Optional[str], run_test: bool, add_extra_repositories: bool = True
) -> List[str]:
"""
Get list of tool GitHub repositories to parse
:param g: GitHub instance
:param RepoSelection: The selection to use from the repository (needed to split the process for CI jobs)
:run_test: for testing only parse the repository
:param repository_list: The selection to use from the repository (needed to split the process for CI jobs)
:param run_test: for testing only parse the repository
"""

if run_test:
Expand All @@ -141,8 +141,8 @@ def get_tool_github_repositories(
repo_list: List[str] = []
for i in range(1, 5):
repo_selection = f"repositories0{i}.list"
if RepoSelection: # only get these repositories
if RepoSelection == repo_selection:
if repository_list: # only get these repositories
if repository_list == repo_selection:
repo_f = repo.get_contents(repo_selection)
repo_l = get_string_content(repo_f).rstrip()
repo_list.extend(repo_l.split("\n"))
Expand Down Expand Up @@ -571,12 +571,11 @@ def export_tools_to_tsv(
:param output_fp: path to output file
:param format_list_col: boolean indicating if list columns should be formatting
"""
df = pd.DataFrame(tools)
df = pd.DataFrame(tools).sort_values("Galaxy wrapper id")
if format_list_col:
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand All @@ -592,30 +591,33 @@ def export_tools_to_tsv(
def filter_tools(
tools: List[Dict],
ts_cat: List[str],
excluded_tools: List[str],
keep_tools: List[str],
) -> List[Dict]:
tool_status: Dict,
) -> tuple:
"""
Filter tools for specific ToolShed categories and add information if to keep or to exclude
:param tools: dictionary with tools and their metadata
:param ts_cat: list of ToolShed categories to keep in the extraction
:param excluded_tools: list of tools to skip
:param keep_tools: list of tools to keep
:param tool_status: dictionary with tools and their 2 status: Keep and Deprecated
"""
ts_filtered_tools = []
filtered_tools = []
for tool in tools:
# filter ToolShed categories and leave function if not in expected categories
if check_categories(tool["ToolShed categories"], ts_cat):
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = name in keep_tools or name in excluded_tools
tool["To keep"] = None
if name in keep_tools:
tool["To keep"] = True
elif name in excluded_tools:
tool["To keep"] = False
filtered_tools.append(tool)
return filtered_tools
tool["Reviewed"] = name in tool_status
keep = None
deprecated = None
if name in tool_status:
keep = tool_status[name][1]
deprecated = tool_status[name][2]
tool["Deprecated"] = deprecated
if keep: # only add tools that are manually marked as to keep
filtered_tools.append(tool)
tool["To keep"] = keep
ts_filtered_tools.append(tool)
return ts_filtered_tools, filtered_tools


if __name__ == "__main__":
Expand All @@ -626,12 +628,21 @@ def filter_tools(
# Extract tools
extractools = subparser.add_parser("extractools", help="Extract tools")
extractools.add_argument("--api", "-a", required=True, help="GitHub access token")
extractools.add_argument("--all-tools-json", "-j", required=True, help="Filepath to JSON with all extracted tools")
extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to TSV with all extracted tools")
extractools.add_argument("--all-tools", "-o", required=True, help="Filepath to TSV with all extracted tools")
extractools.add_argument(
"--planemorepository", "-pr", required=False, help="Repository list to use from the planemo-monitor repository"
"--planemo-repository-list",
"-pr",
required=False,
help="Repository list to use from the planemo-monitor repository",
)
extractools.add_argument(
"--avoid-extra-repositories",
"-e",
action="store_true",
default=False,
required=False,
help="Do not parse extra repositories in conf file",
)

extractools.add_argument(
"--test",
"-t",
Expand All @@ -645,38 +656,44 @@ def filter_tools(
filtertools = subparser.add_parser("filtertools", help="Filter tools")
filtertools.add_argument(
"--tools",
"-t",
"-i",
required=True,
help="Filepath to JSON with all extracted tools, generated by extractools command",
)
filtertools.add_argument(
"--filtered_tools",
"--ts-filtered-tools",
"-t",
required=True,
help="Filepath to TSV with tools filtered based on ToolShed category",
)
filtertools.add_argument(
"--filtered-tools",
"-f",
required=True,
help="Filepath to TSV with filtered tools",
help="Filepath to TSV with tools filtered based on ToolShed category and manual curation",
)
filtertools.add_argument(
"--categories",
"-c",
help="Path to a file with ToolShed category to keep in the extraction (one per line)",
)
filtertools.add_argument(
"--exclude",
"-e",
help="Path to a file with ToolShed ids of tools to exclude (one per line)",
)
filtertools.add_argument(
"--keep",
"-k",
help="Path to a file with ToolShed ids of tools to keep (one per line)",
"--status",
"-s",
help="Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not",
)
args = parser.parse_args()

if args.command == "extractools":
# connect to GitHub
g = Github(args.api)
# get list of GitHub repositories to parse
repo_list = get_tool_github_repositories(g, args.planemorepository, args.test)
repo_list = get_tool_github_repositories(
g=g,
repository_list=args.planemo_repository_list,
run_test=args.test,
add_extra_repositories=not args.avoid_extra_repositories,
)
# parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
tools: List[Dict] = []
for r in repo_list:
Expand All @@ -699,8 +716,8 @@ def filter_tools(
tools = json.load(f)
# get categories and tools to exclude
categories = read_file(args.categories)
excl_tools = read_file(args.exclude)
keep_tools = read_file(args.keep)
status = pd.read_csv(args.status, sep="\t", index_col=0, header=None).to_dict("index")
# filter tool lists
filtered_tools = filter_tools(tools, categories, excl_tools, keep_tools)
ts_filtered_tools, filtered_tools = filter_tools(tools, categories, status)
export_tools_to_tsv(ts_filtered_tools, args.ts_filtered_tools, format_list_col=True)
export_tools_to_tsv(filtered_tools, args.filtered_tools, format_list_col=True)
6 changes: 3 additions & 3 deletions bin/get_community_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ for com_data_fp in data/communities/* ; do
python bin/extract_galaxy_tools.py \
filtertools \
--tools "results/all_tools.json" \
--filtered_tools "results/$community/tools.tsv" \
--ts-filtered-tools "results/$community/tools_filtered_by_ts_categories.tsv" \
--filtered-tools "results/$community/tools.tsv" \
--categories "data/communities/$community/categories" \
--exclude "data/communities/$community/tools_to_exclude" \
--keep "data/communities/$community/tools_to_keep"
--status "data/communities/$community/tool_status.tsv"

python bin/create_interactive_table.py \
--table "results/$community/tools.tsv" \
Expand Down
File renamed without changes.
Empty file.
Loading

0 comments on commit 30f1c13

Please sign in to comment.