From 071f01b9d7b78fe52c70de87e15c027acc35c939 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Thu, 23 Nov 2023 16:43:03 +0100 Subject: [PATCH] fetch tools stepwise with CI --- .github/workflows/fetch_all_tools.yaml | 35 +++++++++++++++++++++++--- bin/extract_all_tools_downstream.sh | 8 ++++++ bin/extract_all_tools_merge.sh | 4 +++ bin/extract_all_tools_stepwise.sh | 12 +++++++++ bin/extract_galaxy_tools.py | 23 +++++++++++++---- 5 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 bin/extract_all_tools_downstream.sh create mode 100644 bin/extract_all_tools_merge.sh create mode 100755 bin/extract_all_tools_stepwise.sh diff --git a/.github/workflows/fetch_all_tools.yaml b/.github/workflows/fetch_all_tools.yaml index 1564de39..207cf340 100644 --- a/.github/workflows/fetch_all_tools.yaml +++ b/.github/workflows/fetch_all_tools.yaml @@ -16,8 +16,17 @@ permissions: contents: write jobs: - fetch-all-tools: + fetch-all-tools-stepwise: runs-on: ubuntu-latest + name: Fetch all tool stepwise + strategy: + matrix: + python-version: [3.8] + subset: + - repositories01.list + - repositories02.list + - repositories03.list + - repositories04.list steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -26,8 +35,7 @@ jobs: - name: Run script # run: bash bin/extract_all_tools.sh run: | - chmod +x bin/extract_all_tools.sh - bin/extract_all_tools.sh + 'bash ./bin/extract_all_tools.sh "${{ matrix.subset }}"' env: GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - name: Commit all tools @@ -37,3 +45,24 @@ jobs: git add results git commit -m "fetch all tools bot" git push + fetch-all-tools-merge: + runs-on: ubuntu-latest + name: Fetch all tools merge + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - name: Install requirement + run: python -m pip install -r requirements.txt + - name: Run script + run: | + 'bash ./bin/extract_all_tools_merge.sh' + 'bash ./bin/extract_all_tools_downstream.sh' + env: + GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + - name: Commit all tools + run: | + git config user.name github-actions + git config user.email github-actions@github.com + git add results + git commit -m "fetch all tools bot" + git push \ No newline at end of file diff --git a/bin/extract_all_tools_downstream.sh b/bin/extract_all_tools_downstream.sh new file mode 100644 index 00000000..aca00afa --- /dev/null +++ b/bin/extract_all_tools_downstream.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +mkdir -p 'results/' + +python bin/create_interactive_table.py \ + --table "results/all_tools.tsv" \ + --template "data/interactive_table_template.html" \ + --output "results/index.html" \ No newline at end of file diff --git a/bin/extract_all_tools_merge.sh b/bin/extract_all_tools_merge.sh new file mode 100644 index 00000000..99bac2ca --- /dev/null +++ b/bin/extract_all_tools_merge.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cat 'results/*_tools.tsv' > 'results/all_tools.tsv' + diff --git a/bin/extract_all_tools_stepwise.sh b/bin/extract_all_tools_stepwise.sh new file mode 100755 index 00000000..7b9cac75 --- /dev/null +++ b/bin/extract_all_tools_stepwise.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +mkdir -p 'results/' + +output="results/${1}_tools.tsv" + +python bin/extract_galaxy_tools.py \ + extractools \ + --api $GITHUB_API_KEY \ + --all_tools $output \ + --planemorepository $1 + diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 54bfbcea..098b7d19 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -50,18 +50,27 @@ def get_string_content(cf: ContentFile) -> str: return base64.b64decode(cf.content).decode("utf-8") -def get_tool_github_repositories(g: Github) -> List[str]: +def get_tool_github_repositories(g: Github, RepoSelection: Optional[str]) -> List[str]: """ Get list of tool GitHub repositories to parse :param g: GitHub instance + :param RepoSelection: The selection to use from the repository (needed to split the process for CI jobs) """ + repo = g.get_user("galaxyproject").get_repo("planemo-monitor") repo_list: List[str] = [] for i in range(1, 5): - repo_f = repo.get_contents(f"repositories0{i}.list") - repo_l = get_string_content(repo_f).rstrip() - repo_list.extend(repo_l.split("\n")) + repo_selection = f"repositories0{i}.list" + if RepoSelection: # only get these repositories + if RepoSelection == repo_selection: + repo_f = repo.get_contents(repo_selection) + repo_l = get_string_content(repo_f).rstrip() + repo_list.extend(repo_l.split("\n")) + else: + repo_f = repo.get_contents(repo_selection) + repo_l = get_string_content(repo_f).rstrip() + repo_list.extend(repo_l.split("\n")) return repo_list @@ -390,6 +399,10 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] extractools = subparser.add_parser("extractools", help="Extract tools") extractools.add_argument("--api", "-a", required=True, help="GitHub access token") extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to TSV with all extracted tools") + extractools.add_argument( + "--planemorepository", "-pr", required=False, help="Repository list to use from the planemo-monitor repository" + ) + # Filter tools filtertools = subparser.add_parser("filtertools", help="Filter tools") filtertools.add_argument( @@ -412,7 +425,7 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] # connect to GitHub g = Github(args.api) # get list of GitHub repositories to parse - repo_list = get_tool_github_repositories(g) + repo_list = get_tool_github_repositories(g, args.planemorepository) # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file tools: List[Dict] = [] for r in repo_list: