Skip to content

Commit

Permalink
Make wordcloud script more generic
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut committed Jul 8, 2024
1 parent b504192 commit d7ac69f
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 32 deletions.
76 changes: 46 additions & 30 deletions bin/create_wordcloud.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import argparse
from typing import Dict

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -9,33 +10,39 @@
from wordcloud import WordCloud


def get_wordcloud(community_tool_path: str, mask_figure: str, stats_column: str, wordcloud_output_path: str) -> None:
def prepare_data(table_path: str, name_col: str, stat_col: str) -> Dict:
"""
Generate a wordcloud based on the counts for each Galaxy wrapper id
Prepare data to create dictionary with key being the name and
value the stat/counts
:param community_tool_path: Dataframe that must
:param table_path: Path TSV file with name and stats
have the columns "Galaxy wrapper id" and `stats_column`
:param mask_figure: a figure that is used to render the wordcloud
E.g. a nice shape to highlight your community
:param stats_column: Column name of the
column with usage statistics in the table
:param wordcloud_output_path: Path to store the wordcloud
:param name_col: Name of the column with name for wordcloud
:param stat_col: Name of columns with usage/count
"""
table = pd.read_csv(table_path, sep="\t")

community_tool_stats = pd.read_csv(community_tool_path, sep="\t")

assert (
stats_column in community_tool_stats
), f"Stats column: {stats_column} not found in table!" # check if the stats column is there
assert stat_col in table, f"Stat column: {stat_col} not found in table!"
assert name_col in table, f"Name column: {name_col} not found in table!"

# some tools are not used at all
community_tool_stats[stats_column] = community_tool_stats[stats_column].fillna(value=0)
table[stat_col] = table[stat_col].fillna(value=0)

# create dictionary with key being the name and value the stat/counts
freq = pd.Series(table[stat_col].values, index=table[name_col]).to_dict()

return freq

# create the word cloud
frec = pd.Series(
community_tool_stats[stats_column].values, index=community_tool_stats["Galaxy wrapper id"]
).to_dict()

def generate_wordcloud(freq: dict, mask_figure: str) -> WordCloud:
"""
Generate a wordcloud based on counts
:param freq: Dictionary with key being the name and
value the stat/counts
:param mask_figure: a figure that is used to render the wordcloud
E.g. a nice shape to highlight your community
"""
mask = np.array(Image.open(mask_figure))
mask[mask == 0] = 255 # set 0 in array to 255 to work with wordcloud

Expand All @@ -44,43 +51,52 @@ def get_wordcloud(community_tool_path: str, mask_figure: str, stats_column: str,
background_color="rgba(255, 255, 255, 0)",
random_state=42,
)
wc.generate_from_frequencies(freq)
return wc

wc.generate_from_frequencies(frec)

def export_wordcloud(wc: WordCloud, wordcloud_output_path: str) -> None:
"""
Export wordcloud to file
:param wordcloud_output_path: Path to store the wordcloud
"""
fig, ax = plt.subplots(figsize=(13, 5))
ax.imshow(wc)

plt.axis("off")
plt.tight_layout(pad=0)

plt.savefig(wordcloud_output_path)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Create wordcloud from \
TSV file based on Galaxy usage statistics"
)
parser = argparse.ArgumentParser(description="Create wordcloud from TSV file")
parser.add_argument(
"--table",
"-ta",
required=True,
help="Path to TSV file with tools and stats",
help="Path to TSV file with name and stats",
)
parser.add_argument(
"--stats_column",
"--name_col",
"-nc",
required=True,
help="Name of the column with name to use in wordcloud",
)
parser.add_argument(
"--stat_col",
"-sc",
required=True,
help="Name of the column with usage statistics",
help="Name of the column with statistic to build the wordcloud",
)
parser.add_argument(
"--output",
"-out",
required=True,
help="Path to HTML output",
)

parser.add_argument("--wordcloud_mask", "-wcm", required=False, help="Mask figure to generate the wordcloud")

args = parser.parse_args()
get_wordcloud(args.table, args.wordcloud_mask, args.stats_column, args.output)
frec = prepare_data(args.table, args.name_col, args.stat_col)
wc = generate_wordcloud(frec, args.wordcloud_mask)
export_wordcloud(wc, args.output)
5 changes: 3 additions & 2 deletions bin/format_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ python bin/create_interactive_table.py \

python bin/create_wordcloud.py \
--table "results/all_tools.tsv" \
--name_col "Galaxy wrapper id" \
--stat_col "No. of tool users (2022-2023) (usegalaxy.eu)" \
--wordcloud_mask "data/usage_stats/wordcloud_mask.png" \
--output "results/all_tools_wordcloud.png" \
--stats_column "No. of tool users (2022-2023) (usegalaxy.eu)"
--output "results/all_tools_wordcloud.png" \

0 comments on commit d7ac69f

Please sign in to comment.