From d7ac69f6b9e7402da68e12160a432b638e78b9cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Mon, 8 Jul 2024 16:54:02 +0200 Subject: [PATCH] Make wordcloud script more generic --- bin/create_wordcloud.py | 76 +++++++++++++++++++++++++---------------- bin/format_tools.sh | 5 +-- 2 files changed, 49 insertions(+), 32 deletions(-) diff --git a/bin/create_wordcloud.py b/bin/create_wordcloud.py index 4859c022..505d948f 100644 --- a/bin/create_wordcloud.py +++ b/bin/create_wordcloud.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +from typing import Dict import matplotlib.pyplot as plt import numpy as np @@ -9,33 +10,39 @@ from wordcloud import WordCloud -def get_wordcloud(community_tool_path: str, mask_figure: str, stats_column: str, wordcloud_output_path: str) -> None: +def prepare_data(table_path: str, name_col: str, stat_col: str) -> Dict: """ - Generate a wordcloud based on the counts for each Galaxy wrapper id + Prepare data to create dictionary with key being the name and + value the stat/counts - :param community_tool_path: Dataframe that must + :param table_path: Path TSV file with name and stats have the columns "Galaxy wrapper id" and `stats_column` - :param mask_figure: a figure that is used to render the wordcloud - E.g. a nice shape to highlight your community - :param stats_column: Column name of the - column with usage statistics in the table - :param wordcloud_output_path: Path to store the wordcloud + :param name_col: Name of the column with name for wordcloud + :param stat_col: Name of columns with usage/count """ + table = pd.read_csv(table_path, sep="\t") - community_tool_stats = pd.read_csv(community_tool_path, sep="\t") - - assert ( - stats_column in community_tool_stats - ), f"Stats column: {stats_column} not found in table!" # check if the stats column is there + assert stat_col in table, f"Stat column: {stat_col} not found in table!" + assert name_col in table, f"Name column: {name_col} not found in table!" # some tools are not used at all - community_tool_stats[stats_column] = community_tool_stats[stats_column].fillna(value=0) + table[stat_col] = table[stat_col].fillna(value=0) + + # create dictionary with key being the name and value the stat/counts + freq = pd.Series(table[stat_col].values, index=table[name_col]).to_dict() + + return freq - # create the word cloud - frec = pd.Series( - community_tool_stats[stats_column].values, index=community_tool_stats["Galaxy wrapper id"] - ).to_dict() +def generate_wordcloud(freq: dict, mask_figure: str) -> WordCloud: + """ + Generate a wordcloud based on counts + + :param freq: Dictionary with key being the name and + value the stat/counts + :param mask_figure: a figure that is used to render the wordcloud + E.g. a nice shape to highlight your community + """ mask = np.array(Image.open(mask_figure)) mask[mask == 0] = 255 # set 0 in array to 255 to work with wordcloud @@ -44,34 +51,42 @@ def get_wordcloud(community_tool_path: str, mask_figure: str, stats_column: str, background_color="rgba(255, 255, 255, 0)", random_state=42, ) + wc.generate_from_frequencies(freq) + return wc - wc.generate_from_frequencies(frec) +def export_wordcloud(wc: WordCloud, wordcloud_output_path: str) -> None: + """ + Export wordcloud to file + + :param wordcloud_output_path: Path to store the wordcloud + """ fig, ax = plt.subplots(figsize=(13, 5)) ax.imshow(wc) - plt.axis("off") plt.tight_layout(pad=0) - plt.savefig(wordcloud_output_path) if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Create wordcloud from \ - TSV file based on Galaxy usage statistics" - ) + parser = argparse.ArgumentParser(description="Create wordcloud from TSV file") parser.add_argument( "--table", "-ta", required=True, - help="Path to TSV file with tools and stats", + help="Path to TSV file with name and stats", ) parser.add_argument( - "--stats_column", + "--name_col", + "-nc", + required=True, + help="Name of the column with name to use in wordcloud", + ) + parser.add_argument( + "--stat_col", "-sc", required=True, - help="Name of the column with usage statistics", + help="Name of the column with statistic to build the wordcloud", ) parser.add_argument( "--output", @@ -79,8 +94,9 @@ def get_wordcloud(community_tool_path: str, mask_figure: str, stats_column: str, required=True, help="Path to HTML output", ) - parser.add_argument("--wordcloud_mask", "-wcm", required=False, help="Mask figure to generate the wordcloud") args = parser.parse_args() - get_wordcloud(args.table, args.wordcloud_mask, args.stats_column, args.output) + frec = prepare_data(args.table, args.name_col, args.stat_col) + wc = generate_wordcloud(frec, args.wordcloud_mask) + export_wordcloud(wc, args.output) diff --git a/bin/format_tools.sh b/bin/format_tools.sh index 75fe3055..16105f80 100755 --- a/bin/format_tools.sh +++ b/bin/format_tools.sh @@ -9,6 +9,7 @@ python bin/create_interactive_table.py \ python bin/create_wordcloud.py \ --table "results/all_tools.tsv" \ + --name_col "Galaxy wrapper id" \ + --stat_col "No. of tool users (2022-2023) (usegalaxy.eu)" \ --wordcloud_mask "data/usage_stats/wordcloud_mask.png" \ - --output "results/all_tools_wordcloud.png" \ - --stats_column "No. of tool users (2022-2023) (usegalaxy.eu)" \ No newline at end of file + --output "results/all_tools_wordcloud.png" \ \ No newline at end of file