From 2e05928a6282c24bf45eac0e6b4134fa3a73fd94 Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Tue, 16 Jul 2024 13:37:59 -0400 Subject: [PATCH 1/8] Update utils.py --- flepimop/gempyor_pkg/src/gempyor/utils.py | 204 ++++++++++++++++++++-- 1 file changed, 186 insertions(+), 18 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 4d3209061..fdebcc106 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -23,7 +23,17 @@ def write_df(fname: str, df: pd.DataFrame, extension: str = ""): - """write without index, so assume the index has been put a column""" + """ + Convert a DataFrame to either a csv or parquet file. + + Args: + fname: The filename (str). + df: The pandas DataFrame to be converted. + extension: Optional argument. Must either be 'csv' or 'parquet'. Default value is an empty string. + + Raises: + NotImplementedError: If an invalid file extension is given. + """ # cast to str to use .split in case fname is a PosixPath fname = str(fname) if extension: # Empty strings are falsy in python @@ -39,6 +49,22 @@ def write_df(fname: str, df: pd.DataFrame, extension: str = ""): def command_safe_run(command, command_name="mycommand", fail_on_fail=True): + """ + Verifies that a command is valid by attempting to run it. Prints stream of code if command fails. + + Args: + command: The CLI command to be given (str). + command_name: The reference name for you command (str). Default value is "mycommand". + fail_on_fail: Boolean; default is True. If True, an exception will be thrown if the command fails. + + Returns: + returncode: The returncode message from running yourcommand. + stdout: Standard output + stderr: Standard error stream + + Raises: + Exception: If fail_on_fail=True and the command fails, an exception will be thrown. + """ import subprocess import shlex # using shlex to split the command because it's not obvious https://docs.python.org/3/library/subprocess.html#subprocess.Popen @@ -63,8 +89,21 @@ def command_safe_run(command, command_name="mycommand", fail_on_fail=True): def read_df(fname: str, extension: str = "") -> pd.DataFrame: - """Load a dataframe from a file, agnostic to whether it is a parquet or a csv. The extension - can be provided as an argument or it is infered""" + """ + Load a dataframe from a file, agnostic to whether it is a parquet or a csv. The extension + can be provided as an argument or it is inferred. + + Args: + fname: The filename (str). + extension: Optional argument. Must either be 'csv' or 'parquet'. Default value is an empty string. + + Returns: + A pandas DataFrame of the data that was in the parquet or csv file. + + Raises: + NotImplementedError: If an invalid file extension is given. + FileNotFoundError: If the file cannot be found; likely due to fname typo. + """ fname = str(fname) if extension: # Empty strings are falsy in python fname = f"{fname}.{extension}" @@ -80,7 +119,15 @@ def read_df(fname: str, extension: str = "") -> pd.DataFrame: def add_method(cls): - "Decorator to add a method to a class" + """ + Decorator to add a method to a class. + + Args: + cls: The class you want to add a method to. + + Returns: + decorator: The decorator. + """ def decorator(func): @functools.wraps(func) @@ -97,6 +144,18 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path # unfortunatelly very complicated, this is cpython only ?? + """ + Function serving to create a class that finds and imports the necessary modules. + + Args: + plugin_file_path: Pathway to the module (str). + path_prefix: Pathway prefix to the module (str). + class_name: Name of the class (str). + Keyword args: + + Returns: + + """ import sys, os full_path = os.path.join(path_prefix, plugin_file_path) @@ -119,6 +178,7 @@ def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_d """A time profiler decorator. Inspired by and modified the profile decorator of Giampaolo Rodola: http://code.activestate.com/recipes/577817-profile-decorator/ + Args: output_file: str or None. Default is None Path of the output file. If only name of the file is given, it's @@ -136,6 +196,7 @@ def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_d strip_dirs: bool Whether to remove the leading path info from file names. This is also useful in reducing the size of the printout + Returns: Profile of the decorated function """ @@ -157,6 +218,15 @@ def wrapper(*args, **kwargs): def as_list(thing): + """ + Returns argument passed as a list. + + Args: + thing: The object that you would like to be converted to a list. + + Returns: + thing: The object converted to a list. + """ if type(thing) == list: return thing return [thing] @@ -187,14 +257,30 @@ def convert(self, value, view): @add_method(confuse.ConfigView) def as_date(self): - "Evaluates an datetime.date or ISO8601 date string, raises ValueError on parsing errors." + """ + Evaluates an datetime.date or ISO8601 date string. + + Args: + self: Class instance to convert to date or date string. + + Raises: + ValueError: On parsing errors. + """ return self.get(ISO8601Date()) @add_method(confuse.ConfigView) def as_evaled_expression(self): - "Evaluates an expression string, returning a float. Raises ValueError on parsing errors." + """ + Evaluates an expression string, returning a float. + + Args: + self: Class instance expression to evaluate. + + Raises: + ValueError: On parsing errors. + """ value = self.get() if isinstance(value, numbers.Number): @@ -209,19 +295,52 @@ def as_evaled_expression(self): def get_truncated_normal(*, mean=0, sd=1, a=0, b=10): - "Returns the truncated normal distribution" + """ + Returns the truncated normal distribution. + + Args: Must be assigned with keyword. + mean: Mean. Default value is 0. + sd: Standard deviation. Default value is 1. + a: Starting value. Default value is 0. + b: Ending value. Default value is 10. + + Returns: + A frozen random variable object holding the fixed given parameters. + """ return scipy.stats.truncnorm((a - mean) / sd, (b - mean) / sd, loc=mean, scale=sd) def get_log_normal(meanlog, sdlog): - "Returns the log normal distribution" + """ + Returns the log normal distribution. + + Args: + meanlog: Mean. + sdlog: Standard deviation. + + Returns: + A frozen random variable object holding the fixed given parameters. + """ return scipy.stats.lognorm(s=sdlog, scale=np.exp(meanlog), loc=0) @add_method(confuse.ConfigView) def as_random_distribution(self): - "Constructs a random distribution object from a distribution config key" + """ + Constructs a random distribution object from a distribution config key. + + Args: + self: Class instance (in this case, a config key) to construct the random distribution from. + + Returns: + A partial object containing the random distribution. + + Raises: + ValueError: When values are out of range. + NotImplementedError: If an unknown distribution is found. + + """ if isinstance(self.get(), dict): dist = self["distribution"].get() @@ -267,7 +386,7 @@ def as_random_distribution(self): def list_filenames(folder: str = ".", filters: list = []) -> list: """ - return the list of all filename and path in the provided folders. + Returns the list of all filenames and paths in the provided folders. If filters [list] is provided, then only the files that contains each of the substrings in filter will be returned. Example to get all hosp file: ``` @@ -277,6 +396,13 @@ def list_filenames(folder: str = ".", filters: list = []) -> list: ``` gempyor.utils.list_filenames(folder="model_output/", filters=["hosp" , ".parquet"]) ``` + + Args: + folder: A path to the folder containing files (str). Default value is ".". + filters: A list of substrings to filter the filenames. Default value is an empty list. + + Returns: + fn_list: A list of filenames. """ from pathlib import Path @@ -322,6 +448,10 @@ def rolling_mean_pad(data, window): def print_disk_diagnosis(): + """ + Reads and prints AWS diagnoses. + Includes total bytes, used bytes, free bytes. + """ import os from os import path from shutil import disk_usage @@ -349,6 +479,20 @@ def bash(command): def create_resume_out_filename( flepi_run_index: str, flepi_prefix: str, flepi_slot_index: str, flepi_block_index: str, filetype: str, liketype: str ) -> str: + """ + Compiles run output information. + + Args: + flepi_run_index: Index of the run (str). + flepi_prefix: File prefix (str). + flepi_slot_index: Index of the slot (str). + flepi_block_index: Index of the block (str). + filetype: File type (str). + liketype: (str). + + Returns: + The path to a corresponding output file. + """ prefix = f"{flepi_prefix}/{flepi_run_index}" inference_filepath_suffix = f"{liketype}/intermediate" inference_filename_prefix = "{:09d}.".format(int(flepi_slot_index)) @@ -370,6 +514,19 @@ def create_resume_out_filename( def create_resume_input_filename( resume_run_index: str, flepi_prefix: str, flepi_slot_index: str, filetype: str, liketype: str ) -> str: + """ + Compiles run input information. + + Args: + resume_run_index: Index of the run (str). + flepi_prefix: File prefix (str). + flepi_slot_index: Index of thes lot (str). + filetype: File type (str). + liketype: (str). + + Returns: + The path to the a corresponding input file. + """ prefix = f"{flepi_prefix}/{resume_run_index}" inference_filepath_suffix = f"{liketype}/final" index = flepi_slot_index @@ -392,9 +549,12 @@ def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) specific environment variable settings. This function dynamically determines the list based on the current operational context given by the environment. - The function checks two environment variables: - - `resume_discard_seeding`: Determines whether seeding-related file types should be included. - - `flepi_block_index`: Determines a specific operational mode or block of the process. + Args: + resume_discard_seeding: Determines whether seeding-related file types should be included (str). + flepi_block_index: Determines a specific operational mode or block of the process (str). + + Returns: + List of file types. """ if flepi_block_index == "1": if resume_discard_seeding == "true": @@ -419,6 +579,19 @@ def create_resume_file_names_map( parquet file types and environmental conditions. The function adjusts the file name mappings based on the operational block index and the location of the last job output. + Args: + resume_discard_seeding: + flepi_block_index: + resume_run_index: + flepi_prefix: + flepi_slot_index: + flepi_run_index: + last_job_output: + + Return: + Dict[str, str]: A dictionary where keys are input file paths and values are corresponding + output file paths. The paths may be modified by the 'LAST_JOB_OUTPUT' if it + is set and points to an S3 location. The mappings depend on: - Parquet file types appropriate for resuming a process, as determined by the environment. - Whether the files are for 'global' or 'chimeric' types, as these liketypes influence the @@ -428,11 +601,6 @@ def create_resume_file_names_map( - The presence and value of 'LAST_JOB_OUTPUT' environment variable, which if set to an S3 path, adjusts the keys in the mapping to be prefixed with this path. - Returns: - Dict[str, str]: A dictionary where keys are input file paths and values are corresponding - output file paths. The paths may be modified by the 'LAST_JOB_OUTPUT' if it - is set and points to an S3 location. - Raises: No explicit exceptions are raised within the function, but it relies heavily on external functions and environment variables which if improperly configured could lead to unexpected From ecebd83909154c23a91fd79001125b9635660f3d Mon Sep 17 00:00:00 2001 From: Emily Przykucki <100221052+emprzy@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:40:05 -0400 Subject: [PATCH 2/8] Update utils.py --- flepimop/gempyor_pkg/src/gempyor/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 06531cfbb..573fa2e0b 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -564,7 +564,7 @@ def create_resume_out_filename( flepi_slot_index: Index of the slot (str). flepi_block_index: Index of the block (str). filetype: File type (str). - liketype: (str). + liketype: Chimeric or global (str). Returns: The path to a corresponding output file. @@ -598,7 +598,7 @@ def create_resume_input_filename( flepi_prefix: File prefix (str). flepi_slot_index: Index of thes lot (str). filetype: File type (str). - liketype: (str). + liketype: Chimeric or global (str). Returns: The path to the a corresponding input file. From 8ac7b8f0d67a489322c8dc26a88408455eadf49e Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Fri, 9 Aug 2024 09:41:03 -0400 Subject: [PATCH 3/8] Update utils.py --- flepimop/gempyor_pkg/src/gempyor/utils.py | 55 ++++++++++++++--------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 573fa2e0b..1eb27755f 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -99,19 +99,19 @@ def read_df( ) -def command_safe_run(command, command_name="mycommand", fail_on_fail=True): +def command_safe_run(command: str, command_name: str="mycommand", fail_on_fail: bool=True): """ - Verifies that a command is valid by attempting to run it. Prints stream of code if command fails. + Runs a shell command and prints diagnostics if command fails. Args: - command: The CLI command to be given (str). + command: The CLI command to be given. command_name: The reference name for you command (str). Default value is "mycommand". - fail_on_fail: Boolean; default is True. If True, an exception will be thrown if the command fails. + fail_on_fail: If True, an exception will be thrown if the command fails (default is True) Returns: returncode: The returncode message from running yourcommand. - stdout: Standard output - stderr: Standard error stream + stdout: Standard output. + stderr: Standard error stream. Raises: Exception: If fail_on_fail=True and the command fails, an exception will be thrown. @@ -162,21 +162,21 @@ def wrapper(*args, **kwargs): def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, class_name: str, **kwargs): - # Look for all possible plugins and import them - # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path - # unfortunatelly very complicated, this is cpython only ?? """ Function serving to create a class that finds and imports the necessary modules. Args: - plugin_file_path: Pathway to the module (str). - path_prefix: Pathway prefix to the module (str). - class_name: Name of the class (str). - Keyword args: + plugin_file_path: Pathway to the module. + path_prefix: Pathway prefix to the module. + class_name: Name of the class. + kwargs: Further arguments passed to initilization of the class. Returns: """ + # Look for all possible plugins and import them + # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path + # unfortunatelly very complicated, this is cpython only ?? import sys, os full_path = os.path.join(path_prefix, plugin_file_path) @@ -195,7 +195,7 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla from functools import wraps -def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_dirs=False): +def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_dirs: bool=False): """A time profiler decorator. Inspired by and modified the profile decorator of Giampaolo Rodola: http://code.activestate.com/recipes/577817-profile-decorator/ @@ -216,10 +216,10 @@ def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_d are printed toward the top of the file. strip_dirs: bool Whether to remove the leading path info from file names. - This is also useful in reducing the size of the printout + This is also useful in reducing the size of the printout. Returns: - Profile of the decorated function + Profile of the decorated function. """ def inner(func): @@ -238,7 +238,7 @@ def wrapper(*args, **kwargs): return inner -def as_list(thing): +def as_list(thing) -> list: """ Returns argument passed as a list. @@ -255,6 +255,12 @@ def as_list(thing): ### A little timer class class Timer(object): + """ + A timer class that starts, ends, and records time in between. + + Attributes: + name: Name of event. + """ def __init__(self, name): self.name = name @@ -267,6 +273,12 @@ def __exit__(self, type, value, traceback): class ISO8601Date(confuse.Template): + """ + Reads in config dates into datetimes.dates. + + Attributes: + value: Date value. + """ def convert(self, value, view): if isinstance(value, datetime.date): return value @@ -281,9 +293,6 @@ def as_date(self): """ Evaluates an datetime.date or ISO8601 date string. - Args: - self: Class instance to convert to date or date string. - Raises: ValueError: On parsing errors. """ @@ -666,8 +675,7 @@ def create_resume_file_names_map( Return: Dict[str, str]: A dictionary where keys are input file paths and values are corresponding - output file paths. The paths may be modified by the 'LAST_JOB_OUTPUT' if it - is set and points to an S3 location. + output file paths. The mappings depend on: - Parquet file types appropriate for resuming a process, as determined by the environment. - Whether the files are for 'global' or 'chimeric' types, as these liketypes influence the @@ -681,6 +689,9 @@ def create_resume_file_names_map( No explicit exceptions are raised within the function, but it relies heavily on external functions and environment variables which if improperly configured could lead to unexpected behavior. + + Notes: + - The paths may be modified by the 'LAST_JOB_OUTPUT' if it is set and points to an S3 location. """ file_types = get_filetype_for_resume( resume_discard_seeding=resume_discard_seeding, flepi_block_index=flepi_block_index From 91b34fc50805d13ae583690a0a2ca9b4810ea4b3 Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Fri, 23 Aug 2024 15:44:46 -0400 Subject: [PATCH 4/8] Update utils.py Refining gempyor.utils documentation. Preparing to establish a PR --- flepimop/gempyor_pkg/src/gempyor/utils.py | 70 +++++++++++------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 1eb27755f..ce7f5d0dd 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -172,7 +172,7 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla kwargs: Further arguments passed to initilization of the class. Returns: - + The classs_name attribute of module class. """ # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path @@ -260,6 +260,7 @@ class Timer(object): Attributes: name: Name of event. + tstart: Time start. """ def __init__(self, name): self.name = name @@ -289,10 +290,13 @@ def convert(self, value, view): @add_method(confuse.ConfigView) -def as_date(self): +def as_date(self) -> datetime.date: """ Evaluates an datetime.date or ISO8601 date string. + Returns: + A datetime.date data type of the date associated with the object. + Raises: ValueError: On parsing errors. """ @@ -305,8 +309,8 @@ def as_evaled_expression(self): """ Evaluates an expression string, returning a float. - Args: - self: Class instance expression to evaluate. + Returns: + A float data type of the value associated with the object. Raises: ValueError: On parsing errors. @@ -386,7 +390,6 @@ def as_random_distribution(self): Raises: ValueError: When values are out of range. NotImplementedError: If an unknown distribution is found. - """ if isinstance(self.get(), dict): @@ -441,6 +444,15 @@ def list_filenames( If filters are provided, only the files containing each of the substrings in the filters will be returned. + Args: + folder: The directory to search for files. Defaults to the current directory. + filters: A string or a list of strings to filter filenames. Only files + containing all the provided substrings will be returned. Defaults to an + empty list. + + Returns: + A list of strings representing the paths to the files that match the filters. + Example: To get all files containing "hosp": ``` @@ -457,15 +469,6 @@ def list_filenames( filters=["hosp", ".parquet"], ) ``` - - Args: - folder: The directory to search for files. Defaults to the current directory. - filters: A string or a list of strings to filter filenames. Only files - containing all the provided substrings will be returned. Defaults to an - empty list. - - Returns: - A list of strings representing the paths to the files that match the filters. """ filters = [filters] if not isinstance(filters, list) else filters filters = filters if len(filters) else [""] @@ -534,8 +537,7 @@ def rolling_mean_pad( def print_disk_diagnosis(): """ - Reads and prints AWS diagnoses. - Includes total bytes, used bytes, free bytes. + Reads and prints AWS diagnoses. Includes total bytes, used bytes, free bytes. """ import os from os import path @@ -603,11 +605,11 @@ def create_resume_input_filename( Compiles run input information. Args: - resume_run_index: Index of the run (str). - flepi_prefix: File prefix (str). - flepi_slot_index: Index of thes lot (str). - filetype: File type (str). - liketype: Chimeric or global (str). + resume_run_index: Index of the run. + flepi_prefix: File prefix. + flepi_slot_index: Index of the slot. + filetype: File type. + liketype: Chimeric or global. Returns: The path to the a corresponding input file. @@ -628,7 +630,7 @@ def create_resume_input_filename( ) -def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) -> List[str]: +def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) -> list[str]: """ Retrieves a list of parquet file types that are relevant for resuming a process based on specific environment variable settings. This function dynamically determines the list @@ -665,13 +667,13 @@ def create_resume_file_names_map( based on the operational block index and the location of the last job output. Args: - resume_discard_seeding: - flepi_block_index: - resume_run_index: - flepi_prefix: - flepi_slot_index: - flepi_run_index: - last_job_output: + resume_discard_seeding: Determines whether seeding-related file types should be included. + flepi_block_index: Determines a specific operational mode or block of the process. + resume_run_index: Resume run index. + flepi_prefix: File prefix. + flepi_slot_index: Index of the slot. + flepi_run_index: flepiMoP run index. + last_job_output: Adjusts the keys in the mapping to be prefixed with this path. Return: Dict[str, str]: A dictionary where keys are input file paths and values are corresponding @@ -734,7 +736,7 @@ def download_file_from_s3(name_map: Dict[str, str]) -> None: then iterates over each S3 URI in the provided mapping, downloads the file to the corresponding local path, and handles errors if the S3 URI format is incorrect or if the download fails. - Parameters: + Args: name_map (Dict[str, str]): A dictionary where keys are S3 URIs (strings) and values are the local file paths (strings) where the files should be saved. @@ -793,18 +795,16 @@ def download_file_from_s3(name_map: Dict[str, str]) -> None: def move_file_at_local(name_map: Dict[str, str]) -> None: """ Moves files locally according to a given mapping. - This function takes a dictionary where the keys are source file paths and the values are destination file paths. It ensures that the destination directories exist and then copies the files from the source paths to the destination paths. - Parameters: - name_map (Dict[str, str]): A dictionary mapping source file paths to - destination file paths. + Args: + name_map (Dict[str, str]): A dictionary mapping source file paths to destination file paths. Returns: - None + None """ for src, dst in name_map.items(): os.path.makedirs(os.path.dirname(dst), exist_ok=True) From 2b779bb54bb320b5df2f2aea90dea03d9624af92 Mon Sep 17 00:00:00 2001 From: Emily Przykucki <100221052+emprzy@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:26:22 -0400 Subject: [PATCH 5/8] Update utils.py --- flepimop/gempyor_pkg/src/gempyor/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index ce7f5d0dd..124d20349 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -172,7 +172,7 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla kwargs: Further arguments passed to initilization of the class. Returns: - The classs_name attribute of module class. + The class_name attribute of module class. """ # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path From ff06002153a4329b36638c04e708aee1e3e1c446 Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Wed, 4 Sep 2024 14:10:38 -0400 Subject: [PATCH 6/8] Documenting utils.py Adding examples to function documentation (and further refining documentation) in the gempyor.utils module. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 246 +++++++++++++++++----- 1 file changed, 190 insertions(+), 56 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index ce7f5d0dd..069e5b83e 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -99,20 +99,17 @@ def read_df( ) -def command_safe_run(command: str, command_name: str="mycommand", fail_on_fail: bool=True): +def command_safe_run(command: str, command_name: str="mycommand", fail_on_fail: bool=True) -> tuple[int, str, str]: """ Runs a shell command and prints diagnostics if command fails. Args: command: The CLI command to be given. - command_name: The reference name for you command (str). Default value is "mycommand". + command_name: The reference name for you command. Default value is "mycommand". fail_on_fail: If True, an exception will be thrown if the command fails (default is True) Returns: - returncode: The returncode message from running yourcommand. - stdout: Standard output. - stderr: Standard error stream. - + As a tuple; the return code, the standard output, and standard error from running the command. Raises: Exception: If fail_on_fail=True and the command fails, an exception will be thrown. """ @@ -141,7 +138,7 @@ def command_safe_run(command: str, command_name: str="mycommand", fail_on_fail: def add_method(cls): """ - Decorator to add a method to a class. + A function which adds a function to a class. Args: cls: The class you want to add a method to. @@ -161,7 +158,7 @@ def wrapper(*args, **kwargs): return decorator -def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, class_name: str, **kwargs): +def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, class_name: str, **kwargs: dict[str, any]) -> any: """ Function serving to create a class that finds and imports the necessary modules. @@ -169,10 +166,23 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla plugin_file_path: Pathway to the module. path_prefix: Pathway prefix to the module. class_name: Name of the class. - kwargs: Further arguments passed to initilization of the class. + **kwargs: Further arguments passed to initilization of the class. Returns: - The classs_name attribute of module class. + The instance of the class that was instantiated with provided **kwargs. + + Example: + Suppose there is a module called `my_plugin.py with a class `MyClass` located at `/path/to/plugin/`. + + Dynamically import and instantiate the class: + ``` + instance = search_and_import_plugins_class('/path/to/plugin', path_prefix, 'MyClass', **params) + ``` + + View the instance: + ``` + print(instance.display()) + ``` """ # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path @@ -195,31 +205,40 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla from functools import wraps -def profile(output_file=None, sort_by="cumulative", lines_to_print=None, strip_dirs: bool=False): - """A time profiler decorator. +def profile(output_file: str = None, sort_by: str = "cumulative", lines_to_print: int = None, strip_dirs: bool = False): + """ + A time profiler decorator. Inspired by and modified the profile decorator of Giampaolo Rodola: http://code.activestate.com/recipes/577817-profile-decorator/ Args: - output_file: str or None. Default is None + output_file: Path of the output file. If only name of the file is given, it's saved in the current directory. If it's None, the name of the decorated function is used. - sort_by: str or SortKey enum or tuple/list of str/SortKey enum + sort_by: Sorting criteria for the Stats object. For a list of valid string and SortKey refer to: https://docs.python.org/3/library/profile.html#pstats.Stats.sort_stats - lines_to_print: int or None + lines_to_print: Number of lines to print. Default (None) is for all the lines. This is useful in reducing the size of the printout, especially that sorting by 'cumulative', the time consuming operations are printed toward the top of the file. - strip_dirs: bool - Whether to remove the leading path info from file names. - This is also useful in reducing the size of the printout. + strip_dirs: + Whether to remove the leading path info from file names. Returns: Profile of the decorated function. + + Example: + ``` + @profile(output_file="my_function.prof") + def my_function(): + # Function body content + pass + my_function() + ``` """ def inner(func): @@ -238,7 +257,7 @@ def wrapper(*args, **kwargs): return inner -def as_list(thing) -> list: +def as_list(thing: any) -> list[any]: """ Returns argument passed as a list. @@ -246,7 +265,7 @@ def as_list(thing) -> list: thing: The object that you would like to be converted to a list. Returns: - thing: The object converted to a list. + he object converted to a list. """ if type(thing) == list: return thing @@ -276,11 +295,18 @@ def __exit__(self, type, value, traceback): class ISO8601Date(confuse.Template): """ Reads in config dates into datetimes.dates. - - Attributes: - value: Date value. """ - def convert(self, value, view): + def convert(self, value: any, view: confuse.View): + """ + Converts the given value to a datetime.date object. + + Args: + value: The value to be converted. Can be datetime.date object or ISO8601 string. + view: A view object from confuse, to be used for error reporting. + + Raises: + confuse.TemplateError: If `value` is neither a datetime.date nor an ISO8601Date string. + """ if isinstance(value, datetime.date): return value elif isinstance(value, str): @@ -296,9 +322,6 @@ def as_date(self) -> datetime.date: Returns: A datetime.date data type of the date associated with the object. - - Raises: - ValueError: On parsing errors. """ return self.get(ISO8601Date()) @@ -348,8 +371,13 @@ def get_truncated_normal( b: The upper bound of the truncated normal distribution. Defaults to 10. Returns: - rv_frozen: A frozen instance of the truncated normal distribution with the - specified parameters. + rv_frozen: A frozen instance of the truncated normal distribution with the specified parameters. + + Example: + Create a truncated normal distribution with specified parameters (truncated between 1 and 10): + ``` + truncated_normal_dist = get_truncated_normal(mean=5, sd=2, a=1, b=10) + ``` """ lower = (a - mean) / sd upper = (b - mean) / sd @@ -360,7 +388,8 @@ def get_log_normal( meanlog: float | int, sdlog: float | int, ) -> scipy.stats._distn_infrastructure.rv_frozen: - """Returns a log normal distribution. + """ + Returns a log normal distribution. This function constructs a log normal distribution with the specified log mean and log standard deviation. @@ -372,6 +401,12 @@ def get_log_normal( Returns: rv_frozen: A frozen instance of the log normal distribution with the specified parameters. + + Example: + Create a log-normal distribution with specified parameters: + ``` + log_normal_dist = get_log_normal(meanlog=1, sdlog=0.5) + ``` """ return scipy.stats.lognorm(s=sdlog, scale=np.exp(meanlog), loc=0) @@ -390,6 +425,28 @@ def as_random_distribution(self): Raises: ValueError: When values are out of range. NotImplementedError: If an unknown distribution is found. + + Example: + Say that `config` is a `confuse.ConfigView` instance. + + To create a uniform distribution between 1 and 10: + ``` + dist_function = config.as_random_distribution() + sample = dist_function() + ``` + + To use a truncated normal distribution: + ``` + config_truncnorm = confuse.ConfigView({ + "distribution": "truncnorm", + "mean": 0 + "sd": 1, + "a": -1, + "b": 1 + }) + truncnorm_dist_function = config_truncnorm.as_random_distribution() + truncnorm_sample = truncnorm_dist_function() + ``` """ if isinstance(self.get(), dict): @@ -445,10 +502,12 @@ def list_filenames( in the filters will be returned. Args: - folder: The directory to search for files. Defaults to the current directory. - filters: A string or a list of strings to filter filenames. Only files - containing all the provided substrings will be returned. Defaults to an - empty list. + folder: + The directory to search for files. Defaults to the current directory. + filters: + A string or a list of strings to filter filenames. Only files + containing all the provided substrings will be returned. Defaults to an + empty list. Returns: A list of strings representing the paths to the files that match the filters. @@ -499,7 +558,7 @@ def rolling_mean_pad( Examples: Below is a brief set of examples showcasing how to smooth a metric, like hospitalizations, using this function. - + ``` >>> import numpy as np >>> from gempyor.utils import rolling_mean_pad >>> hospitalizations = np.arange(1., 29.).reshape((7, 4)) @@ -519,6 +578,7 @@ def rolling_mean_pad( [17. , 18. , 19. , 20. ], [20.2, 21.2, 22.2, 23.2], [22.6, 23.6, 24.6, 25.6]]) + ``` """ weights = (1. / window) * np.ones(window) output = scipy.ndimage.convolve1d(data, weights, axis=0, mode="nearest") @@ -537,13 +597,22 @@ def rolling_mean_pad( def print_disk_diagnosis(): """ - Reads and prints AWS diagnoses. Includes total bytes, used bytes, free bytes. + Reads and prints AWS disk diagnostic information. """ import os from os import path from shutil import disk_usage - def bash(command): + def bash(command: str) -> str: + """ + Executes a shell command and returns its output. + + Args: + command: The shell command to be executed. + + Returns: + The output of the shell command. + """ output = os.popen(command).read() return output @@ -570,15 +639,32 @@ def create_resume_out_filename( Compiles run output information. Args: - flepi_run_index: Index of the run (str). - flepi_prefix: File prefix (str). - flepi_slot_index: Index of the slot (str). - flepi_block_index: Index of the block (str). - filetype: File type (str). - liketype: Chimeric or global (str). + flepi_run_index: Index of the run. + flepi_prefix: File prefix. + flepi_slot_index: Index of the slot. + flepi_block_index: Index of the block. + filetype: File type. + liketype: Chimeric or global. Returns: The path to a corresponding output file. + + Example: + Generate an output file with specified parameters: + ``` + filename = create_resume_out_filename( + flepi_run_index="test_run", + flepi_prefix="model_output/run_id/", + flepi_slot_index="1", + flepi_block_index="2", + filetype="seed", + liketype="chimeric" + ) + ``` + Example output: + ``` + "experiment/001/normal/intermediate/000000123.000000000.1.parquet" + ``` """ prefix = f"{flepi_prefix}/{flepi_run_index}" inference_filepath_suffix = f"{liketype}/intermediate" @@ -613,6 +699,22 @@ def create_resume_input_filename( Returns: The path to the a corresponding input file. + + Example: + Generate an input file with specified parameters: + ``` + filename = create_resume_input_filename( + resume_run_index="2", + flepi_prefix="model_output/run_id/", + flepi_slot_index="1", + filetype="seed", + liketype="chimeric" + ) + ``` + Example output: + ``` + "experiment/002/normal/final/789.csv" + ``` """ prefix = f"{flepi_prefix}/{resume_run_index}" inference_filepath_suffix = f"{liketype}/final" @@ -633,7 +735,8 @@ def create_resume_input_filename( def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) -> list[str]: """ Retrieves a list of parquet file types that are relevant for resuming a process based on - specific environment variable settings. This function dynamically determines the list + specific environment variable settings. + This function dynamically determines the list based on the current operational context given by the environment. Args: @@ -642,6 +745,25 @@ def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) Returns: List of file types. + + Example: + Determine file types for block index 1 with seeding data NOT discarded: + ``` + filetypes = get_filetype_for_resume(resume_discard_seeding="false", flepi_block_index="1") + ``` + Output of `print(filetypes)`: + ``` + ["seed", "spar", "snpi", "hpar", "hnpi", "init"] + ``` + + Determine file types for block index 2 with seeding data discarded: + ``` + filtypes = get_filetype_for_resume(resume_discard_seeding="true", flepi_block_index="2") + ``` + Output of `print(filetypes)`: + ``` + ["seed", "spar", "snpi", "hpar", "hnpi", "host", "llik", "init"] + ``` """ if flepi_block_index == "1": if resume_discard_seeding == "true": @@ -660,7 +782,7 @@ def create_resume_file_names_map( flepi_slot_index, flepi_run_index, last_job_output, -) -> Dict[str, str]: +) -> dict: """ Generates a mapping of input file names to output file names for a resume process based on parquet file types and environmental conditions. The function adjusts the file name mappings @@ -676,8 +798,9 @@ def create_resume_file_names_map( last_job_output: Adjusts the keys in the mapping to be prefixed with this path. Return: - Dict[str, str]: A dictionary where keys are input file paths and values are corresponding - output file paths. + A dictionary where keys are input file paths and values are corresponding + output file paths. + The mappings depend on: - Parquet file types appropriate for resuming a process, as determined by the environment. - Whether the files are for 'global' or 'chimeric' types, as these liketypes influence the @@ -692,6 +815,19 @@ def create_resume_file_names_map( functions and environment variables which if improperly configured could lead to unexpected behavior. + Example: + Generate a mapping of file names for a given resume process: + ``` + file_names_map = create_resume_file_names_map( + resume_discard_seeding="false", + flepi_block_index="1", + resume_run_index="1", + flepi_prefix="model_output/run_id/", + flepi_slot_index="1", + flepi_run_index="test_run", + last_job_output="s3://bucket/path/") + ``` + Notes: - The paths may be modified by the 'LAST_JOB_OUTPUT' if it is set and points to an S3 location. """ @@ -729,7 +865,7 @@ def create_resume_file_names_map( return resume_file_name_mapping -def download_file_from_s3(name_map: Dict[str, str]) -> None: +def download_file_from_s3(name_map: dict) -> None: """ Downloads files from AWS S3 based on a mapping of S3 URIs to local file paths. The function checks if the directory for the first output file exists and creates it if necessary. It @@ -737,13 +873,14 @@ def download_file_from_s3(name_map: Dict[str, str]) -> None: local path, and handles errors if the S3 URI format is incorrect or if the download fails. Args: - name_map (Dict[str, str]): A dictionary where keys are S3 URIs (strings) and values - are the local file paths (strings) where the files should - be saved. + name_map: + A dictionary where keys are S3 URIs (strings) and values + are the local file paths (strings) where the files should + be saved. Returns: - None: This function does not return a value; its primary effect is the side effect of - downloading files and potentially creating directories. + This function does not return a value; its primary effect is the side effect of + downloading files and potentially creating directories. Raises: ValueError: If an S3 URI does not start with 's3://', indicating an invalid format. @@ -802,9 +939,6 @@ def move_file_at_local(name_map: Dict[str, str]) -> None: Args: name_map (Dict[str, str]): A dictionary mapping source file paths to destination file paths. - - Returns: - None """ for src, dst in name_map.items(): os.path.makedirs(os.path.dirname(dst), exist_ok=True) From 5ef0cb03d3b9a7db3b34c45c4b941444614b49bc Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Wed, 4 Sep 2024 17:16:33 -0400 Subject: [PATCH 7/8] Update utils.py --- flepimop/gempyor_pkg/src/gempyor/utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 9c4e9cc7f..24e718b80 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -169,7 +169,6 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla **kwargs: Further arguments passed to initilization of the class. Returns: -<<<<<<< HEAD The instance of the class that was instantiated with provided **kwargs. Example: @@ -184,9 +183,6 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla ``` print(instance.display()) ``` -======= - The class_name attribute of module class. ->>>>>>> 2b779bb54bb320b5df2f2aea90dea03d9624af92 """ # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path @@ -300,7 +296,7 @@ class ISO8601Date(confuse.Template): """ Reads in config dates into datetimes.dates. """ - def convert(self, value: any, view: confuse.View): + def convert(self, value: any, view: confuse.ConfigView): """ Converts the given value to a datetime.date object. From 41e7aebca9a3f5ccaf4dd75fde697550a350d210 Mon Sep 17 00:00:00 2001 From: Emily Przykucki Date: Mon, 9 Sep 2024 13:00:18 -0400 Subject: [PATCH 8/8] Adjusting formatting on function documentation examples Utilizing this >>> format rather than three backticks (```). Also including output examples when relevant. --- flepimop/gempyor_pkg/src/gempyor/utils.py | 144 ++++++++++------------ 1 file changed, 66 insertions(+), 78 deletions(-) diff --git a/flepimop/gempyor_pkg/src/gempyor/utils.py b/flepimop/gempyor_pkg/src/gempyor/utils.py index 24e718b80..8fc4012b2 100644 --- a/flepimop/gempyor_pkg/src/gempyor/utils.py +++ b/flepimop/gempyor_pkg/src/gempyor/utils.py @@ -7,6 +7,7 @@ import shutil import subprocess import time +import typing from typing import List, Dict, Literal import confuse @@ -110,6 +111,7 @@ def command_safe_run(command: str, command_name: str="mycommand", fail_on_fail: Returns: As a tuple; the return code, the standard output, and standard error from running the command. + Raises: Exception: If fail_on_fail=True and the command fails, an exception will be thrown. """ @@ -158,7 +160,7 @@ def wrapper(*args, **kwargs): return decorator -def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, class_name: str, **kwargs: dict[str, any]) -> any: +def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, class_name: str, **kwargs: dict[str, typing.Any]) -> typing.Any: """ Function serving to create a class that finds and imports the necessary modules. @@ -171,18 +173,18 @@ def search_and_import_plugins_class(plugin_file_path: str, path_prefix: str, cla Returns: The instance of the class that was instantiated with provided **kwargs. - Example: + Examples: Suppose there is a module called `my_plugin.py with a class `MyClass` located at `/path/to/plugin/`. Dynamically import and instantiate the class: - ``` - instance = search_and_import_plugins_class('/path/to/plugin', path_prefix, 'MyClass', **params) - ``` + >>> instance = search_and_import_plugins_class('/path/to/plugin', path_prefix, 'MyClass', **params) + View the instance: - ``` - print(instance.display()) - ``` + + >>> print(instance) + <__main__.MyClass object at 0x7f8b2c6b4d60> + """ # Look for all possible plugins and import them # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path @@ -231,14 +233,14 @@ def profile(output_file: str = None, sort_by: str = "cumulative", lines_to_print Returns: Profile of the decorated function. - Example: - ``` - @profile(output_file="my_function.prof") - def my_function(): + Examples: + >>> @profile(output_file="my_function.prof") + >>> def my_function(): # Function body content pass - my_function() - ``` + >>> my_function() + After running ``my_function``, a file named ``my_function.prof`` will be created in the current WD. + This file contains the profiling data. """ def inner(func): @@ -265,7 +267,7 @@ def as_list(thing: any) -> list[any]: thing: The object that you would like to be converted to a list. Returns: - he object converted to a list. + The object converted to a list. """ if type(thing) == list: return thing @@ -373,11 +375,11 @@ def get_truncated_normal( Returns: rv_frozen: A frozen instance of the truncated normal distribution with the specified parameters. - Example: + Examples: Create a truncated normal distribution with specified parameters (truncated between 1 and 10): - ``` - truncated_normal_dist = get_truncated_normal(mean=5, sd=2, a=1, b=10) - ``` + >>> truncated_normal_dist = get_truncated_normal(mean=5, sd=2, a=1, b=10) + >>> print(truncated_normal_dist) + rv_frozen() """ lower = (a - mean) / sd upper = (b - mean) / sd @@ -402,11 +404,11 @@ def get_log_normal( rv_frozen: A frozen instance of the log normal distribution with the specified parameters. - Example: + Examples: Create a log-normal distribution with specified parameters: - ``` - log_normal_dist = get_log_normal(meanlog=1, sdlog=0.5) - ``` + >>> log_normal_dist = get_log_normal(meanlog=1, sdlog=0.5) + >>> print(log_normal_dist) + """ return scipy.stats.lognorm(s=sdlog, scale=np.exp(meanlog), loc=0) @@ -426,26 +428,25 @@ def as_random_distribution(self): ValueError: When values are out of range. NotImplementedError: If an unknown distribution is found. - Example: - Say that `config` is a `confuse.ConfigView` instance. + Examples: + Say that ``config`` is a ``confuse.ConfigView`` instance. To create a uniform distribution between 1 and 10: - ``` - dist_function = config.as_random_distribution() - sample = dist_function() - ``` + >>> dist_function = config.as_random_distribution() + >>> sample = dist_function() + 5.436789235794546 To use a truncated normal distribution: - ``` - config_truncnorm = confuse.ConfigView({ + >>> config_truncnorm = confuse.ConfigView({ "distribution": "truncnorm", "mean": 0 "sd": 1, "a": -1, "b": 1 }) - truncnorm_dist_function = config_truncnorm.as_random_distribution() - truncnorm_sample = truncnorm_dist_function() + >>> truncnorm_dist_function = config_truncnorm.as_random_distribution() + >>> truncnorm_sample = truncnorm_dist_function() + 0.312745 ``` """ @@ -495,7 +496,8 @@ def list_filenames( folder: str | bytes | os.PathLike = ".", filters: str | list[str] = [], ) -> list[str]: - """Return the list of all filenames and paths in the provided folder. + """ + Return the list of all filenames and paths in the provided folder. This function lists all files in the specified folder and its subdirectories. If filters are provided, only the files containing each of the substrings @@ -512,22 +514,18 @@ def list_filenames( Returns: A list of strings representing the paths to the files that match the filters. - Example: + Examples: To get all files containing "hosp": - ``` - gempyor.utils.list_filenames( + >>> gempyor.utils.list_filenames( folder="model_output/", filters=["hosp"], ) - ``` To get only "hosp" files with a ".parquet" extension: - ``` - gempyor.utils.list_filenames( + >>> gempyor.utils.list_filenames( folder="model_output/", filters=["hosp", ".parquet"], ) - ``` """ filters = [filters] if not isinstance(filters, list) else filters filters = filters if len(filters) else [""] @@ -649,10 +647,9 @@ def create_resume_out_filename( Returns: The path to a corresponding output file. - Example: + Examples: Generate an output file with specified parameters: - ``` - filename = create_resume_out_filename( + >>> filename = create_resume_out_filename( flepi_run_index="test_run", flepi_prefix="model_output/run_id/", flepi_slot_index="1", @@ -660,11 +657,8 @@ def create_resume_out_filename( filetype="seed", liketype="chimeric" ) - ``` - Example output: - ``` + >>> print(filename) "experiment/001/normal/intermediate/000000123.000000000.1.parquet" - ``` """ prefix = f"{flepi_prefix}/{flepi_run_index}" inference_filepath_suffix = f"{liketype}/intermediate" @@ -700,21 +694,17 @@ def create_resume_input_filename( Returns: The path to the a corresponding input file. - Example: + Examples: Generate an input file with specified parameters: - ``` - filename = create_resume_input_filename( + >>> filename = create_resume_input_filename( resume_run_index="2", flepi_prefix="model_output/run_id/", flepi_slot_index="1", filetype="seed", liketype="chimeric" ) - ``` - Example output: - ``` + >>> print(filename) "experiment/002/normal/final/789.csv" - ``` """ prefix = f"{flepi_prefix}/{resume_run_index}" inference_filepath_suffix = f"{liketype}/final" @@ -746,24 +736,16 @@ def get_filetype_for_resume(resume_discard_seeding: str, flepi_block_index: str) Returns: List of file types. - Example: + Examples: Determine file types for block index 1 with seeding data NOT discarded: - ``` - filetypes = get_filetype_for_resume(resume_discard_seeding="false", flepi_block_index="1") - ``` - Output of `print(filetypes)`: - ``` + >>> filetypes = get_filetype_for_resume(resume_discard_seeding="false", flepi_block_index="1") + >>> print(filetypes) ["seed", "spar", "snpi", "hpar", "hnpi", "init"] - ``` Determine file types for block index 2 with seeding data discarded: - ``` - filtypes = get_filetype_for_resume(resume_discard_seeding="true", flepi_block_index="2") - ``` - Output of `print(filetypes)`: - ``` + >>> filtypes = get_filetype_for_resume(resume_discard_seeding="true", flepi_block_index="2") + >>> print(filetypes) ["seed", "spar", "snpi", "hpar", "hnpi", "host", "llik", "init"] - ``` """ if flepi_block_index == "1": if resume_discard_seeding == "true": @@ -782,7 +764,7 @@ def create_resume_file_names_map( flepi_slot_index, flepi_run_index, last_job_output, -) -> dict: +) -> dict[str, str]: """ Generates a mapping of input file names to output file names for a resume process based on parquet file types and environmental conditions. The function adjusts the file name mappings @@ -797,7 +779,7 @@ def create_resume_file_names_map( flepi_run_index: flepiMoP run index. last_job_output: Adjusts the keys in the mapping to be prefixed with this path. - Return: + Returns: A dictionary where keys are input file paths and values are corresponding output file paths. @@ -815,10 +797,9 @@ def create_resume_file_names_map( functions and environment variables which if improperly configured could lead to unexpected behavior. - Example: + Examples: Generate a mapping of file names for a given resume process: - ``` - file_names_map = create_resume_file_names_map( + >>> file_names_map = create_resume_file_names_map( resume_discard_seeding="false", flepi_block_index="1", resume_run_index="1", @@ -826,10 +807,17 @@ def create_resume_file_names_map( flepi_slot_index="1", flepi_run_index="test_run", last_job_output="s3://bucket/path/") - ``` + >>> print(file_names_map) + { + 's3://bucket/path/model_output/run_id/1_type1_global_1.in': 'model_output/run_id/test_run_type1_global_1_1.out', + 's3://bucket/path/model_output/run_id/1_type1_chimeric_1.in': 'model_output/run_id/test_run_type1_chimeric_1_1.out', + 's3://bucket/path/model_output/run_id/1_type2_global_1.in': 'model_output/run_id/test_run_type2_global_1_1.out', + 's3://bucket/path/model_output/run_id/1_type2_chimeric_1.in': 'model_output/run_id/test_run_type2_chimeric_1_1.out' + } + # Note: this output is toy output implemented with toy file names. Notes: - - The paths may be modified by the 'LAST_JOB_OUTPUT' if it is set and points to an S3 location. + - The paths may be modified by the 'LAST_JOB_OUTPUT' if it is set and points to an S3 location. """ file_types = get_filetype_for_resume( resume_discard_seeding=resume_discard_seeding, flepi_block_index=flepi_block_index @@ -865,7 +853,7 @@ def create_resume_file_names_map( return resume_file_name_mapping -def download_file_from_s3(name_map: dict) -> None: +def download_file_from_s3(name_map: dict[str, str]) -> None: """ Downloads files from AWS S3 based on a mapping of S3 URIs to local file paths. The function checks if the directory for the first output file exists and creates it if necessary. It @@ -929,7 +917,7 @@ def download_file_from_s3(name_map: dict) -> None: print("Could not download file from s3") -def move_file_at_local(name_map: Dict[str, str]) -> None: +def move_file_at_local(name_map: dict[str, str]) -> None: """ Moves files locally according to a given mapping. This function takes a dictionary where the keys are source file paths and @@ -938,7 +926,7 @@ def move_file_at_local(name_map: Dict[str, str]) -> None: destination paths. Args: - name_map (Dict[str, str]): A dictionary mapping source file paths to destination file paths. + name_map: A dictionary mapping source file paths to destination file paths. """ for src, dst in name_map.items(): os.path.makedirs(os.path.dirname(dst), exist_ok=True)