From fd8aff4d501a2e416f992ef07cb4d7a645079831 Mon Sep 17 00:00:00 2001 From: Cecilia Sensalari <57489957+Cecilia-Sensalari@users.noreply.github.com> Date: Thu, 1 Jun 2023 09:49:16 +0200 Subject: [PATCH] Bugfix for exponential-lognormal mixture model: remove weight rounding during EM iterations (#50) * Remove weight rounding in M-step of ELMM * Remove several roundings for printed parameters * Remove rounding for printed parameters in LMM * Test pipeline: Nextflow version compatible DSL1 * 600 default EM iterations; missing max_iter in LMM --- .github/workflows/test_pipeline.yml | 2 +- doc/source/configuration.rst | 4 ++-- doc/source/paralogs_analyses.rst | 4 ++-- example/config_expert.txt | 2 +- ksrates/cluster_anchor_ks.py | 2 +- ksrates/exp_log_mixture.py | 2 +- ksrates/fc_configfile.py | 20 ++++++++++---------- ksrates/fc_exp_log_mixture.py | 24 ++++++++++-------------- ksrates/fc_lognormal_mixture.py | 22 +++++++++++----------- ksrates/lognormal_mixture.py | 2 +- 10 files changed, 40 insertions(+), 44 deletions(-) diff --git a/.github/workflows/test_pipeline.yml b/.github/workflows/test_pipeline.yml index e55dd2f..904c263 100644 --- a/.github/workflows/test_pipeline.yml +++ b/.github/workflows/test_pipeline.yml @@ -37,7 +37,7 @@ jobs: run: | cd test mv nextflow.config custom_nextflow.config - nextflow run ../main.nf --config config_elaeis.txt -with-docker ksrates + NXF_VER=21.10.6 nextflow run ../main.nf --config config_elaeis.txt -with-docker ksrates - name: Visualize output files if: ${{ always() }} diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 723f370..0065cea 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -221,7 +221,7 @@ The following can be used as a template:: kde_bandwidth_modifier = 0.4 plot_adjustment_arrows = no num_mixture_model_initializations = 10 - max_mixture_model_iterations = 300 + max_mixture_model_iterations = 600 max_mixture_model_components = 5 max_mixture_model_ks = 5 extra_paralogs_analyses_methods = no @@ -232,7 +232,7 @@ The following can be used as a template:: * **kde_bandwidth_modifier**: modifier to adjust the fitting of the KDE curve on the underlying whole-paranome or anchor *K*:sub:`S` distribution. The KDE Scott's factor internally computed by SciPy tends to produce an overly smooth KDE curve, especially with steep WGD peaks, and therefore it is reduced by multiplying it by a modifier. Decreasing the modifier leads to tighter fits, increasing it leads to smoother fits, and setting it to 1 gives the default KDE factor. Note that a too small factor is likely to take into account data noise. [Default: 0.4] * **plot_adjustment_arrows**: flag to toggle the plotting of rate-adjustment arrows below the adjusted mixed paralog--ortholog *K*:sub:`S` plot. These arrows start from the original unadjusted ortholog divergence *K*:sub:`S` estimate and end on the rate-adjusted estimate (options: "yes" and "no"). [Default: "no"] * **num_mixture_model_initializations**: number of times the EM algorithm is initialized (either for the random initialization in the exponential-lognormal mixture model or for k-means in the lognormal mixture model). [Default: 10] -* **max_mixture_model_iterations**: maximum number of EM iterations for mixture modeling. [Default: 300] +* **max_mixture_model_iterations**: maximum number of EM iterations for mixture modeling. [Default: 600] * **max_mixture_model_components**: maximum number of components considered during execution of the mixture models. [Default: 5] * **max_mixture_model_ks**: upper limit for the *K*:sub:`S` range in which the exponential-lognormal and lognormal-only mixture models are performed. [Default: 5] * **extra_paralogs_analyses_methods**: flag to toggle the optional analysis of the paralog *K*:sub:`S` distribution with non default mixture model methods (see section :ref:`paralogs_analyses` and Supplementary Materials) [Default: "no"] diff --git a/doc/source/paralogs_analyses.rst b/doc/source/paralogs_analyses.rst index 722fde1..e67b05b 100644 --- a/doc/source/paralogs_analyses.rst +++ b/doc/source/paralogs_analyses.rst @@ -71,7 +71,7 @@ Among its outputs (for a complete list see section :ref:`output_files`), the ELM * The model initialization approach is stored in column 1 ``Model`` according to a numerical code (1: data-driven, 2: hybrid data-driven plus a random lognormal component, 3: random initialization with exponential component and one lognormal component, 4: random initialization with exponential component and two lognormal components; higher numbers feature random initialization with exponential component and increasing number of lognormal components). * The initialization round is stored in column 2 ``Initialization``. By default each model type (except type 1) is initialized and fitted 10 times, so this column shows numbers from 1 to 10. * The BIC and loglikelihood scores for the fitted model are stored in columns 3 ``BIC`` and 4 ``Loglikelihood``. - * The number of algorithm iterations needed to reach convergence is stored in column 5 ``Convergence``. If greater than 300 iterations would be needed, convergence is not reached and the cell will show *NA*. + * The number of algorithm iterations needed to reach convergence is stored in column 5 ``Convergence``. If greater than (by default) 600 iterations would be needed, convergence is not reached and the cell will show *NA*. * The fitted exponential rate parameter and its component weight are stored in columns 6 ``Exponential_Rate`` and 7 ``Exponential_Weight``. * The mean, standard deviation and weight of the fitted Normal components used to define the correspondent lognormal components are stored in columns 8 to 10: ``Normal_Mean``, ``Normal_SD`` and ``Normal_Weight``. When there are multiple lognormal components, the data for each of them are stored in a separate row (the number of rows for each model and initialization is thus equal to the number of lognormal components). @@ -106,7 +106,7 @@ Among its outputs (for a complete list see section :ref:`output_files`), the LMM * The model type is stored in column 1 ``Model`` according to a numerical code (1: one lognormal component, 2: two lognormal components, 3: three lognormal components; and so on). * The BIC and loglikelihood scores for the fitted model are stored in columns 2 ``BIC`` and 3 ``Loglikelihood`` - * The number of algorithm iterations needed to reach convergence is stored in column 4 ``Convergence``. If greater than 300 iterations would be needed, convergence is not reached and the cell will show *NA*. + * The number of algorithm iterations needed to reach convergence is stored in column 4 ``Convergence``. If greater than (by default) 600 iterations would be needed, convergence is not reached and the cell will show *NA*. * The mean, standard deviation and weight of the fitted Normal components used to define the correspondent lognormal components are stored in columns 5 to 7: ``Normal_Mean``, ``Normal_SD`` and ``Normal_Weight``. When there are multiple lognormal components, the data for each of them are stored in a separate row (the number of rows per model is thus equal to the number of components). diff --git a/example/config_expert.txt b/example/config_expert.txt index 68b31ce..b4aa38e 100644 --- a/example/config_expert.txt +++ b/example/config_expert.txt @@ -3,7 +3,7 @@ logging_level = info kde_bandwidth_modifier = 0.4 plot_adjustment_arrows = no -max_mixture_model_iterations = 300 +max_mixture_model_iterations = 600 num_mixture_model_initializations = 10 extra_paralogs_analyses_methods = no max_mixture_model_components = 5 diff --git a/ksrates/cluster_anchor_ks.py b/ksrates/cluster_anchor_ks.py index 645a6d8..41bf013 100644 --- a/ksrates/cluster_anchor_ks.py +++ b/ksrates/cluster_anchor_ks.py @@ -37,7 +37,7 @@ def cluster_anchor_ks(config_file, expert_config_file, correction_table_file, pa color_list = config.get_color_list() plot_correction_arrows = config.plot_correction_arrows() - max_EM_iterations = config.get_max_EM_iterations() # default 300 + max_EM_iterations = config.get_max_EM_iterations() # default 600 num_EM_initializations = config.get_num_EM_initializations() # how many times the fitting with N given components is initialized logging.info(f" - maximum EM iterations: {max_EM_iterations}") logging.info(f" - number of EM initializations: {num_EM_initializations}") diff --git a/ksrates/exp_log_mixture.py b/ksrates/exp_log_mixture.py index 83c33ee..b92cba8 100644 --- a/ksrates/exp_log_mixture.py +++ b/ksrates/exp_log_mixture.py @@ -44,7 +44,7 @@ def exp_log_mixture(config_file, expert_config_file, paralog_tsv_file, correctio # Parameters used during the mixture modeling max_ks_EM = config.get_max_ks_for_mixture_model(max_ks_para) # upper Ks limit considered for the mixture model fitting - max_EM_iterations = config.get_max_EM_iterations() # default 300 + max_EM_iterations = config.get_max_EM_iterations() # default 600 num_EM_initializations = config.get_num_EM_initializations() # how many times the fitting with N given components is initialized # in the random method and in the "peak + random" method). default 10 max_num_comp = config.get_max_mixture_model_components() # max number of components used in the fitting with random components (exp + buffer lognormal + lognormal) diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index c4d6031..0eb7459 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -607,7 +607,7 @@ def get_kde_bandwidth_modifier(self): def get_max_EM_iterations(self): """ - Gets the maximum number of EM iterations to apply during mixture modeling [Default: 300]. + Gets the maximum number of EM iterations to apply during mixture modeling [Default: 600]. :return max_mixture_model_iterations: max number of iterations for the exponential-maximization algorithm @@ -620,20 +620,20 @@ def get_max_EM_iterations(self): except Exception: pass if not isinstance(max_iter, int): - logging.warning(f'Unrecognized field in expert configuration file [max_mixture_model_iterations = {max_iter}]. Please choose a positive integer. Default choice will be applied [300]') - max_iter = 300 + logging.warning(f'Unrecognized field in expert configuration file [max_mixture_model_iterations = {max_iter}]. Please choose a positive integer. Default choice will be applied [600]') + max_iter = 600 elif max_iter <= 0: - logging.warning(f'Unrecognized field in expert configuration file [max_mixture_model_iterations = {max_iter}]. Please choose a positive integer. Default choice will be applied [300]') - max_iter = 300 - elif max_iter < 100: + logging.warning(f'Unrecognized field in expert configuration file [max_mixture_model_iterations = {max_iter}]. Please choose a positive integer. Default choice will be applied [600]') + max_iter = 600 + elif max_iter <= 300: logging.warning(f"A small maximum number of mixture model iterations [max_mixture_model_iterations = {max_iter}] can produce poor fitting.") - elif max_iter > 400: + elif max_iter > 600: logging.warning(f"A large maximum number of mixture model iterations [max_mixture_model_iterations = {max_iter}] can increase the runtime.") except Exception: - logging.warning(f'Missing field in expert configuration file [max_mixture_model_iterations]. Please choose a positive integer. Default choice will be applied [300]') - max_iter = 300 + logging.warning(f'Missing field in expert configuration file [max_mixture_model_iterations]. Please choose a positive integer. Default choice will be applied [600]') + max_iter = 600 else: - max_iter = 300 + max_iter = 600 return max_iter diff --git a/ksrates/fc_exp_log_mixture.py b/ksrates/fc_exp_log_mixture.py index 255d4a0..5206e98 100644 --- a/ksrates/fc_exp_log_mixture.py +++ b/ksrates/fc_exp_log_mixture.py @@ -588,7 +588,7 @@ def m_step(num_comp, data, posteriors): new_weights = [] for k in range(0, num_comp): weight = points_per_k[k] / len(data) - new_weights.append(round(weight, 2)) + new_weights.append(weight) # Computes updated means and stdevs of the lognormal components new_means, new_stdevs = [], [] @@ -607,7 +607,7 @@ def em(num_comp, max_iter, ks_data_proxy, init_lambd, init_means, init_stdevs, i """ Performs the EM algorithm with a given number of components, their parameters on the proxy dataset for the weighted Ks paranome (the deconvoluted data). If convergence of the algorithm based on - threshold of 1e-6 is not reached in 300 iterations, it prints a warning. + threshold of 1e-6 is not reached in 600 iterations, it prints a warning. :param num_comp: number of components (1 exponential distribution + N lognormal components) :param max_iter: maximum number of iterations for the EM steps @@ -713,12 +713,12 @@ def print_details_model(model_id, model_iteration, max_model_iteration, max_num_ if convergence_flag: outfile.write(f"The EM algorithm has reached convergence after {convergence_iteration} iterations\n") else: - outfile.write(f"The EM algorithm didn't reach convergence after {max_em_iteration} iterations (diff is ~{round(loglik_diff, 2)})\n") + outfile.write(f"The EM algorithm didn't reach convergence after {max_em_iteration} iterations (diff is {loglik_diff})\n") outfile.write("\n") - print_parameters("fitted", round(fitted_means, 2), round(fitted_stdevs, 2), round(fitted_lambd, 2), round(fitted_weights, 2), outfile) - outfile.write(f"Log-likelihood: {round(final_loglik, 3)}\n") - outfile.write(f"BIC: {round(bic, 3)}\n") + print_parameters("fitted", fitted_means, fitted_stdevs, fitted_lambd, fitted_weights, outfile) + outfile.write(f"Log-likelihood: {final_loglik}\n") + outfile.write(f"BIC: {bic}\n") # if EM_data_random or EM_random: if (model_id == 2 or model_id == 5) and model_iteration == max_model_iteration: @@ -736,8 +736,8 @@ def print_details_model(model_id, model_iteration, max_model_iteration, max_num_ else: model_iteration = int(model_iteration) for mean, stdev, weight in zip(fitted_means, fitted_stdevs, fitted_weights[1:]): - parameter_table.append([model_id, model_iteration, round(bic, 3), round(final_loglik, 3), convergence_iteration, - round(fitted_lambd, 2), fitted_weights[0], round(mean, 2), round(stdev, 2), weight]) + parameter_table.append([model_id, model_iteration, bic, final_loglik, convergence_iteration, + fitted_lambd, fitted_weights[0], mean, stdev, weight]) def make_parameter_table_file(parameter_table, species): @@ -773,11 +773,7 @@ def print_parameters(starting_or_fitted, means, stdevs, lambd, weights, outfile) outfile.write(f" EXP : {lambd}\n") for n in range(len(means)): outfile.write(f" NORM {n+1}: {means[n]} +- {stdevs[n]}\n") - rounded_weights = [] - for w in weights: - w = round(w, 2) - rounded_weights.append(w) - outfile.write(f" WEIGHT: {rounded_weights}\n") + outfile.write(f" WEIGHT: {weights}\n") def plot_init_comp(ax_ks, ax_logks, means, stdevs, lambd, weights, plot_logtranformed=True): @@ -986,7 +982,7 @@ def eval_best_model(bic_dict, outfile): if delta_bic > 2: j = 1 if delta_bic > 6: j = 2 if delta_bic > 10: j = 3 - outfile.write(f" Model {m}: delta(BIC) = {round(delta_bic, 2):>8} ({l[j]})\n") + outfile.write(f" Model {m}: delta(BIC) = {delta_bic:>8} ({l[j]})\n") outfile.write("\n") return best_model_id diff --git a/ksrates/fc_lognormal_mixture.py b/ksrates/fc_lognormal_mixture.py index d3cea6d..7eb408b 100644 --- a/ksrates/fc_lognormal_mixture.py +++ b/ksrates/fc_lognormal_mixture.py @@ -91,7 +91,7 @@ def inspect_bic(bic, outfile): outfile.write("\n") -def log_components(X, model_id, m, outfile, parameter_table, max_iter=300): +def log_components(X, model_id, m, outfile, parameter_table, max_iter): """ Modified from wgd. @@ -112,20 +112,20 @@ def log_components(X, model_id, m, outfile, parameter_table, max_iter=300): outfile.write("FITTED PARAMETERS:\n") for j in range(len(m.means_)): - outfile.write(f" NORM {j+1}: {round(m.means_[j][0], 2)} +- {round(sqrt(m.covariances_[j][0][0]), 2)}\n") + outfile.write(f" NORM {j+1}: {m.means_[j][0]} +- {sqrt(m.covariances_[j][0][0])}\n") - parameter_table.append([model_id, round(m.bic(X), 3), round(m.lower_bound_, 3), convergence, - round(m.means_[j][0], 2), round(m.covariances_[j][0][0], 2), round(m.weights_[j], 2)]) + parameter_table.append([model_id, m.bic(X), m.lower_bound_, convergence, + m.means_[j][0], m.covariances_[j][0][0], m.weights_[j]]) - rounded_weights = [] + weight_list = [] for w in m.weights_: - rounded_weights.append(round(w, 2)) - outfile.write(f" WEIGHT: {rounded_weights}\n") - outfile.write(f"Log-likelihood: {round(m.lower_bound_, 3)}\n") - outfile.write(f"BIC: {round(m.bic(X), 3)}\n\n") + weight_list.append(w) + outfile.write(f" WEIGHT: {weight_list}\n") + outfile.write(f"Log-likelihood: {m.lower_bound_}\n") + outfile.write(f"BIC: {m.bic(X)}\n\n") -def fit_gmm(X, n1, n2, outfile, parameter_table, max_iter=300, n_init=1, **kwargs): +def fit_gmm(X, n1, n2, outfile, parameter_table, max_iter=600, n_init=1, **kwargs): """ Modified from wgd. Compute Gaussian mixtures for different numbers of components @@ -150,7 +150,7 @@ def fit_gmm(X, n1, n2, outfile, parameter_table, max_iter=300, n_init=1, **kwarg n_components=N[i], covariance_type='full', max_iter=max_iter, n_init=n_init, tol=1e-6, **kwargs ).fit(X) - log_components(X, i+1, models[i], outfile, parameter_table) + log_components(X, i+1, models[i], outfile, parameter_table, max_iter) else: logging.warning(f"Lognormal mixture model with {N[i]} or more components is skipped due to too few input Ks data points") break diff --git a/ksrates/lognormal_mixture.py b/ksrates/lognormal_mixture.py index 9123220..1eee2e6 100644 --- a/ksrates/lognormal_mixture.py +++ b/ksrates/lognormal_mixture.py @@ -41,7 +41,7 @@ def lognormal_mixture(config_file, expert_config_file, paralog_tsv_file, anchors # Parameters used during the mixture modeling max_ks_EM = config.get_max_ks_for_mixture_model(max_ks_para) # upper Ks limit considered for the mixture model fitting - max_EM_iterations = config.get_max_EM_iterations() # default 300 + max_EM_iterations = config.get_max_EM_iterations() # default 600 num_EM_initializations = config.get_num_EM_initializations() # how many time k-means is performed. Default 10. max_num_comp = config.get_max_mixture_model_components()