From 0cea1d1c83589e061286b914e7fb820909ae3807 Mon Sep 17 00:00:00 2001 From: jbout Date: Mon, 23 Sep 2024 22:40:21 +0300 Subject: [PATCH] internalize the mean computation --- src/qumin/calc_paradigm_entropy.py | 16 ++++++++++------ src/qumin/config/qumin.yaml | 3 ++- src/qumin/entropy/distribution.py | 5 +++-- src/qumin/representations/__init__.py | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/qumin/calc_paradigm_entropy.py b/src/qumin/calc_paradigm_entropy.py index 0d40900..06df46f 100644 --- a/src/qumin/calc_paradigm_entropy.py +++ b/src/qumin/calc_paradigm_entropy.py @@ -11,6 +11,8 @@ from .entropy.distribution import PatternDistribution, SplitPatternDistribution from .representations import segments, patterns, create_paradigms, create_features +from .representations.frequencies import Frequencies +from itertools import permutations log = logging.getLogger() @@ -24,6 +26,8 @@ def H_command(cfg, md): cfg.patterns) == 2, "You must pass either a single dataset and patterns file, or a list of two of each (coindexed)." md.bipartite = True + Frequencies.initialize(md.datasets[0], real=True) + patterns_file_path = cfg.patterns if md.bipartite else [cfg.patterns] sounds_file_name = md.get_table_path("sounds") @@ -86,10 +90,10 @@ def H_command(cfg, md): features=features) distrib.mutual_information() - mean1 = distrib.distribs[0].get_results().loc[:, "value"].mean() - mean2 = distrib.distribs[1].get_results().loc[:, "value"].mean() - mean3 = distrib.get_results(measure="mutual_information").loc[:, "value"].mean() - mean4 = distrib.get_results(measure="normalized_mutual_information").loc[:, "value"].mean() + mean1 = distrib.distribs[0].get_mean() + mean2 = distrib.distribs[1].get_mean() + mean3 = distrib.get_mean(measure="mutual_information") + mean4 = distrib.get_mean(measure="normalized_mutual_information") log.debug("Mean remaining H(c1 -> c2) for %s = %s", names[0], mean1) log.debug("Mean remaining H(c1 -> c2) for %s = %s", names[1], mean2) log.debug("Mean I(%s,%s) = %s", *names, mean3) @@ -109,7 +113,7 @@ def H_command(cfg, md): if onePred: if not md.bipartite: # Already computed in bipartite systems :) distrib.one_pred_entropy() - mean = distrib.get_results().loc[:, "value"].mean() + mean = distrib.get_mean() log.info("Mean H(c1 -> c2) = %s ", mean) if verbose: distrib.one_pred_distrib_log() @@ -119,7 +123,7 @@ def H_command(cfg, md): for n in preds: distrib.n_preds_entropy_matrix(n) - mean = distrib.get_results(n=n).loc[:, "value"].mean() + mean = distrib.get_mean(n=n) log.info(f"Mean H(c1, ..., c{n} -> c) = {mean}") if verbose: diff --git a/src/qumin/config/qumin.yaml b/src/qumin/config/qumin.yaml index f11385c..f429bf7 100644 --- a/src/qumin/config/qumin.yaml +++ b/src/qumin/config/qumin.yaml @@ -69,7 +69,8 @@ entropy: # with any file, use to compute entropy heatmap # with n-1 predictors, allows for acceleration on nPreds entropy computation. merged: False # Whether identical columns are merged in the input. - stacked: False # whether to stack results in long form + stacked: False # whether to stack results in long form. + weighting: True # whether to use cell frequencies for weighting. eval: iter: 10 # How many 90/10 train/test folds to do. diff --git a/src/qumin/entropy/distribution.py b/src/qumin/entropy/distribution.py index 8bf6855..48ea7ac 100644 --- a/src/qumin/entropy/distribution.py +++ b/src/qumin/entropy/distribution.py @@ -98,10 +98,11 @@ def __init__(self, paradigms, patterns, classes, name, features=None): "dataset" ]) - def get_results(self, measure="cond_entropy", n=1): + def get_mean(self, measure="cond_entropy", n=1): is_cond_ent = self.data.loc[:, "measure"] == measure is_one_pred = self.data.loc[:, "n_preds"] == n - return self.data.loc[is_cond_ent & is_one_pred, :] + + return self.data.loc[is_cond_ent & is_one_pred, "value"].mean() def export_file(self, filename): """ Export the data DataFrame to file diff --git a/src/qumin/representations/__init__.py b/src/qumin/representations/__init__.py index 6eee056..b93c7c4 100644 --- a/src/qumin/representations/__init__.py +++ b/src/qumin/representations/__init__.py @@ -87,7 +87,7 @@ def get_unknown_segments(forms, unknowns, name): paradigms = paradigms[~paradigms.loc[:, lexemes].isin(defective_lexemes)] if most_freq: - inflected = paradigms.loc[:,lexemes].unique() + inflected = paradigms.loc[:, lexemes].unique() lexemes_file_name = Path(dataset.basepath) / dataset.get_resource("lexemes").path lexemes_df = pd.read_csv(lexemes_file_name, usecols=["lexeme_id", "frequency"]) # Restrict to lexemes we have kept, if we dropped defectives