only calculate scores once for each frequency signature; close #30

fau-klue · Nov 8, 2023 · fab2a2c · fab2a2c
1 parent b85bd5c
commit fab2a2c
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 20 deletions.
diff --git a/association_measures/measures.py b/association_measures/measures.py
@@ -4,7 +4,7 @@
 """
 
 import numpy as np
-from pandas import concat
+from pandas import concat, merge
 from scipy.stats import norm, beta
 from warnings import warn
 
@@ -95,14 +95,22 @@ def score(df, measures=None, f1=None, N=None, N1=None, N2=None,
     else:
         measures = [ams_all[k] for k in ams_all]
 
+    # reduce df to unique frequency signatures
+    vocab = len(df) if vocab is None else vocab
+    df_reduced = df.drop_duplicates(subset=list(freq_columns)).copy()
+
     # calculate measures
     for measure in measures:
-        df[measure.__name__] = measure(
-            df, disc=disc, discounting=discounting, signed=signed, alpha=alpha,
+        df_reduced[measure.__name__] = measure(
+            df_reduced, disc=disc, discounting=discounting, signed=signed, alpha=alpha,
             correct=correct, boundary=boundary, vocab=vocab, one_sided=one_sided
         )
 
-    # frequency columns?
+    # join on frequency columns (NB: thanks to pandas API, we have to take care of index names ourselves)
+    index_names = ['index'] if df.index.names == [None] else df.index.names
+    df = merge(df.reset_index(), df_reduced, how='left', on=list(freq_columns)).set_index(index_names)
+
+    # keep frequency columns?
     if not freq:
         df = df.drop(freq_columns, axis=1)
     else:

diff --git a/performance.md b/performance.md
@@ -1,27 +1,28 @@
 # Performance
 - performance is calculated on a Lenovo X1 Carbon (10th generation, i7)
 - input data are 24,167 observations from [brown.csv](tests/data/brown.csv)
-- we report  for 1000 iterations
+- NB: dataframe contains 4241 duplicated frequency signatures (for which calculation will only be run once since v0.2.7)
+- for each measure, we report time needed for 1000 scorings of the whole dataframe
 
 ## v0.2.7
 - major performance improvement regarding conservative log-ratio with Poisson boundary (factor 50)
 ```
 settings: iterations=1000, df_size=24167
--  0.0874 :: contingency_table
--  1.5254 :: expected_frequencies
--  0.1510 :: z_score
--  0.2906 :: t_score
--  1.7408 :: log_likelihood
--  0.6146 :: simple_ll
--  1.3270 :: min_sensitivity
--  0.2604 :: liddell
--  0.2502 :: dice
--  0.4494 :: log_ratio
--  4.6467 :: binomial_likelihood
--  2.1923 :: conservative_log_ratio
-- 31.2882 :: conservative_log_ratio_poisson
--  0.3840 :: mutual_information
--  0.4441 :: local_mutual_information
+-  0.0871 :: contingency_table
+-  1.5258 :: expected_frequencies
+-  0.1507 :: z_score
+-  0.2899 :: t_score
+-  1.7406 :: log_likelihood
+-  0.6125 :: simple_ll
+-  1.2981 :: min_sensitivity
+-  0.2584 :: liddell
+-  0.2491 :: dice
+-  0.4460 :: log_ratio
+-  4.5788 :: binomial_likelihood
+-  2.1891 :: conservative_log_ratio
+- 29.8616 :: conservative_log_ratio_poisson
+-  0.3702 :: mutual_information
+-  0.4314 :: local_mutual_information
 ```
 
 ## v0.2.6