fau-klue · ausgerechnet · Jun 27, 2024 · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.7, 3.8, 3.9, "3.10" ]
+        python-version: [ 3.8, 3.9, "3.10", "3.11", "3.12" ]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -10,10 +10,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.8
+    - name: Set up Python 3.10
       uses: actions/setup-python@v1
       with:
-        python-version: '3.8'
+        python-version: '3.10'
     - name: Install build dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/CITATION.cff b/CITATION.cff
@@ -7,6 +7,6 @@ authors:
 - family-names: "Opolka"
   given-names: "Markus"
 title: "Pandas Association Measures"
-version: 0.2.6
-date-released: 2022-10-15
+version: 0.3.0
+date-released: 2024-06-27
 url: "https://github.com/fau-klue/pandas-association-measures"
diff --git a/Pipfile b/Pipfile
@@ -4,14 +4,15 @@ url = "https://pypi.org/simple"
 verify_ssl = true
 
 [dev-packages]
-pytest = "==7.0.1"
-pylint = "==2.13.9"
-pytest-cov = "==3.0.0"
-twine = "==3.7.1"
-setuptools = "==59.6.0"
-cython = "==0.29.30"
+pytest = "==7.4.0"
+pylint = "==2.17.5"
+pytest-cov = "==4.1.0"
+twine = "==4.0.2"
+setuptools = "==68.0.0"
+cython = "==3.0.0"
 
 [packages]
-wheel = ">=0.37.1"
-pandas = ">=1.1.5"
-scipy = ">=1.5.4"
+wheel = ">=0.43.0,<0.44"
+pandas = ">=2.0,<3.0"
+numpy = ">=1.24,<2.0"
+scipy = ">=1.10.0"
diff --git a/README.md b/README.md
@@ -228,6 +228,29 @@ particularly        1.059386
 arrived             3.879126
 ```
 
+## Topographic Maps
+
+**New since version 0.3**: You can use `association_measures.grid.topography` to create a dataframe for visualising association measures in terms of topographic maps. It yields a lograthmically scaled grid from `N1` to `N2` with values of all association measures at resaonable sampling points of all combinations of `f1` and `f2`.
+```python3
+>>> from association_measures.grids import topography
+>>> topography(N1=10e6, N2=10e6)
+            O11         O12       O21         O22          R1          R2        C1          C2           N         E11  ...      dice  log_ratio  conservative_log_ratio  mutual_information  local_mutual_information        ipm  ipm_reference  ipm_expected  clr_normal  log_ratio_hardie
+index                                                                                                                    ...                                                                                                                                                                 
+0             0  10000000.0         0  10000000.0  10000000.0  10000000.0         0  20000000.0  20000000.0         0.0  ...  0.000000   0.000000                0.000000                 inf                       NaN        0.0            0.0          0.00    0.000000          0.000000
+1             0  10000000.0         1   9999999.0  10000000.0  10000000.0         1  19999999.0  20000000.0         0.5  ...  0.000000  -9.967226                0.000000           -2.698970                  0.000000        0.0            0.1          0.05    0.000000         -9.965784
+2             0  10000000.0         2   9999998.0  10000000.0  10000000.0         2  19999998.0  20000000.0         1.0  ...  0.000000 -10.966505                0.000000           -3.000000                  0.000000        0.0            0.2          0.10    0.000000        -10.965784
+3             0  10000000.0         3   9999997.0  10000000.0  10000000.0         3  19999997.0  20000000.0         1.5  ...  0.000000 -11.551228                0.000000           -3.176091                 -0.000000        0.0            0.3          0.15    0.000000        -11.550747
+4             0  10000000.0         4   9999996.0  10000000.0  10000000.0         4  19999996.0  20000000.0         2.0  ...  0.000000 -11.966145                0.000000           -3.301030                 -0.000000        0.0            0.4          0.20    0.000000        -11.965784
+...         ...         ...       ...         ...         ...         ...       ...         ...         ...         ...  ...       ...        ...                     ...                 ...                       ...        ...            ...           ...         ...               ...
+39995  10000000         0.0   7205937   2794063.0  10000000.0  10000000.0  17205937   2794063.0  20000000.0   8602968.5  ...  0.735134   0.472742                0.468813            0.065352             653516.672773  1000000.0       720593.7     860296.85    0.471159          0.472742
+39996  10000000         0.0   7821100   2178900.0  10000000.0  10000000.0  17821100   2178900.0  20000000.0   8910550.0  ...  0.718879   0.354557                0.350718            0.050095             500954.884892  1000000.0       782110.0     891055.00    0.353215          0.354557
+39997  10000000         0.0   8488779   1511221.0  10000000.0  10000000.0  18488779   1511221.0  20000000.0   9244389.5  ...  0.702031   0.236371                0.232619            0.034122             341217.643897  1000000.0       848877.9     924438.95    0.235298          0.236371
+39998  10000000         0.0   9213457    786543.0  10000000.0  10000000.0  19213457    786543.0  20000000.0   9606728.5  ...  0.684616   0.118186                0.114514            0.017424             174244.829132  1000000.0       921345.7     960672.85    0.117443          0.118186
+39999  10000000         0.0  10000000         0.0  10000000.0  10000000.0  20000000         0.0  20000000.0  10000000.0  ...  0.666667   0.000000                0.000000            0.000000                  0.000000  1000000.0      1000000.0    1000000.00    0.000000          0.000000
+
+[40000 rows x 29 columns]
+```
+
 # Development
 
 The package is tested using pylint and pytest.

diff --git a/association_measures/grids.py b/association_measures/grids.py
@@ -0,0 +1,65 @@
+from itertools import product
+
+from numpy import exp, linspace, log
+from pandas import DataFrame
+
+from .measures import score
+
+
+def expand_grid(dictionary):
+    """Create a grid of all value combinations of all keys of the dictionary
+
+    """
+
+    return DataFrame([row for row in product(*dictionary.values())],
+                     columns=dictionary.keys())
+
+
+def log_seq(to=10e6, length=200, exact=50):
+    """Create a logarithimcally scaled sequence
+
+    """
+
+    if length <= exact:
+        raise ValueError()
+
+    length = length - exact
+
+    return list(range(exact + 1)) + [int(exp(s)) for s in sorted([x for x in linspace(log(exact), log(to), length)])]
+
+
+def log_grid(N1=10e6, N2=10e6, length1=200, length2=200, exact1=50, exact2=50):
+    """Create a logarithmically-scaled grid
+
+    """
+    return expand_grid({
+        'f1': log_seq(N1, length1, exact=exact1),
+        'f2': log_seq(N2, length2, exact=exact2)
+    }).drop_duplicates().reset_index(drop=True)
+
+
+def topography(N1=10e6, N2=10e6, length=200, length1=None, length2=None, exact=50, exact1=None, exact2=None):
+    """Create logarithmically scaled grid and calculcate scores
+
+    """
+
+    exact1 = exact if exact1 is None else exact1
+    exact2 = exact if exact2 is None else exact2
+    length1 = length if length1 is None else length1
+    length2 = length if length2 is None else length2
+
+    # support
+    g = log_grid(N1=N1, N2=N2, length1=length1, length2=length2, exact1=exact1, exact2=exact2)
+
+    # add scores
+    scores = score(g, N1=N1, N2=N2)
+    # .. add alternative for CLR
+    scores['clr_normal'] = score(
+        g, N1=N1, N2=N2, boundary='normal', measures=['conservative_log_ratio']
+    )['conservative_log_ratio']
+    # .. add alternative for log-ratio
+    scores['log_ratio_hardie'] = score(
+        g, N1=N1, N2=N2, discounting='Hardie2014', measures=['log_ratio']
+    )['log_ratio']
+
+    return scores
diff --git a/association_measures/measures.py b/association_measures/measures.py
@@ -3,15 +3,15 @@
 
 """
 
+from warnings import warn
+
 import numpy as np
 from pandas import concat, merge
-from scipy.stats import norm, beta
-from warnings import warn
+from scipy.stats import beta, norm
 
 from .binomial import choose
 from .frequencies import expected_frequencies, observed_frequencies
 
-
 CHOOSE = np.vectorize(choose)
 
 
@@ -20,7 +20,6 @@ def list_measures():
 
     :return: dictionary of measures
     :rtype: dict
-
     """
 
     return {
@@ -144,8 +143,7 @@ def calculate_measures(df, measures=None, freq=False, per_million=True, digits=N
 ###############################
 
 def z_score(df, **kwargs):
-    """
-    Calculate z-score
+    """Calculate z-score
 
     :param DataFrame df: DataFrame with columns O11 and E11
     :return: z-score
@@ -158,8 +156,7 @@ def z_score(df, **kwargs):
 
 
 def t_score(df, disc=.001, **kwargs):
-    """
-    Calculate t-score
+    """Calculate t-score
 
     :param DataFrame df: pd.DataFrame with columns O11 and E11
     :param float disc: discounting (or smoothing) parameter for O11 == 0
@@ -174,8 +171,7 @@ def t_score(df, disc=.001, **kwargs):
 
 
 def log_likelihood(df, signed=True, **kwargs):
-    """
-    Calculate log-likelihood
+    """Calculate log-likelihood
 
     :param DataFrame df: pd.DataFrame with columns O11..O22, E11..E22
     :param bool signed: return negative values for rows with O11 < E11?
@@ -204,17 +200,15 @@ def log_likelihood(df, signed=True, **kwargs):
 
 
 def simple_ll(df, signed=True, **kwargs):
-    """
-    Calculate simple log-likelihood
+    """Calculate simple log-likelihood
 
     :param DataFrame df: pd.DataFrame with columns O11, E11
     :param bool signed: return negative values for rows with O11 < E11?
     :return: simple log-likelihood
     :rtype: pd.Series
     """
 
-    # NB: discounting will not have any effect:
-    #     term will be multiplied by original Oij = 0
+    # NB: discounting will not have any effect: term will be multiplied by original Oij = 0
     O11_disc = df['O11'].where(df['O11'] != 0, 1)
 
     log_term = df['O11'] * np.log(O11_disc / df['E11'])
@@ -260,8 +254,7 @@ def liddell(df, **kwargs):
 
 
 def dice(df, **kwargs):
-    """
-    Calculate Dice coefficient
+    """Calculate Dice coefficient
 
     :param DataFrame df: pd.DataFrame with columns O11, O12, O21
     :return: dice
@@ -301,8 +294,7 @@ def log_ratio(df, disc=.5, discounting='Walter1975', **kwargs):
 #######################
 
 def hypergeometric_likelihood(df, **kwargs):
-    """
-    Calculate hypergeometric-likelihood
+    """Calculate hypergeometric-likelihood
 
     :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22
     :return: hypergeometric-likelihood
@@ -324,8 +316,7 @@ def hypergeometric_likelihood(df, **kwargs):
 
 
 def binomial_likelihood(df, **kwargs):
-    """
-    Calculate binomial-likelihood
+    """Calculate binomial-likelihood
 
     :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22, E11, N
     :return: binomial-likelihood
@@ -350,13 +341,12 @@ def binomial_likelihood(df, **kwargs):
 # CONSERVATIVE ESTIMATES #
 ##########################
 
-def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
+def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='poisson',
                            correct='Bonferroni', vocab=None,
                            one_sided=False, **kwargs):
-    """
-    Calculate conservative log-ratio, i.e. the binary logarithm of the
+    """Calculate conservative log-ratio, i.e. the binary logarithm of the
     lower bound of the confidence interval of relative risk at the
-    (Bonferroni-corrected) confidence level.
+    (Bonferroni-corrected) significance level.
 
     :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22
     :param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0
@@ -368,7 +358,6 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
 
     :return: conservative log-ratio
     :rtype: pd.Series
-
     """
 
     # correction of alpha for two-sided tests
@@ -394,15 +383,12 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
 
     # Poisson approximation (Evert 2022)
     if boundary == 'poisson':
-
         # only calculate where_lower
         lower = beta.ppf(alpha, df['O11'], df['O21'] + 1)
         lower_boundary = np.log2((df['R2'] / df['R1']) * lower / (1 - lower)).clip(lower=0)
-
         # only calculate where_upper
         upper = beta.ppf(1 - alpha, df['O11'] + 1, df['O21'])
         upper_boundary = np.log2((df['R2'] / df['R1']) * upper / (1 - upper)).clip(upper=0)
-
         # combine, set to 0 where (df['O11'] == 0) & (df['O12'] == 0)
         clrr = lower_boundary.where(
             (df['O11'] / df['R1']) >= (df['O21'] / df['R2']),
@@ -434,8 +420,7 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
 ######################
 
 def mutual_information(df, disc=.001, **kwargs):
-    """
-    Calculate Mutual Information
+    """Calculate Mutual Information
 
     :param DataFrame df: pd.DataFrame with columns O11 and E11
     :param float disc: discounting (or smoothing) parameter for O11 == 0
@@ -450,16 +435,14 @@ def mutual_information(df, disc=.001, **kwargs):
 
 
 def local_mutual_information(df, **kwargs):
-    """
-    Calculate Local Mutual Information
+    """Calculate Local Mutual Information
 
     :param DataFrame df: pd.DataFrame with columns O11 and E11
-    :return: mutual information
+    :return: local mutual information
     :rtype: pd.Series
     """
 
-    # NB: discounting will not have any effect:
-    #     term will be multiplied by original Oij = 0
+    # NB: discounting will not have any effect: term will be multiplied by original Oij = 0
     O11_disc = df['O11'].where(df['O11'] != 0, 1)
     am = df['O11'] * np.log10(O11_disc / df['E11'])
 

diff --git a/association_measures/version.py b/association_measures/version.py
@@ -2,5 +2,5 @@
 Association measures are mathematical formulae that interpret cooccurrence frequency data.
 """
 
-VERSION = (0, 2, 7)
+VERSION = (0, 3, 0)
 __version__ = '.'.join(map(str, VERSION))
diff --git a/performance.py b/performance.py
@@ -82,7 +82,7 @@
     # conservative estimates
     {
         'name': 'conservative_log_ratio',
-        'code': 'am.conservative_log_ratio(df)'
+        'code': 'am.conservative_log_ratio(df, boundary="normal")'
     },
     {
         'name': 'conservative_log_ratio_poisson',

diff --git a/setup.py b/setup.py
@@ -2,9 +2,7 @@
 
 import os
 import sys
-from setuptools import find_packages, Command
-from distutils.core import setup
-from distutils.extension import Extension
+from setuptools import find_packages, setup, Command, Extension
 
 # Package meta-data.
 NAME = 'association-measures'
@@ -13,7 +11,7 @@
 EMAIL = '[email protected]'
 AUTHOR = 'Philipp Heinrich & Markus Opolka'
 
-REQUIRES_PYTHON = '>=3.6'
+REQUIRES_PYTHON = '>=3.8'
 REQUIRED = [
     'wheel',
     'pandas',
@@ -99,10 +97,11 @@ def run(self):
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'Programming Language :: Cython',
     ],
 )