Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.3.0 #35

Merged
merged 12 commits into from
Jun 27, 2024
2 changes: 1 addition & 1 deletion .github/workflows/python-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.7, 3.8, 3.9, "3.10" ]
python-version: [ 3.8, 3.9, "3.10", "3.11", "3.12" ]

steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
- name: Set up Python 3.10
uses: actions/setup-python@v1
with:
python-version: '3.8'
python-version: '3.10'
- name: Install build dependencies
run: |
python -m pip install --upgrade pip
Expand Down
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ authors:
- family-names: "Opolka"
given-names: "Markus"
title: "Pandas Association Measures"
version: 0.2.6
date-released: 2022-10-15
version: 0.3.0
date-released: 2024-06-27
url: "https://github.com/fau-klue/pandas-association-measures"
19 changes: 10 additions & 9 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
pytest = "==7.0.1"
pylint = "==2.13.9"
pytest-cov = "==3.0.0"
twine = "==3.7.1"
setuptools = "==59.6.0"
cython = "==0.29.30"
pytest = "==7.4.0"
pylint = "==2.17.5"
pytest-cov = "==4.1.0"
twine = "==4.0.2"
setuptools = "==68.0.0"
cython = "==3.0.0"

[packages]
wheel = ">=0.37.1"
pandas = ">=1.1.5"
scipy = ">=1.5.4"
wheel = ">=0.43.0,<0.44"
pandas = ">=2.0,<3.0"
numpy = ">=1.24,<2.0"
scipy = ">=1.10.0"
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,29 @@ particularly 1.059386
arrived 3.879126
```

## Topographic Maps

**New since version 0.3**: You can use `association_measures.grid.topography` to create a dataframe for visualising association measures in terms of topographic maps. It yields a lograthmically scaled grid from `N1` to `N2` with values of all association measures at resaonable sampling points of all combinations of `f1` and `f2`.
```python3
>>> from association_measures.grids import topography
>>> topography(N1=10e6, N2=10e6)
O11 O12 O21 O22 R1 R2 C1 C2 N E11 ... dice log_ratio conservative_log_ratio mutual_information local_mutual_information ipm ipm_reference ipm_expected clr_normal log_ratio_hardie
index ...
0 0 10000000.0 0 10000000.0 10000000.0 10000000.0 0 20000000.0 20000000.0 0.0 ... 0.000000 0.000000 0.000000 inf NaN 0.0 0.0 0.00 0.000000 0.000000
1 0 10000000.0 1 9999999.0 10000000.0 10000000.0 1 19999999.0 20000000.0 0.5 ... 0.000000 -9.967226 0.000000 -2.698970 0.000000 0.0 0.1 0.05 0.000000 -9.965784
2 0 10000000.0 2 9999998.0 10000000.0 10000000.0 2 19999998.0 20000000.0 1.0 ... 0.000000 -10.966505 0.000000 -3.000000 0.000000 0.0 0.2 0.10 0.000000 -10.965784
3 0 10000000.0 3 9999997.0 10000000.0 10000000.0 3 19999997.0 20000000.0 1.5 ... 0.000000 -11.551228 0.000000 -3.176091 -0.000000 0.0 0.3 0.15 0.000000 -11.550747
4 0 10000000.0 4 9999996.0 10000000.0 10000000.0 4 19999996.0 20000000.0 2.0 ... 0.000000 -11.966145 0.000000 -3.301030 -0.000000 0.0 0.4 0.20 0.000000 -11.965784
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39995 10000000 0.0 7205937 2794063.0 10000000.0 10000000.0 17205937 2794063.0 20000000.0 8602968.5 ... 0.735134 0.472742 0.468813 0.065352 653516.672773 1000000.0 720593.7 860296.85 0.471159 0.472742
39996 10000000 0.0 7821100 2178900.0 10000000.0 10000000.0 17821100 2178900.0 20000000.0 8910550.0 ... 0.718879 0.354557 0.350718 0.050095 500954.884892 1000000.0 782110.0 891055.00 0.353215 0.354557
39997 10000000 0.0 8488779 1511221.0 10000000.0 10000000.0 18488779 1511221.0 20000000.0 9244389.5 ... 0.702031 0.236371 0.232619 0.034122 341217.643897 1000000.0 848877.9 924438.95 0.235298 0.236371
39998 10000000 0.0 9213457 786543.0 10000000.0 10000000.0 19213457 786543.0 20000000.0 9606728.5 ... 0.684616 0.118186 0.114514 0.017424 174244.829132 1000000.0 921345.7 960672.85 0.117443 0.118186
39999 10000000 0.0 10000000 0.0 10000000.0 10000000.0 20000000 0.0 20000000.0 10000000.0 ... 0.666667 0.000000 0.000000 0.000000 0.000000 1000000.0 1000000.0 1000000.00 0.000000 0.000000

[40000 rows x 29 columns]
```

# Development

The package is tested using pylint and pytest.
Expand Down
65 changes: 65 additions & 0 deletions association_measures/grids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from itertools import product

from numpy import exp, linspace, log
from pandas import DataFrame

from .measures import score


def expand_grid(dictionary):
"""Create a grid of all value combinations of all keys of the dictionary

"""

return DataFrame([row for row in product(*dictionary.values())],
columns=dictionary.keys())


def log_seq(to=10e6, length=200, exact=50):
"""Create a logarithimcally scaled sequence

"""

if length <= exact:
raise ValueError()

length = length - exact

return list(range(exact + 1)) + [int(exp(s)) for s in sorted([x for x in linspace(log(exact), log(to), length)])]


def log_grid(N1=10e6, N2=10e6, length1=200, length2=200, exact1=50, exact2=50):
"""Create a logarithmically-scaled grid

"""
return expand_grid({
'f1': log_seq(N1, length1, exact=exact1),
'f2': log_seq(N2, length2, exact=exact2)
}).drop_duplicates().reset_index(drop=True)


def topography(N1=10e6, N2=10e6, length=200, length1=None, length2=None, exact=50, exact1=None, exact2=None):
"""Create logarithmically scaled grid and calculcate scores

"""

exact1 = exact if exact1 is None else exact1
exact2 = exact if exact2 is None else exact2
length1 = length if length1 is None else length1
length2 = length if length2 is None else length2

# support
g = log_grid(N1=N1, N2=N2, length1=length1, length2=length2, exact1=exact1, exact2=exact2)

# add scores
scores = score(g, N1=N1, N2=N2)
# .. add alternative for CLR
scores['clr_normal'] = score(
g, N1=N1, N2=N2, boundary='normal', measures=['conservative_log_ratio']
)['conservative_log_ratio']
# .. add alternative for log-ratio
scores['log_ratio_hardie'] = score(
g, N1=N1, N2=N2, discounting='Hardie2014', measures=['log_ratio']
)['log_ratio']

return scores
53 changes: 18 additions & 35 deletions association_measures/measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

"""

from warnings import warn

import numpy as np
from pandas import concat, merge
from scipy.stats import norm, beta
from warnings import warn
from scipy.stats import beta, norm

from .binomial import choose
from .frequencies import expected_frequencies, observed_frequencies


CHOOSE = np.vectorize(choose)


Expand All @@ -20,7 +20,6 @@ def list_measures():

:return: dictionary of measures
:rtype: dict

"""

return {
Expand Down Expand Up @@ -144,8 +143,7 @@ def calculate_measures(df, measures=None, freq=False, per_million=True, digits=N
###############################

def z_score(df, **kwargs):
"""
Calculate z-score
"""Calculate z-score

:param DataFrame df: DataFrame with columns O11 and E11
:return: z-score
Expand All @@ -158,8 +156,7 @@ def z_score(df, **kwargs):


def t_score(df, disc=.001, **kwargs):
"""
Calculate t-score
"""Calculate t-score

:param DataFrame df: pd.DataFrame with columns O11 and E11
:param float disc: discounting (or smoothing) parameter for O11 == 0
Expand All @@ -174,8 +171,7 @@ def t_score(df, disc=.001, **kwargs):


def log_likelihood(df, signed=True, **kwargs):
"""
Calculate log-likelihood
"""Calculate log-likelihood

:param DataFrame df: pd.DataFrame with columns O11..O22, E11..E22
:param bool signed: return negative values for rows with O11 < E11?
Expand Down Expand Up @@ -204,17 +200,15 @@ def log_likelihood(df, signed=True, **kwargs):


def simple_ll(df, signed=True, **kwargs):
"""
Calculate simple log-likelihood
"""Calculate simple log-likelihood

:param DataFrame df: pd.DataFrame with columns O11, E11
:param bool signed: return negative values for rows with O11 < E11?
:return: simple log-likelihood
:rtype: pd.Series
"""

# NB: discounting will not have any effect:
# term will be multiplied by original Oij = 0
# NB: discounting will not have any effect: term will be multiplied by original Oij = 0
O11_disc = df['O11'].where(df['O11'] != 0, 1)

log_term = df['O11'] * np.log(O11_disc / df['E11'])
Expand Down Expand Up @@ -260,8 +254,7 @@ def liddell(df, **kwargs):


def dice(df, **kwargs):
"""
Calculate Dice coefficient
"""Calculate Dice coefficient

:param DataFrame df: pd.DataFrame with columns O11, O12, O21
:return: dice
Expand Down Expand Up @@ -301,8 +294,7 @@ def log_ratio(df, disc=.5, discounting='Walter1975', **kwargs):
#######################

def hypergeometric_likelihood(df, **kwargs):
"""
Calculate hypergeometric-likelihood
"""Calculate hypergeometric-likelihood

:param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22
:return: hypergeometric-likelihood
Expand All @@ -324,8 +316,7 @@ def hypergeometric_likelihood(df, **kwargs):


def binomial_likelihood(df, **kwargs):
"""
Calculate binomial-likelihood
"""Calculate binomial-likelihood

:param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22, E11, N
:return: binomial-likelihood
Expand All @@ -350,13 +341,12 @@ def binomial_likelihood(df, **kwargs):
# CONSERVATIVE ESTIMATES #
##########################

def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='poisson',
correct='Bonferroni', vocab=None,
one_sided=False, **kwargs):
"""
Calculate conservative log-ratio, i.e. the binary logarithm of the
"""Calculate conservative log-ratio, i.e. the binary logarithm of the
lower bound of the confidence interval of relative risk at the
(Bonferroni-corrected) confidence level.
(Bonferroni-corrected) significance level.

:param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22
:param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0
Expand All @@ -368,7 +358,6 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',

:return: conservative log-ratio
:rtype: pd.Series

"""

# correction of alpha for two-sided tests
Expand All @@ -394,15 +383,12 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',

# Poisson approximation (Evert 2022)
if boundary == 'poisson':

# only calculate where_lower
lower = beta.ppf(alpha, df['O11'], df['O21'] + 1)
lower_boundary = np.log2((df['R2'] / df['R1']) * lower / (1 - lower)).clip(lower=0)

# only calculate where_upper
upper = beta.ppf(1 - alpha, df['O11'] + 1, df['O21'])
upper_boundary = np.log2((df['R2'] / df['R1']) * upper / (1 - upper)).clip(upper=0)

# combine, set to 0 where (df['O11'] == 0) & (df['O12'] == 0)
clrr = lower_boundary.where(
(df['O11'] / df['R1']) >= (df['O21'] / df['R2']),
Expand Down Expand Up @@ -434,8 +420,7 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal',
######################

def mutual_information(df, disc=.001, **kwargs):
"""
Calculate Mutual Information
"""Calculate Mutual Information

:param DataFrame df: pd.DataFrame with columns O11 and E11
:param float disc: discounting (or smoothing) parameter for O11 == 0
Expand All @@ -450,16 +435,14 @@ def mutual_information(df, disc=.001, **kwargs):


def local_mutual_information(df, **kwargs):
"""
Calculate Local Mutual Information
"""Calculate Local Mutual Information

:param DataFrame df: pd.DataFrame with columns O11 and E11
:return: mutual information
:return: local mutual information
:rtype: pd.Series
"""

# NB: discounting will not have any effect:
# term will be multiplied by original Oij = 0
# NB: discounting will not have any effect: term will be multiplied by original Oij = 0
O11_disc = df['O11'].where(df['O11'] != 0, 1)
am = df['O11'] * np.log10(O11_disc / df['E11'])

Expand Down
2 changes: 1 addition & 1 deletion association_measures/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Association measures are mathematical formulae that interpret cooccurrence frequency data.
"""

VERSION = (0, 2, 7)
VERSION = (0, 3, 0)
__version__ = '.'.join(map(str, VERSION))
2 changes: 1 addition & 1 deletion performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
# conservative estimates
{
'name': 'conservative_log_ratio',
'code': 'am.conservative_log_ratio(df)'
'code': 'am.conservative_log_ratio(df, boundary="normal")'
},
{
'name': 'conservative_log_ratio_poisson',
Expand Down
9 changes: 4 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

import os
import sys
from setuptools import find_packages, Command
from distutils.core import setup
from distutils.extension import Extension
from setuptools import find_packages, setup, Command, Extension

# Package meta-data.
NAME = 'association-measures'
Expand All @@ -13,7 +11,7 @@
EMAIL = '[email protected]'
AUTHOR = 'Philipp Heinrich & Markus Opolka'

REQUIRES_PYTHON = '>=3.6'
REQUIRES_PYTHON = '>=3.8'
REQUIRED = [
'wheel',
'pandas',
Expand Down Expand Up @@ -99,10 +97,11 @@ def run(self):
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Programming Language :: Cython',
],
)
Loading
Loading