ANOVA (Analysis of Variance) is a statistical method used to compare the means of three or more groups to determine if there are significant differences between them. It assumes that the data are normally distributed, the groups have similar variances (homogeneity of variances), and the samples are independent. ANOVA is appropriate when these assumptions are met and you have one or more categorical independent variables and a continuous dependent variable. It should not be used if the assumptions are violated, such as when the data are not normally distributed or the variances are unequal, as this can lead to inaccurate results.
import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from statsmodels.stats.multitest import multipletests
def anova_analysis(metadata_df, dependent_variable, columns_to_compare):
"""
Perform ANOVA analysis on specified columns grouped by a dependent variable.
Parameters:
metadata_df (pd.DataFrame): DataFrame containing the metadata.
dependent_variable (str): The column name of the dependent variable.
columns_to_compare (list): List of column names to compare using ANOVA.
Returns:
pd.DataFrame: DataFrame containing ANOVA statistics and corrected p-values.
"""
category_list = list(metadata_df[dependent_variable].unique())
group_indices = {category: list(metadata_df[metadata_df[dependent_variable] == category].index) for category in category_list}
df_in = metadata_df.drop(columns=[dependent_variable])
namee = []
pva = []
stata = []
group_medians = {category: [] for category in category_list}
group_se = {category: [] for category in category_list}
for col in columns_to_compare:
groups = [list(df_in[col].reindex(group_indices[category])) for category in category_list]
[aov_stat, aov_pval] = f_oneway(*groups)
for category in category_list:
group_medians[category].append(np.median([float(i) for i in groups[category_list.index(category)]]))
group_se[category].append(np.std([float(i) for i in groups[category_list.index(category)]], ddof=1) / np.sqrt(len(groups[category_list.index(category)])))
namee.append(col)
pva.append(aov_pval)
stata.append(aov_stat)
df = pd.DataFrame(list(zip(stata, pva)), index=namee, columns=['statistic', 'pvalue'])
# Correct p-values
df['pvalue_corr'] = multipletests(df['pvalue'], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
for category in category_list:
df[f'median: {category}'] = group_medians[category]
df[f'se_{category}'] = group_se[category]
return df
# Example usage
metadata_df = meta_dmm.copy()
dependent_variable = 'dmm_cluster'
# columns_to_compare = list(metadata_df.columns.difference([dependent_variable]))
# columns_to_compare = [ 'snap_or_wic', 'foodbank','alcohol_yn', 'tobacco_yn',
# 'BMI>30', 'mood', 't2d', 'hypten', 'dep', 'colon_results','Diet Score','weight_init', 'BMI (calc)']
columns_to_compare = [ 'gender', 'age','snap_or_wic', 'foodbank',
'BMI>30', 'Diet Score','weight_init', 'BMI (calc)','tobacco_yn','species_richness']
anova_df = anova_analysis(metadata_df, dependent_variable, columns_to_compare)
anova_df