Skip to content

Commit

Permalink
several regularization
Browse files Browse the repository at this point in the history
  • Loading branch information
jcblemai committed Apr 16, 2024
1 parent 84f18b8 commit 90bd280
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 149 deletions.
75 changes: 75 additions & 0 deletions examples/simple_usa_statelevel/inference_benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,81 @@
" ax.plot(df.drop(\"time\",axis=1).set_index(\"date\").groupby(\"date\").sum()[\"incidCase\"], lw=.5)\n",
"ax.grid()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import scipy"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0;31mSignature:\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoisson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpmf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m\n",
"Probability mass function at k of the given RV.\n",
"\n",
"Parameters\n",
"----------\n",
"k : array_like\n",
" Quantiles.\n",
"arg1, arg2, arg3,... : array_like\n",
" The shape parameter(s) for the distribution (see docstring of the\n",
" instance object for more information)\n",
"loc : array_like, optional\n",
" Location parameter (default=0).\n",
"\n",
"Returns\n",
"-------\n",
"pmf : array_like\n",
" Probability mass function evaluated at k\n",
"\u001b[0;31mFile:\u001b[0m ~/anaconda3/lib/python3.11/site-packages/scipy/stats/_distn_infrastructure.py\n",
"\u001b[0;31mType:\u001b[0m method"
]
}
],
"source": [
"scipy.stats.poisson.pmf?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate some tests for logprob"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "module 'gempyor.statistics' has no attribute 'plot_poisson_pmf'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgempyor\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m statistics\n\u001b[0;32m----> 3\u001b[0m statistics\u001b[38;5;241m.\u001b[39mplot_poisson_pmf(\u001b[38;5;241m10\u001b[39m, \u001b[38;5;241m0.5\u001b[39m, ax)\n",
"\u001b[0;31mAttributeError\u001b[0m: module 'gempyor.statistics' has no attribute 'plot_poisson_pmf'"
]
}
],
"source": [
"from gempyor import statistics\n",
"\n",
"statistics(co)"
]
}
],
"metadata": {
Expand Down
6 changes: 5 additions & 1 deletion examples/simple_usa_statelevel/simple_usa_statelevel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ inference:
likelihood:
dist: pois
regularize:

- name: forecast
last_n: 10
mult: 2
- name: allsubpop
mult: 10
incidHosp:
name: incidHosp
sim_var: incidHosp
Expand Down
170 changes: 22 additions & 148 deletions flepimop/gempyor_pkg/src/gempyor/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class Statistic:
"""
A statistic is a function that takes two time series and returns a scalar value.
It applies resampling, scaling, and regularization to the data before computing the statistic's log-loss.
It applies resample, scale, and regularization to the data before computing the statistic's log-loss.
Configuration:
- sim_var: the variable in the simulation data
- data_var: the variable in the ground truth data
Expand All @@ -22,35 +22,28 @@ def __init__(self, name, statistic_config: confuse.ConfigView):
self.data_var = statistic_config["data_var"].as_str()
self.name = name

self.regularizations = [] # A list to hold regularization functions and configs
if statistic_config["regularize"].exists():
for reg_config in statistic_config["regularize"]: # Iterate over the list
reg_name = reg_config["name"].get()
reg_func = getattr(self, f"_{reg_name}_regularize")
if reg_func is None:
raise ValueError(f"Unsupported regularization: {reg_name}")
self.regularizations.append((reg_func, reg_config))

self.resample = False
if statistic_config["resample"].exists():
self.resample = True
resample_config = statistic_config["resample"]
self.resample_freq = ""
if resample_config["freq"].exists():
self.resample_freq = resample_config["freq"].get()

self.resample_aggregator = ""
if resample_config["aggregator"].exists():
self.resample_aggregator = getattr(pd.Series, resample_config["aggregator"].get())

self.resample_skipna = False # TODO
if resample_config["aggregator"].exists() and resample_config["skipna"].exists():
self.resample_skipna = resample_config["skipna"].get()

self.regularize = False
if statistic_config["regularize"].exists():
raise ValueError("Regularization is not implemented")
regularization_config = statistic_config["regularization"].get()

self.regularization = None
if statistic_config["regularize"].exists():
# TODO: support several regularization
self.regularization_config = statistic_config["regularize"]
self.regularization_name = self.regularization_config["name"].get()
self.regularization = getattr(self, f"_{self.regularization_name}_regularize")
if self.regularization is None:
raise ValueError(f"Unsupported regularization: {self.regularization_name}")

self.scale = False
if statistic_config["scale"].exists():
Expand All @@ -71,8 +64,6 @@ def _allsubpop_regularize(self, data):
""" add a regularization term that is the sum of all subpopulations
"""



def __str__(self) -> str:
return f"{self.name}: {self.dist} between {self.sim_var} (sim) and {self.data_var} (data)."

Expand All @@ -92,148 +83,31 @@ def apply_scale(self, data):
return data

def apply_transforms(self, data):
return self.apply_scale(self.apply_resampling(data))
data_scaled_resampled = self.apply_scale(self.apply_resample(data))
# Apply regularizations sequentially
for reg_func, reg_config in self.regularizations:
data_scaled_resampled = reg_func(data_scaled_resampled, **reg_config) # Pass config parameters
return data_scaled_resampled


def compute_logloss(self, model_data, gt_data):
model_data = self. apply_transforms(model_data[self.sim_var])
gt_data = self.apply_transforms(gt_data[self.data_var])

dist_map = {
dist_map = {
"pois": scipy.stats.poisson.pmf,
"norm": scipy.stats.norm.pdf,
"nbinom": scipy.stats.nbinom.pmf,
}
if self.dist not in dist_map:
raise ValueError(f"Invalid distribution specified: {self.dist}")
log_likelihood = dist_map[self.dist](round(gt_data), model_data)

# Apply regularization if defined
if self.regularization:
log_likelihood -= self.regularization(model_data)


dist_map = {
"pois": scipy.stats.poisson.pmf,
"norm": lambda x, loc, scale: scipy.stats.norm.pdf(x, loc=loc, scale=self.params.get("scale", scale)),
"norm": lambda x, loc, scale: scipy.stats.norm.pdf(x, loc=loc, scale=self.params.get("scale", scale)), # wrong:
"nbinom": lambda x, n, p: scipy.stats.nbinom.pmf(x, n=self.params.get("n"), p=model_data),
"rmse": lambda x, y: np.sqrt(np.mean((x-y)**2)),
"absolute_error": lambda x, y: np.mean(np.abs(x-y)),
}
if self.dist not in dist_map:
raise ValueError(f"Invalid distribution specified: {self.dist}")

# Use stored parameters in the distribution function call
log_likelihood = dist_map[self.dist](round(gt_data), model_data, **self.params)


likelihood = dist_map[self.dist](gt_data, model_data, **self.params)

if not model_data.shape == gt_data.shape:
raise ValueError(f"{self.name} Statistic error: data and groundtruth do not have the same shape")

if self.dist == "pois":
ll = np.log(scipy.stats.poisson.pmf(round(gt_data), model_data))
elif self.dist == "norm":
ll = np.log(scipy.stats.norm.pdf(gt_data, loc=model_data, scale=param[0]))
elif self.dist == "nbinom":
ll = np.log(scipy.stats.nbinom.pmf(gt_data, n=param[0], p=model_data))
else:
raise ValueError("Invalid distribution specified, got {self.dist}")
return ll




class Statistic:
def __init__(self, name, statistic_config: confuse.ConfigView):
self.sim_var = statistic_config["sim_var"].as_str()
self.data_var = statistic_config["data_var"].as_str()
self.name = name

self.resample = False
if statistic_config["resample"].exists():
self.resample = True
resample_config = statistic_config["resample"]
self.resample_freq = ""
if resample_config["freq"].exists():
self.resample_freq = resample_config["freq"].get()

self.resample_aggregator = ""
if resample_config["aggregator"].exists():
self.resample_aggregator = getattr(pd.Series, resample_config["aggregator"].get())

self.resample_skipna = False # TODO
if resample_config["aggregator"].exists() and resample_config["skipna"].exists():
self.resample_skipna = resample_config["skipna"].get()

self.regularize = False
if statistic_config["regularize"].exists():
raise ValueError("Regularization is not implemented")
regularization_config = statistic_config["regularization"].get()

self.scale = False
if statistic_config["scale"].exists():
self.scale_func = getattr(np, statistic_config["scale"].get())

self.dist = statistic_config["likelihood"]["dist"].get()
# TODO here get the parameter in a dictionnary


def __str__(self) -> str:
return f"{self.name}: {self.dist} between {self.sim_var} (sim) and {self.data_var} (data)."

def __repr__(self) -> str:
return f"A Statistic(): {self.__str__()}"

def apply_resample(self, data):
if self.resample:
return data.resample(self.resample_freq).agg(self.resample_aggregator, skipna=self.resample_skipna)
else:
return data

def apply_scale(self, data):
if self.scale:
return self.scale_func(data)
else:
return data

def apply_transforms(self, data):
return self.apply_scale(self.apply_resampling(data))

def compute_logloss(self, model_data, gt_data, param, add_one = False):
model_data = self. apply_transforms(model_data[self.sim_var])
gt_data = self.apply_transforms(gt_data[self.data_var])

if not model_data.shape == gt_data.shape:
raise ValueError(f"{self.name} Statistic error: data and groundtruth do not have the same shape")

if add_one: # TO DO
# do not evaluate likelihood if both simulated and observed value are zero. Assign likelihood = 1
eval_ = np.logical_not(model_data+gt_data == 0)
# if simulated value is 0, but data is non zero, change sim to 1 and evaluate likelihood
model_data[np.logical_and(model_data == 0, eval_)] = 1
else:
eval_ = np.ones(len(gt_data), dtype=bool)

ll = np.zeros(len(gt_data))

if self.dist == "pois":
ll[eval_] = np.log(scipy.stats.poisson.pmf(np.round(gt_data[eval_]), model_data[eval_]))
elif self.dist == "norm":
ll[eval_] = np.log(scipy.stats.norm.pdf(gt_data[eval_], loc=model_data[eval_], scale=param[0]))
elif self.dist == "norm_cov":
ll[eval_] = np.log(scipy.stats.norm.pdf(gt_data[eval_], loc=model_data[eval_], scale=np.maximum(model_data[eval_],5)*param[0]))
elif self.dist == "nbinom": # param 0 is dispersion parameter k
ll[eval_] = np.log(scipy.stats.nbinom.pmf(gt_data[eval_], n=param[0], p=model_data[eval_]))
elif self.dist == "sqrtnorm":
ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt(model_data[eval_]), scale=param[0]))
elif self.dist == "sqrtnorm_cov":
ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt(model_data[eval_]), scale=np.sqrt(np.maximum(model_data[eval_],5))*param[0]))
elif self.dist == "sqrtnorm_scale_sim": # param 0 is cov, param 1 is multipler
ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt([eval_]*param[1]), scale=np.sqrt(np.maximum(model_data[eval_],5)*param[1])*param[0]))
elif self.dist == "lognorm":
# lognormal where the mode (MLE) is the simulated value
gt_data[np.logical_and(gt_data == 0, eval_)] = 1 # if observed value is 0 but simulated is 1, change data to 1 and evaluate likelihood.
# can't have zeros for lognormal, would give loglikelihood of negative infinity
# rc[eval] <- dlnorm(obs[eval], meanlog = log(sim[eval]) + param[[1]]^2, sdlog = param[[1]], log = T) # mean is adjusted so that sim is the mode
ll[eval_] = np.log(scipy.stats.lognorm.pdf(gt_data[eval_], loc=model_data[eval_] + param[0]**2, scale=param[0]))
else:
raise ValueError("Invalid distribution specified, got {self.dist}")
return ll
return np.log(likelihood)

0 comments on commit 90bd280

Please sign in to comment.