several regularization

HopkinsIDD · Apr 16, 2024 · 90bd280 · 90bd280
1 parent 84f18b8
commit 90bd280
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 149 deletions.
diff --git a/examples/simple_usa_statelevel/inference_benchmark.ipynb b/examples/simple_usa_statelevel/inference_benchmark.ipynb
@@ -76,6 +76,81 @@
     "    ax.plot(df.drop(\"time\",axis=1).set_index(\"date\").groupby(\"date\").sum()[\"incidCase\"], lw=.5)\n",
     "ax.grid()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;31mSignature:\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoisson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpmf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mDocstring:\u001b[0m\n",
+      "Probability mass function at k of the given RV.\n",
+      "\n",
+      "Parameters\n",
+      "----------\n",
+      "k : array_like\n",
+      "    Quantiles.\n",
+      "arg1, arg2, arg3,... : array_like\n",
+      "    The shape parameter(s) for the distribution (see docstring of the\n",
+      "    instance object for more information)\n",
+      "loc : array_like, optional\n",
+      "    Location parameter (default=0).\n",
+      "\n",
+      "Returns\n",
+      "-------\n",
+      "pmf : array_like\n",
+      "    Probability mass function evaluated at k\n",
+      "\u001b[0;31mFile:\u001b[0m      ~/anaconda3/lib/python3.11/site-packages/scipy/stats/_distn_infrastructure.py\n",
+      "\u001b[0;31mType:\u001b[0m      method"
+     ]
+    }
+   ],
+   "source": [
+    "scipy.stats.poisson.pmf?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate some tests for logprob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "module 'gempyor.statistics' has no attribute 'plot_poisson_pmf'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgempyor\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m statistics\n\u001b[0;32m----> 3\u001b[0m statistics\u001b[38;5;241m.\u001b[39mplot_poisson_pmf(\u001b[38;5;241m10\u001b[39m, \u001b[38;5;241m0.5\u001b[39m, ax)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'gempyor.statistics' has no attribute 'plot_poisson_pmf'"
+     ]
+    }
+   ],
+   "source": [
+    "from gempyor import statistics\n",
+    "\n",
+    "statistics(co)"
+   ]
   }
  ],
  "metadata": {

diff --git a/examples/simple_usa_statelevel/simple_usa_statelevel.yml b/examples/simple_usa_statelevel/simple_usa_statelevel.yml
@@ -121,7 +121,11 @@ inference:
       likelihood: 
         dist: pois
       regularize:
-
+        - name: forecast 
+          last_n: 10
+          mult: 2
+        - name: allsubpop
+          mult: 10
     incidHosp:
       name: incidHosp
       sim_var: incidHosp

diff --git a/flepimop/gempyor_pkg/src/gempyor/statistics.py b/flepimop/gempyor_pkg/src/gempyor/statistics.py
@@ -7,7 +7,7 @@
 class Statistic:
     """
         A statistic is a function that takes two time series and returns a scalar value.
-        It applies resampling, scaling, and regularization to the data before computing the statistic's log-loss.
+        It applies resample, scale, and regularization to the data before computing the statistic's log-loss.
         Configuration:
         - sim_var: the variable in the simulation data
         - data_var: the variable in the ground truth data
@@ -22,35 +22,28 @@ def __init__(self, name, statistic_config: confuse.ConfigView):
         self.data_var = statistic_config["data_var"].as_str()
         self.name = name
 
+        self.regularizations = []  # A list to hold regularization functions and configs
+        if statistic_config["regularize"].exists():
+            for reg_config in statistic_config["regularize"]:  # Iterate over the list
+                reg_name = reg_config["name"].get()
+                reg_func = getattr(self, f"_{reg_name}_regularize") 
+                if reg_func is None:
+                    raise ValueError(f"Unsupported regularization: {reg_name}")
+                self.regularizations.append((reg_func, reg_config))
+
         self.resample = False
         if statistic_config["resample"].exists():
             self.resample = True
             resample_config = statistic_config["resample"]
             self.resample_freq = ""
             if resample_config["freq"].exists():
                 self.resample_freq = resample_config["freq"].get()
-
             self.resample_aggregator = ""
             if resample_config["aggregator"].exists():
                 self.resample_aggregator = getattr(pd.Series, resample_config["aggregator"].get())
-
             self.resample_skipna = False # TODO
             if resample_config["aggregator"].exists() and resample_config["skipna"].exists():
                 self.resample_skipna = resample_config["skipna"].get()
-
-        self.regularize = False
-        if statistic_config["regularize"].exists():
-            raise ValueError("Regularization is not implemented")
-            regularization_config = statistic_config["regularization"].get()
-
-        self.regularization = None
-        if statistic_config["regularize"].exists():
-            # TODO: support several regularization
-            self.regularization_config = statistic_config["regularize"]
-            self.regularization_name = self.regularization_config["name"].get()
-            self.regularization = getattr(self, f"_{self.regularization_name}_regularize")
-            if self.regularization is None:
-                raise ValueError(f"Unsupported regularization: {self.regularization_name}")
 
         self.scale = False
         if statistic_config["scale"].exists():
@@ -71,8 +64,6 @@ def _allsubpop_regularize(self, data):
         """ add a regularization term that is the sum of all subpopulations
         """
 
-
-
     def __str__(self) -> str:
         return f"{self.name}: {self.dist} between {self.sim_var} (sim) and {self.data_var} (data)."
 
@@ -92,148 +83,31 @@ def apply_scale(self, data):
             return data
 
     def apply_transforms(self, data):
-        return self.apply_scale(self.apply_resampling(data))
+        data_scaled_resampled = self.apply_scale(self.apply_resample(data))
+        # Apply regularizations sequentially
+        for reg_func, reg_config in self.regularizations:
+            data_scaled_resampled = reg_func(data_scaled_resampled, **reg_config)  # Pass config parameters
+        return data_scaled_resampled
+
 
     def compute_logloss(self, model_data, gt_data):
         model_data = self. apply_transforms(model_data[self.sim_var])
         gt_data = self.apply_transforms(gt_data[self.data_var])
 
-                dist_map = {
+        dist_map = {
             "pois": scipy.stats.poisson.pmf,
-            "norm": scipy.stats.norm.pdf,
-            "nbinom": scipy.stats.nbinom.pmf,
-        }
-        if self.dist not in dist_map:
-            raise ValueError(f"Invalid distribution specified: {self.dist}")
-        log_likelihood = dist_map[self.dist](round(gt_data), model_data)
-
-        # Apply regularization if defined
-        if self.regularization:
-            log_likelihood -= self.regularization(model_data)
-
-
-            dist_map = {
-            "pois": scipy.stats.poisson.pmf,
-            "norm": lambda x, loc, scale: scipy.stats.norm.pdf(x, loc=loc, scale=self.params.get("scale", scale)),
+            "norm": lambda x, loc, scale: scipy.stats.norm.pdf(x, loc=loc, scale=self.params.get("scale", scale)), # wrong:  
             "nbinom": lambda x, n, p: scipy.stats.nbinom.pmf(x, n=self.params.get("n"), p=model_data),
+            "rmse": lambda x, y: np.sqrt(np.mean((x-y)**2)),
+            "absolute_error": lambda x, y: np.mean(np.abs(x-y)),
         }
         if self.dist not in dist_map:
             raise ValueError(f"Invalid distribution specified: {self.dist}")
 
         # Use stored parameters in the distribution function call
-        log_likelihood = dist_map[self.dist](round(gt_data), model_data, **self.params)
-
-
+        likelihood = dist_map[self.dist](gt_data, model_data, **self.params)
 
         if not model_data.shape == gt_data.shape:
             raise ValueError(f"{self.name} Statistic error: data and groundtruth do not have the same shape")
 
-        if self.dist == "pois":
-            ll = np.log(scipy.stats.poisson.pmf(round(gt_data), model_data))
-        elif self.dist == "norm":
-            ll = np.log(scipy.stats.norm.pdf(gt_data, loc=model_data, scale=param[0]))
-        elif self.dist == "nbinom":
-            ll = np.log(scipy.stats.nbinom.pmf(gt_data, n=param[0], p=model_data))
-        else:
-            raise ValueError("Invalid distribution specified, got {self.dist}")
-        return ll
-
-
-
-
-class Statistic:
-    def __init__(self, name, statistic_config: confuse.ConfigView):
-        self.sim_var = statistic_config["sim_var"].as_str()
-        self.data_var = statistic_config["data_var"].as_str()
-        self.name = name
-
-        self.resample = False
-        if statistic_config["resample"].exists():
-            self.resample = True
-            resample_config = statistic_config["resample"]
-            self.resample_freq = ""
-            if resample_config["freq"].exists():
-                self.resample_freq = resample_config["freq"].get()
-
-            self.resample_aggregator = ""
-            if resample_config["aggregator"].exists():
-                self.resample_aggregator = getattr(pd.Series, resample_config["aggregator"].get())
-
-            self.resample_skipna = False # TODO
-            if resample_config["aggregator"].exists() and resample_config["skipna"].exists():
-                self.resample_skipna = resample_config["skipna"].get()
-
-        self.regularize = False
-        if statistic_config["regularize"].exists():
-            raise ValueError("Regularization is not implemented")
-            regularization_config = statistic_config["regularization"].get()
-
-        self.scale = False
-        if statistic_config["scale"].exists():
-            self.scale_func = getattr(np, statistic_config["scale"].get())
-
-        self.dist = statistic_config["likelihood"]["dist"].get()
-        # TODO here get the parameter in a dictionnary
-
-
-    def __str__(self) -> str:
-        return f"{self.name}: {self.dist} between {self.sim_var} (sim) and {self.data_var} (data)."
-
-    def __repr__(self) -> str:
-        return f"A Statistic(): {self.__str__()}"
-
-    def apply_resample(self, data):
-        if self.resample:
-            return data.resample(self.resample_freq).agg(self.resample_aggregator, skipna=self.resample_skipna)
-        else:
-            return data
-
-    def apply_scale(self, data):
-        if self.scale:
-            return self.scale_func(data)
-        else:
-            return data
-
-    def apply_transforms(self, data):
-        return self.apply_scale(self.apply_resampling(data))
-
-    def compute_logloss(self, model_data, gt_data, param, add_one = False):
-        model_data = self. apply_transforms(model_data[self.sim_var])
-        gt_data = self.apply_transforms(gt_data[self.data_var])
-
-        if not model_data.shape == gt_data.shape:
-            raise ValueError(f"{self.name} Statistic error: data and groundtruth do not have the same shape")
-
-        if add_one: # TO DO
-            # do not evaluate likelihood if both simulated and observed value are zero. Assign likelihood = 1
-            eval_ = np.logical_not(model_data+gt_data == 0)
-            # if simulated value is 0, but data is non zero, change sim to 1 and evaluate likelihood
-            model_data[np.logical_and(model_data == 0, eval_)] = 1
-        else:
-            eval_ = np.ones(len(gt_data), dtype=bool)
-
-        ll = np.zeros(len(gt_data))
-
-        if self.dist == "pois":
-            ll[eval_] = np.log(scipy.stats.poisson.pmf(np.round(gt_data[eval_]), model_data[eval_]))
-        elif self.dist == "norm":
-            ll[eval_] = np.log(scipy.stats.norm.pdf(gt_data[eval_], loc=model_data[eval_], scale=param[0]))
-        elif self.dist == "norm_cov": 
-            ll[eval_] = np.log(scipy.stats.norm.pdf(gt_data[eval_], loc=model_data[eval_], scale=np.maximum(model_data[eval_],5)*param[0]))
-        elif self.dist == "nbinom": # param 0 is dispersion parameter k
-            ll[eval_] = np.log(scipy.stats.nbinom.pmf(gt_data[eval_], n=param[0], p=model_data[eval_]))
-        elif self.dist == "sqrtnorm": 
-            ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt(model_data[eval_]), scale=param[0]))
-        elif self.dist == "sqrtnorm_cov": 
-            ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt(model_data[eval_]), scale=np.sqrt(np.maximum(model_data[eval_],5))*param[0]))
-        elif self.dist == "sqrtnorm_scale_sim": # param 0 is cov, param 1 is multipler
-            ll[eval_] = np.log(scipy.stats.norm.pdf(np.sqrt(gt_data[eval_]), loc=np.sqrt([eval_]*param[1]), scale=np.sqrt(np.maximum(model_data[eval_],5)*param[1])*param[0]))
-        elif self.dist == "lognorm": 
-            # lognormal where the mode (MLE) is the simulated value
-            gt_data[np.logical_and(gt_data == 0, eval_)] = 1 # if observed value is 0 but simulated is 1, change data to 1 and evaluate likelihood.
-            # can't have zeros for lognormal, would give loglikelihood of negative infinity
-            # rc[eval] <- dlnorm(obs[eval], meanlog = log(sim[eval]) + param[[1]]^2, sdlog = param[[1]], log = T) # mean is adjusted so that sim is the mode
-            ll[eval_] = np.log(scipy.stats.lognorm.pdf(gt_data[eval_], loc=model_data[eval_] + param[0]**2, scale=param[0]))
-        else:
-            raise ValueError("Invalid distribution specified, got {self.dist}")
-        return ll
+        return np.log(likelihood)