Skip to content

Commit

Permalink
janky ML fitting and working implementation of density curve on top o…
Browse files Browse the repository at this point in the history
…f density histogram, reasonably good performance with no strange spikes at least so far
  • Loading branch information
mbi6245 committed Sep 11, 2024
1 parent 3b012e2 commit 8e1697d
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 7 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,6 @@ dmypy.json
.pyre/

# Misc.
.DS_Store
.DS_Store
cleaned_individual_microdata_gbd2023_2024_03_18.csv
microdata_2024_05_03.csv
111 changes: 108 additions & 3 deletions plots.ipynb

Large diffs are not rendered by default.

20 changes: 17 additions & 3 deletions src/ensemble/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,10 @@ def _objective_func(self, vec: np.ndarray) -> float:
case "KS":
return cp.norm(vec, "inf")

def _ensemble_func(
self, weights: List[float], ecdf: np.ndarray, cdfs: np.ndarray
def _ensemble_func_temp(
self,
weights: List[float],
pdfs, # , ecdf: np.ndarray, cdfs: np.ndarray
) -> float:
"""
Expand All @@ -310,7 +312,8 @@ def _ensemble_func(
float
_description_
"""
return self._objective_func(ecdf - cdfs @ weights)
# return self._objective_func(ecdf - cdfs @ weights)
return -1 * cp.sum(cp.log(pdfs @ weights))

def fit(self, data: npt.ArrayLike) -> EnsembleResult:
"""fits weighted sum of CDFs corresponding to distributions in
Expand All @@ -335,15 +338,18 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
sample_variance = np.var(data, ddof=1)
ecdf = stats.ecdf(data).cdf.probabilities
equantiles = stats.ecdf(data).cdf.quantiles
print(len(equantiles), ", num of empirical quantiles")

# fill matrix with cdf values over support of data
num_distributions = len(self.distributions)
cdfs = np.zeros((len(data), num_distributions))
pdfs = np.zeros((len(data), num_distributions))
for i in range(num_distributions):
curr_dist = distribution_dict[self.distributions[i]](
sample_mean, sample_variance
)
cdfs[:, i] = curr_dist.cdf(equantiles)
pdfs[:, i] = curr_dist.pdf(equantiles)

# CVXPY implementation
w = cp.Variable(num_distributions)
Expand All @@ -354,6 +360,14 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:

fitted_weights = w.value

# ML implementation
# w = cp.Variable(num_distributions)
# objective = cp.Minimize(self._ensemble_func_temp(w, pdfs))
# constraints = [0 <= w, cp.sum(w) == 1]
# prob = cp.Problem(objective, constraints)
# prob.solve()
# fitted_weights = w.value

res = EnsembleResult(
weights=fitted_weights,
ensemble_distribution=EnsembleDistribution(
Expand Down

0 comments on commit 8e1697d

Please sign in to comment.