diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..f4687fd --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11.2 \ No newline at end of file diff --git a/MHP.py b/MHP.py index db9037e..8db72e5 100644 --- a/MHP.py +++ b/MHP.py @@ -14,7 +14,6 @@ import numpy as np -import time as T from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.extmath import cartesian @@ -22,7 +21,7 @@ import matplotlib.pyplot as plt class MHP: - def __init__(self, alpha=[[0.5]], mu=[0.1], omega=1.0): + def __init__(self, alpha=[[0.5]], mu=[0.1], omega=1.0, seed=None): '''params should be of form: alpha: numpy.array((u,u)), mu: numpy.array((,u)), omega: float''' @@ -30,6 +29,7 @@ def __init__(self, alpha=[[0.5]], mu=[0.1], omega=1.0): self.alpha, self.mu, self.omega = np.array(alpha), np.array(mu), omega self.dim = self.mu.shape[0] self.check_stability() + self.rng = np.random.default_rng(seed=seed) def check_stability(self): ''' check stability of process (max alpha eigenvalue < 1)''' @@ -46,16 +46,16 @@ def generate_seq(self, horizon): self.data = [] # clear history Istar = np.sum(self.mu) - s = np.random.exponential(scale=1./Istar) + s = self.rng.exponential(scale=1./Istar) # attribute (weighted random sample, since sum(mu)==Istar) - n0 = np.random.choice(np.arange(self.dim), - 1, + n0 = self.rng.choice(np.arange(self.dim), 1, p=(self.mu / Istar)) - self.data.append([s, n0]) + self.data.append([s, n0[0]]) # value of \lambda(t_k) where k is most recent event # starts with just the base rate + rates = 0 lastrates = self.mu.copy() decIstar = False @@ -72,7 +72,7 @@ def generate_seq(self, horizon): self.omega * np.sum(self.alpha[:,uj]) # generate new event - s += np.random.exponential(scale=1./Istar) + s += self.rng.exponential(scale=1./Istar) # calc rates at time s (use trick to take advantage of rates at last event) rates = self.mu + np.exp(-self.omega * (s - tj)) * \ @@ -82,7 +82,7 @@ def generate_seq(self, horizon): # handle attribution and thinning in one step as weighted random sample diff = Istar - np.sum(rates) try: - n0 = np.random.choice(np.arange(self.dim+1), 1, + n0 = self.rng.choice(np.arange(self.dim+1), 1, p=(np.append(rates, diff) / Istar)) except ValueError: # by construction this should not happen @@ -91,7 +91,7 @@ def generate_seq(self, horizon): return self.data if n0 < self.dim: - self.data.append([s, n0]) + self.data.append([s, n0[0]]) # update lastrates lastrates = rates.copy() else: @@ -157,7 +157,6 @@ def sum_pij(a,b): k = 0 old_LL = -10000 - START = T.time() while k < maxiter: Auu = Ahat[rowidx, colidx] ag = np.multiply(Auu, kern) @@ -233,7 +232,7 @@ def plot_rates(self, horizon=-1): f, axarr = plt.subplots(self.dim*2,1, sharex='col', gridspec_kw = {'height_ratios':sum([[3,1] for i in range(self.dim)],[])}, figsize=(8,self.dim*2)) - xs = np.linspace(0, horizon, (horizon/100.)*1000) + xs = np.linspace(0, horizon, int((horizon/100.)*1000)) for i in range(self.dim): row = i * 2 @@ -269,9 +268,7 @@ def plot_events(self, horizon=-1, showDays=True, labeled=True): plt.plot([j,j], [-self.dim, 1], 'k:', alpha=0.15) if labeled: - ax.set_yticklabels('') - ax.set_yticks(-np.arange(0, self.dim), minor=True) - ax.set_yticklabels([r'$e_{%d}$' % i for i in range(self.dim)], minor=True) + ax.set_yticks(-np.arange(0, self.dim), labels=[r'$e_{%d}$' % i for i in range(self.dim)]) else: ax.yaxis.set_visible(False) diff --git a/README.md b/README.md index 6be1cc8..e56e346 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The `MHP.py` file contains a single class, `MHP`. The core methods are: 2. `EM`: uses a Bayesian EM algorithm to learn parameters, given a sequence and estimates of `alpha`, `mu`, and `omega`. It treats `omega` as a hyperparameter and does not optimize over this parameter. For more details see [the report](https://stmorse.github.io/docs/6-867-final-writeup.pdf), [my thesis](https://stmorse.github.io/docs/orc-thesis.pdf), or [this blog post](https://stmorse.github.io/blog). -There are also two visualization methods, `plot_events` (which plots only the time series of events), and `plot_rates` (which plots the time series with the corresponding conditional intensities, currently only implemented for `dim=3`). +There are also two visualization methods, `plot_events` (which plots only the time series of events), and `plot_rates` (which plots the time series with the corresponding conditional intensities). # Usage @@ -58,21 +58,21 @@ We can learn the parameters of an already generated sequence by simply calling ` mhat = np.random.uniform(0,1, size=3) ahat = np.random.uniform(0,1, size=(3,3)) w = 3. - +np.set_printoptions(precision=3,suppress=True) P.EM(ahat, mhat, w) ``` which will give output something like: ``` -After ITER 0 (old: -10000.000 new: -0.129) - terms 5.1766, 17.4638, 8.5362 -After ITER 10 (old: -0.129 new: -1.349) - terms -20.8908, 11.8172, 14.1828 -Reached stopping criterion. (Old: -1.349 New: -1.350) - -(array([[ 0.0223, 0.1475, 0. ], - [ 0.5954, 0. , 0. ], - [ 0. , 0.625 , 0. ]]), - array([ 0.2038, 0.0046, 0. ])) +After ITER 0 (old: -10000.000 new: -0.044) + terms 9.4483, 15.4094, 10.5906 +After ITER 10 (old: -0.044 new: -1.509) + terms -25.8799, 12.6529, 13.3471 +Reached stopping criterion. (Old: -1.509 New: -1.509) + +(array([[0.06 , 0. , 0.071], + [0.503, 0. , 0.076], + [0. , 0.735, 0. ]]), + array([0.211, 0. , 0.015])) ``` This is already a reasonable approximation to the actual ground truth values from above, without including regularization, cross-validation, a larger sample, etc. diff --git a/requirements_3_11.txt b/requirements_3_11.txt new file mode 100644 index 0000000..7646559 --- /dev/null +++ b/requirements_3_11.txt @@ -0,0 +1,7 @@ +numpy==1.26.2 +ipykernel==6.28.0 +matplotlib==3.8.2 +scipy==1.11.4 +scikit-learn==1.3.2 +pytest==7.4.4 +ruff==0.1.9 diff --git a/test_MHP.py b/test_MHP.py new file mode 100644 index 0000000..1a2eb11 --- /dev/null +++ b/test_MHP.py @@ -0,0 +1,54 @@ +import numpy as np + +from MHP import MHP + +expected_seq = np.array([[6.799319039689095, 0], [6.832330144004188, 0]]) +expected_multiple_seq = np.array( + [ + [3.39965952, 0.0], + [3.40566154, 0.0], + [12.48864946, 0.0], + [13.17625391, 0.0], + [13.46781712, 1.0], + [14.65373507, 2.0], + [14.77459075, 0.0], + ] +) + +m = np.array([0.2, 0.0, 0.0]) +a = np.array([[0.1, 0.0, 0.0], [0.9, 0.0, 0.0], [0.0, 0.9, 0.0]]) +w = 3.1 + + +def test_generate_seq(): + P = MHP(seed=0) + P.generate_seq(10) + np.testing.assert_array_equal(P.data, expected_seq) + + +def test_generate_seq_multiple(): + P = MHP(mu=m, alpha=a, omega=w, seed=0) + P.generate_seq(15) + np.testing.assert_allclose(P.data, expected_multiple_seq) + + +def test_EM(): + P = MHP(seed=0) + P.data = expected_seq + alpha = np.array([[0.5]]) + mu = np.array([0.1]) + omega = 1.0 + expected = (np.array([[0.28310623]]), np.array([0.20985337])) + for i, value in enumerate(P.EM(alpha, mu, omega)): + np.testing.assert_allclose(value, expected[i]) + + +def test_EM_multiple(): + P = MHP(seed=0) + P.data = expected_multiple_seq + expected = ( + np.array([[0.1676253, 0.0, 0.0], [0.2, 0.0, 0.0], [0.0, 1.0, 0.0]]), + np.array([0.28169129, 0.0, 0.0]), + ) + for i, value in enumerate(P.EM(a, m, w)): + np.testing.assert_allclose(value, expected[i])