Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding new class for wrapping typical ADVI workflow #6

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion notebooks/contador.ipynb

Large diffs are not rendered by default.

5,956 changes: 4,954 additions & 1,002 deletions notebooks/wu2004.ipynb

Large diffs are not rendered by default.

75 changes: 74 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,77 @@ profile = "black"
multi_line_output = 3
line_length = 100
include_trailing_comma = true
reverse_relative = true
reverse_relative = true

[tool.ruff]
line-length = 100
select = [
# mccabe
"C90",
# flake-8 comprehensions
"C4",
# pycodestyle
"E", # Error
"W", # Warning
# PyFlakes
"F",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# type checking
"TCH",
# flake8-print
"T2",
# pep8-naming
"N",
# pydocstyle
"D",
# flake8-bandit
"S",

]
show-fixes = true
exclude = [
".tox",
".git",
"__pycache__",
"build",
"dist",
"tests/fixtures/*",
"*.pyc",
"*.egg-info",
".cache",
".eggs",
"data",
]

[tool.ruff.lint]
# 1. Enable flake8-bugbear (`B`) rules, in addition to the defaults.
# select = ["E4", "E7", "E9", "F", "B"]

# 2. Avoid enforcing line-length violations (`E501`)
ignore = ["E501","D203", "D213"]

# 3. Avoid trying to fix flake8-bugbear (`B`) violations.
unfixable = ["B"]

# 4. Ignore `E402` (import violations) in all `__init__.py` files, and in select subdirectories.
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402"]
"**/{tests,docs,tools}/*" = ["E402"]

[tool.ruff.format]
# Like Black, use double quotes for non-triple-quoted strings.
quote-style = "double"
# Like Black, indent with spaces, rather than tabs.
indent-style = "space"
# Like Black, respect magic trailing commas.
skip-magic-trailing-comma = false
# Like Black, automatically detect the appropriate line ending.
line-ending = "auto"

[tool.ruff.mccabe]
max-complexity = 20
3 changes: 1 addition & 2 deletions src/emll/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from emll.linlog_model import *
from emll.util import create_elasticity_matrix, create_Ey_matrix

import emll.test_models
from . import pymc_utils
302 changes: 302 additions & 0 deletions src/emll/bmca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
"""Code to actually implement the BMCA workflow in emll."""

import gzip
import os

import cloudpickle
import cobra
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt

from . import util

os.environ["MKL_THREADING_LAYER"] = "GNU"


class BMCA:
"""Class to build out BMCA experiment."""

def __init__(
self,
model_path,
reference_state,
metabolite_concentrations_path=None,
enzyme_measurements_path=None,
boundary_fluxes_path=None,
v_star_path=None,
):
"""Initialize the BMCA (Biochemical Model Calibration and Analysis) instance. Includes preprocessing to prepare data for pymc inference.

Parameters
----------
model_path : str
The path to the YAML file containing the biochemical model.

v_star_path : str
The path to the CSV file containing v_star data.

metabolite_concentrations_path : str
The path to the CSV file containing metabolite concentrations data.

boundary_fluxes_path : str
The path to the CSV file containing boundary fluxes data.

enzyme_measurements_path : str
The path to the CSV file containing enzyme measurements data.

reference_state : str
The reference state identifier for the experiment.

Attributes
----------
model : cobra.Model
The biochemical model loaded from the provided YAML file.

x : pandas.DataFrame
DataFrame containing metabolite concentrations data loaded from the CSV file.

e : pandas.DataFrame
DataFrame containing enzyme measurements data loaded from the CSV file.

v : pandas.DataFrame
DataFrame containing boundary fluxes data loaded from the CSV file.

v_star : pandas.DataFrame
DataFrame containing v_star data loaded from the CSV file.

ref_state : str
The reference state identifier used in the experiment.

"""
# TODO: Reduce reading in of data (below) into a single method using a config file
# # Gather input data
# self.gather_inputs(config_file)

# Get required user provided model and reference state
self.model = util.get_model_by_type(model_path)
self.ref_state = reference_state

# Get optional user provided data
self.x = util.get_dataframe(metabolite_concentrations_path)
self.e = util.get_dataframe(enzyme_measurements_path)
self.v = util.get_dataframe(boundary_fluxes_path)
self.v_star = util.get_dataframe(v_star_path, header=None, index_col=0)
if not(self.v_star is None):
self.v_star = self.v_star[1]



# TODO: move need for flux calculation outside __init__
# # Determine if fluxes need to be calculated
# need_fluxes = False
# if not(self.v is None):
# flux_df_rows = self.v.index
# else:
# need_fluxes = True
#
#
#
# Are fluxes provided:
# Yes: which fluxes are provided?
# All fluxes: do not calculate fluxes
# Some fluxes: calculate fluxes for missing fluxes
# Use provided reference state (required)
# No: calculate fluxes (use default reference state)
#
# If need to calculate fluxes
# Are metabolomics data provided?
# Yes: calculate fluxes using metaboliomics data as constrainsts
# Are transcriptomics data provided?
# Yes:



def preprocess_data(self):
"""Read in cobra model as components."""
# Establish compartments for reactions and metabolites
self.r_compartments = [
r.compartments if "e" not in r.compartments else "t" for r in self.model.reactions
]
self.r_compartments[self.model.reactions.index("SUCCt2r")] = "c"
self.r_compartments[self.model.reactions.index("ACt2r")] = "c"
for rxn in self.model.exchanges:
self.r_compartments[self.model.reactions.index(rxn)] = "t"
self.m_compartments = [m.compartment for m in self.model.metabolites]

# Reindex arrays to have the same column ordering
to_consider = self.v.columns
self.v = self.v.loc[:, to_consider]
self.x = self.x.loc[:, to_consider]
self.e = self.e.loc[:, to_consider]

self.n_exp = len(to_consider) - 1

# Drop reference state
self.vn = (
self.v.drop(self.ref_state)
.divide(self.v.drop(self.ref_state)["flux"], axis=0)
.drop("flux", axis=1)
.T
)
self.xn = (self.x.subtract(self.x[self.ref_state], 0) * np.log(2)).T
self.en = (2 ** self.e.subtract(self.e[self.ref_state], 0)).T

# Get indexes for measured values
self.x_inds = np.array([self.model.metabolites.index(met) for met in self.xn.columns])
self.e_inds = np.array([self.model.reactions.index(rxn) for rxn in self.en.columns])
self.v_inds = np.array([self.model.reactions.index(rxn) for rxn in self.vn.columns])

self.e_laplace_inds = []
self.e_zero_inds = []

for i, rxn in enumerate(self.model.reactions):
if rxn.id not in self.en.columns:
if ("e" not in rxn.compartments) and (len(rxn.compartments) == 1):
self.e_laplace_inds += [i]
else:
self.e_zero_inds += [i]

self.e_laplace_inds = np.array(self.e_laplace_inds)
self.e_zero_inds = np.array(self.e_zero_inds)
self.e_indexer = np.hstack([self.e_inds, self.e_laplace_inds, self.e_zero_inds]).argsort()

self.N = cobra.util.create_stoichiometric_matrix(self.model)
self.Ex = emll.util.create_elasticity_matrix(self.model)
self.Ey = emll.util.create_Ey_matrix(self.model)

self.Ex *= 0.1 + 0.8 * np.random.rand(*self.Ex.shape)

self.ll = emll.LinLogLeastNorm(self.N, self.Ex, self.Ey, self.v_star.values, driver="gelsy")

def build_pymc_model(self):
"""Build the PyMC probabilistic model."""
with pm.Model() as pymc_model:
# Priors on elasticity values
self.Ex_t = pm.Deterministic(
"Ex",
emll.initialize_elasticity(
self.ll.N,
b=0.01,
sigma=1,
alpha=None,
m_compartments=self.m_compartments,
r_compartments=self.r_compartments,
),
)

self.Ey_t = T.as_tensor_variable(self.Ey)

e_measured = pm.Normal(
"log_e_measured",
mu=np.log(self.en),
sigma=0.2,
shape=(self.n_exp, len(self.e_inds)),
)
e_unmeasured = pm.Laplace(
"log_e_unmeasured", mu=0, b=0.1, shape=(self.n_exp, len(self.e_laplace_inds))
)
log_en_t = pt.concatenate(
[e_measured, e_unmeasured, pt.zeros((self.n_exp, len(self.e_zero_inds)))], axis=1
)[:, self.e_indexer]

pm.Deterministic("log_en_t", log_en_t)

# Priors on external concentrations
yn_t = pm.Normal(
"yn_t",
mu=0,
sigma=10,
shape=(self.n_exp, self.ll.ny),
initval=0.1 * np.random.randn(self.n_exp, self.ll.ny),
)

chi_ss, vn_ss = self.ll.steady_state_pytensor(
self.Ex_t, self.Ey_t, pt.exp(log_en_t), yn_t
)
pm.Deterministic("chi_ss", chi_ss)
pm.Deterministic("vn_ss", vn_ss)

log_vn_ss = pt.log(pt.clip(vn_ss[:, self.v_inds], 1e-8, 1e8))
log_vn_ss = pt.clip(log_vn_ss, -1.5, 1.5)

chi_clip = pt.clip(chi_ss[:, self.x_inds], -1.5, 1.5)

chi_obs = pm.Normal(
"chi_obs", mu=chi_clip, sigma=0.2, observed=self.xn.clip(lower=-1.5, upper=1.5)
)
log_vn_obs = pm.Normal(
"vn_obs",
mu=log_vn_ss,
sigma=0.1,
observed=np.log(self.vn).clip(lower=-1.5, upper=1.5),
)

self.pymc_model = pymc_model



def sample_prior_predictive(self, num_samples=500):
"""Sample from prior predictive distribution."""
with self.pymc_model:
prior_predictive = pm.sample_prior_predictive(num_samples)

# Generate dataframe from prior predictive results
# prior_predictive_df = pd.DataFrame(prior_predictive)

return prior_predictive_df


def run_emll(self):
"""Build linlog model and run inference."""
with self.pymc_model:
approx = pm.ADVI()
hist = approx.fit(
n=40000,
obj_optimizer=pm.adagrad_window(learning_rate=0.005),
total_grad_norm_constraint=100,
)

# trace = hist.sample(500)
# ppc = pm.sample_ppc(trace)

return approx, hist

def save_results(self, approx, hist):
"""Save ADVI results in cloudpickle."""
with gzip.open("data/{self.model.name}.pgz", "wb") as f:
cloudpickle.dump(
{
"approx": approx,
"hist": hist,
},
f,
)


def main(
model_path,
v_star_path,
metabolite_concentrations_path,
boundary_fluxes_path,
enzyme_measurements_path,
reference_state,
):
"""Run BMCA for the provided model."""
emll_model = BMCA(
model_path,
v_star_path,
metabolite_concentrations_path,
boundary_fluxes_path,
enzyme_measurements_path,
reference_state,
)
emll_model.preprocess_data()
emll_model.build_pymc_model()
approx, hist = emll_model.run_emll()
emll_model.save_results(approx, hist)


if __name__ == "__main__":
pass
Loading