Skip to content

Commit

Permalink
Merge pull request #75 from PNNL-CompBio/dev-memory
Browse files Browse the repository at this point in the history
Dev memory
  • Loading branch information
biodataganache authored Aug 28, 2022
2 parents da20987 + efc2b9b commit 877872c
Show file tree
Hide file tree
Showing 10 changed files with 308 additions and 506 deletions.
584 changes: 115 additions & 469 deletions resources/tutorial/snekmer_demo.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions snekmer/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def load_npz(
columns: Dict[str, str] = {
"ids": "sequence_id",
"seqs": "sequence",
"vecs": "sequence_vector",
"vecs": "sequence_vector"
},
) -> pd.DataFrame:
"""Compile .npz results into dataframe.
Expand Down Expand Up @@ -88,7 +88,7 @@ def load_npz(
df.update({f"{out_col}_length": [len(s) for s in data[in_col]]})
# df.update()

return pd.DataFrame(df)
return pd.DataFrame(df), data["kmerlist"]


def read_output_kmers(filename: str) -> List[str]:
Expand Down
20 changes: 19 additions & 1 deletion snekmer/rules/cluster.smk
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,26 @@ rule cluster:

# tabulate vectorized seq data
data = list()
kmerbasis = list()
kmerlists = list()
for f in input.data:
data.append(skm.io.load_npz(f))
dat,kmerlist = skm.io.load_npz(f)
# memfix: these need to be harmonized
data.append(dat)
kmerlists.append(kmerlist)
kmerbasis.extend(kmerlist)

# make a superset of kmers
kmerbasis = np.unique(kmerlist)
basis = skm.vectorize.KmerVec(config["alphabet"], config["k"])
basis.set_kmer_set(kmerbasis)

for i in range(len(data)):
this = data[i]
kmerlist = kmerlists[i]
vecs = skm.utils.to_feature_matrix(this["sequence_vector"].values)
that = basis.harmonize_data(vecs, kmerlist)
this["sequence_vector"] = that.tolist()

data = pd.concat(data, ignore_index=True)
data["background"] = [f in BGS for f in data["filename"]]
Expand Down
12 changes: 11 additions & 1 deletion snekmer/rules/kmerize.smk
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,18 @@ rule vectorize:
ids.append(f.id)
lengths.append(len(f.seq))

# memfix: we need to reformat the vec array to reflect
# the all observed kmers in a matrix - rather than
# a list of lists of variable length. Then we'll also
# add the kmer order to kmer to what we save - that way we
# don't need to consider every possible kmer
vecs, kmerlist = skm.vectorize.make_feature_matrix(vecs)

kmer.set_kmer_set(kmer_set=kmerlist)

# save seqIO output and transformed vecs
np.savez_compressed(output.data, ids=ids, seqs=seqs, vecs=vecs, lengths=lengths)
np.savez_compressed(output.data, kmerlist=kmerlist, ids=ids,
seqs=seqs, vecs=vecs, lengths=lengths)

with open(output.kmerobj, "wb") as f:
pickle.dump(kmer, f)
58 changes: 43 additions & 15 deletions snekmer/rules/model.smk
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ rule score:
data=join("output", "scoring", "sequences", "{nb}.csv.gz"),
weights=join("output", "scoring", "weights", "{nb}.csv.gz"),
scorer=join("output", "scoring", "{nb}.scorer"),
matrix=join("output", "scoring", "{nb}.matrix"),
log:
join("output", "scoring", "log", "{nb}.log"),
run:
Expand All @@ -138,10 +139,24 @@ rule score:

# tabulate vectorized seq data
data = list()
kmer_sets = list()
for f in input.data:
data.append(skm.io.load_npz(f))
this, kmerlist = skm.io.load_npz(f)
data.append(this)
kmer_sets.append(kmerlist)

# now we're loading all the other models in too
# each has a different basis set - we need to
# harmonize those in data before continuing
for i in range(len(data)):
this = data[i]
kmerlist = kmer_sets[i]
vecs = skm.utils.to_feature_matrix(this["sequence_vector"].values)
that = kmer.harmonize_data(vecs, kmerlist)
this["sequence_vector"] = that.tolist()

data = pd.concat(data, ignore_index=True)

data["background"] = [f in BGS for f in data["filename"]]

# log conversion step runtime
Expand Down Expand Up @@ -203,6 +218,11 @@ rule score:
# log time to compute class probabilities
skm.utils.log_runtime(log[0], start_time, step="class_probabilities")

# this will be redundnat with the output csv file - except that
# it will have the harmonized matrix included with it
with(open(output.matrix, "wb") as f):
pickle.dump(data, f)

# save all files to respective outputs
delete_cols = ["vec", "sequence_vector"]
for col in delete_cols:
Expand All @@ -211,6 +231,7 @@ rule score:
data.drop(columns="sequence_vector").to_csv(
output.data, index=False, compression="gzip"
)

class_probabilities.to_csv(output.weights, index=False, compression="gzip")
with open(output.scorer, "wb") as f:
pickle.dump(scorer, f)
Expand All @@ -225,28 +246,35 @@ rule model:
data=rules.score.output.data,
weights=rules.score.output.weights,
kmerobj=rules.score.input.kmerobj,
matrix=rules.score.output.matrix
output:
model=join("output", "model", "{nb}.model"),
results=join("output", "model", "results", "{nb}.csv"),
figs=directory(join("output", "model", "figures", "{nb}")),
run:
# create lookup table to match vector to sequence by file+ID
lookup = {}
for f in input.raw:
loaded = np.load(f)
lookup.update(
{
(splitext(basename(f))[0], seq_id): seq_vec
for seq_id, seq_vec in zip(loaded["ids"], loaded["vecs"])
}
)
#lookup = {}
#for f in input.raw:
# loaded = np.load(f)
# lookup.update(
# {
# (splitext(basename(f))[0], seq_id): seq_vec
# for seq_id, seq_vec in zip(loaded["ids"], loaded["vecs"])
# }
# )
#print(lookup)
with open(input.matrix, "rb") as f:
matrix = pickle.load(f)

data = matrix

# load all input data and encode rule-wide variables
data = pd.read_csv(input.data)
data["sequence_vector"] = [
lookup[(seq_f, seq_id)]
for seq_f, seq_id in zip(data["filename"], data["sequence_id"])
]
#data = pd.read_csv(input.data)
#data["sequence_vector"] = [
# lookup[(seq_f, seq_id)]
# for seq_f, seq_id in zip(data["filename"], data["sequence_id"])
#]

scores = pd.read_csv(input.weights)
family = skm.utils.get_family(
skm.utils.split_file_ext(input.weights)[0], regex=config["input_file_regex"]
Expand Down
2 changes: 1 addition & 1 deletion snekmer/rules/score.smk
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ rule score:
data["train"] = [idx in i_train for idx in data.index]

# generate family scores and object

scorer = skm.score.KmerScorer()
scorer.fit(
list(kmer.kmer_set.kmers),
Expand Down Expand Up @@ -110,4 +111,3 @@ rule score:

# record script endtime
skm.utils.log_runtime(log[0], start_time)

9 changes: 6 additions & 3 deletions snekmer/rules/search.smk
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,18 @@ rule search:
# load vectorized sequences, score, and predict scores
results = list()
for f in input.files:
df = skm.io.load_npz(f)
df, kmerlist = skm.io.load_npz(f)
filename = skm.utils.split_file_ext(basename(f))[0]

vecs = skm.utils.to_feature_matrix(df["sequence_vector"].values)
#vecs = df["sequence_vector"].values
#print(vecs)
base_kmers = list(kmer.kmer_set.kmers)
scores = scorer.predict(vecs, kmerlist)

scores = scorer.predict(vecs, list(kmer.kmer_set.kmers))
predictions = model.predict(scores.reshape(-1, 1))
predicted_probas = model.predict_proba(scores.reshape(-1, 1))

# display results (score, family assignment, and probability)
df[f"{family}_score"] = scores # scorer output
df[family] = [True if p == 1 else False for p in predictions]
Expand Down
33 changes: 30 additions & 3 deletions snekmer/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,10 +584,28 @@ def fit(
"background": list(data.index[data[bg_col]]),
}


# step 0: get feature matrix and all labels
labels = data[label_col].values
matrix = to_feature_matrix(data[vec_col])
# print(matrix.shape)

#print(data[vec_col].shape)
#print(data[vec_col])

x = len(data[vec_col])
y = len(data[vec_col][0])
#print(x,y)

matrix = np.zeros(x*y).reshape((x,y))
#print(matrix.shape)

for i in range(x):
for j in range(y):
value = data[vec_col][i]
value = value[j]
matrix[i,j] = value
#matrix = np.asarray(np.concatenate(data[vec_col])).reshape((len(data[vec_col]), len(data[vec_col][0])))
#print(matrix.shape)


# step 0: get indices for label (family) ids
# i_label = {
Expand All @@ -598,6 +616,7 @@ def fit(
# step 1: score sample sequences and fit score scaler
sample_matrix = matrix[i_background["sample"]]
s_labels = labels[i_background["sample"]]

probas = feature_class_probabilities(
sample_matrix.T, s_labels, kmers=self.kmers.basis
)
Expand Down Expand Up @@ -697,10 +716,18 @@ def predict(self, array, kmers):
"""
# fit new input data to the correct kmer basis
#self.kmers.set_basis(kmers)
array = self.kmers.transform(array, kmers)

#return (
# apply_feature_probabilities(
# array, self.probabilities["sample"], scaler=self.scaler
# )
# / self.score_norm
#)
return (
apply_feature_probabilities(
array, self.probabilities["sample"], scaler=self.scaler
array, self.probabilities["sample"], scaler=False
)
/ self.score_norm
)
1 change: 0 additions & 1 deletion snekmer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,4 +200,3 @@ def to_feature_matrix(
length_array = np.ones(len(array))
array = [np.array(a) / length for a, length in zip(array, length_array)]
return np.asarray(array)

Loading

0 comments on commit 877872c

Please sign in to comment.