Skip to content

Commit

Permalink
Merge pull request #4 from aertslab/add_hot_encoding_to_sequence
Browse files Browse the repository at this point in the history
Add support for decoding one hot encoded sequence back to a DNA seque…
  • Loading branch information
LukasMahieu authored Jun 27, 2024
2 parents 990c36a + c1ef411 commit 599c4da
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions src/crested/tl/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,52 @@ def one_hot_encode_sequence(sequence: str) -> np.ndarray:
HOT_ENCODING_TABLE[np.frombuffer(sequence.encode("ascii"), dtype=np.uint8)],
axis=0,
)


def build_one_hot_decoding_table() -> np.ndarray:
"""Get hot decoding table to decode a one hot encoded sequence to a DNA sequence string."""
one_hot_decoding_table = np.full(np.iinfo(np.uint8).max, ord("N"), dtype=np.uint8)
one_hot_decoding_table[1] = ord("A")
one_hot_decoding_table[2] = ord("C")
one_hot_decoding_table[4] = ord("G")
one_hot_decoding_table[8] = ord("T")

return one_hot_decoding_table


HOT_DECODING_TABLE = build_one_hot_decoding_table()


def hot_encoding_to_sequence(one_hot_encoded_sequence: np.ndarray) -> str:
"""Decode a one hot encoded sequence to a DNA sequence string."""
# Convert hot encoded seqeuence from:
# (x, 4) with dtype=np.float32
# to:
# (x, 4) with dtype=np.uint8
# and finally combine ACGT dimensions to:
# (x, 1) with dtype=np.uint32
hes_u32 = one_hot_encoded_sequence.astype(np.uint8).view(np.uint32)

# Do some bitshifting magic to decode uint32 to DNA sequence string.
sequence = (
HOT_DECODING_TABLE[
(
(
hes_u32 << 31 >> 31
) # A: 2^0 : 1 -> 1 = A in HOT_DECODING_TABLE
| (
hes_u32 << 23 >> 30
) # C: 2^8 : 256 -> 2 = C in HOT_DECODING_TABLE
| (
hes_u32 << 15 >> 29
) # G: 2^16 : 65536 -> 4 = G in HOT_DECODING_TABLE
| (
hes_u32 << 7 >> 28
) # T: 2^24 : 16777216 -> 8 = T in HOT_DECODING_TABLE
).astype(np.uint8)
]
.tobytes()
.decode("ascii")
)

return sequence

0 comments on commit 599c4da

Please sign in to comment.