Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support cas12-screens #40

Merged
merged 7 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
**__pycache__
**.idea
.vscode

**.DS_Store

docs/build
dist
build/
Expand Down
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions screenpro/ngs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.

## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
## with support from the Nick Youngblut (@nick-youngblut).

'''Scripts to work with NGS data

This module provides functions to process FASTQ files from screens with single or dual guide
libraries. In general, the algorithm is fairly simple:
1. Read the FASTQ file and extract the proper sequences
2. Count the exact number of occurrences for each unique sequence
3. Map the counted sequences to the reference sequence library
4. Return the counted mapped or unmapped events as a dataframe(s)

For single-guide screens, the sequences are counted as single protospacer
from a single-end read file (R1). Then, these sequences are mapped to the reference
library of protospacer sequences.

For dual-guide screens, the sequences are counted as pairs of protospacer A and B
from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
library of protospacer A and B pairs.

Theoretically, the algorithm is able to detect any observed sequence since it is counting first
and then mapping. Therefore, the recombination events can be detected. In dual-guide design
protospacer A and B are not the same pairs as in the reference library. These events include:
- Protospacer A and B pairs are present in the reference library but paired differently
- Only one of the protospacer A and B is present in the reference library
- None of the protospacer A and B is present in the reference library
'''

from . import cas9
from . import cas12
206 changes: 206 additions & 0 deletions screenpro/ngs/cas12.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import biobear as bb
import polars as pl
from time import time


def fastq_to_count_merged_reads(
fastq_file_path:str,
verbose: bool=False) -> pl.DataFrame:
if verbose: ('count unique sequences ...')
t0 = time()

session = bb.connect()

sql_cmd = f"""
SELECT f.sequence AS sequence, COUNT(*) as count
FROM fastq_scan('{fastq_file_path}') f
GROUP BY sequence
"""

df_count = session.sql(sql_cmd).to_polars()

if verbose: print("done in %0.3fs" % (time() - t0))

return df_count


def get_spacers_cas12(df_count,DRref):

# get 1st spacer sequence
df_count = df_count.with_columns(
DR1_loc = df_count['sequence'].str.find(DRref['DR-1']),
).with_columns(
pl.col('sequence').str.slice(
pl.col("DR1_loc")-23, 23
).alias("SP1_sequence")
)

# get 2ed+ spacer sequence

for DR_key, DR_seq in DRref.items():
DR_n = int(DR_key[-1]) # get the number of the DR
spacer_i = DR_n+1 # get the spacer number, 3' of the DR sequence
df_count = df_count.with_columns(
df_count['sequence'].str.find(DR_seq).alias(f"DR{DR_n}_loc")
).with_columns(
pl.col('sequence').str.slice(
pl.col(f"DR{DR_n}_loc")+19, 23
).alias(f"SP{spacer_i}_sequence")
)

out = df_count.select([
f'SP{i}_sequence' for i in range(1,len(DRref)+2)
]+ ['count']).group_by([
f'SP{i}_sequence' for i in range(1,len(DRref)+2)
]).sum()

return df_count, out


def map_to_cas12_pairs_library(df_count,library,DR1_seq, get_recominant=False, verbose=False):

t0 = time()

df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq})

if verbose:
perc_DR1 = df_count.with_columns(
pl.col('DR1_loc').fill_null(0).gt(0)
).filter(
pl.col('DR1_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR1: {perc_DR1}")

df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence']].reset_index()).join(
df_count_split,
on=["SP1_sequence","SP2_sequence"], how="left"
)

if verbose:
perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts mapped to library: {perc_mapped}")


if get_recominant:
df_res_unmap = df_count_split.join(
pl.DataFrame(
library[['SP1_sequence','SP2_sequence']].reset_index()
), on=["SP1_sequence","SP2_sequence"], how="anti"
)

df_res_unmap_remapped_sp1 = df_res_unmap.join(
pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
on=["SP2_sequence"], how="left"
).drop_nulls().unique().with_columns(
recombinant_name=pl.concat_str(
[
pl.col("SP1_name"),
pl.col("SP1_id"),
pl.col("SP2_name"),
pl.col("SP2_id"),
],
separator="_"
)
).select(['recombinant_name','SP1_sequence','SP2_sequence','count'])

if verbose:
perc_remapped = df_res_unmap_remapped_sp1_sp2['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res, df_res_unmap_remapped_sp1_sp2

else:
if verbose: print("done in %0.3fs" % (time() - t0))

return df_res


def map_to_cas12_triplets_library(df_count,library,DR1_seq, DR2_seq, get_recominant=False, verbose=False):

t0 = time()

df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq, 'DR-2': DR2_seq})

if verbose:
perc_DR1 = df_count.with_columns(
pl.col('DR1_loc').fill_null(0).gt(0)
).filter(
pl.col('DR1_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR1: {perc_DR1}")

perc_DR2 = df_count.with_columns(
pl.col('DR2_loc').fill_null(0).gt(0)
).filter(
pl.col('DR2_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR2: {perc_DR2}")

df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()).join(
df_count_split,
on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="left"
)

if verbose:
perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts mapped to library: {perc_mapped}")

if get_recominant:
df_res_unmap = df_count_split.join(
pl.DataFrame(
library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()
), on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="anti"
)

df_res_unmap_remapped_sp1 = df_res_unmap.join(
pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
on=["SP2_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2_sp3 = df_res_unmap_remapped_sp1_sp2.join(
pl.DataFrame(library[['SP3_name','SP3_id','SP3_sequence']]),
on=["SP3_sequence"], how="left"
).drop_nulls().unique().with_columns(
recombinant_name=pl.concat_str(
[
pl.col("SP1_name"),
pl.col("SP1_id"),
pl.col("SP2_name"),
pl.col("SP2_id"),
pl.col("SP3_name"),
pl.col("SP3_id"),
],
separator="_"
)
).select(['recombinant_name','SP1_sequence','SP2_sequence','SP3_sequence','count'])

if verbose:
perc_remapped = df_res_unmap_remapped_sp1_sp2_sp3['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res, df_res_unmap_remapped_sp1_sp2_sp3

else:

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res
33 changes: 0 additions & 33 deletions screenpro/ngs.py → screenpro/ngs/cas9.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,3 @@
## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.

## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
## with support from the Nick Youngblut (@nick-youngblut).

'''Scripts to work with NGS data

This module provides functions to process FASTQ files from screens with single or dual guide
libraries. In general, the algorithm is fairly simple:
1. Read the FASTQ file and extract the proper sequences
2. Count the exact number of occurrences for each unique sequence
3. Map the counted sequences to the reference sequence library
4. Return the counted mapped or unmapped events as a dataframe(s)

For single-guide screens, the sequences are counted as single protospacer
from a single-end read file (R1). Then, these sequences are mapped to the reference
library of protospacer sequences.

For dual-guide screens, the sequences are counted as pairs of protospacer A and B
from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
library of protospacer A and B pairs.

Theoretically, the algorithm is able to detect any observed sequence since it is counting first
and then mapping. Therefore, the recombination events can be detected. In dual-guide design
protospacer A and B are not the same pairs as in the reference library. These events include:
- Protospacer A and B pairs are present in the reference library but paired differently
- Only one of the protospacer A and B is present in the reference library
- None of the protospacer A and B is present in the reference library
'''

from time import time
import pandas as pd
import polars as pl
Expand Down
8 changes: 4 additions & 4 deletions screenpro/tests/test_fastq2count.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from screenpro import ngs


def test_single_guide():
df_count = ngs.fastq_to_count_single_guide(
def test_cas9_single_guide():
df_count = ngs.cas9.fastq_to_count_single_guide(
fastq_file_path='demo/step1_process_fastq_files/example_crispri_v2_sample.fastq.gz',
trim5p_start=2,
trim5p_length=19,
Expand All @@ -13,8 +13,8 @@ def test_single_guide():
assert isinstance(df_count, pl.DataFrame)


def test_dual_guide():
df_count = ngs.fastq_to_count_dual_guide(
def test_cas9_dual_guide():
df_count = ngs.cas9.fastq_to_count_dual_guide(
R1_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R1.fastq.gz',
R2_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R2.fastq.gz',
trim5p_pos1_start=2,
Expand Down
Loading