Skip to content

Commit

Permalink
Merge pull request #40 from cas12-screens
Browse files Browse the repository at this point in the history
- split NGS modules into cas9 and cas12 sub-modules
- add functions to enable processing higher-order screens using cas12 platforms
  • Loading branch information
abearab authored Apr 5, 2024
2 parents 770ea2a + 13382b3 commit 2f6deab
Show file tree
Hide file tree
Showing 7 changed files with 248 additions and 37 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
**__pycache__
**.idea
.vscode

**.DS_Store

docs/build
dist
build/
Expand Down
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions screenpro/ngs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.

## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
## with support from the Nick Youngblut (@nick-youngblut).

'''Scripts to work with NGS data
This module provides functions to process FASTQ files from screens with single or dual guide
libraries. In general, the algorithm is fairly simple:
1. Read the FASTQ file and extract the proper sequences
2. Count the exact number of occurrences for each unique sequence
3. Map the counted sequences to the reference sequence library
4. Return the counted mapped or unmapped events as a dataframe(s)
For single-guide screens, the sequences are counted as single protospacer
from a single-end read file (R1). Then, these sequences are mapped to the reference
library of protospacer sequences.
For dual-guide screens, the sequences are counted as pairs of protospacer A and B
from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
library of protospacer A and B pairs.
Theoretically, the algorithm is able to detect any observed sequence since it is counting first
and then mapping. Therefore, the recombination events can be detected. In dual-guide design
protospacer A and B are not the same pairs as in the reference library. These events include:
- Protospacer A and B pairs are present in the reference library but paired differently
- Only one of the protospacer A and B is present in the reference library
- None of the protospacer A and B is present in the reference library
'''

from . import cas9
from . import cas12
206 changes: 206 additions & 0 deletions screenpro/ngs/cas12.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import biobear as bb
import polars as pl
from time import time


def fastq_to_count_merged_reads(
fastq_file_path:str,
verbose: bool=False) -> pl.DataFrame:
if verbose: ('count unique sequences ...')
t0 = time()

session = bb.connect()

sql_cmd = f"""
SELECT f.sequence AS sequence, COUNT(*) as count
FROM fastq_scan('{fastq_file_path}') f
GROUP BY sequence
"""

df_count = session.sql(sql_cmd).to_polars()

if verbose: print("done in %0.3fs" % (time() - t0))

return df_count


def get_spacers_cas12(df_count,DRref):

# get 1st spacer sequence
df_count = df_count.with_columns(
DR1_loc = df_count['sequence'].str.find(DRref['DR-1']),
).with_columns(
pl.col('sequence').str.slice(
pl.col("DR1_loc")-23, 23
).alias("SP1_sequence")
)

# get 2ed+ spacer sequence

for DR_key, DR_seq in DRref.items():
DR_n = int(DR_key[-1]) # get the number of the DR
spacer_i = DR_n+1 # get the spacer number, 3' of the DR sequence
df_count = df_count.with_columns(
df_count['sequence'].str.find(DR_seq).alias(f"DR{DR_n}_loc")
).with_columns(
pl.col('sequence').str.slice(
pl.col(f"DR{DR_n}_loc")+19, 23
).alias(f"SP{spacer_i}_sequence")
)

out = df_count.select([
f'SP{i}_sequence' for i in range(1,len(DRref)+2)
]+ ['count']).group_by([
f'SP{i}_sequence' for i in range(1,len(DRref)+2)
]).sum()

return df_count, out


def map_to_cas12_pairs_library(df_count,library,DR1_seq, get_recominant=False, verbose=False):

t0 = time()

df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq})

if verbose:
perc_DR1 = df_count.with_columns(
pl.col('DR1_loc').fill_null(0).gt(0)
).filter(
pl.col('DR1_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR1: {perc_DR1}")

df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence']].reset_index()).join(
df_count_split,
on=["SP1_sequence","SP2_sequence"], how="left"
)

if verbose:
perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts mapped to library: {perc_mapped}")


if get_recominant:
df_res_unmap = df_count_split.join(
pl.DataFrame(
library[['SP1_sequence','SP2_sequence']].reset_index()
), on=["SP1_sequence","SP2_sequence"], how="anti"
)

df_res_unmap_remapped_sp1 = df_res_unmap.join(
pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
on=["SP2_sequence"], how="left"
).drop_nulls().unique().with_columns(
recombinant_name=pl.concat_str(
[
pl.col("SP1_name"),
pl.col("SP1_id"),
pl.col("SP2_name"),
pl.col("SP2_id"),
],
separator="_"
)
).select(['recombinant_name','SP1_sequence','SP2_sequence','count'])

if verbose:
perc_remapped = df_res_unmap_remapped_sp1_sp2['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res, df_res_unmap_remapped_sp1_sp2

else:
if verbose: print("done in %0.3fs" % (time() - t0))

return df_res


def map_to_cas12_triplets_library(df_count,library,DR1_seq, DR2_seq, get_recominant=False, verbose=False):

t0 = time()

df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq, 'DR-2': DR2_seq})

if verbose:
perc_DR1 = df_count.with_columns(
pl.col('DR1_loc').fill_null(0).gt(0)
).filter(
pl.col('DR1_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR1: {perc_DR1}")

perc_DR2 = df_count.with_columns(
pl.col('DR2_loc').fill_null(0).gt(0)
).filter(
pl.col('DR2_loc')
).get_column('count').sum() / df_count['count'].sum() * 100

print(f"% counts with DR2: {perc_DR2}")

df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()).join(
df_count_split,
on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="left"
)

if verbose:
perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts mapped to library: {perc_mapped}")

if get_recominant:
df_res_unmap = df_count_split.join(
pl.DataFrame(
library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()
), on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="anti"
)

df_res_unmap_remapped_sp1 = df_res_unmap.join(
pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
on=["SP2_sequence"], how="left"
)

df_res_unmap_remapped_sp1_sp2_sp3 = df_res_unmap_remapped_sp1_sp2.join(
pl.DataFrame(library[['SP3_name','SP3_id','SP3_sequence']]),
on=["SP3_sequence"], how="left"
).drop_nulls().unique().with_columns(
recombinant_name=pl.concat_str(
[
pl.col("SP1_name"),
pl.col("SP1_id"),
pl.col("SP2_name"),
pl.col("SP2_id"),
pl.col("SP3_name"),
pl.col("SP3_id"),
],
separator="_"
)
).select(['recombinant_name','SP1_sequence','SP2_sequence','SP3_sequence','count'])

if verbose:
perc_remapped = df_res_unmap_remapped_sp1_sp2_sp3['count'].drop_nulls().sum() / df_count['count'].sum() * 100

print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res, df_res_unmap_remapped_sp1_sp2_sp3

else:

if verbose: print("done in %0.3fs" % (time() - t0))

return df_res
33 changes: 0 additions & 33 deletions screenpro/ngs.py → screenpro/ngs/cas9.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,3 @@
## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.

## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
## with support from the Nick Youngblut (@nick-youngblut).

'''Scripts to work with NGS data
This module provides functions to process FASTQ files from screens with single or dual guide
libraries. In general, the algorithm is fairly simple:
1. Read the FASTQ file and extract the proper sequences
2. Count the exact number of occurrences for each unique sequence
3. Map the counted sequences to the reference sequence library
4. Return the counted mapped or unmapped events as a dataframe(s)
For single-guide screens, the sequences are counted as single protospacer
from a single-end read file (R1). Then, these sequences are mapped to the reference
library of protospacer sequences.
For dual-guide screens, the sequences are counted as pairs of protospacer A and B
from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
library of protospacer A and B pairs.
Theoretically, the algorithm is able to detect any observed sequence since it is counting first
and then mapping. Therefore, the recombination events can be detected. In dual-guide design
protospacer A and B are not the same pairs as in the reference library. These events include:
- Protospacer A and B pairs are present in the reference library but paired differently
- Only one of the protospacer A and B is present in the reference library
- None of the protospacer A and B is present in the reference library
'''

from time import time
import pandas as pd
import polars as pl
Expand Down
8 changes: 4 additions & 4 deletions screenpro/tests/test_fastq2count.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from screenpro import ngs


def test_single_guide():
df_count = ngs.fastq_to_count_single_guide(
def test_cas9_single_guide():
df_count = ngs.cas9.fastq_to_count_single_guide(
fastq_file_path='demo/step1_process_fastq_files/example_crispri_v2_sample.fastq.gz',
trim5p_start=2,
trim5p_length=19,
Expand All @@ -13,8 +13,8 @@ def test_single_guide():
assert isinstance(df_count, pl.DataFrame)


def test_dual_guide():
df_count = ngs.fastq_to_count_dual_guide(
def test_cas9_dual_guide():
df_count = ngs.cas9.fastq_to_count_dual_guide(
R1_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R1.fastq.gz',
R2_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R2.fastq.gz',
trim5p_pos1_start=2,
Expand Down

0 comments on commit 2f6deab

Please sign in to comment.