ArcInstitute · abearab · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 **__pycache__
 **.idea
 .vscode
+
+**.DS_Store
+
 docs/build
 dist
 build/

diff --git a/demo/step1_process_fastq_files/example_cas12a_pairs.assembled.fastq.gz b/demo/step1_process_fastq_files/example_cas12a_pairs.assembled.fastq.gz
diff --git a/demo/step1_process_fastq_files/example_cas12a_triplets.assembled.fastq.gz b/demo/step1_process_fastq_files/example_cas12a_triplets.assembled.fastq.gz
diff --git a/screenpro/ngs/__init__.py b/screenpro/ngs/__init__.py
@@ -0,0 +1,35 @@
+## Copyright (c) 2022-2024 ScreenPro2 Development Team.
+## All rights reserved.
+## Gilbart Lab, UCSF / Arc Institute.
+## Multi-Omics Tech Center, Arc Insititue.
+
+## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
+## with support from the Nick Youngblut (@nick-youngblut).
+
+'''Scripts to work with NGS data
+
+This module provides functions to process FASTQ files from screens with single or dual guide
+libraries. In general, the algorithm is fairly simple:
+1. Read the FASTQ file and extract the proper sequences
+2. Count the exact number of occurrences for each unique sequence
+3. Map the counted sequences to the reference sequence library
+4. Return the counted mapped or unmapped events as a dataframe(s)
+
+For single-guide screens, the sequences are counted as single protospacer
+from a single-end read file (R1). Then, these sequences are mapped to the reference
+library of protospacer sequences.
+
+For dual-guide screens, the sequences are counted as pairs of protospacer A and B
+from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
+library of protospacer A and B pairs.
+
+Theoretically, the algorithm is able to detect any observed sequence since it is counting first
+and then mapping. Therefore, the recombination events can be detected. In dual-guide design
+protospacer A and B are not the same pairs as in the reference library. These events include:
+- Protospacer A and B pairs are present in the reference library but paired differently
+- Only one of the protospacer A and B is present in the reference library
+- None of the protospacer A and B is present in the reference library
+'''
+
+from . import cas9
+from . import cas12
diff --git a/screenpro/ngs/cas12.py b/screenpro/ngs/cas12.py
@@ -0,0 +1,206 @@
+import biobear as bb
+import polars as pl
+from time import time
+
+
+def fastq_to_count_merged_reads(
+        fastq_file_path:str, 
+        verbose: bool=False) -> pl.DataFrame:
+    if verbose: ('count unique sequences ...')
+    t0 = time()
+
+    session = bb.connect()
+
+    sql_cmd = f"""
+    SELECT f.sequence AS sequence, COUNT(*) as count
+    FROM fastq_scan('{fastq_file_path}') f
+    GROUP BY sequence
+    """
+
+    df_count = session.sql(sql_cmd).to_polars()
+
+    if verbose: print("done in %0.3fs" % (time() - t0))
+
+    return df_count
+
+
+def get_spacers_cas12(df_count,DRref):
+
+    # get 1st spacer sequence
+    df_count = df_count.with_columns(
+            DR1_loc = df_count['sequence'].str.find(DRref['DR-1']),
+        ).with_columns(
+            pl.col('sequence').str.slice(
+                pl.col("DR1_loc")-23, 23
+            ).alias("SP1_sequence")
+        )
+
+    # get 2ed+ spacer sequence
+
+    for DR_key, DR_seq in DRref.items():
+        DR_n = int(DR_key[-1]) # get the number of the DR
+        spacer_i = DR_n+1 # get the spacer number, 3' of the DR sequence
+        df_count = df_count.with_columns(
+            df_count['sequence'].str.find(DR_seq).alias(f"DR{DR_n}_loc")
+        ).with_columns(
+            pl.col('sequence').str.slice(
+                pl.col(f"DR{DR_n}_loc")+19, 23
+            ).alias(f"SP{spacer_i}_sequence")
+        )
+
+    out = df_count.select([
+        f'SP{i}_sequence' for i in range(1,len(DRref)+2)
+    ]+ ['count']).group_by([
+        f'SP{i}_sequence' for i in range(1,len(DRref)+2)
+    ]).sum()
+
+    return df_count, out
+
+
+def map_to_cas12_pairs_library(df_count,library,DR1_seq, get_recominant=False, verbose=False):
+
+    t0 = time()
+
+    df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq})
+
+    if verbose:
+        perc_DR1 = df_count.with_columns(
+            pl.col('DR1_loc').fill_null(0).gt(0)
+        ).filter(
+            pl.col('DR1_loc')
+        ).get_column('count').sum() / df_count['count'].sum() * 100
+
+        print(f"% counts with DR1: {perc_DR1}")
+
+    df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence']].reset_index()).join(
+        df_count_split, 
+        on=["SP1_sequence","SP2_sequence"], how="left"
+    )
+
+    if verbose:
+        perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100
+
+        print(f"% counts mapped to library: {perc_mapped}")
+
+
+    if get_recominant:
+        df_res_unmap = df_count_split.join(
+            pl.DataFrame(
+                library[['SP1_sequence','SP2_sequence']].reset_index()
+            ), on=["SP1_sequence","SP2_sequence"], how="anti"
+        )
+
+        df_res_unmap_remapped_sp1 = df_res_unmap.join(
+            pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
+        )
+
+        df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
+            pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
+            on=["SP2_sequence"], how="left"
+        ).drop_nulls().unique().with_columns(
+            recombinant_name=pl.concat_str(
+                [
+                    pl.col("SP1_name"),
+                    pl.col("SP1_id"),
+                    pl.col("SP2_name"),
+                    pl.col("SP2_id"),
+                ],
+                separator="_"
+            )
+        ).select(['recombinant_name','SP1_sequence','SP2_sequence','count'])
+
+        if verbose:
+            perc_remapped = df_res_unmap_remapped_sp1_sp2['count'].drop_nulls().sum() / df_count['count'].sum() * 100
+
+            print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")
+
+        if verbose: print("done in %0.3fs" % (time() - t0))
+
+        return df_res, df_res_unmap_remapped_sp1_sp2
+
+    else:
+        if verbose: print("done in %0.3fs" % (time() - t0))
+
+        return df_res
+
+
+def map_to_cas12_triplets_library(df_count,library,DR1_seq, DR2_seq, get_recominant=False, verbose=False):
+
+    t0 = time()
+
+    df_count, df_count_split = get_spacers_cas12(df_count, DRref = {'DR-1': DR1_seq, 'DR-2': DR2_seq})
+
+    if verbose:
+        perc_DR1 = df_count.with_columns(
+            pl.col('DR1_loc').fill_null(0).gt(0)
+        ).filter(
+            pl.col('DR1_loc')
+        ).get_column('count').sum() / df_count['count'].sum() * 100
+
+        print(f"% counts with DR1: {perc_DR1}")
+
+        perc_DR2 = df_count.with_columns(
+            pl.col('DR2_loc').fill_null(0).gt(0)
+        ).filter(
+            pl.col('DR2_loc')
+        ).get_column('count').sum() / df_count['count'].sum() * 100
+
+        print(f"% counts with DR2: {perc_DR2}")
+
+    df_res = pl.DataFrame(library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()).join(
+        df_count_split, 
+        on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="left"
+    )
+
+    if verbose:
+        perc_mapped = df_res['count'].drop_nulls().sum() / df_count['count'].sum() * 100
+
+        print(f"% counts mapped to library: {perc_mapped}")
+
+    if get_recominant:
+        df_res_unmap = df_count_split.join(
+            pl.DataFrame(
+                library[['SP1_sequence','SP2_sequence','SP3_sequence']].reset_index()
+            ), on=["SP1_sequence","SP2_sequence","SP3_sequence"], how="anti"
+        )
+
+        df_res_unmap_remapped_sp1 = df_res_unmap.join(
+            pl.DataFrame(library[['SP1_name','SP1_id','SP1_sequence']]), on=["SP1_sequence"], how="left"
+        )
+
+        df_res_unmap_remapped_sp1_sp2 = df_res_unmap_remapped_sp1.join(
+            pl.DataFrame(library[['SP2_name','SP2_id','SP2_sequence']]),
+            on=["SP2_sequence"], how="left"
+        )
+
+        df_res_unmap_remapped_sp1_sp2_sp3 = df_res_unmap_remapped_sp1_sp2.join(
+            pl.DataFrame(library[['SP3_name','SP3_id','SP3_sequence']]),
+            on=["SP3_sequence"], how="left"
+        ).drop_nulls().unique().with_columns(
+            recombinant_name=pl.concat_str(
+                [
+                    pl.col("SP1_name"),
+                    pl.col("SP1_id"),
+                    pl.col("SP2_name"),
+                    pl.col("SP2_id"),
+                    pl.col("SP3_name"),
+                    pl.col("SP3_id"),
+                ],
+                separator="_"
+            )
+        ).select(['recombinant_name','SP1_sequence','SP2_sequence','SP3_sequence','count'])
+
+        if verbose:
+            perc_remapped = df_res_unmap_remapped_sp1_sp2_sp3['count'].drop_nulls().sum() / df_count['count'].sum() * 100
+
+            print(f"% counts remapped to library: {perc_remapped} [fully remapped recombination events]")
+
+        if verbose: print("done in %0.3fs" % (time() - t0))
+
+        return df_res, df_res_unmap_remapped_sp1_sp2_sp3
+
+    else:
+
+        if verbose: print("done in %0.3fs" % (time() - t0))
+
+        return df_res
diff --git a/screenpro/ngs.py → screenpro/ngs/cas9.py b/screenpro/ngs.py → screenpro/ngs/cas9.py
@@ -1,36 +1,3 @@
-## Copyright (c) 2022-2024 ScreenPro2 Development Team.
-## All rights reserved.
-## Gilbart Lab, UCSF / Arc Institute.
-## Multi-Omics Tech Center, Arc Insititue.
-
-## This part of the software is conceptualized and developed by Abolfazl Arab (@abearab)
-## with support from the Nick Youngblut (@nick-youngblut).
-
-'''Scripts to work with NGS data
-
-This module provides functions to process FASTQ files from screens with single or dual guide
-libraries. In general, the algorithm is fairly simple:
-1. Read the FASTQ file and extract the proper sequences
-2. Count the exact number of occurrences for each unique sequence
-3. Map the counted sequences to the reference sequence library
-4. Return the counted mapped or unmapped events as a dataframe(s)
-
-For single-guide screens, the sequences are counted as single protospacer
-from a single-end read file (R1). Then, these sequences are mapped to the reference
-library of protospacer sequences.
-
-For dual-guide screens, the sequences are counted as pairs of protospacer A and B
-from paired-end read files (R1 and R2). Then, sequences are mapped to the reference
-library of protospacer A and B pairs.
-
-Theoretically, the algorithm is able to detect any observed sequence since it is counting first
-and then mapping. Therefore, the recombination events can be detected. In dual-guide design
-protospacer A and B are not the same pairs as in the reference library. These events include:
-- Protospacer A and B pairs are present in the reference library but paired differently
-- Only one of the protospacer A and B is present in the reference library
-- None of the protospacer A and B is present in the reference library
-'''
-
 from time import time
 import pandas as pd
 import polars as pl

diff --git a/screenpro/tests/test_fastq2count.py b/screenpro/tests/test_fastq2count.py
@@ -2,8 +2,8 @@
 from screenpro import ngs
 
 
-def test_single_guide():
-    df_count = ngs.fastq_to_count_single_guide(
+def test_cas9_single_guide():
+    df_count = ngs.cas9.fastq_to_count_single_guide(
         fastq_file_path='demo/step1_process_fastq_files/example_crispri_v2_sample.fastq.gz',
         trim5p_start=2,
         trim5p_length=19,
@@ -13,8 +13,8 @@ def test_single_guide():
     assert isinstance(df_count, pl.DataFrame)
 
 
-def test_dual_guide():
-    df_count = ngs.fastq_to_count_dual_guide(
+def test_cas9_dual_guide():
+    df_count = ngs.cas9.fastq_to_count_dual_guide(
         R1_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R1.fastq.gz',
         R2_fastq_file_path='demo/step1_process_fastq_files/example_crispri_v3_sample_R2.fastq.gz',
         trim5p_pos1_start=2,