feat: Cell Ranger 8.0.0

10XGenomics · Apr 23, 2024 · 5b0acf8 · 5b0acf8
1 parent 7c9ccea
commit 5b0acf8
Show file tree

Hide file tree

Showing 423 changed files with 21,747 additions and 17,482 deletions.
diff --git a/bin/tenkit/sitecheck b/bin/tenkit/sitecheck
@@ -104,6 +104,9 @@ ifchk "SGE JOB_NAME"        "echo \$JOB_NAME"
 check "LSF Submit"          "which bsub"
 ifchk "LSF LSB_JOBNAME"     "echo \$LSB_JOBNAME"
 
+check "HTCondor Submit"     "which condor_submit"
+check "Batch system"        "echo \$BATCH_SYSTEM"
+
 check "BCL2FASTQ 1"         "which configureBclToFastq.pl"
 ifchk "BCL2FASTQ 1 Version" "ls \$(dirname \$(which configureBclToFastq.pl))/../etc"
 ifchk "Perl"                "which perl"

diff --git a/conda_spec.bzl b/conda_spec.bzl
diff --git a/lib/bin/parameters.toml b/lib/bin/parameters.toml
@@ -1,5 +1,5 @@
 detect_chemistry_sample_reads = 100_000
-detect_chemistry_total_reads = 1_000_000
+detect_chemistry_total_reads = 2_000_000
 min_fraction_whitelist_match = 0.1
 min_barcode_similarity = 0.1
 star_parameters = ""

diff --git a/lib/python/cellranger/altair_utils.py b/lib/python/cellranger/altair_utils.py
@@ -6,45 +6,77 @@
 
 import altair as alt
 import pandas as pd
+from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
 
 LAYER = "layer"
 HCONCAT = "hconcat"
 VCONCAT = "vconcat"
 DATA = "data"
 DATA_NAME = "name"
 DATA_FORMAT = "format"
+DATASETS_KEY = "datasets"
+CSV_PARSE_KEY = "parse"
 DATA_FORMAT_CSV_DICT = {"type": "csv"}
 
 
-def sanitise_data_names(chart_dict):
-    """Sanitise data names in chart dict."""
+def get_vega_dtype(dtype: pd.DataFrame.dtypes):
+    """Convert pandas dtype to vega-lite dtype.
+
+    Vega-lite CSV parse
+    dtypes are - https://vega.github.io/vega-lite-api/api/csv.html#parse .
+    """
+    if is_bool_dtype(dtype):
+        return "boolean"
+    elif is_datetime64_any_dtype(dtype):
+        return "date"
+    elif is_numeric_dtype(dtype):  # A little tricky as bools are also considered numeric.
+        return "number"
+    else:
+        raise ValueError("Expected code to be unreachable")
+
+
+def sanitise_data_names(chart_dict, name_to_format_dict):
+    """Sanitise data names in chart dict.
+
+    Uses name_to_format_dict to get column types to parse.
+    """
     if LAYER in chart_dict:
         for layer_dict in chart_dict[LAYER]:
-            sanitise_data_names(layer_dict)
+            sanitise_data_names(layer_dict, name_to_format_dict)
     if HCONCAT in chart_dict:
         for hconcat_dict in chart_dict[HCONCAT]:
-            sanitise_data_names(hconcat_dict)
+            sanitise_data_names(hconcat_dict, name_to_format_dict)
     if VCONCAT in chart_dict:
         for vconcat_dict in chart_dict[VCONCAT]:
-            sanitise_data_names(vconcat_dict)
+            sanitise_data_names(vconcat_dict, name_to_format_dict)
     if DATA in chart_dict:
         if DATA_NAME not in chart_dict[DATA]:
             raise ValueError("Found nameless data in Altair JSON. Not expected VEGA out.")
         else:
+            old_data_name = chart_dict[DATA][DATA_NAME]
+            chart_dict[DATA][DATA_FORMAT] = name_to_format_dict[old_data_name]
             chart_dict[DATA][DATA_NAME] += ".csv"
-            chart_dict[DATA][DATA_FORMAT] = DATA_FORMAT_CSV_DICT
 
 
 def sanitise_chart_dict(chart_dict):
     """Clean up the chart dict."""
     # Convert all datasets into CSVs from JSONs
-    raw_dataset_names = list(chart_dict["datasets"].keys())
+    raw_dataset_names = list(chart_dict[DATASETS_KEY].keys())
+    name_to_format_dict = {}
     for raw_dataset in raw_dataset_names:
-        csv_string = pd.DataFrame.from_dict(chart_dict["datasets"][raw_dataset]).to_csv(index=False)
+        name_to_format_dict[raw_dataset] = DATA_FORMAT_CSV_DICT.copy()
+        df = pd.DataFrame.from_dict(chart_dict[DATASETS_KEY][raw_dataset])
+        csv_string = df.to_csv(index=False)
+        # Getting columns with special parse
+        name_to_format_dict[raw_dataset][CSV_PARSE_KEY] = {
+            x: get_vega_dtype(y)
+            for x, y in df.dtypes.items()
+            if (is_numeric_dtype(y) or is_datetime64_any_dtype(y) or is_bool_dtype(y))
+        }
         new_dataset_name = f"{raw_dataset}.csv"
-        chart_dict["datasets"][new_dataset_name] = csv_string
-        chart_dict["datasets"].pop(raw_dataset)
-    sanitise_data_names(chart_dict)
+        chart_dict[DATASETS_KEY][new_dataset_name] = csv_string
+        chart_dict[DATASETS_KEY].pop(raw_dataset)
+    sanitise_data_names(chart_dict, name_to_format_dict)
 
 
 def chart_to_json(

diff --git a/lib/python/cellranger/analysis/combinatorics.py b/lib/python/cellranger/analysis/combinatorics.py
@@ -57,7 +57,7 @@ def generate_all_multiplets(n_tags: int, max_multiplets: int, add_unit_vector_at
         x_1 + x_2 + x_3 + ... = k for k = 1..max_multiplets
     """
     solutions: list[list[int]] = []
-    for k in range(0, max_multiplets + 1):
+    for k in range(max_multiplets + 1):
         cur_solutions = list(generate_solutions(n_tags, k))
         assert int(calc_all_possible_nonnegative_solutions(n_tags, k)) == len(cur_solutions)
         solutions += cur_solutions

diff --git a/lib/python/cellranger/analysis/irlb.py b/lib/python/cellranger/analysis/irlb.py
@@ -219,7 +219,7 @@ def irlb(
         V[:, k] = F
         B = np.zeros((m_b, m_b))
         # Improve this! There must be better way to assign diagonal...
-        for l in range(0, k):
+        for l in range(k):
             B[l, l] = S[1][l]
         B[0:k, k] = R[0:k]
         # Update the left approximate singular vectors

diff --git a/lib/python/cellranger/analysis/multigenome.py b/lib/python/cellranger/analysis/multigenome.py
@@ -154,6 +154,15 @@ def classify_gems(
             counts1[counts1 > counts0], analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
         )
 
+    # If input is a pure species instead of a mixture then modeling counts independently
+    # can result in an absurdly low threshold for the missing species causing FP labels that
+    # show up as an inflated number of Multiplets.
+    thresholds = sorted([thresh0, thresh1])
+    fold_change = thresholds[1] / thresholds[0]
+    if (thresholds[0] < 50) and (fold_change > 25):
+        thresh0 = thresh1 = np.percentile(
+            counts0 + counts1, analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
+        )
     doublet = np.logical_and(counts0 >= thresh0, counts1 >= thresh1)
     dtype = np.dtype(("S", max(len(cls) for cls in analysis_constants.GEM_CLASSES)))
     result = np.where(
@@ -226,7 +235,7 @@ def _infer_multiplets(
         np.random.seed(0)
 
         n_multiplet_boot: np.ndarray[int, np.dtype[np.float_]] = np.zeros(bootstraps)
-        for i in range(0, bootstraps):
+        for i in range(bootstraps):
             boot_idx = np.random.choice(len(counts0), len(counts0))
             counts0_boot = counts0[boot_idx]
             counts1_boot = counts1[boot_idx]

diff --git a/lib/python/cellranger/cell_calling.py b/lib/python/cellranger/cell_calling.py
@@ -15,23 +15,28 @@
 import cellranger.sgt as cr_sgt
 import cellranger.stats as cr_stats
 from cellranger.analysis.diffexp import adjust_pvalue_bh
-from cellranger.chemistry import CHEMISTRY_DESCRIPTION_FIELD, CHEMISTRY_SC3P_LT
+from cellranger.chemistry import (
+    CHEMISTRY_DESCRIPTION_FIELD,
+    CHEMISTRY_SC3P_LT,
+    SC3P_V4_CHEMISTRIES,
+    SC5P_V3_CHEMISTRIES,
+)
 
 # Drop this top fraction of the barcodes when estimating ambient.
 MAX_OCCUPIED_PARTITIONS_FRAC = 0.5
 
+# Minimum number of UMIs for a barcode to be called as a cell
+MIN_GLOBAL_UMIS = 0
+
+# Maximum percentage of mitochondrial reads allowed for a barcode to be called a cell
+MAX_MITO_PCT = 100.0
+
 # Minimum number of UMIS per barcode to consider after the initial cell calling
 MIN_UMIS = 500
 
 # Default number of background simulations to make
 NUM_SIMS = 10000
 
-# Minimum ratio of UMIs to the median (initial cell call UMI) to consider after the initial cell calling
-MIN_UMI_FRAC_OF_MEDIAN = 0.01
-
-# Maximum adjusted p-value to call a barcode as non-ambient
-MAX_ADJ_PVALUE = 0.01
-
 # Minimum number of UMIS per barcode to consider after the initial cell calling for targeted GEX
 TARGETED_CC_MIN_UMIS_ADDITIONAL_CELLS = 10
 
@@ -103,6 +108,15 @@ class NonAmbientBarcodeResult(NamedTuple):
     pvalues: np.ndarray  # pvalues (n)
     pvalues_adj: np.ndarray  # B-H adjusted pvalues (n)
     is_nonambient: np.ndarray  # Boolean nonambient calls (n)
+    emptydrops_minimum_umis: int  # Min UMI threshold for empty drops (1)
+
+
+def get_empty_drops_fdr(chemistry_description: str) -> float:
+    """Gets the maximum adjusted p-value to call a barcode as non-ambient."""
+    # The chips used with V4 have roughly double the GEMs as the older V3 chips
+    v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
+    v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
+    return 0.001 if chemistry_description in v4_chem_names else 0.01
 
 
 def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None) -> tuple[int, int]:
@@ -115,8 +129,13 @@ def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None)
         low_index:
         high_index:
     """
+    # The chips used with V4 have roughly double the GEMs as the older V3 chips
+    v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
+    v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
     if chemistry_description == CHEMISTRY_SC3P_LT[CHEMISTRY_DESCRIPTION_FIELD]:
         N_PARTITIONS = 9000
+    elif chemistry_description in v4_chem_names:
+        N_PARTITIONS = 80000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 160000
     else:
         N_PARTITIONS = 45000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 90000
     return (N_PARTITIONS // 2, N_PARTITIONS)
@@ -128,9 +147,7 @@ def find_nonambient_barcodes(
     chemistry_description,
     num_probe_bcs,
     *,
-    min_umi_frac_of_median=MIN_UMI_FRAC_OF_MEDIAN,
     emptydrops_minimum_umis=MIN_UMIS,
-    max_adj_pvalue=MAX_ADJ_PVALUE,
     num_sims=NUM_SIMS,
 ):
     """Call barcodes as being sufficiently distinct from the ambient profile.
@@ -148,6 +165,7 @@ def find_nonambient_barcodes(
     bc_order = np.argsort(umis_per_bc)
 
     low, high = get_empty_drops_range(chemistry_description, num_probe_bcs)
+    max_adj_pvalue = get_empty_drops_fdr(chemistry_description)
 
     # Take what we expect to be the barcodes associated w/ empty partitions.
     print(f"Range empty barcodes: {low} - {high}")
@@ -186,14 +204,11 @@ def find_nonambient_barcodes(
     eval_bcs = np.ma.array(np.arange(matrix.bcs_dim))
     eval_bcs[orig_cells] = ma.masked
 
-    median_initial_umis = np.median(umis_per_bc[orig_cells])
-    min_umis = int(
-        max(emptydrops_minimum_umis, np.ceil(median_initial_umis * min_umi_frac_of_median))
-    )
-    print(f"Median UMIs of initial cell calls: {median_initial_umis}")
-    print(f"Min UMIs: {min_umis}")
+    max_background_umis = np.max(umis_per_bc[empty_bcs], initial=0)
+    emptydrops_minimum_umis = max(emptydrops_minimum_umis, 1 + max_background_umis)
+    print(f"Max background UMIs: {max_background_umis}")
 
-    eval_bcs[umis_per_bc < min_umis] = ma.masked
+    eval_bcs[umis_per_bc < emptydrops_minimum_umis] = ma.masked
     n_unmasked_bcs = len(eval_bcs) - eval_bcs.mask.sum()
 
     eval_bcs = np.argsort(ma.masked_array(umis_per_bc, mask=eval_bcs.mask))[0:n_unmasked_bcs]
@@ -208,6 +223,7 @@ def find_nonambient_barcodes(
         return None
 
     assert not np.any(np.isin(eval_bcs, orig_cells))
+    assert not np.any(np.isin(eval_bcs, empty_bcs))
     print(f"Number of candidate bcs: {len(eval_bcs)}")
     print(f"Range candidate bc umis: {umis_per_bc[eval_bcs].min()}, {umis_per_bc[eval_bcs].max()}")
 
@@ -234,6 +250,7 @@ def find_nonambient_barcodes(
 
     pvalues_adj = adjust_pvalue_bh(pvalues)
 
+    print(f"Max adjusted P-value: {max_adj_pvalue}")
     is_nonambient = pvalues_adj <= max_adj_pvalue
 
     return NonAmbientBarcodeResult(
@@ -242,4 +259,5 @@ def find_nonambient_barcodes(
         pvalues=pvalues,
         pvalues_adj=pvalues_adj,
         is_nonambient=is_nonambient,
+        emptydrops_minimum_umis=emptydrops_minimum_umis,
     )