Skip to content

Commit

Permalink
feat: Cell Ranger 8.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
macklin-10x committed Apr 23, 2024
1 parent 7c9ccea commit 5b0acf8
Show file tree
Hide file tree
Showing 423 changed files with 21,747 additions and 17,482 deletions.
3 changes: 3 additions & 0 deletions bin/tenkit/sitecheck
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ ifchk "SGE JOB_NAME" "echo \$JOB_NAME"
check "LSF Submit" "which bsub"
ifchk "LSF LSB_JOBNAME" "echo \$LSB_JOBNAME"

check "HTCondor Submit" "which condor_submit"
check "Batch system" "echo \$BATCH_SYSTEM"

check "BCL2FASTQ 1" "which configureBclToFastq.pl"
ifchk "BCL2FASTQ 1 Version" "ls \$(dirname \$(which configureBclToFastq.pl))/../etc"
ifchk "Perl" "which perl"
Expand Down
1,002 changes: 713 additions & 289 deletions conda_spec.bzl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/bin/parameters.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
detect_chemistry_sample_reads = 100_000
detect_chemistry_total_reads = 1_000_000
detect_chemistry_total_reads = 2_000_000
min_fraction_whitelist_match = 0.1
min_barcode_similarity = 0.1
star_parameters = ""
Expand Down
54 changes: 43 additions & 11 deletions lib/python/cellranger/altair_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,77 @@

import altair as alt
import pandas as pd
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype

LAYER = "layer"
HCONCAT = "hconcat"
VCONCAT = "vconcat"
DATA = "data"
DATA_NAME = "name"
DATA_FORMAT = "format"
DATASETS_KEY = "datasets"
CSV_PARSE_KEY = "parse"
DATA_FORMAT_CSV_DICT = {"type": "csv"}


def sanitise_data_names(chart_dict):
"""Sanitise data names in chart dict."""
def get_vega_dtype(dtype: pd.DataFrame.dtypes):
"""Convert pandas dtype to vega-lite dtype.
Vega-lite CSV parse
dtypes are - https://vega.github.io/vega-lite-api/api/csv.html#parse .
"""
if is_bool_dtype(dtype):
return "boolean"
elif is_datetime64_any_dtype(dtype):
return "date"
elif is_numeric_dtype(dtype): # A little tricky as bools are also considered numeric.
return "number"
else:
raise ValueError("Expected code to be unreachable")


def sanitise_data_names(chart_dict, name_to_format_dict):
"""Sanitise data names in chart dict.
Uses name_to_format_dict to get column types to parse.
"""
if LAYER in chart_dict:
for layer_dict in chart_dict[LAYER]:
sanitise_data_names(layer_dict)
sanitise_data_names(layer_dict, name_to_format_dict)
if HCONCAT in chart_dict:
for hconcat_dict in chart_dict[HCONCAT]:
sanitise_data_names(hconcat_dict)
sanitise_data_names(hconcat_dict, name_to_format_dict)
if VCONCAT in chart_dict:
for vconcat_dict in chart_dict[VCONCAT]:
sanitise_data_names(vconcat_dict)
sanitise_data_names(vconcat_dict, name_to_format_dict)
if DATA in chart_dict:
if DATA_NAME not in chart_dict[DATA]:
raise ValueError("Found nameless data in Altair JSON. Not expected VEGA out.")
else:
old_data_name = chart_dict[DATA][DATA_NAME]
chart_dict[DATA][DATA_FORMAT] = name_to_format_dict[old_data_name]
chart_dict[DATA][DATA_NAME] += ".csv"
chart_dict[DATA][DATA_FORMAT] = DATA_FORMAT_CSV_DICT


def sanitise_chart_dict(chart_dict):
"""Clean up the chart dict."""
# Convert all datasets into CSVs from JSONs
raw_dataset_names = list(chart_dict["datasets"].keys())
raw_dataset_names = list(chart_dict[DATASETS_KEY].keys())
name_to_format_dict = {}
for raw_dataset in raw_dataset_names:
csv_string = pd.DataFrame.from_dict(chart_dict["datasets"][raw_dataset]).to_csv(index=False)
name_to_format_dict[raw_dataset] = DATA_FORMAT_CSV_DICT.copy()
df = pd.DataFrame.from_dict(chart_dict[DATASETS_KEY][raw_dataset])
csv_string = df.to_csv(index=False)
# Getting columns with special parse
name_to_format_dict[raw_dataset][CSV_PARSE_KEY] = {
x: get_vega_dtype(y)
for x, y in df.dtypes.items()
if (is_numeric_dtype(y) or is_datetime64_any_dtype(y) or is_bool_dtype(y))
}
new_dataset_name = f"{raw_dataset}.csv"
chart_dict["datasets"][new_dataset_name] = csv_string
chart_dict["datasets"].pop(raw_dataset)
sanitise_data_names(chart_dict)
chart_dict[DATASETS_KEY][new_dataset_name] = csv_string
chart_dict[DATASETS_KEY].pop(raw_dataset)
sanitise_data_names(chart_dict, name_to_format_dict)


def chart_to_json(
Expand Down
2 changes: 1 addition & 1 deletion lib/python/cellranger/analysis/combinatorics.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def generate_all_multiplets(n_tags: int, max_multiplets: int, add_unit_vector_at
x_1 + x_2 + x_3 + ... = k for k = 1..max_multiplets
"""
solutions: list[list[int]] = []
for k in range(0, max_multiplets + 1):
for k in range(max_multiplets + 1):
cur_solutions = list(generate_solutions(n_tags, k))
assert int(calc_all_possible_nonnegative_solutions(n_tags, k)) == len(cur_solutions)
solutions += cur_solutions
Expand Down
2 changes: 1 addition & 1 deletion lib/python/cellranger/analysis/irlb.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def irlb(
V[:, k] = F
B = np.zeros((m_b, m_b))
# Improve this! There must be better way to assign diagonal...
for l in range(0, k):
for l in range(k):
B[l, l] = S[1][l]
B[0:k, k] = R[0:k]
# Update the left approximate singular vectors
Expand Down
11 changes: 10 additions & 1 deletion lib/python/cellranger/analysis/multigenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,15 @@ def classify_gems(
counts1[counts1 > counts0], analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
)

# If input is a pure species instead of a mixture then modeling counts independently
# can result in an absurdly low threshold for the missing species causing FP labels that
# show up as an inflated number of Multiplets.
thresholds = sorted([thresh0, thresh1])
fold_change = thresholds[1] / thresholds[0]
if (thresholds[0] < 50) and (fold_change > 25):
thresh0 = thresh1 = np.percentile(
counts0 + counts1, analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
)
doublet = np.logical_and(counts0 >= thresh0, counts1 >= thresh1)
dtype = np.dtype(("S", max(len(cls) for cls in analysis_constants.GEM_CLASSES)))
result = np.where(
Expand Down Expand Up @@ -226,7 +235,7 @@ def _infer_multiplets(
np.random.seed(0)

n_multiplet_boot: np.ndarray[int, np.dtype[np.float_]] = np.zeros(bootstraps)
for i in range(0, bootstraps):
for i in range(bootstraps):
boot_idx = np.random.choice(len(counts0), len(counts0))
counts0_boot = counts0[boot_idx]
counts1_boot = counts1[boot_idx]
Expand Down
50 changes: 34 additions & 16 deletions lib/python/cellranger/cell_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@
import cellranger.sgt as cr_sgt
import cellranger.stats as cr_stats
from cellranger.analysis.diffexp import adjust_pvalue_bh
from cellranger.chemistry import CHEMISTRY_DESCRIPTION_FIELD, CHEMISTRY_SC3P_LT
from cellranger.chemistry import (
CHEMISTRY_DESCRIPTION_FIELD,
CHEMISTRY_SC3P_LT,
SC3P_V4_CHEMISTRIES,
SC5P_V3_CHEMISTRIES,
)

# Drop this top fraction of the barcodes when estimating ambient.
MAX_OCCUPIED_PARTITIONS_FRAC = 0.5

# Minimum number of UMIs for a barcode to be called as a cell
MIN_GLOBAL_UMIS = 0

# Maximum percentage of mitochondrial reads allowed for a barcode to be called a cell
MAX_MITO_PCT = 100.0

# Minimum number of UMIS per barcode to consider after the initial cell calling
MIN_UMIS = 500

# Default number of background simulations to make
NUM_SIMS = 10000

# Minimum ratio of UMIs to the median (initial cell call UMI) to consider after the initial cell calling
MIN_UMI_FRAC_OF_MEDIAN = 0.01

# Maximum adjusted p-value to call a barcode as non-ambient
MAX_ADJ_PVALUE = 0.01

# Minimum number of UMIS per barcode to consider after the initial cell calling for targeted GEX
TARGETED_CC_MIN_UMIS_ADDITIONAL_CELLS = 10

Expand Down Expand Up @@ -103,6 +108,15 @@ class NonAmbientBarcodeResult(NamedTuple):
pvalues: np.ndarray # pvalues (n)
pvalues_adj: np.ndarray # B-H adjusted pvalues (n)
is_nonambient: np.ndarray # Boolean nonambient calls (n)
emptydrops_minimum_umis: int # Min UMI threshold for empty drops (1)


def get_empty_drops_fdr(chemistry_description: str) -> float:
"""Gets the maximum adjusted p-value to call a barcode as non-ambient."""
# The chips used with V4 have roughly double the GEMs as the older V3 chips
v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
return 0.001 if chemistry_description in v4_chem_names else 0.01


def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None) -> tuple[int, int]:
Expand All @@ -115,8 +129,13 @@ def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None)
low_index:
high_index:
"""
# The chips used with V4 have roughly double the GEMs as the older V3 chips
v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
if chemistry_description == CHEMISTRY_SC3P_LT[CHEMISTRY_DESCRIPTION_FIELD]:
N_PARTITIONS = 9000
elif chemistry_description in v4_chem_names:
N_PARTITIONS = 80000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 160000
else:
N_PARTITIONS = 45000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 90000
return (N_PARTITIONS // 2, N_PARTITIONS)
Expand All @@ -128,9 +147,7 @@ def find_nonambient_barcodes(
chemistry_description,
num_probe_bcs,
*,
min_umi_frac_of_median=MIN_UMI_FRAC_OF_MEDIAN,
emptydrops_minimum_umis=MIN_UMIS,
max_adj_pvalue=MAX_ADJ_PVALUE,
num_sims=NUM_SIMS,
):
"""Call barcodes as being sufficiently distinct from the ambient profile.
Expand All @@ -148,6 +165,7 @@ def find_nonambient_barcodes(
bc_order = np.argsort(umis_per_bc)

low, high = get_empty_drops_range(chemistry_description, num_probe_bcs)
max_adj_pvalue = get_empty_drops_fdr(chemistry_description)

# Take what we expect to be the barcodes associated w/ empty partitions.
print(f"Range empty barcodes: {low} - {high}")
Expand Down Expand Up @@ -186,14 +204,11 @@ def find_nonambient_barcodes(
eval_bcs = np.ma.array(np.arange(matrix.bcs_dim))
eval_bcs[orig_cells] = ma.masked

median_initial_umis = np.median(umis_per_bc[orig_cells])
min_umis = int(
max(emptydrops_minimum_umis, np.ceil(median_initial_umis * min_umi_frac_of_median))
)
print(f"Median UMIs of initial cell calls: {median_initial_umis}")
print(f"Min UMIs: {min_umis}")
max_background_umis = np.max(umis_per_bc[empty_bcs], initial=0)
emptydrops_minimum_umis = max(emptydrops_minimum_umis, 1 + max_background_umis)
print(f"Max background UMIs: {max_background_umis}")

eval_bcs[umis_per_bc < min_umis] = ma.masked
eval_bcs[umis_per_bc < emptydrops_minimum_umis] = ma.masked
n_unmasked_bcs = len(eval_bcs) - eval_bcs.mask.sum()

eval_bcs = np.argsort(ma.masked_array(umis_per_bc, mask=eval_bcs.mask))[0:n_unmasked_bcs]
Expand All @@ -208,6 +223,7 @@ def find_nonambient_barcodes(
return None

assert not np.any(np.isin(eval_bcs, orig_cells))
assert not np.any(np.isin(eval_bcs, empty_bcs))
print(f"Number of candidate bcs: {len(eval_bcs)}")
print(f"Range candidate bc umis: {umis_per_bc[eval_bcs].min()}, {umis_per_bc[eval_bcs].max()}")

Expand All @@ -234,6 +250,7 @@ def find_nonambient_barcodes(

pvalues_adj = adjust_pvalue_bh(pvalues)

print(f"Max adjusted P-value: {max_adj_pvalue}")
is_nonambient = pvalues_adj <= max_adj_pvalue

return NonAmbientBarcodeResult(
Expand All @@ -242,4 +259,5 @@ def find_nonambient_barcodes(
pvalues=pvalues,
pvalues_adj=pvalues_adj,
is_nonambient=is_nonambient,
emptydrops_minimum_umis=emptydrops_minimum_umis,
)
Loading

0 comments on commit 5b0acf8

Please sign in to comment.