diff --git a/.github/workflows/ci.config b/.github/workflows/ci.config new file mode 100644 index 00000000..27cc59ae --- /dev/null +++ b/.github/workflows/ci.config @@ -0,0 +1,15 @@ + +process { + executor='slurm' + time = '7 d' + memory = '4 GB' +} + +singularity { + enabled = true +} + +conda { + createTimeout = "30 min" + useMamba = true +} diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml new file mode 100644 index 00000000..7b68f59b --- /dev/null +++ b/.github/workflows/nextflow-linter.yaml @@ -0,0 +1,55 @@ +name: nf-core linting +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-java@v2 + with: + distribution: 'adopt' + java-version: '11' + + - name: install Nextflow + run: | + wget -qO- https://get.nextflow.io | bash + chmod +x nextflow + mkdir -p $HOME/.local/bin + mv nextflow $HOME/.local/bin/ + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: install nf-core tools + run: | + python -m pip install --upgrade pip + pip install nf-core + + - name: check Nextflow version + run: nextflow -version + + # https://nf-co.re/tools/docs/latest/pipeline_lint_tests/ + - name: create .nf-core.yml + run: | + cat << EOF > .nf-core.yml + repository_type: pipeline + lint: + actions_awsfulltest: False + actions_awstest: False + multiqc_config: False + schema_lint: False + schema_params: False + EOF + + - name: run nf-core lint + run: nf-core pipelines lint --dir . diff --git a/README.md b/README.md index 06bcde9e..0fc91ff7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,18 @@ # scxa-tertiary-workflow Tertiary component for SCXA workflows + +# How to run workflow for tertiary analysis +## Prepare data +``` +bash scripts/data_prep.sh [output path] +``` +## Run for plate +``` +nextflow run main.nf --slurm -resume --dir_path [--output_path ] [--scanpy_scripts_container ] +``` +## Run for droplet +``` +nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] [--scanpy_scripts_container ] +``` + +If `[--output_path ]` is not specified results will be `/results` dir. diff --git a/main.nf b/main.nf index 8b137891..afe544ef 100644 --- a/main.nf +++ b/main.nf @@ -1 +1,831 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0" +params.technology = "plate" +params.batch_variable = "" +params.representation = "X_pca" +params.dir_path = "." +params.result_dir_path = params.output_path ?: params.dir_path + "/results" +params.celltype_field = 'NO_CELLTYPE_FIELD' +params.neighbor_values = ['3', '5', '10', '15', '20', '25', '30', '50', '100'] +params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] +params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'] +params.slotname = "louvain_resolution" +params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it } +params.merged_group_slotname = params.clustering_slotname.collect { it + params.celltype_field } + +log.info """ +=============================== +WORKFLOW PARAMETER VALUES +=============================== +EXP dir path: ${params.dir_path} +Selected technology: ${params.technology} +Results results_dir_path: ${params.result_dir_path} +celltype_field: ${params.celltype_field} +neighbor_values: ${params.neighbor_values} +perplexity_values: ${params.perplexity_values} +resolution_values: ${params.resolution_values} +slotname: ${params.slotname} +clustering_slotname: ${params.clustering_slotname} +merged_group_slotname: ${params.merged_group_slotname} +batch_variable: ${params.batch_variable} +representation: ${params.representation} +=============================== +""" + +/* + * Column_rearrange_1: Only keeps the specified columns and removes header + */ +process Column_rearrange_1 { + // Set the output file + input: + path genemeta + val col + + output: + path 'filtered_genemeta.txt' + + script: + """ + # Find the column number of the specified gene_id column name + col_num=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col\$" | cut -d: -f1) + + # If column is found, extract it; otherwise, raise an error + if [[ -z "\$col_num" ]]; then + echo "Error: Column '$col' not found in $genemeta" >&2 + exit 1 + fi + + # Extract the gene_id column (without the header) + tail -n +2 "$genemeta" | cut -f\$col_num > filtered_genemeta.txt + """ +} + +/* + * Column_rearrange_2: Only keeps the specified columns and removes header + */ +process Column_rearrange_2 { + // Set the output file + input: + path genemeta + val col1 + val col2 + + output: + path 'filtered_genemeta_2.txt' + + script: + """ + # Find the column number of the specified gene_id column name + col_num_1=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col1\$" | cut -d: -f1) + col_num_2=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col2\$" | cut -d: -f1) + + # If either column is not found, raise an error + if [[ -z "\$col_num_1" || -z "\$col_num_2" ]]; then + echo "Error: Column '$col1' or '$col2' not found in $genemeta" >&2 + exit 1 + fi + + # Extract the gene_id column (without the header) + tail -n +2 "$genemeta" | cut -f\$col_num_1,\$col_num_2 > filtered_genemeta_2.txt + """ +} + +/* + * mergeGeneFiles: Merges gene file with genemeta on column 1, and keeps column1 and 4 + */ +process mergeGeneFiles { + input: + path gene + path filtered_genemeta + + output: + path 'merged_genemeta.tsv' + + script: + """ + # Sort both files by the first column for join compatibility + sort -k1,1 "$gene" > sorted_gene.txt + sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt + + # Perform a left join to keep all data from gene file + join -a 1 -t \$'\t' -o 0,1.2,2.2 sorted_gene.txt sorted_genemeta.txt | cut -f1,3 > merged_genemeta.tsv + """ +} + +process scanpy_read_10x { + container params.scanpy_scripts_container + + input: + path matrix + path genes + path barcodes + path cellmeta + path genemeta + + output: + path 'anndata.h5ad' + + script: + """ + #ln -s $matrix matrix.mtx + ln -s $genes genes.tsv + #ln -s $barcodes barcodes.tsv + + export PYTHONIOENCODING='utf-8' + + scanpy-read-10x --input-10x-mtx ./ \ + --var-names 'gene_ids' \ + --extra-obs $cellmeta \ + --extra-var $genemeta \ + --show-obj stdout \ + --output-format anndata \ + 'anndata.h5ad' + """ +} + +process scanpy_multiplet_scrublet { + container params.scanpy_scripts_container + + input: + path anndata + val batch_variable + + output: + path 'scrublet.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + if [ -z "$batch_variable" ]; then + scanpy-cli multiplet scrublet \ + --input-format 'anndata' \ + --output-format 'anndata' \ + $anndata \ + scrublet.h5ad + else + scanpy-cli multiplet scrublet \ + --input-format 'anndata' \ + --output-format 'anndata' \ + --batch-key "$batch_variable" \ + $anndata \ + scrublet.h5ad + fi + """ +} + +process scanpy_plot_scrublet { + publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)' + container params.scanpy_scripts_container + + input: + path anndata + + output: + path 'scrublet.png' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-cli plot scrublet \ + --input-format "anndata" \ + --scale-hist-obs "linear" \ + --scale-hist-sim "linear" \ + $anndata \ + scrublet.png + """ +} + +process scanpy_filter_cells { + container params.scanpy_scripts_container + + input: + path anndata + val category + + output: + path 'filtered_cell_anndata.h5ad' + + script: + """ + n_counts=1500 + if [[ -n "$category" ]]; then + n_counts=750 + fi + + export PYTHONIOENCODING='utf-8' + scanpy-filter-cells --gene-name 'gene_symbols' \ + --param 'c:n_counts' \$n_counts 1000000000.0 \ + --param 'c:pct_counts_mito' 0.0 0.35 \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata 'filtered_cell_anndata.h5ad' \ + --export-mtx ./ \ + $category + """ +} + +process scanpy_filter_genes { + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'genes.tsv' + container params.scanpy_scripts_container + + input: + path anndata + path genes + + output: + path 'filtered_gene_anndata.h5ad' + path 'matrix.mtx' + path 'barcodes.tsv' + path 'genes.tsv' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-filter-genes \ + --param 'g:n_cells' 3.0 1000000000.0 \ + --subset 'g:index' \ + $genes \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'filtered_gene_anndata.h5ad' \ + --export-mtx ./ + """ +} + +process normalise_data { + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'genes.tsv' + container params.scanpy_scripts_container + + input: + path anndata + + output: + path 'normalised_anndata.h5ad' + path 'matrix.mtx' + path 'barcodes.tsv' + path 'genes.tsv' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-normalise-data \ + --no-log-transform \ + --normalize-to '1000000.0' \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'normalised_anndata.h5ad' \ + --export-mtx ./ + """ +} + +process normalise_internal_data { + container params.scanpy_scripts_container + + input: + path anndata + + output: + path 'normalised_internal_anndata.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-normalise-data \ + --normalize-to '1000000.0' \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'normalised_internal_anndata.h5ad' + """ +} + +process find_variable_genes { + container params.scanpy_scripts_container + + input: + path anndata + val batch_variable + + output: + path 'variable_genes.h5ad' + + script: + """ + batch_variable_tag="" + if [[ -n "$batch_variable" ]]; then + batch_variable_tag="--batch-key $batch_variable" + fi + + export PYTHONIOENCODING='utf-8' + scanpy-find-variable-genes \ + --flavor 'seurat' \ + --mean-limits 0.0125 1000000000.0 \ + --disp-limits 0.5 50.0 \ + --span 0.3 \ + --n-bins '20' \ + \$batch_variable_tag \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata 'variable_genes.h5ad' + """ +} + +process scale_data { + container params.scanpy_scripts_container + + input: + path anndata + + output: + path 'scaled_anndata.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-scale-data \ + --input-format "anndata" \ + --output-format "anndata" \ + $anndata \ + 'scaled_anndata.h5ad' + + """ +} + +process run_pca { + container params.scanpy_scripts_container + + input: + path anndata + + output: + path 'PCA.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-run-pca \ + --no-zero-center \ + --svd-solver 'arpack' \ + --random-state '1234' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'PCA.h5ad' + """ +} + +process harmony_batch { + container params.scanpy_scripts_container + + input: + path anndata + val batch_variable + output: + path 'harmony.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + if [[ -n "$batch_variable" ]]; then + scanpy-integrate harmony \ + --batch-key $batch_variable \ + --basis 'X_pca' \ + --adjusted-basis 'X_pca_harmony' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'harmony.h5ad' + else + echo "No batch variables passed, simply passing original input as output unchanged." + + cp $anndata 'harmony.h5ad' + fi + + """ +} + +process neighbors { + container params.scanpy_scripts_container + + input: + path anndata + val representation + output: + path 'neighbors.h5ad' + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-neighbors \ + --n-neighbors 15 \ + --method 'umap' \ + --metric 'euclidean' \ + --random-state '0' \ + --use-rep $representation \ + --n-pcs '50' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'neighbors.h5ad' + + """ +} + +process neighbors_for_umap { + container params.scanpy_scripts_container + + + input: + tuple path(anndata), val(n_neighbors) + val representation + output: + path "neighbors_${n_neighbors}.h5ad" + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-neighbors \ + --n-neighbors $n_neighbors \ + --key-added 'neighbors_n_neighbors_${n_neighbors}' \ + --method 'umap' \ + --metric 'euclidean' \ + --random-state '0' \ + --use-rep $representation \ + --n-pcs '50' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'neighbors_${n_neighbors}.h5ad' + + """ +} + +process find_clusters { + publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_resolution_*.tsv' + container params.scanpy_scripts_container + + input: + tuple path(anndata), val(resolution) + output: + path "clusters_${resolution}.h5ad" + path "clusters_resolution_${resolution}.tsv" + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-find-cluster louvain \ + --neighbors-key 'neighbors' \ + --key-added 'louvain_resolution_${resolution}' \ + --resolution ${resolution} \ + --random-state '1234' \ + --directed \ + --export-cluster output.tsv \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'clusters_${resolution}.h5ad' + + mv 'output.tsv' 'clusters_resolution_${resolution}.tsv' + """ +} + +process restore_unscaled { + container params.scanpy_scripts_container + + input: + tuple path(anndata), path(normalise_internal_data) + + output: + path "restore_unscaled_output_${anndata}.h5" + + script: + """ + export PYTHONIOENCODING='utf-8' + ln -s $anndata input.h5 + ln -s $normalise_internal_data r_source.h5 + python ${projectDir}/scripts/restore_unscaled.py + mv output.h5 'restore_unscaled_output_${anndata}.h5' + """ +} + +process find_markers { + publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv' + errorStrategy 'ignore' + container params.scanpy_scripts_container + + input: + tuple path(anndata), val(merged_group_slotname) + + output: + path "markers_${merged_group_slotname}.h5ad" + path "markers_*.tsv" + + script: + """ + VAR="$merged_group_slotname" + PREFIX="${params.slotname}_" + echo \$VAR + echo \$PREFIX + n_number="\${VAR#\$PREFIX}" + echo \$n_number + + export PYTHONIOENCODING='utf-8' + + scanpy-find-markers \ + --save 'markers_${merged_group_slotname}.tsv' \ + --n-genes '100' \ + --groupby '${merged_group_slotname}' \ + --key-added 'markers_${merged_group_slotname}' \ + --method 'wilcoxon' \ + --use-raw \ + --reference 'rest' \ + --filter-params 'min_in_group_fraction:0.0,max_out_group_fraction:1.0,min_fold_change:1.0' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + "markers_${merged_group_slotname}.h5ad" \ + && mv "markers_${merged_group_slotname}.tsv" "markers_resolution_\${n_number}.tsv" + """ +} + +process run_umap { + publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'umap_n_neighbors_*.tsv' + + errorStrategy 'ignore' + + container params.scanpy_scripts_container + + input: + path anndata + + output: + path "umap_*.h5ad" + path "umap_n_neighbors_*.tsv" + + script: + """ + export PYTHONIOENCODING='utf-8' + echo \$PYTHONIOENCODING + VAR="$anndata" + n_number="\${VAR%.h5ad}" + echo \$n_number + scanpy-run-umap \ + --neighbors-key "neighbors_n_\${n_number}" \ + --key-added "neighbors_n_\${n_number}" \ + --export-embedding embeddings.tsv \ + --n-components 2 \ + --min-dist 0.5 \ + --spread 1.0 \ + --alpha 1.0 \ + --gamma 1.0 \ + --negative-sample-rate 5 \ + --random-state 0 \ + --init-pos 'spectral' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + "umap_\${n_number}.h5ad" \ + && mv "embeddings_neighbors_n_\${n_number}.tsv" umap_n_\${n_number}.tsv + + """ +} + +process run_tsne { + publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'tsne_perplexity_*\\.tsv' + + errorStrategy 'ignore' + + container params.scanpy_scripts_container + + input: + tuple path(anndata), val(perplexity_values) + val representation + + output: + path "tsne_${perplexity_values}.h5ad" + path "tsne_perplexity_${perplexity_values}.tsv" + + script: + """ + export PYTHONIOENCODING='utf-8' + scanpy-run-tsne \ + --use-rep $representation \ + --export-embedding embeddings.tsv \ + --perplexity $perplexity_values \ + --key-added 'perplexity_$perplexity_values' \ + --early-exaggeration '12.0' \ + --learning-rate '400.0' \ + --no-fast-tsne \ + --random-state 1234 \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'tsne_${perplexity_values}.h5ad' \ + && mv 'embeddings_perplexity_${perplexity_values}.tsv' 'tsne_perplexity_${perplexity_values}.tsv' + """ +} + +process make_project_file { + publishDir params.result_dir_path, mode: 'copy' + + container params.scanpy_scripts_container + + input: + path neighbors + path scanpy_read_10x + path filter_genes + path normalise_data + path find_markers + path TNSEs_mix_UMAPs + output: + path "project.h5ad" + script: + """ + export PYTHONIOENCODING='utf-8' + ln -s $neighbors input.h5 + ln -s $scanpy_read_10x r_source.h5 + ln -s '$filter_genes' x_source_0.h5 + ln -s '$normalise_data' x_source_1.h5 + count=0 + for i in $find_markers + do + ln -sf "\${i}" obs_source_\${count}.h5 + ln -sf "\${i}" uns_source_\${count}.h5 + count=\$((count + 1)) + echo "\${count}" + done + count=0 + for i in $TNSEs_mix_UMAPs + do + ln -sf "\${i}" embedding_source_\${count}.h5 + count=\$((count + 1)) + echo "\${count}" + done + python ${projectDir}/scripts/final_project.py + mv output.h5 project.h5ad + """ +} + +workflow { + + // Create input channel (single file via CLI parameter) + genemeta = Channel.fromPath("${params.dir_path}/genes_metadata.tsv") + genes = Channel.fromPath("${params.dir_path}/genes.tsv") + barcodes = Channel.fromPath("${params.dir_path}/barcodes.tsv") + matrix = Channel.fromPath("${params.dir_path}/matrix.mtx") + cellmeta = Channel.fromPath("${params.dir_path}/cell_metadata.tsv") + neighbors_ch = channel.fromList(params.neighbor_values) + perplexity_ch = channel.fromList(params.perplexity_values) + resolution_ch = channel.fromList(params.resolution_values) + merged_group_slotname_ch = Channel.fromList(params.merged_group_slotname) + + Column_rearrange_1( + genemeta, + "gene_id" + ) + Column_rearrange_2( + genemeta, + "gene_id", + "gene_name" + ) + mergeGeneFiles( + genes, + Column_rearrange_2.out + ) + scanpy_read_10x( + matrix, + mergeGeneFiles.out, + barcodes, + cellmeta, + genemeta + ) + + if ( params.technology == "droplet" ) { + SCRUBLET_ch = scanpy_multiplet_scrublet( + scanpy_read_10x.out, + params.batch_variable + ) + scanpy_plot_scrublet( + SCRUBLET_ch + ) + scanpy_filter_cells( + SCRUBLET_ch, + "--category predicted_doublet False" + ) + } + else { + scanpy_filter_cells( + scanpy_read_10x.out, + "" + ) + } + + scanpy_filter_genes( + scanpy_filter_cells.out, + Column_rearrange_1.out[0] + ) + normalise_data( + scanpy_filter_genes.out[0] + ) + normalise_internal_data( + scanpy_filter_genes.out[0] + ) + find_variable_genes( + normalise_internal_data.out, + params.batch_variable + ) + + if ( params.technology == "droplet" ) { + scale_data( + find_variable_genes.out + ) + run_pca( + scale_data.out + ) + } + else { + run_pca( + find_variable_genes.out + ) + } + + harmony_batch( + run_pca.out, + params.batch_variable + ) + neighbors( + harmony_batch.out, + params.representation + ) + neighbors_for_umap( + harmony_batch.out.combine(neighbors_ch), + params.representation + ) + TNSEs_ch = run_tsne( + harmony_batch.out.combine(perplexity_ch), + params.representation + )[0] + //TNSEs_ch + // .filter { it.exitStatus == 0 } + + UMAPs_ch = run_umap( + neighbors_for_umap.out.flatten() + )[0] + //UMAPs_ch + // .filter { it.exitStatus == 0 } + find_clusters( + neighbors.out.combine(resolution_ch) + ) + + // Combine the outputs of find_clusters and neighbors processes + combined_outputs = find_clusters.out[0].mix(neighbors.out) + + if ( params.technology == "droplet" ) { + restore_unscaled ( + combined_outputs.combine(normalise_internal_data.out) + ) + restore_unscaled_files = restore_unscaled.out.map { file -> + // Extract the sample number from the file name + def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','') + [file, sampleNumber] // Create a tuple with sample number and file + } + find_markers( + restore_unscaled_files + ) + } + else { + processed_files = combined_outputs.map { file -> + // Extract the sample number from the file name + def sampleNumber = file.baseName.replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field) + [file, sampleNumber] // Create a tuple with sample number and file + } + find_markers( + processed_files + ) + } + make_project_file( + neighbors.out, + scanpy_read_10x.out, + scanpy_filter_genes.out[0], + normalise_data.out[0], + find_markers.out[0].collect(), + TNSEs_ch.mix(UMAPs_ch).collect() + ) +} diff --git a/nextflow.config b/nextflow.config index 8b137891..9f892942 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1 +1,49 @@ +process { + executor='slurm' + queue="$SCXA_HPC_QUEUE" + // clusterOptions="$SCXA_HPC_OPTIONS" + time = '7 d' + memory = '4 GB' + queueSize=500 + exitReadTimeout='100000 sec' + pollInterval = '5sec' + // error strategy + // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + // memory = { 4.GB * 2 ^task.attempt } + // maxRetries = 4 +} +singularity { + enabled = true + cacheDir = "$SCXA_SINGULARITY_CACHE" +} + +conda { + // cacheDir = "$SCXA_WORKFLOW_ROOT/envs" + createTimeout = "30 min" + useMamba = true +} + +params { + dir_path = "." + output_path = null + result_dir_path = "${params.output_path ?: params.dir_path + '/results'}" +} + +trace { + enabled = true + file = "${params.result_dir_path}/trace.txt" + overwrite = true +} + +timeline { + enabled = true + file = "${params.result_dir_path}/timeline.html" + overwrite = true +} + +report { + enabled = true + file = "${params.result_dir_path}/report.html" + overwrite = true +} diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh new file mode 100755 index 00000000..eb9c2433 --- /dev/null +++ b/scripts/data_prep.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# This is EMBL-EBI specific script to fetch data from workflow root and put it in a place for downstream workflow to use + +if [ -z "$SCXA_WORKFLOW_ROOT" ]; then + echo "Variable SCXA_WORKFLOW_ROOT is not defined or empty. Please load SC env." + echo "Exiting..." + exit 1; +fi + +if [ -z "$1" ]; then + echo "Experiment ID is not provided. Please provide EXP ID" + echo "bash data_prep.sh [output path]" + echo "Exiting..." + exit 1; +fi + +EXP_ID=$1 + +outdir="$(pwd)" + +if [ "$2" ]; then + outdir=$2 +fi + + +echo "Creating ${outdir}/${EXP_ID} directory" +mkdir -p ${outdir}/${EXP_ID} +cd ${outdir}/${EXP_ID} + +echo "Copying data to ${outdir}/${EXP_ID}" + +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip -f genes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip -f matrix.mtx.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip -f barcodes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv + +echo "Copying data for ${EXP_ID} finished" diff --git a/scripts/final_project.py b/scripts/final_project.py new file mode 100755 index 00000000..9616ded0 --- /dev/null +++ b/scripts/final_project.py @@ -0,0 +1,124 @@ + +import scanpy as sc +import anndata +from numpy import all +import logging +import os + +adata = sc.read('input.h5') + + +gene_name = 'index' +qc_vars = list() + + + +gene_names = getattr(adata.var, gene_name) +# Define the directory containing your source files +source_dir = '.' # Adjust to the appropriate path + +ad_s = sc.read('r_source.h5') +if not all(adata.obs.index.isin(ad_s.obs.index)): + logging.error("Specified object for .raw must contain all .obs from main object.") + sys.exit(1) +else: + adata.raw = ad_s[adata.obs.index] +del ad_s + +ad_s = sc.read('x_source_0.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + if "filtered" == '': + logging.error("%sth destination layer for %sth X source not specified" % ("0", "0")) + sys.exit(1) + adata.layers["filtered"] = ad_s.X +else: + logging.error("X source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('x_source_1.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + if "normalised" == '': + logging.error("%sth destination layer for %sth X source not specified" % ("1", "1")) + sys.exit(1) + adata.layers["normalised"] = ad_s.X +else: + logging.error("X source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s + +obs_sources = [file for file in os.listdir(source_dir) if file.startswith('obs_source_') and file.endswith('.h5')] + +for idx, obs_source_file in enumerate(sorted(obs_sources)): + ad_s = sc.read(os.path.join(source_dir, obs_source_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = f"_{idx}" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] + else: + logging.error(f"Observation source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s + + +embedding_sources = [file for file in os.listdir(source_dir) if file.startswith('embedding_source_') and file.endswith('.h5')] + +for idx, embedding_file in enumerate(sorted(embedding_sources)): + ad_s = sc.read(os.path.join(source_dir, embedding_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + # Copy tsne embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + # Copy umap embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + else: + logging.error(f"Embedding source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s + + +uns_sources = [file for file in os.listdir(source_dir) if file.startswith('uns_source_') and file.endswith('.h5')] +for idx, uns_file in enumerate(sorted(uns_sources)): + ad_s = sc.read(os.path.join(source_dir, uns_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix=f"_{idx}" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] + else: + logging.error(f"Uns source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s + + +if len(qc_vars) > 0: + pct_top = [50] + sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True) + +if 'n_genes' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_genes=0) +if 'n_counts' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_counts=0) +if 'n_cells' not in adata.var.columns: + sc.pp.filter_genes(adata, min_cells=0) +if 'n_counts' not in adata.var.columns: + sc.pp.filter_genes(adata, min_counts=0) + +adata.write('output.h5', compression='gzip') + diff --git a/scripts/restore_unscaled.py b/scripts/restore_unscaled.py new file mode 100755 index 00000000..9e5eef92 --- /dev/null +++ b/scripts/restore_unscaled.py @@ -0,0 +1,33 @@ +import scanpy as sc +import anndata +from numpy import all +import logging + +adata = sc.read('input.h5') + +gene_name = 'index' +qc_vars = list() +gene_names = getattr(adata.var, gene_name) + +ad_s = sc.read('r_source.h5') +if not all(adata.obs.index.isin(ad_s.obs.index)): + logging.error("Specified object for .raw must contain all .obs from main object.") + sys.exit(1) +else: + adata.raw = ad_s[adata.obs.index] +del ad_s + +if len(qc_vars) > 0: + pct_top = [50] + sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True) + +if 'n_genes' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_genes=0) +if 'n_counts' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_counts=0) +if 'n_cells' not in adata.var.columns: + sc.pp.filter_genes(adata, min_cells=0) +if 'n_counts' not in adata.var.columns: + sc.pp.filter_genes(adata, min_counts=0) + +adata.write('output.h5', compression='gzip')