Merge pull request #142 from sgosline/sara-new-sim

Sara new simulated data
PNNL-CompBio · Apr 21, 2021 · 18c842e · 18c842e
2 parents c156c22 + 7a8879b
commit 18c842e
Show file tree

Hide file tree

Showing 26 changed files with 162 additions and 9,811 deletions.
diff --git a/perfEval/README.md b/perfEval/README.md
@@ -4,7 +4,19 @@ This document describes the current test matrix of comparisons we want to carry
 out so that we can keep track of what analysis should be done. We generally have created
 four different ways to evaluate protein deconvolultion algorithms.
 
-## Tests of various algorithms
+## To run a deconvolution algorithm
+The CWL workflow [run-deconv.cwl](./run-deconv.cwl) takes the following arguments
+to run an algorithm included in our suite:
+| Argument | Required? | Description|
+| --- | --- | --- |
+| signature | yes | Signature matrix file used to run the deconvolution, such as those located in [the signature matrix directory](../signature_matrices)|
+| alg | yes | Name of algorithm. Currently implemented are: `cibersort`, `xcell`, `epic`, `mcpcounter`, and `bayesdebulk`|
+| cancerType | no | Optional argument to describe cancer type|
+| dataType | no | Optional argument to describe data type (e.g. protein or mRNA)|
+| sampleType | no | Optional argument to describe sample type (e.g. tumor or normal|
+
+
+## Implemented algorithm metrics
 Below are the three different tests we perform. For each test, there are numerous
 metrics we use to evaluate the performance as well as different parameters.
 
@@ -15,27 +27,28 @@ script in this directory.
 One test we will perform is to evaluate the similarities in tumor deconvolution
 algorithms between mRNA and proteins from the same tumors with the same signature matrices.
 These tests can be evaluated across signature matrices, tissue types (tumor, normal, all) and cancer
-types.
-
-*add documentation to run in `mrna-prot` directory*
+types. These tests are located in the [`mrna-prot` directory](./mrna-prot).
 
 
 ### Imputation analysis
-Here we measure how sensitive an algorithm is to imputed vs. unimputed proteomics data
-
-*add documentation to run in the `imputation` directory*
+Here we measure how sensitive an algorithm is to imputed vs. unimputed proteomics data.
+The documentation to evaluate this is in the [`imputation` directory](./imputation).
 
 ### Simulated data analysis
-Here we tests how well each algorithm performs on simulated data.
+Here we test how well each algorithm performs on simulated data.
+The documentation to test this is in the [`data-sim` directory](./data-sim).
 
-*add documetnation to run in the `data-sim` directory.
+### Immune subtype analysis
+We also evaluate how well the various cell types agree with what is expected based on the mRNA-defined immune subtypes.
 
+## How to determine agreement
+How we compare the deconvolution algorithms to the 'gold standard' of any particular approach is just as important as what data we are using. As such, we have carefully thought through the various approaches. Here are the current comparisons we employ.
 
-## Vector and matrix comparisons
-The following are options:
-- Vector comparisons: these measure correlation across patients OR subtypes using spearman or pearson Correlation
-- Matrix comparisons: these measure distances between matrices.
+### Per sample correlations
+For each sample from the original matrix, we evaluate how well the cell type predictions agree between the deconvoluted protein matrix and the 'test' scenario. This test can be run using the [`deconv-corr-cwl-tool.cwl`](./correlations/deconv-corr-cwl-tool.cwl) script.
 
+### Cell type correlations
+For each cell type in the original matrix, we evaluate how well the predictions for that cell type agree across samples between the deconvoluted matrix and the 'test' scenario. This test can be run using the [`deconv-corrXcelltypes-cwl-tool.cwl`](./correlations/deconv-corrXcelltypes-cwl-tool.cwl] script.
 
-## Parameters to compare
-In addition to the various algorithms, there are also
+### Matrix distance metrics
+To compare two matrices, we employ a number of pairwise distance metrics to determine if the two matrices are similar or not. (song to add more details here)
diff --git a/perfEval/data-sim/call-deconv-on-sim.cwl b/perfEval/data-sim/call-deconv-on-sim.cwl
@@ -15,6 +15,7 @@ inputs:
      type: File
    permutation:
      type: string
+     default: '1'
    prot-alg:
      type: string
    sampleType:
@@ -23,6 +24,9 @@ inputs:
    dataType:
      type: string
      default: 'prot'
+   simType:
+     type: string
+     default: 'prot'
 
 outputs:
   pat-cor-file:
@@ -46,6 +50,7 @@ steps:
      run: ../../simulatedData/sim-data-tool.cwl
      in:
        repNumber: permutation
+       simType: simType
      out:
        [matrix,cellType]
   deconv-prot:
@@ -58,6 +63,13 @@ steps:
        dataType: dataType
        matrix: get-sim-data/matrix
      out: [deconvoluted]
+  match-prot-to-sig:
+     run: ../../simulatedData/map-sig-tool.cwl
+     in:
+       deconv-matrix: deconv-prot/deconvoluted
+       sig-matrix: signature
+       cell-matrix: get-sim-data/cellType
+     out: [updated-deconv]
   patient-cor:
      run: ../correlations/deconv-corr-cwl-tool.cwl
      in:
@@ -68,7 +80,7 @@ steps:
        signature: signature
        sampleType: sampleType
        proteomics:
-         source: deconv-prot/deconvoluted
+         source: match-prot-to-sig/updated-deconv
        transcriptomics:
          source: get-sim-data/cellType
      out: [corr]
@@ -82,14 +94,14 @@ steps:
        signature: signature
        sampleType: sampleType
        proteomics:
-         source: deconv-prot/deconvoluted
+         source: match-prot-to-sig/updated-deconv
        transcriptomics:
          source: get-sim-data/cellType
      out: [corr]
   matrix-distance:
      run: ../comparison/deconv-comparison-tool.cwl
      in:
-       matrixA: deconv-prot/deconvoluted
+       matrixA: match-prot-to-sig/updated-deconv
        matrixB: get-sim-data/cellType
        cancerType: permutation
        aAlg: prot-alg

diff --git a/perfEval/data-sim/sim-test.yml b/perfEval/data-sim/sim-test.yml
@@ -3,6 +3,7 @@ prot-algorithms:
   - epic
   - xcell
   - cibersort
+  - bayesdebulk
 reps:
   - "1"
   - "2"

diff --git a/perfEval/mrna-prot/alg-test.yml b/perfEval/mrna-prot/alg-test.yml
@@ -1,22 +1,18 @@
 cancerTypes:
-  - endometrial
+  - luad
   - ccrcc
 prot-algorithms:
   - mcpcounter
   - epic
-  - xcell
   - cibersort
   - repbulk
 mrna-algorithms:
   - mcpcounter
   - epic
-  - xcell
   - cibersort
   - repbulk
 tissueTypes:
    - tumor
 signatures:
   - class: File
     path: ../../signature_matrices/LM7c.txt
-  - class: File
-    path: ../../signature_matrices/LM22.txt
diff --git a/perfEval/mrna-prot/call-deconv-and-cor.cwl b/perfEval/mrna-prot/call-deconv-and-cor.cwl
@@ -35,7 +35,7 @@ outputs:
 
 steps:
   deconv-mrna:
-     run: ../mrna-deconv.cwl
+     run: mrna-deconv.cwl
      in:
        cancerType: cancerType
        mrnaAlg: mrna-alg

diff --git a/perfEval/mrna-deconv.cwl → perfEval/mrna-prot/mrna-deconv.cwl b/perfEval/mrna-deconv.cwl → perfEval/mrna-prot/mrna-deconv.cwl
@@ -20,14 +20,14 @@ inputs:
 
 steps:
   download-mrna:
-    run: ../mRNAData/mrna-data-cwl-tool.cwl
+    run: ../../mRNAData/mrna-data-cwl-tool.cwl
     in:
       cancerType: cancerType
       sampleType: sampleType
     out:
       [matrix]
   run-deconv:
-    run: run-deconv.cwl
+    run: ../run-deconv.cwl
     in:
       matrix: download-mrna/matrix
       signature: signature

diff --git a/perfEval/run-deconv.cwl b/perfEval/run-deconv.cwl
@@ -16,6 +16,16 @@ inputs:
      type: string
    matrix:
      type: File
+   cancerType:
+     type: string
+     default: "cancer"
+   sampleType:
+     type: string
+     default: "tumor"
+   dataType:
+     type: string
+     default: "prot"
+
 steps:
   run-cibersort:
     run: ../tumorDeconvAlgs/cibersort/run-cibersort-tool.cwl

diff --git a/simulatedData/Dockerfile b/simulatedData/Dockerfile
@@ -1,9 +1,11 @@
 FROM rocker/tidyverse
-RUN Rscript -e 'install.packages("argparser")'
+
 
 COPY *rda /bin/
 COPY getSimDataMatrices.R /bin/
+COPY mapSimDataMatrices.R /bin/
 
 RUN chmod 777 /bin/getSimDataMatrices.R
+RUN chmod 777 /bin/mapSimDataMatrices.R
 
 VOLUME ['/tmp']
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ prot-algorithms: @@
       - epic
       - xcell
       - cibersort
+      - bayesdebulk
     reps:
       - "1"
       - "2"
@@ Expand Down @@