Browse code

Update SCTK_QC, importAnnData and sce2adata.R

rz2333 authored on 15/10/2020 16:02:58
Showing 9 changed files

... ...
@@ -19,18 +19,28 @@
19 19
                               rowData = sce_rowdata,
20 20
                               colData = sce_coldata)
21 21
   colnames(sce) <- paste0(sampleName,"_",colnames(sce))
22
+
23
+  multi_Assay <- reticulate::py_to_r(anndata$layers$as_dict())
24
+  for(assay_name in names(multi_Assay)){
25
+    tryCatch({
26
+      SummarizedExperiment::assay(sce, assay_name, withDimnames = FALSE) <- t(reticulate::py_to_r(multi_Assay[[assay_name]]))
27
+      base::dimnames(SummarizedExperiment::assay(sce, assay_name)) <- base::dimnames(SummarizedExperiment::assay(sce, "counts"))
28
+    }, error = function(x){
29
+      error_message <- paste0("Warning: unable to add '",assay_name,"' from .layers AnnData slot to SCE assay. Skipping. ")
30
+      message(error_message)
31
+    })
32
+  }
22 33
   
23 34
   multidim_observations <- reticulate::py_to_r(anndata$obsm_keys())
24 35
   for(obsm_name in multidim_observations){
25 36
     tryCatch({
26
-      reducedDims(sce)[[obsm_name]] <- reticulate::py_to_r(anndata$obsm[obsm_name])
37
+      SingleCellExperiment::reducedDims(sce)[[obsm_name]] <- reticulate::py_to_r(anndata$obsm[obsm_name])
27 38
     }, error = function(x){
28
-      error_message <- paste0("Warning: unable to add '",uns_name,"' from .obsm AnnData slot to SCE metadata. Skipping. ")
39
+      error_message <- paste0("Warning: unable to add '",obsm_name,"' from .obsm AnnData slot to SCE metadata. Skipping. ")
29 40
       message(error_message)
30 41
     })
31
-    
32 42
   }
33
-  
43
+
34 44
   unstructured_data <- reticulate::py_to_r(anndata$uns_keys())
35 45
   for(uns_name in unstructured_data){
36 46
     tryCatch({
... ...
@@ -4,7 +4,7 @@
4 4
 #' @return A list of \link[SingleCellExperiment]{SingleCellExperiment} object containing
5 5
 #' the droplet or cell data or both,depending on the dataType that users provided.
6 6
 #' @export
7
-importMultipleSources <- function(allImportEntries) {
7
+importMultipleSources <- function(allImportEntries, delayedArray = FALSE) {
8 8
   sceObjs <- list()
9 9
   for (entry in allImportEntries$samples) {
10 10
     if (entry$type == "cellRanger2") {
... ...
@@ -12,12 +12,14 @@ importMultipleSources <- function(allImportEntries) {
12 12
         newSce <- importCellRangerV2Sample(
13 13
           dataDir = entry$params$dataDir,
14 14
           sampleName = entry$params$sampleName,
15
+          delayedArray = delayedArray
15 16
         )
16 17
       } else {
17 18
         newSce <- importCellRangerV2(
18 19
           cellRangerDirs = entry$params$cellRangerDirs,
19 20
           sampleDirs = entry$params$sampleDirs,
20 21
           sampleNames = entry$params$sampleNames,
22
+          delayedArray = delayedArray
21 23
         )
22 24
       }
23 25
       
... ...
@@ -26,42 +28,50 @@ importMultipleSources <- function(allImportEntries) {
26 28
         newSce <- importCellRangerV3Sample(
27 29
           dataDir = entry$params$dataDir,
28 30
           sampleName = entry$params$sampleName,
31
+          delayedArray = delayedArray
29 32
         )
30 33
       } else {
31 34
         newSce <- importCellRangerV3(
32 35
           cellRangerDirs = entry$params$cellRangerDirs,
33 36
           sampleDirs = entry$params$sampleDirs,
34 37
           sampleNames = entry$params$sampleNames,
38
+          delayedArray = delayedArray
35 39
         )
36 40
       }
37 41
     } else if (entry$type == "starSolo") {
38 42
       newSce <- importSTARsolo(
39 43
         STARsoloDirs = entry$params$STARsoloDirs,
40
-        samples = entry$params$amples
44
+        samples = entry$params$amples,
45
+        delayedArray = delayedArray
41 46
       )
42 47
     } else if (entry$type == "busTools") {
43 48
       newSce <- importBUStools(
44 49
         BUStoolsDirs = entry$params$BUStoolsDirs,
45 50
         samples = entry$params$samples,
51
+        delayedArray = delayedArray
46 52
       )
47 53
     } else if (entry$type == "seqc") {
48 54
       newSce <- importSEQC(
49 55
         seqcDirs = entry$params$seqcDirs,
50 56
         samples = entry$params$samples,
51 57
         prefix = entry$params$prefix,
58
+        delayedArray = delayedArray
52 59
       )
53 60
     } else if (entry$type == "optimus") {
54 61
       newSce <- importOptimus(
55 62
         OptimusDirs = entry$params$OptimusDirs,
56
-        samples = entry$params$samples
63
+        samples = entry$params$samples,
64
+        delayedArray = delayedArray
57 65
       )
58 66
     } else if (entry$type == "files") {
59 67
       newSce <- importFromFiles(assayFile = entry$params$assayFile,
60 68
                                 annotFile = entry$params$annotFile,
61 69
                                 featureFile = entry$params$featureFile,
62
-                                assayName = entry$params$assayName)
70
+                                assayName = entry$params$assayName,
71
+                                delayedArray = delayedArray)
63 72
     } else if (entry$type == "example") {
64
-      newSce <- importExampleData(dataset = entry$params$dataset)
73
+      newSce <- importExampleData(dataset = entry$params$dataset,
74
+                                  delayedArray = delayedArray)
65 75
     } else if (entry$type == "rds") {
66 76
       importedrds <- readRDS(entry$params$rdsFile)
67 77
       if (base::inherits(importedrds, "SummarizedExperiment")) {
... ...
@@ -71,13 +81,24 @@ importMultipleSources <- function(allImportEntries) {
71 81
       } else {
72 82
         stop("The '.rds' file should contain a 'SingleCellExperiment' or 'Seurat' object.")
73 83
       }
84
+
85
+      for(assay in SummarizedExperiment::assayNames(newSce)) {
86
+        if(!base::inherits(SummarizedExperiment::assay(sce, assay), "dgCMatrix") && !isTRUE(delayedArray)) {
87
+          SummarizedExperiment::assay(sce, assay) <- .convertToMatrix(SummarizedExperiment::assay(sce, assay))
88
+        }
89
+
90
+        if(!base::inherits(SummarizedExperiment::assay(sce, assay), "DelayedArray") && isTRUE(delayedArray)) {
91
+          SummarizedExperiment::assay(sce, assay) <- DelayedArray::DelayedArray(SummarizedExperiment::assay(sce, assay))
92
+        }
93
+      }
94
+      
74 95
     }
75 96
     sceObjs = c(sceObjs, list(newSce))
76 97
   }
77 98
   
78 99
   return(combineSCE(sceList = sceObjs,
79
-                    by.r = NULL,
80
-                    by.c = Reduce(intersect, lapply(sceObjs, function(x) { colnames(colData(x))})),
100
+                    by.r = Reduce(base::intersect, lapply(sceObjs, function(x) { colnames(rowData(x))})),
101
+                    by.c = Reduce(base::intersect, lapply(sceObjs, function(x) { colnames(colData(x))})),
81 102
                     combined = TRUE)
82 103
   )
83 104
 }
... ...
@@ -43,12 +43,12 @@
43 43
         }
44 44
     }
45 45
 
46
-    # Furthermore, the other assays will for now also be saved to .obsm
46
+    # Furthermore, the other assays will for now also be saved to .layers
47 47
     allAssayNames <- SummarizedExperiment::assayNames(SCE)
48 48
     for (i in 1:length(allAssayNames)) {
49 49
         oneName <- allAssayNames[i]
50 50
         if (!oneName == useAssay) {
51
-            AnnData$obsm$'__setitem__'(oneName, t(SummarizedExperiment::assay(SCE, oneName)))
51
+            AnnData$layers$'__setitem__'(oneName, t(SummarizedExperiment::assay(SCE, oneName)))
52 52
         }
53 53
     }
54 54
     return(AnnData)
... ...
@@ -693,4 +693,4 @@ if (("FlatFile" %in% formats)) {
693 693
     }
694 694
 }
695 695
 
696
-sessionInfo()
696
+sessionInfo()
697 697
\ No newline at end of file
... ...
@@ -97,6 +97,10 @@ The optional arguments are as follows. Their usage depend on type of data and us
97 97
 -n, --numCores. Number of cores used to run the pipeline. By default is 1. Parallel computing is enabled if -n is greater than 1. <br> <br>
98 98
 
99 99
 -T, --parallelType. Type of parallel computing used for parallel computing. Parallel computing used in this pipeline depends on "BiocParallel" package. Default is 'MulticoreParam'. It can be 'MulticoreParam' or 'SnowParam'. This argument will be evaluated only when numCores > 1. <br> <br>
100
+
101
+-i, --studyDesign. The txt file containing the desrciption of the study design. Default is NULL. This would be shown at the begining the html report of cell and droplet QC. <br> <br>
102
+
103
+-L, --subTitle. The subtitle used in the cell and droplet QC HTML report. Default is None. The subtitle can contain information of the sample, like sample name, etc. If -S is set as TRUE, the length of subsitle should be the same as the length of samples. If -S is set as FALSE, the length of subtitle should be one or NULL. <br> <br>
100 104
 </p>
101 105
 </details>
102 106
 
... ...
@@ -327,18 +331,43 @@ The Singulatiry image can easily be built using Docker Hub as a source:
327 331
 singularity pull docker://campbio/sctk_qc:1.7.5
328 332
 ```
329 333
 
330
-The usage of singleCellTK Singularity image is very similar to that of Docker. In Singularity 3.0+, the mount volume is [automatically overlaid](https://singularity.lbl.gov/docs-mount). However, you can use argument --bind/-B to specify your own mount volume. The example is shown as below:
334
+The usage of singleCellTK Singularity image is very similar to that of Docker. In Singularity 3.0+, the mount volume is [automatically overlaid](https://singularity.lbl.gov/docs-mount). 
335
+ 
336
+It's recommended to re-set the home directory when you run singularity. Singularity will mount \$HOME path on your machine by default, which might contain your personal R/Python library folder. If we don't re-set the home to mount, singularity will try to use R/Python libraries which are not built within the singularity image and cause some conflicts. You can point to some "sanitized home", which is different from \$HOME path on your machine, using argument [-H/--home](https://singularity.lbl.gov/faq#solution-1-specify-the-home-to-mount). Besides, you can use argument --bind/-B to specify your own mount volume, which is the path that contains the dataset and will be used to store the output of QC pipeline. The example is shown as below:
331 337
 
332 338
 ```
333
-singularity run sctk_qc_1.7.5.sif \
334
--b ./cellranger \
339
+singularity run --home=/PathToSanitizedHome \
340
+--bind /PathToData:/data sctk_qc_1.7.5.sif \
335 341
 -P CellRangerV3 \
336
--s pbmc_100x100 \
337
--o ./result/tenx_v3_pbmc \
338
--g ./mitochondrial_human_symbol.gmt
342
+-s gencodev34_pbmc_1k_v3 \
343
+-b /data/gencodev34_pbmc_1k_v3
344
+-o /data/result/gencodev34_pbmc_1k_v3 \
345
+-S TRUE \
346
+-F R,Python,FlatFile,HTAN \
347
+-n 15 \
348
+-T MulticoreParam
339 349
 ```
340 350
 
341
-The code above assumed that the dataset is in your current directory, which is automatically mounted by Singularity. If you run Singularity image on BU SCC,it's recommended to re-set the home directory to mount. Otherwise, the container will load libraries in the SCC shared libraries, which might cause some conflicts. You can point to some "sanitized home" using argument [-H/--home](https://singularity.lbl.gov/faq#solution-1-specify-the-home-to-mount). Also, you might want to specify cpu architecture when run the docker on BU SCC using #$ -l cpu_arch=broadwell|haswell|skylake|cascadelake. Because the python packages are compiled by SIMD instructions that are only available on these two cpu architectures. 
351
+Also, you might want to specify cpu architecture when run the Singularity image on BU SCC using '#$ -l cpu_arch=broadwell|haswell|skylake|cascadelake' command. Because the python packages are compiled by SIMD instructions that are only compatible on these cpu architectures. If you are runnning singularity on other cluster, please contact IT helps about how to specify cpu architecture when you run the singularity image. One of the example is shown below: 
352
+
353
+```
354
+#!/bin/bash
355
+#$ -cwd
356
+#$ -j y
357
+#$ -P camplab
358
+#$ -pe omp 16
359
+#$ -l cpu_arch=broadwell|haswell|skylake|cascadelake
360
+singularity run --home=/PathToSanitizedHome \
361
+--bind /PathToData:/data sctk_qc_1.7.5.sif \
362
+-P CellRangerV3 \
363
+-s gencodev34_pbmc_1k_v3 \
364
+-b /data/gencodev34_pbmc_1k_v3
365
+-o /data/result/gencodev34_pbmc_1k_v3 \
366
+-S TRUE \
367
+-F R,Python,FlatFile,HTAN \
368
+-n 15 \
369
+-T MulticoreParam
370
+```
342 371
 
343 372
 ## Documentation of tools that are currently available within the pipeline:
344 373
 #### Empty droplet detection:
345 374
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+mito	feature_ID	ENSG00000198888	ENSG00000198763	ENSG00000198804	ENSG00000198712	ENSG00000228253	ENSG00000198899	ENSG00000198938	ENSG00000198840	ENSG00000212907	ENSG00000198886	ENSG00000198786	ENSG00000198695	ENSG00000198727
0 2
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+mito	feature_ID	ENSG00000198888.2	ENSG00000198763.3	ENSG00000198804.2	ENSG00000198712.1	ENSG00000228253.1	ENSG00000198899.2	ENSG00000198938.2	ENSG00000198840.2	ENSG00000212907.2	ENSG00000198886.2	ENSG00000198786.2	ENSG00000198695.2	ENSG00000198727.2
0 2
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+mito	feature_name	MT-ND1	MT-ND2	MT-CO1	MT-CO2	MT-ATP8	MT-ATP6	MT-CO3	MT-ND3	MT-ND4L	MT-ND4	MT-ND5	MT-ND6	MT-CYB
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{importMultipleSources}
5 5
 \title{Imports samples from different sources and compiles them into a list of SCE objects}
6 6
 \usage{
7
-importMultipleSources(allImportEntries)
7
+importMultipleSources(allImportEntries, delayedArray = FALSE)
8 8
 }
9 9
 \arguments{
10 10
 \item{allImportEntries}{object containing the sources and parameters of all the samples being imported (from the UI)}