Browse code

Merge branch 'release/v1.1.3'

mbcole authored on 19/10/2017 23:01:11
Showing 13 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: scone
2
-Version: 1.1.2
2
+Version: 1.1.3
3 3
 Title: Single Cell Overview of Normalized Expression data
4 4
 Description: SCONE is an R package for comparing and ranking the performance of
5 5
 	different normalization schemes for single-cell RNA-seq and other 
... ...
@@ -60,6 +60,7 @@ importFrom(grDevices,pdf)
60 60
 importFrom(graphics,abline)
61 61
 importFrom(graphics,arrows)
62 62
 importFrom(graphics,barplot)
63
+importFrom(graphics,boxplot)
63 64
 importFrom(graphics,hist)
64 65
 importFrom(graphics,legend)
65 66
 importFrom(graphics,lines)
... ...
@@ -91,6 +92,7 @@ importFrom(stats,dist)
91 92
 importFrom(stats,dnbinom)
92 93
 importFrom(stats,fitted.values)
93 94
 importFrom(stats,glm)
95
+importFrom(stats,lm)
94 96
 importFrom(stats,mad)
95 97
 importFrom(stats,median)
96 98
 importFrom(stats,model.matrix)
... ...
@@ -101,7 +103,10 @@ importFrom(stats,prcomp)
101 103
 importFrom(stats,quantile)
102 104
 importFrom(stats,quasibinomial)
103 105
 importFrom(stats,sd)
106
+importFrom(stats,setNames)
107
+importFrom(stats,var)
104 108
 importFrom(utils,capture.output)
105 109
 importFrom(utils,head)
106 110
 importFrom(utils,sessionInfo)
111
+importFrom(utils,write.csv)
107 112
 importFrom(utils,write.table)
... ...
@@ -33,7 +33,8 @@ TMM_FN = function(ei) {
33 33
   return(eo)
34 34
 }
35 35
 
36
-#' Relative log-expression (RLE; DESeq) scaling normalization wrapper function
36
+#' Relative log-expression (RLE; DESeq) scaling normalization wrapper
37
+#' function
37 38
 #' @importFrom edgeR calcNormFactors
38 39
 #' @details SCONE scaling wrapper for \code{\link[edgeR]{calcNormFactors}}).
39 40
 #' @export
... ...
@@ -24,6 +24,9 @@
24 24
 #'
25 25
 #' @importFrom RColorBrewer brewer.pal
26 26
 #' @importFrom rARPACK svds
27
+#' @importFrom graphics boxplot
28
+#' @importFrom utils write.csv
29
+#' @importFrom stats setNames var
27 30
 #' @export
28 31
 #'
29 32
 #' @return An object that represents the SCONE report app.
... ...
@@ -66,15 +69,15 @@ sconeReport = function(x, methods,
66 69
     stop("reshape2 package needed for sconeReport()")
67 70
   }
68 71
 
69
-  if (!require("plotly", quietly = TRUE)) {
72
+  if (!requireNamespace("plotly", quietly = TRUE)) {
70 73
     stop("plotly package needed for sconeReport()")
71 74
   }
72 75
 
73
-  if (!require("visNetwork", quietly = TRUE)) {
76
+  if (!requireNamespace("visNetwork", quietly = TRUE)) {
74 77
     stop("visNetwork package needed for sconeReport()")
75 78
   }
76 79
 
77
-  if (!require("ggplot2", quietly = TRUE)) {
80
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
78 81
     stop("ggplot2 package needed for sconeReport()")
79 82
   }
80 83
 
... ...
@@ -223,18 +226,18 @@ sconeReport = function(x, methods,
223 226
                             ),
224 227
                             shiny::tabPanel("PCA",
225 228
                                      shiny::br(),
226
-                                     shiny::p(paste0("This panel shows principal ",
229
+                                shiny::p(paste0("This panel shows principal ",
227 230
                                               "component analysis ",
228 231
                                               "(PCA) results",
229 232
                                               " on different subsets ",
230 233
                                               "of genes.")),
231 234
                                      shiny::br(),
232
-                                     shiny::h6("Variance Explained (All Genes)"),
235
+                                  shiny::h6("Variance Explained (All Genes)"),
233 236
                                      shiny::p(paste0("Use this plot",
234 237
                                               " to decide on the ",
235 238
                                               "dimensionality of the reduced",
236 239
                                               " spaced used for evaluation.")),
237
-                                     shiny::plotOutput("plot_scree",width = "650px",
240
+                                shiny::plotOutput("plot_scree",width = "650px",
238 241
                                                 height = "400px"),
239 242
                                      shiny::br(),
240 243
                                      shiny::h6("2D (All Genes)"),
... ...
@@ -264,7 +267,7 @@ sconeReport = function(x, methods,
264 267
                             ),
265 268
                             shiny::tabPanel("QC",
266 269
                                      shiny::br(),
267
-                                     shiny::p(paste0("This panel shows the absolute",
270
+                              shiny::p(paste0("This panel shows the absolute",
268 271
                                               " correlations between",
269 272
                                               " Principal",
270 273
                                               " Components (PCs) of",
... ...
@@ -326,7 +329,7 @@ sconeReport = function(x, methods,
326 329
                             ),
327 330
                             shiny::tabPanel("Control Genes",
328 331
                                      shiny::br(),
329
-                                     shiny::p(paste0("Heatmap of control genes, ",
332
+                                  shiny::p(paste0("Heatmap of control genes, ",
330 333
                                               "colored by all",
331 334
                                               " three categories.")),
332 335
                                      shiny::br(),
... ...
@@ -625,7 +628,7 @@ sconeReport = function(x, methods,
625 628
              fill = cc[sort(unique(factor(strat_col())))])
626 629
     })
627 630
 
628
-    output$plot3d_base <- shiny::renderPlotly({
631
+    output$plot3d_base <- plotly::renderPlotly({
629 632
       PC1 <- PC2 <- PC3 <- NULL
630 633
       df <- setNames(data.frame(pc_obj_base()$x[,1:3]),
631 634
                      c("PC1", "PC2", "PC3"))
... ...
@@ -665,7 +668,7 @@ sconeReport = function(x, methods,
665 668
       abs(cor(qc,pc_obj_base()$x))
666 669
     })
667 670
 
668
-    output$qccorPlot <- shiny::renderPlotly({
671
+    output$qccorPlot <- plotly::renderPlotly({
669 672
       metric <- value <- PC <- NULL
670 673
 
671 674
       df = reshape2::melt(cor_qc()[,1:input$dim])
... ...
@@ -696,7 +699,7 @@ sconeReport = function(x, methods,
696 699
              fill = cc[sort(unique(factor(strat_col())))])
697 700
     })
698 701
 
699
-    output$plot3d_qc <- shiny::renderPlotly({
702
+    output$plot3d_qc <- plotly::renderPlotly({
700 703
       if(ncol(pc_obj_qc()$x) >= 3){
701 704
         PC1 <- PC2 <- PC3 <- NULL
702 705
         df <- setNames(data.frame(pc_obj_qc()$x[,1:3]),
... ...
@@ -1,120 +1,146 @@
1 1
 #' SCONE Evaluation: Evaluate an Expression Matrix
2
-#' 
3
-#' This function evaluates a (normalized) expression matrix using SCONE 
4
-#' criteria, producing 8 metrics based on i) Clustering, ii) Correlations and 
2
+#'
3
+#' This function evaluates a (normalized) expression matrix using SCONE
4
+#' criteria, producing 8 metrics based on i) Clustering, ii) Correlations and
5 5
 #' iii) Relative Expression.
6
-#' 
6
+#'
7 7
 #' @details Users may specify their own eval_proj function that will be used to
8 8
 #'   compute Clustering and Correlation metrics. This eval_proj() function must
9
-#'   have 2 input arguments: \itemize{ \item{e}{ matrix. log-transformed (+ 
10
-#'   pseudocount) expression data (genes in rows, cells in columns).} 
9
+#'   have 2 input arguments: \itemize{ \item{e}{ matrix. log-transformed (+
10
+#'   pseudocount) expression data (genes in rows, cells in columns).}
11 11
 #'   \item{eval_proj_args}{ list. additional function arguments, e.g. prior
12 12
 #'   data weights.}} and it must output a matrix representation of the original
13 13
 #'   data (cells in rows, factors in columns). The value of eval_proj_args is
14 14
 #'   passed to the user-defined function from the eval_proj_args argument of
15 15
 #'   the main score_matrix() function call.
16
-#'   
17
-#' @param expr matrix. The expression data matrix (genes in rows, cells in 
16
+#'
17
+#' @param expr matrix. The expression data matrix (genes in rows, cells in
18 18
 #'   columns).
19 19
 #' @param eval_pcs numeric. The number of principal components to use for
20 20
 #'   evaluation (Default 3). Ignored if !is.null(eval_proj).
21 21
 #' @param eval_proj function. Projection function for evaluation (see Details).
22 22
 #'   If NULL, PCA is used for projection
23
-#' @param eval_proj_args list. List of arguments passed to projection function 
23
+#' @param eval_proj_args list. List of arguments passed to projection function
24 24
 #'   as eval_proj_args (see Details).
25
-#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam 
26
-#'   tightness (PAM_SIL) evaluation. If an array of integers, largest average 
25
+#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam
26
+#'   tightness (PAM_SIL) evaluation. If an array of integers, largest average
27 27
 #'   silhouette width (tightness) will be reported in PAM_SIL. If NULL, PAM_SIL
28 28
 #'   will be returned NA.
29 29
 #' @param bio factor. A known biological condition (variation to be preserved),
30 30
 #'   NA is allowed. If NULL, condition ASW, BIO_SIL, will be returned NA.
31 31
 #' @param batch factor. A known batch variable (variation to be removed), NA is
32 32
 #'   allowed. If NULL, batch ASW, BATCH_SIL, will be returned NA.
33
-#' @param qc_factors Factors of unwanted variation derived from quality 
33
+#' @param qc_factors Factors of unwanted variation derived from quality
34 34
 #'   metrics. If NULL, qc correlations, EXP_QC_COR, will be returned NA.
35
-#' @param uv_factors Factors of unwanted variation derived from negative 
35
+#' @param uv_factors Factors of unwanted variation derived from negative
36 36
 #'   control genes (evaluation set). If NULL, uv correlations, EXP_UV_COR,
37 37
 #'   will be returned NA.
38 38
 #' @param wv_factors Factors of wanted variation derived from positive
39 39
 #'   control genes (evaluation set). If NULL, wv correlations, EXP_WV_COR,
40 40
 #'   will be returned NA.
41
-#' @param is_log logical. If TRUE the expr matrix is already logged and log 
41
+#' @param is_log logical. If TRUE the expr matrix is already logged and log
42 42
 #'   transformation will not be carried out prior to projection. Default FALSE.
43
-#' @param stratified_pam logical. If TRUE then maximum ASW is separately 
44
-#'   computed for each biological-cross-batch stratum (accepts NAs), and a 
43
+#' @param stratified_pam logical. If TRUE then maximum ASW is separately
44
+#'   computed for each biological-cross-batch stratum (accepts NAs), and a
45 45
 #'   weighted average silhouette width is returned as PAM_SIL. Default FALSE.
46
-#'   
46
+#' @param stratified_cor logical. If TRUE then cor metrics are separately
47
+#'   computed for each biological-cross-batch stratum (accepts NAs), and
48
+#'   weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
49
+#'   Default FALSE.
50
+#' @param stratified_rle logical. If TRUE then rle metrics are separately
51
+#'   computed for each biological-cross-batch stratum (accepts NAs), and
52
+#'   weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.
53
+#'
47 54
 #' @importFrom class knn
48 55
 #' @importFrom fpc pamk
49 56
 #' @importFrom cluster silhouette
50 57
 #' @importFrom rARPACK svds
51 58
 #' @importFrom matrixStats rowMedians colMedians colIQRs
52
-#'   
59
+#' @importFrom stats lm
60
+#'
53 61
 #' @export
54
-#' 
62
+#'
55 63
 #' @return A list with the following metrics: \itemize{ \item{BIO_SIL}{ Average
56
-#'   silhouette width by biological condition.} \item{BATCH_SIL}{ Average 
57
-#'   silhouette width by batch condition.} \item{PAM_SIL}{ Maximum average 
58
-#'   silhouette width from PAM clustering (see stratified_pam argument).} 
59
-#'   \item{EXP_QC_COR}{ Maximum squared spearman correlation between expression
60
-#'   pcs and quality factors.} \item{EXP_UV_COR}{ Maximum squared spearman 
61
-#'   correlation between expression pcs and negative control gene factors.} 
62
-#'   \item{EXP_WV_COR}{ Maximum squared spearman correlation between expression
63
-#'   pcs and positive control gene factors.} \item{RLE_MED}{ The mean squared 
64
-#'   median Relative Log Expression (RLE).} \item{RLE_IQR}{ The variance of the
65
-#'   inter-quartile range (IQR) of the RLE.} }
66
-#'   
67
-#' @examples 
68
-#' 
64
+#'   silhouette width by biological condition.} \item{BATCH_SIL}{ Average
65
+#'   silhouette width by batch condition.} \item{PAM_SIL}{ Maximum average
66
+#'   silhouette width from PAM clustering (see stratified_pam argument).}
67
+#'   \item{EXP_QC_COR}{ Coefficient of determination between expression
68
+#'   pcs and quality factors (see stratified_cor argument).} \item{EXP_UV_COR}{
69
+#'   Coefficient of determination between expression pcs and negative
70
+#'   control gene factors (see stratified_cor argument).} \item{EXP_WV_COR}{
71
+#'   Coefficient of determination between expression pcs and positive
72
+#'   control gene factors (see stratified_cor argument).} \item{RLE_MED}{ The
73
+#'   mean squared median Relative Log Expression (RLE) (see stratified_rle
74
+#'   argument).} \item{RLE_IQR}{ The variance of the inter-quartile range (IQR)
75
+#'   of the RLE (see stratified_rle argument).} }
76
+#'
77
+#' @examples
78
+#'
69 79
 #' set.seed(141)
70 80
 #' bio = as.factor(rep(c(1,2),each = 2))
71 81
 #' batch = as.factor(rep(c(1,2),2))
72 82
 #' log_expr = matrix(rnorm(20),ncol = 4)
73
-#' 
74
-#' scone_metrics = score_matrix(log_expr, 
83
+#'
84
+#' scone_metrics = score_matrix(log_expr,
75 85
 #'    bio = bio, batch = batch,
76 86
 #'    eval_kclust = 2, is_log = TRUE)
77
-#' 
87
+#'
78 88
 
79
-score_matrix <- function(expr, eval_pcs = 3,
80
-                         eval_proj = NULL, eval_proj_args = NULL,
89
+score_matrix <- function(expr,
90
+                         eval_pcs = 3,
91
+                         eval_proj = NULL,
92
+                         eval_proj_args = NULL,
81 93
                          eval_kclust = NULL,
82
-                         bio = NULL, batch = NULL,
94
+                         bio = NULL,
95
+                         batch = NULL,
83 96
                          qc_factors = NULL,
84 97
                          uv_factors = NULL,
85 98
                          wv_factors = NULL,
86
-                         is_log=FALSE, stratified_pam = FALSE){
87
-  
88
-  if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){
99
+                         is_log = FALSE,
100
+                         stratified_pam = FALSE,
101
+                         stratified_cor = FALSE,
102
+                         stratified_rle = FALSE) {
103
+  if (any(is.na(expr) | is.infinite(expr) | is.nan(expr))) {
89 104
     stop("NA/Inf/NaN Expression Values.")
90 105
   }
91 106
   
92
-  if(!is_log) {
107
+  if (!is_log) {
93 108
     expr <- log1p(expr)
94 109
   }
95 110
   
96
-  # The svd we do below on expr throws an exception if 
97
-  # expr created by one of the normalizations has a 
111
+  # The svd we do below on expr throws an exception if
112
+  # expr created by one of the normalizations has a
98 113
   # constant feature (=gene, i.e. row)
99
-  constantFeatures = apply(expr, 1, function(x) max(x)-min(x)) < 1e-3
100
-  if(any(constantFeatures)) {
101
-    warning(sprintf(paste0("scone_eval: expression matrix ",
102
-                           "contained %d constant features (rows) ",
103
-                           "---> excluding them"), sum(constantFeatures)))
104
-    expr = expr[!constantFeatures, ]
114
+  constantFeatures = apply(expr, 1, function(x)
115
+    max(x) - min(x)) < 1e-3
116
+  if (any(constantFeatures)) {
117
+    warning(sprintf(
118
+      paste0(
119
+        "scone_eval: expression matrix ",
120
+        "contained %d constant features (rows) ",
121
+        "---> excluding them"
122
+      ),
123
+      sum(constantFeatures)
124
+    ))
125
+    expr = expr[!constantFeatures,]
105 126
   }
106 127
   
107
-  if(is.null(eval_proj)){
128
+  if (is.null(eval_proj)) {
108 129
     proj = tryCatch({
109
-      svds(scale(t(expr), center = TRUE, scale = TRUE),
110
-           k=eval_pcs, nu=eval_pcs, nv=0)$u},
111
-      error = function(e) {
112
-        stop("scone_eval: svd failed")
113
-      })
130
+      svds(
131
+        scale(t(expr), center = TRUE, scale = TRUE),
132
+        k = eval_pcs,
133
+        nu = eval_pcs,
134
+        nv = 0
135
+      )$u
136
+    },
137
+    error = function(e) {
138
+      stop("scone_eval: svd failed")
139
+    })
114 140
     
115 141
     
116 142
   } else {
117
-    proj = eval_proj(expr,eval_proj_args = eval_proj_args)
143
+    proj = eval_proj(expr, eval_proj_args = eval_proj_args)
118 144
     eval_pcs = ncol(proj)
119 145
   }
120 146
   
... ...
@@ -123,17 +149,21 @@ score_matrix <- function(expr, eval_pcs = 3,
123 149
   
124 150
   # Biological Condition
125 151
   
126
-  if( !is.null(bio) ) {
127
-    if(!all(is.na(bio))) {
128
-      if(length(unique(bio)) > 1) {
152
+  if (!is.null(bio)) {
153
+    if (!all(is.na(bio))) {
154
+      if (length(unique(bio)) > 1) {
129 155
         BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),
130 156
                                               dd[!is.na(bio),
131 157
                                                  !is.na(bio)]))$avg.width
132 158
       } else {
133 159
         BIO_SIL = NA
134
-        warning(paste0("after exclusion of samples, ",
135
-                       "only one bio remains, BATCH_BIO",
136
-                       " is undefined"))
160
+        warning(
161
+          paste0(
162
+            "after exclusion of samples, ",
163
+            "only one bio remains, BATCH_BIO",
164
+            " is undefined"
165
+          )
166
+        )
137 167
       }
138 168
     } else {
139 169
       BIO_SIL = NA
... ...
@@ -145,17 +175,21 @@ score_matrix <- function(expr, eval_pcs = 3,
145 175
   
146 176
   # Batch Condition
147 177
   
148
-  if(!is.null(batch)) {
149
-    if(!all(is.na(batch))) {
150
-      if(length(unique(batch)) > 1) {
178
+  if (!is.null(batch)) {
179
+    if (!all(is.na(batch))) {
180
+      if (length(unique(batch)) > 1) {
151 181
         BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),
152 182
                                                  dd[!is.na(batch),
153 183
                                                     !is.na(batch)]))$avg.width
154 184
       } else {
155 185
         BATCH_SIL <- NA
156
-        warning(paste0("after exclusion of samples,",
157
-                       " only one batch remains, ",
158
-                       "BATCH_SIL is undefined"))
186
+        warning(
187
+          paste0(
188
+            "after exclusion of samples,",
189
+            " only one batch remains, ",
190
+            "BATCH_SIL is undefined"
191
+          )
192
+        )
159 193
       }
160 194
     } else{
161 195
       BATCH_SIL <- NA
... ...
@@ -167,94 +201,197 @@ score_matrix <- function(expr, eval_pcs = 3,
167 201
   
168 202
   ## ------ PAM Tightness -----
169 203
   
170
-  if ( !is.null(eval_kclust) ){
171
-    
204
+  if (!is.null(eval_kclust)) {
172 205
     # "Stratified" PAM
173 206
     
174
-    if(stratified_pam){
175
-      
176
-      biobatch = as.factor(paste(bio,batch,sep = "_"))
207
+    if (stratified_pam) {
208
+      biobatch = as.factor(paste(bio, batch, sep = "_"))
177 209
       PAM_SIL = 0
178 210
       
179 211
       # Max Average Sil Width per bio-cross-batch Condition
180
-      for (cond in levels(biobatch)){
212
+      for (cond in levels(biobatch)) {
181 213
         is_cond = which(biobatch == cond)
182 214
         cond_w = length(is_cond)
183
-        if(cond_w > max(eval_kclust)){
184
-          pamk_object = pamk(proj[is_cond,],krange = eval_kclust)
215
+        if (cond_w > max(eval_kclust)) {
216
+          pamk_object = pamk(proj[is_cond, ], krange = eval_kclust)
185 217
           
186
-          # Despite krange excluding nc = 1, 
218
+          # Despite krange excluding nc = 1,
187 219
           # if asw is negative, nc = 1 will be selected
188
-          if(is.null(pamk_object$pamobject$silinfo$avg.width) ){
220
+          if (is.null(pamk_object$pamobject$silinfo$avg.width)) {
189 221
             if (!1 %in% eval_kclust) {
190
-              pamk_object$pamobject$silinfo$avg.width = max(
191
-                pamk_object$crit[1:max(eval_kclust) %in% eval_kclust]
192
-              )
193
-            }else{
194
-              stop(paste0("nc = 1 was selected by Duda-Hart,",
195
-                          " exclude 1 from eval_kclust."))
222
+              pamk_object$pamobject$silinfo$avg.width = 
223
+                max(pamk_object$crit[1:max(eval_kclust) %in% eval_kclust])
224
+            } else{
225
+              stop(paste0(
226
+                "nc = 1 was selected by Duda-Hart,",
227
+                " exclude 1 from eval_kclust."
228
+              ))
196 229
             }
197 230
           }
198 231
           
199
-          PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width
200
-        }else{
201
-          stop(paste(paste0("Number of clusters 'k' must be ",
202
-                            "smaller than bio-cross-batch stratum size:"),
203
-                     paste(levels(biobatch),
204
-                           table(biobatch),
205
-                           sep = " = ",collapse = ", ")))
232
+          PAM_SIL = PAM_SIL + cond_w * pamk_object$pamobject$silinfo$avg.width
233
+        } else{
234
+          stop(paste(
235
+            paste0(
236
+              "Number of clusters 'k' must be ",
237
+              "smaller than bio-cross-batch stratum size:"
238
+            ),
239
+            paste(
240
+              levels(biobatch),
241
+              table(biobatch),
242
+              sep = " = ",
243
+              collapse = ", "
244
+            )
245
+          ))
206 246
         }
207 247
       }
208
-      PAM_SIL = PAM_SIL/length(biobatch)
248
+      PAM_SIL = PAM_SIL / length(biobatch)
209 249
       
210 250
       # Traditional PAM
211 251
       
212
-    }else{
213
-      pamk_object = pamk(proj,krange = eval_kclust)
252
+    } else{
253
+      pamk_object = pamk(proj, krange = eval_kclust)
214 254
       PAM_SIL = pamk_object$pamobject$silinfo$avg.width
215 255
     }
216 256
     
217
-  }else{
257
+  } else{
218 258
     PAM_SIL = NA
219 259
   }
220 260
   
221 261
   ## ------ Correlation with Factors -----
222
-  
223
-  # Max cor with quality factors.
224
-  if(!is.null(qc_factors)){
225
-    EXP_QC_COR = max(cor(proj,qc_factors,method = "spearman")^2)
226
-  }else{
227
-    EXP_QC_COR = NA
228
-  }
229
-  
230
-  # Max cor with UV factors.
231
-  if(!is.null(uv_factors)){
232
-    EXP_UV_COR = max(cor(proj,uv_factors,method = "spearman")^2)
233
-  }else{
234
-    EXP_UV_COR = NA
235
-  }
236
-  
237
-  # Max cor with WV factors.
238
-  if(!is.null(wv_factors)){
239
-    EXP_WV_COR = max(cor(proj,wv_factors,method = "spearman")^2)
240
-  }else{
241
-    EXP_WV_COR = NA
262
+  if (stratified_cor) {
263
+    biobatch = as.factor(paste(bio, batch, sep = "_"))
264
+    EXP_QC_COR <- EXP_UV_COR <- EXP_WV_COR <- 0
265
+    
266
+    for (cond in levels(biobatch)) {
267
+      is_cond = which(biobatch == cond)
268
+      cond_w = length(is_cond)
269
+      
270
+      # Max cor with quality factors.
271
+      if (!is.null(qc_factors)) {
272
+        EXP_QC_COR <-
273
+          EXP_QC_COR + cond_w * (1 - sum(unlist(apply(proj[is_cond,], 2,
274
+                                                      function(y) {
275
+            lm(y ~ qc_factors[is_cond,])$residual
276
+          })) ^ 2) / sum(scale(proj[is_cond,], scale = FALSE) ^ 2))
277
+      }
278
+      
279
+      # Max cor with UV factors.
280
+      if (!is.null(uv_factors)) {
281
+        EXP_UV_COR  <-
282
+          EXP_UV_COR + cond_w * (1 - sum(unlist(apply(proj[is_cond,], 2,
283
+                                                      function(y) {
284
+            lm(y ~ uv_factors[is_cond,])$residual
285
+          })) ^ 2) / sum(scale(proj[is_cond,], scale = FALSE) ^ 2))
286
+      }
287
+      
288
+      # Max cor with WV factors.
289
+      if (!is.null(wv_factors)) {
290
+        EXP_WV_COR <-
291
+          EXP_WV_COR + cond_w * (1 - sum(unlist(apply(proj[is_cond,], 2,
292
+                                                      function(y) {
293
+            lm(y ~ wv_factors[is_cond,])$residual
294
+          })) ^ 2) / sum(scale(proj[is_cond,], scale = FALSE) ^ 2))
295
+      }
296
+      
297
+    }
298
+    
299
+    if (!is.null(qc_factors)) {
300
+      EXP_QC_COR <- EXP_QC_COR / length(biobatch)
301
+    } else{
302
+      EXP_QC_COR <- NA
303
+    }
304
+    if (!is.null(uv_factors)) {
305
+      EXP_UV_COR <- EXP_UV_COR / length(biobatch)
306
+    } else{
307
+      EXP_UV_COR <- NA
308
+    }
309
+    if (!is.null(wv_factors)) {
310
+      EXP_WV_COR <- EXP_WV_COR / length(biobatch)
311
+    } else{
312
+      EXP_WV_COR <- NA
313
+    }
314
+    
315
+  } else{
316
+    # Max cor with quality factors.
317
+    if (!is.null(qc_factors)) {
318
+      EXP_QC_COR <- 1 - sum(unlist(apply(proj, 2, function(y) {
319
+        lm(y ~ qc_factors)$residual
320
+        })) ^ 2) / sum(scale(proj, scale = FALSE) ^ 2)
321
+    } else{
322
+      EXP_QC_COR = NA
323
+    }
324
+    
325
+    # Max cor with UV factors.
326
+    if (!is.null(uv_factors)) {
327
+      EXP_UV_COR <- 1 - sum(unlist(apply(proj, 2, function(y) {
328
+        lm(y ~ uv_factors)$residual
329
+      })) ^ 2) / sum(scale(proj, scale = FALSE) ^ 2)
330
+    } else{
331
+      EXP_UV_COR = NA
332
+    }
333
+    
334
+    # Max cor with WV factors.
335
+    if (!is.null(wv_factors)) {
336
+      EXP_WV_COR <- 1 - sum(unlist(apply(proj, 2, function(y) {
337
+        lm(y ~ wv_factors)$residual
338
+      })) ^ 2) / sum(scale(proj, scale = FALSE) ^ 2)
339
+    } else{
340
+      EXP_WV_COR = NA
341
+    }
342
+    
242 343
   }
243 344
   
244 345
   ## ----- RLE Measures
245
-  rle <- expr - rowMedians(expr)
246
-  
247
-  # Non-Zero Median RLE
248
-  RLE_MED <- mean(colMedians(rle)^2) # Variance of the median
249
-  
250
-  # Variable IQR RLE
251
-  RLE_IQR <- var(colIQRs(rle)) # Variance of the IQR
346
+  if (stratified_rle) {
347
+    biobatch = as.factor(paste(bio, batch, sep = "_"))
348
+    RLE_MED <- RLE_IQR <- 0
349
+    
350
+    for (cond in levels(biobatch)) {
351
+      is_cond = which(biobatch == cond)
352
+      cond_w = length(is_cond)
353
+      rle <- expr[, is_cond] - rowMedians(expr[, is_cond])
354
+      
355
+      # Non-Zero Median RLE
356
+      RLE_MED <-
357
+        RLE_MED + cond_w * mean(colMedians(rle) ^ 2) # Var of the med
358
+      
359
+      # Variable IQR RLE
360
+      RLE_IQR <-
361
+        RLE_IQR + cond_w * var(colIQRs(rle)) # Variance of the IQR
362
+      
363
+    }
364
+    
365
+    RLE_MED <- RLE_MED / length(biobatch)
366
+    RLE_IQR <- RLE_IQR / length(biobatch)
367
+    
368
+  } else{
369
+    rle <- expr - rowMedians(expr)
370
+    
371
+    # Non-Zero Median RLE
372
+    RLE_MED <- mean(colMedians(rle) ^ 2) # Variance of the median
373
+    
374
+    # Variable IQR RLE
375
+    RLE_IQR <- var(colIQRs(rle)) # Variance of the IQR
376
+  }
252 377
   
253
-  scores = c(BIO_SIL, BATCH_SIL, PAM_SIL,
254
-             EXP_QC_COR, EXP_UV_COR, EXP_WV_COR,
255
-             RLE_MED, RLE_IQR)
256
-  names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL",
257
-                    "EXP_QC_COR", "EXP_UV_COR", "EXP_WV_COR",
258
-                    "RLE_MED", "RLE_IQR")
378
+  scores = c(BIO_SIL,
379
+             BATCH_SIL,
380
+             PAM_SIL,
381
+             EXP_QC_COR,
382
+             EXP_UV_COR,
383
+             EXP_WV_COR,
384
+             RLE_MED,
385
+             RLE_IQR)
386
+  names(scores) = c(
387
+    "BIO_SIL",
388
+    "BATCH_SIL",
389
+    "PAM_SIL",
390
+    "EXP_QC_COR",
391
+    "EXP_UV_COR",
392
+    "EXP_WV_COR",
393
+    "RLE_MED",
394
+    "RLE_IQR"
395
+  )
259 396
   return(scores)
260 397
 }
... ...
@@ -50,7 +50,13 @@
50 50
 #' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is
51 51
 #'   separately computed for each biological-cross-batch stratum (accepting
52 52
 #'   NAs), and a weighted average is returned as PAM_SIL.
53
-#'   
53
+#' @param stratified_cor logical. If TRUE then cor metrics are separately
54
+#'   computed for each biological-cross-batch stratum (accepts NAs), and
55
+#'   weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
56
+#'   Default FALSE.
57
+#' @param stratified_rle logical. If TRUE then rle metrics are separately 
58
+#'   computed for each biological-cross-batch stratum (accepts NAs), and
59
+#'   weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.
54 60
 #' @param verbose logical. If TRUE some messagges are printed.
55 61
 #' @param run logical. If FALSE the normalization and evaluation are not run,
56 62
 #'   but normalization parameters are returned in the output object for 
... ...
@@ -169,14 +175,16 @@ setMethod(
169 175
                         eval_proj = NULL,
170 176
                         eval_proj_args = NULL, eval_kclust=2:10,
171 177
                         verbose=FALSE, stratified_pam = FALSE,
178
+                        stratified_cor = FALSE,
179
+                        stratified_rle = FALSE,
172 180
                         return_norm = c("no", "in_memory", "hdf5"),
173 181
                         hdf5file,
174 182
                         bpparam=BiocParallel::bpparam()) {
175
-    
176
-    
177
-    rezero = (zero %in% c("preadjust","strong"))
178
-    fixzero = (zero %in% c("postadjust","strong"))
179
-    
183
+
184
+    zero <- match.arg(zero)
185
+    rezero <- (zero %in% c("preadjust","strong"))
186
+    fixzero <- (zero %in% c("postadjust","strong"))
187
+
180 188
     if(x@is_log) {
181 189
       stop("At the moment, scone is implemented only for non-log counts.")
182 190
     }
... ...
@@ -329,6 +337,18 @@ setMethod(
329 337
         stop("'eval_kclust' must be less than the number of samples.")
330 338
       }
331 339
       
340
+      if(stratified_rle) {
341
+        if(is.null(bio) & is.null(batch)){
342
+          stop("For stratified_rle, bio and/or batch must be specified")
343
+        }
344
+      }
345
+      
346
+      if(stratified_cor) {
347
+        if(is.null(bio) & is.null(batch)){
348
+          stop("For stratified_cor, bio and/or batch must be specified")
349
+        }
350
+      }
351
+      
332 352
       if(!is.null(eval_kclust) & stratified_pam) {
333 353
         if(is.null(bio) & is.null(batch)){
334 354
           stop("For stratified_pam, bio and/or batch must be specified")
... ...
@@ -539,7 +559,10 @@ setMethod(
539 559
                               bio = bio, batch = batch,
540 560
                               qc_factors = qc_pcs, 
541 561
                               uv_factors = uv_factors, wv_factors = wv_factors,
542
-                              is_log = TRUE, stratified_pam = stratified_pam)
562
+                              stratified_pam = stratified_pam,
563
+                              stratified_cor = stratified_cor,
564
+                              stratified_rle = stratified_rle,
565
+                              is_log = TRUE)
543 566
       } else {
544 567
         score <- NULL
545 568
       }
... ...
@@ -296,8 +296,8 @@ impute_null <- function(expression,impute_args) {
296 296
 #'   \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in 
297 297
 #'   weighted PCA} \item{expected_state}{ the expected
298 298
 #'   value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} 
299
-#'   \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was 
300
-#'   reached} }
299
+#'   \item{convergence}{for all genes, 0 if the algorithm converged and
300
+#'   1 if maxiter was reached} }
301 301
 #'   
302 302
 #' @examples
303 303
 #' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
... ...
@@ -1,3 +1,10 @@
1
+Changes in version 1.1.3 (2018-10-19)
2
+========================
3
+
4
+* Modified COR scores to reflect R^2 for regression on UV/WV/QC rather than max correlation.
5
+* Bug fix for shiny report.
6
+* Updated vignette.
7
+
1 8
 Changes in version 1.1.2 (2017-7-3)
2 9
 ========================
3 10
 
... ...
@@ -2,7 +2,8 @@
2 2
 % Please edit documentation in R/SCONE_DEFAULTS.R
3 3
 \name{DESEQ_FN}
4 4
 \alias{DESEQ_FN}
5
-\title{Relative log-expression (RLE; DESeq) scaling normalization wrapper function}
5
+\title{Relative log-expression (RLE; DESeq) scaling normalization wrapper
6
+function}
6 7
 \usage{
7 8
 DESEQ_FN(ei)
8 9
 }
... ...
@@ -13,7 +14,8 @@ DESEQ_FN(ei)
13 14
 RLE normalized matrix.
14 15
 }
15 16
 \description{
16
-Relative log-expression (RLE; DESeq) scaling normalization wrapper function
17
+Relative log-expression (RLE; DESeq) scaling normalization wrapper
18
+function
17 19
 }
18 20
 \details{
19 21
 SCONE scaling wrapper for \code{\link[edgeR]{calcNormFactors}}).
... ...
@@ -40,8 +40,8 @@ a list with the following elements: \itemize{ \item{W}{ coefficients
40 40
   \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in 
41 41
   weighted PCA} \item{expected_state}{ the expected
42 42
   value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} 
43
-  \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was 
44
-  reached} }
43
+  \item{convergence}{for all genes, 0 if the algorithm converged and
44
+  1 if maxiter was reached} }
45 45
 }
46 46
 \description{
47 47
 This function implements Newton's method for solving zero of 
... ...
@@ -15,8 +15,8 @@ scone(x, ...)
15 15
   adjust_batch = c("no", "yes", "force"), run = TRUE, evaluate = TRUE,
16 16
   eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL,
17 17
   eval_kclust = 2:10, verbose = FALSE, stratified_pam = FALSE,
18
-  return_norm = c("no", "in_memory", "hdf5"), hdf5file,
19
-  bpparam = BiocParallel::bpparam())
18
+  stratified_cor = FALSE, stratified_rle = FALSE, return_norm = c("no",
19
+  "in_memory", "hdf5"), hdf5file, bpparam = BiocParallel::bpparam())
20 20
 }
21 21
 \arguments{
22 22
 \item{x}{a \code{\link{SconeExperiment}} object.}
... ...
@@ -77,6 +77,15 @@ If NULL, tightness will be returned NA.}
77 77
 separately computed for each biological-cross-batch stratum (accepting
78 78
 NAs), and a weighted average is returned as PAM_SIL.}
79 79
 
80
+\item{stratified_cor}{logical. If TRUE then cor metrics are separately
81
+computed for each biological-cross-batch stratum (accepts NAs), and
82
+weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
83
+Default FALSE.}
84
+
85
+\item{stratified_rle}{logical. If TRUE then rle metrics are separately 
86
+computed for each biological-cross-batch stratum (accepts NAs), and
87
+weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.}
88
+
80 89
 \item{return_norm}{character. If "no" the normalized values will not be
81 90
 returned with the output object. This will create a much smaller object
82 91
 and may be useful for large datasets and/or when many combinations are
... ...
@@ -7,10 +7,10 @@
7 7
 score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL,
8 8
   eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL,
9 9
   uv_factors = NULL, wv_factors = NULL, is_log = FALSE,
10
-  stratified_pam = FALSE)
10
+  stratified_pam = FALSE, stratified_cor = FALSE, stratified_rle = FALSE)
11 11
 }
12 12
 \arguments{
13
-\item{expr}{matrix. The expression data matrix (genes in rows, cells in 
13
+\item{expr}{matrix. The expression data matrix (genes in rows, cells in
14 14
 columns).}
15 15
 
16 16
 \item{eval_pcs}{numeric. The number of principal components to use for
... ...
@@ -19,11 +19,11 @@ evaluation (Default 3). Ignored if !is.null(eval_proj).}
19 19
 \item{eval_proj}{function. Projection function for evaluation (see Details).
20 20
 If NULL, PCA is used for projection}
21 21
 
22
-\item{eval_proj_args}{list. List of arguments passed to projection function 
22
+\item{eval_proj_args}{list. List of arguments passed to projection function
23 23
 as eval_proj_args (see Details).}
24 24
 
25
-\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam 
26
-tightness (PAM_SIL) evaluation. If an array of integers, largest average 
25
+\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam
26
+tightness (PAM_SIL) evaluation. If an array of integers, largest average
27 27
 silhouette width (tightness) will be reported in PAM_SIL. If NULL, PAM_SIL
28 28
 will be returned NA.}
29 29
 
... ...
@@ -33,10 +33,10 @@ NA is allowed. If NULL, condition ASW, BIO_SIL, will be returned NA.}
33 33
 \item{batch}{factor. A known batch variable (variation to be removed), NA is
34 34
 allowed. If NULL, batch ASW, BATCH_SIL, will be returned NA.}
35 35
 
36
-\item{qc_factors}{Factors of unwanted variation derived from quality 
36
+\item{qc_factors}{Factors of unwanted variation derived from quality
37 37
 metrics. If NULL, qc correlations, EXP_QC_COR, will be returned NA.}
38 38
 
39
-\item{uv_factors}{Factors of unwanted variation derived from negative 
39
+\item{uv_factors}{Factors of unwanted variation derived from negative
40 40
 control genes (evaluation set). If NULL, uv correlations, EXP_UV_COR,
41 41
 will be returned NA.}
42 42
 
... ...
@@ -44,36 +44,47 @@ will be returned NA.}
44 44
 control genes (evaluation set). If NULL, wv correlations, EXP_WV_COR,
45 45
 will be returned NA.}
46 46
 
47
-\item{is_log}{logical. If TRUE the expr matrix is already logged and log 
47
+\item{is_log}{logical. If TRUE the expr matrix is already logged and log
48 48
 transformation will not be carried out prior to projection. Default FALSE.}
49 49
 
50
-\item{stratified_pam}{logical. If TRUE then maximum ASW is separately 
51
-computed for each biological-cross-batch stratum (accepts NAs), and a 
50
+\item{stratified_pam}{logical. If TRUE then maximum ASW is separately
51
+computed for each biological-cross-batch stratum (accepts NAs), and a
52 52
 weighted average silhouette width is returned as PAM_SIL. Default FALSE.}
53
+
54
+\item{stratified_cor}{logical. If TRUE then cor metrics are separately
55
+computed for each biological-cross-batch stratum (accepts NAs), and
56
+weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
57
+Default FALSE.}
58
+
59
+\item{stratified_rle}{logical. If TRUE then rle metrics are separately
60
+computed for each biological-cross-batch stratum (accepts NAs), and
61
+weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.}
53 62
 }
54 63
 \value{
55 64
 A list with the following metrics: \itemize{ \item{BIO_SIL}{ Average
56
-  silhouette width by biological condition.} \item{BATCH_SIL}{ Average 
57
-  silhouette width by batch condition.} \item{PAM_SIL}{ Maximum average 
58
-  silhouette width from PAM clustering (see stratified_pam argument).} 
59
-  \item{EXP_QC_COR}{ Maximum squared spearman correlation between expression
60
-  pcs and quality factors.} \item{EXP_UV_COR}{ Maximum squared spearman 
61
-  correlation between expression pcs and negative control gene factors.} 
62
-  \item{EXP_WV_COR}{ Maximum squared spearman correlation between expression
63
-  pcs and positive control gene factors.} \item{RLE_MED}{ The mean squared 
64
-  median Relative Log Expression (RLE).} \item{RLE_IQR}{ The variance of the
65
-  inter-quartile range (IQR) of the RLE.} }
65
+  silhouette width by biological condition.} \item{BATCH_SIL}{ Average
66
+  silhouette width by batch condition.} \item{PAM_SIL}{ Maximum average
67
+  silhouette width from PAM clustering (see stratified_pam argument).}
68
+  \item{EXP_QC_COR}{ Coefficient of determination between expression
69
+  pcs and quality factors (see stratified_cor argument).} \item{EXP_UV_COR}{
70
+  Coefficient of determination between expression pcs and negative
71
+  control gene factors (see stratified_cor argument).} \item{EXP_WV_COR}{
72
+  Coefficient of determination between expression pcs and positive
73
+  control gene factors (see stratified_cor argument).} \item{RLE_MED}{ The
74
+  mean squared median Relative Log Expression (RLE) (see stratified_rle
75
+  argument).} \item{RLE_IQR}{ The variance of the inter-quartile range (IQR)
76
+  of the RLE (see stratified_rle argument).} }
66 77
 }
67 78
 \description{
68
-This function evaluates a (normalized) expression matrix using SCONE 
69
-criteria, producing 8 metrics based on i) Clustering, ii) Correlations and 
79
+This function evaluates a (normalized) expression matrix using SCONE
80
+criteria, producing 8 metrics based on i) Clustering, ii) Correlations and
70 81
 iii) Relative Expression.
71 82
 }
72 83
 \details{
73 84
 Users may specify their own eval_proj function that will be used to
74 85
   compute Clustering and Correlation metrics. This eval_proj() function must
75
-  have 2 input arguments: \itemize{ \item{e}{ matrix. log-transformed (+ 
76
-  pseudocount) expression data (genes in rows, cells in columns).} 
86
+  have 2 input arguments: \itemize{ \item{e}{ matrix. log-transformed (+
87
+  pseudocount) expression data (genes in rows, cells in columns).}
77 88
   \item{eval_proj_args}{ list. additional function arguments, e.g. prior
78 89
   data weights.}} and it must output a matrix representation of the original
79 90
   data (cells in rows, factors in columns). The value of eval_proj_args is
... ...
@@ -87,7 +98,7 @@ bio = as.factor(rep(c(1,2),each = 2))
87 98
 batch = as.factor(rep(c(1,2),2))
88 99
 log_expr = matrix(rnorm(20),ncol = 4)
89 100
 
90
-scone_metrics = score_matrix(log_expr, 
101
+scone_metrics = score_matrix(log_expr,
91 102
    bio = bio, batch = batch,
92 103
    eval_kclust = 2, is_log = TRUE)
93 104
 
... ...
@@ -617,16 +617,16 @@ negative (lower is better) signature.
617 617
   expression PCs. 
618 618
   Positive signature.
619 619
 * EXP_QC_COR: Removal of Alignment Artifacts. 
620
-  Maximum squared Spearman correlation between first 3 expression PCs and 
620
+  R^2 measure for regression of first 3 expression PCs on 
621 621
   first `k_qc` QPCs. 
622 622
   Negative signature.
623 623
 * EXP_UV_COR: Removal of Expression Artifacts. 
624
-  Maximum squared Spearman correlation between first 3 expression PCs and 
624
+  R^2 measure for regression of first 3 expression PCs on 
625 625
   first 3 PCs of the negative control (specified by `eval_negcon` or 
626 626
   `ruv_negcon` by default) sub-matrix of the original (raw) data. 
627 627
   Negative signature.
628 628
 * EXP_WV_COR: Preservation of Biological Variance.
629
-  Maximum squared Spearman correlation between first 3 expression PCs and
629
+  R^2 measure for regression of first 3 expression PCs on 
630 630
   first 3 PCs of the positive control (specified by `eval_poscon`) 
631 631
   sub-matrix of the original (raw) data. 
632 632
   Positive signature.
... ...
@@ -648,7 +648,7 @@ my_scone <- scone(my_scone,
648 648
                   run=TRUE,
649 649
                   eval_kclust = 2:6,stratified_pam = TRUE,
650 650
                   return_norm = "in_memory",
651
-                  zero = "preadjust")
651
+                  zero = "postadjust")
652 652
 
653 653
 ```
654 654
 
... ...
@@ -665,9 +665,9 @@ In the call above, we have set the following parameter arguments:
665 665
   Store all normalized matrices in addition to evaluation data. 
666 666
   Otherwise normalized data is not returned in the resulting 
667 667
   object.
668
-* zero = "preadjust". 
669
-  Restore data entries that are originally zeroes back to zero
670
-  after the scaling step.
668
+* zero = "postadjust". 
669
+  Restore data entries that are originally zeroes / negative after 
670
+  normalization to zero after the adjustment step.
671 671
 
672 672
 The output will contain various updated elements:
673 673
 
... ...
@@ -695,7 +695,7 @@ be recomputed.
695 695
 # Step 3: Selecting a normalization for downstream analysis
696 696
 
697 697
 Based on our sorting criteria, it would appear that
698
-`none,fq,no_uv,no_bio,no_batch` performs well compared to other normalization
698
+`none,uq,ruv_k=1,no_bio,no_batch` performs well compared to other normalization
699 699
 workflows. A useful way to visualize this method with respect to others is the
700 700
 `biplot_color` function
701 701