Browse code

PAM and other methods

neobernad authored on 09/03/2021 16:03:30
Showing1 changed files
... ...
@@ -144,17 +144,21 @@ plotMetricsBoxplot <- function(data) {
144 144
 #' object as a boxplot.
145 145
 #'
146 146
 #' @inheritParams stability
147
+#' @param scale Boolean. If true input data is scaled. Default: FALSE.
147 148
 #'
148
-#' @return Nothing.
149
+#' @return An hclust object.
149 150
 #'
150 151
 #' @examples
151 152
 #' # Using example data from our package
152 153
 #' data("ontMetrics")
153
-#' plotMetricsCluster(ontMetrics)
154
+#' plotMetricsCluster(ontMetrics, scale=TRUE)
154 155
 #'
155
-plotMetricsCluster <- function(data) {
156
+plotMetricsCluster <- function(data, scale=FALSE) {
156 157
   data <- as.data.frame(assay(data))
157 158
   data.metrics = data[,-1] # Removing Description column
159
+  if (isTRUE(scale)) {
160
+    data.metrics = base::scale(data.metrics)
161
+  }
158 162
   d <- dist(t(data.metrics), method = "euclidean") # distance matrix
159 163
   fit <- hclust(d, method="ward.D2")
160 164
   theme_set(theme_bw())
... ...
@@ -164,6 +168,7 @@ plotMetricsCluster <- function(data) {
164 168
     ) +
165 169
     labs(title="Metrics dendrogram")
166 170
   print(p)
171
+  return(fit)
167 172
 }
168 173
 
169 174
 #' @title Metric values as violin plot.
... ...
@@ -306,7 +311,7 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
306 311
   STABLE_CLASS = 0.75
307 312
 
308 313
   outputTable = as.data.frame(metrics)
309
-  rownames(outputTable) = metrics
314
+  #rownames(outputTable) = metrics
310 315
   outputTable = outputTable[, -1]
311 316
   optimalKs = list()
312 317
   stabMaxKs = list() # List of maximum K for the stability of metric X
... ...
@@ -317,7 +322,7 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
317 322
   qualMaxKsQuality = list() # Quality of the current K in qualMaxKs
318 323
 
319 324
   for (metric in metrics) {
320
-    cat("Processing metric: ", metric, "\n")
325
+    message("Processing metric: ", metric, "\n")
321 326
     stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # ks
322 327
     stabMaxKFormatted = getFormattedK(stabMaxK)
323 328
     stabMaxVal = stabDf[metric, stabMaxK]
... ...
@@ -336,50 +341,50 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
336 341
     # CASE 1: ks == kg
337 342
     if (identical(stabMaxK, qualMaxK)) {
338 343
       k = stabMaxKFormatted
339
-      cat("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
344
+      message("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
340 345
       optimalKs = append(optimalKs, k)
341 346
     } else {
342 347
       # CASE 2: ks != kg
343 348
       if (stabMaxVal > STABLE_CLASS && stabDf[metric, qualMaxK] > STABLE_CLASS) {
344 349
         # Both stables
345
-        cat("\tBoth Ks have a stable classification: '",
350
+        message("\tBoth Ks have a stable classification: '",
346 351
             stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
347 352
         k = qualMaxKFormatted
348 353
         optimalKs = append(optimalKs, k)
349
-        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
354
+        message("\tUsing '", k, "' since it provides higher silhouette width\n")
350 355
       } else {
351 356
         if (stabMaxVal <= STABLE_CLASS && stabDf[metric, qualMaxK] <= STABLE_CLASS) {
352 357
           # Both not stables: S_ks <= 0.75 && S_kg <= 0.75
353
-          cat("\tBoth Ks do not have a stable classification: '",
358
+          message("\tBoth Ks do not have a stable classification: '",
354 359
               stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
355 360
           k = qualMaxKFormatted
356 361
           optimalKs = append(optimalKs, k)
357
-          cat("\tUsing '", k, "' since it provides higher silhouette width\n")
362
+          message("\tUsing '", k, "' since it provides higher silhouette width\n")
358 363
         } else {
359 364
           # S_ks > 0.75 && Sil_ks > 0.5 && S_kg <= 0.75
360 365
           if ((stabMaxVal > STABLE_CLASS) && (qualDf[metric, stabMaxK] > 0.5)
361 366
               && (stabDf[metric, qualMaxK] <= STABLE_CLASS)) {
362
-            cat("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
367
+            message("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
363 368
                 qualMaxKFormatted,"' is not\n")
364 369
             k = stabMaxKFormatted
365 370
             optimalKs = append(optimalKs, k)
366
-            cat("\tUsing '", k, "' since it provides higher stability\n")
371
+            message("\tUsing '", k, "' since it provides higher stability\n")
367 372
           } else {
368 373
             # CASE 3
369 374
             if (stabMaxVal > STABLE_CLASS && qualDf[metric, stabMaxK] <= 0.5
370 375
                 && stabDf[metric, qualMaxK] <= STABLE_CLASS)  {
371
-              cat("\tStability k '", stabMaxKFormatted, "' is stable but its silhouette value is not reasonable\n")
376
+              message("\tStability k '", stabMaxKFormatted, "' is stable but its silhouette value is not reasonable\n")
372 377
               if (qualMaxVal > 0.5) { # S_kg > 0.5
373 378
                 k = qualMaxKFormatted
374 379
                 optimalKs = append(optimalKs, k)
375
-                cat("\tUsing quality '", k, "' since its at least reasonable\n")
380
+                message("\tUsing quality '", k, "' since its at least reasonable\n")
376 381
               } else {# S_kg <= 0.5
377 382
                 k = stabMaxKFormatted
378 383
                 optimalKs = append(optimalKs, k)
379
-                cat("\tUsing stability '", k, "' since quality k is not reasonable\n")
384
+                message("\tUsing stability '", k, "' since quality k is not reasonable\n")
380 385
               }
381 386
             } else { # This should not happen but it might come in handy to check errors
382
-              cat("\tUnknown case\n")
387
+              message("\tUnknown case\n")
383 388
               optimalKs = append(optimalKs, -1)
384 389
             }
385 390
           }
... ...
@@ -388,6 +393,7 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
388 393
     }
389 394
   }
390 395
 
396
+  outputTable["Metric"] = metrics
391 397
   outputTable["Stability_max_k"] = unlist(stabMaxKs)
392 398
   outputTable["Stability_max_k_stab"] = unlist(stabMaxKsStability)
393 399
   outputTable["Stability_max_k_qual"] = unlist(stabMaxKsQuality)
Browse code

v1.3.41 -Bug fix for R 4.0 devel

neobernad authored on 07/01/2020 12:25:05
Showing1 changed files
... ...
@@ -302,7 +302,7 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
302 302
   stabDf = standardizeStabilityData(stabData, k.range)
303 303
   qualDf = standardizeQualityData(qualData, k.range)
304 304
 
305
-  metrics = rownames(stabDf)
305
+  metrics = as.character(as.data.frame(assay(stabData))$Metric)
306 306
   STABLE_CLASS = 0.75
307 307
 
308 308
   outputTable = as.data.frame(metrics)
... ...
@@ -437,6 +437,15 @@ plotMetricsClusterComparison <- function(data, k.vector1, k.vector2, seed=NULL)
437 437
   data <- as.data.frame(SummarizedExperiment::assay(data))
438 438
 
439 439
   numMetrics = length(colnames(data))-1
440
+
441
+  if (length(k.vector1) == 1) {
442
+    k.vector1=rep(k.vector1, numMetrics)
443
+  }
444
+
445
+  if (length(k.vector2) == 1) {
446
+    k.vector2=rep(k.vector2, numMetrics)
447
+  }
448
+
440 449
   if (numMetrics != length(k.vector1) || numMetrics != length(k.vector2)
441 450
       || length(k.vector1) != length(k.vector2)) {
442 451
     stop("Input parameters have different lengths")
... ...
@@ -534,9 +543,9 @@ plotMetricsClusterComparison <- function(data, k.vector1, k.vector2, seed=NULL)
534 543
 
535 544
 checkStabilityQualityData <- function(stabData, qualData) {
536 545
   stabDf = assay(stabData) # Getting first assay, which is 'stabData$stability_mean'
537
-  lengthStabDf = length(colnames(stabDf[,-1]))
538
-  stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[1]) # Mean_stability_k_2 -> 2
539
-  stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[lengthStabDf])
546
+  lengthStabDf = length(colnames(stabDf)[-1])
547
+  stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf)[-1][1]) # Mean_stability_k_2 -> 2
548
+  stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf)[-1][lengthStabDf])
540 549
   lengthQual = length(qualData)
541 550
   namesQual = names(qualData)
542 551
   qualRangeStart = getFormattedK(namesQual[1]) # k_2 -> 2
... ...
@@ -572,7 +581,7 @@ standardizeQualityData <- function(qualData, k.range=NULL) {
572 581
   for (i in seq(qualRangeStart, qualRangeEnd, 1)) {
573 582
     curQual = as.data.frame(assay(getDataQualityRange(qualData, i)))
574 583
     if (i == qualRangeStart) {
575
-      Metric = as.character(levels(curQual$Metric))
584
+      Metric = as.character(curQual$Metric)
576 585
     }
577 586
     kValues[[i]] = as.numeric(as.character(curQual$Avg_Silhouette_Width))
578 587
   }
Browse code

v1.3.4: Added plotMetricsClusterComparison

neobernad authored on 19/12/2019 10:23:57
Showing1 changed files
... ...
@@ -402,6 +402,136 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
402 402
 
403 403
 }
404 404
 
405
+
406
+#' @title Comparison between two clusterings as plot.
407
+#' plotMetricsClusterComparison
408
+#' @aliases plotMetricsClusterComparison
409
+#' @description
410
+#' It plots a clustering comparison between two different
411
+#' k-cluster vectors for a set of metrics.
412
+#'
413
+#' @inheritParams stability
414
+#' @param k.vector1 Vector of positive integers representing \code{k} clusters.
415
+#' The \code{k} values must be contained in [2,15] range.
416
+#' @param k.vector2 Vector of positive integers representing \code{k} clusters.
417
+#' The \code{k} values must be contained in [2,15] range.
418
+#'
419
+#' @return Nothing.
420
+#'
421
+#' @examples
422
+#' # Using example data from our package
423
+#' data("rnaMetrics")
424
+#' stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
425
+#' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
426
+#' kOptTable = getOptimalKValue(stabilityData, qualityData)
427
+#'
428
+#'
429
+plotMetricsClusterComparison <- function(data, k.vector1, k.vector2, seed=NULL) {
430
+  if (is.null(seed)) {
431
+    seed = pkg.env$seed
432
+  }
433
+  if (identical(k.vector1, k.vector2)) {
434
+    stop("k.vector1 and k.vector2 are identical")
435
+  }
436
+
437
+  data <- as.data.frame(SummarizedExperiment::assay(data))
438
+
439
+  numMetrics = length(colnames(data))-1
440
+  if (numMetrics != length(k.vector1) || numMetrics != length(k.vector2)
441
+      || length(k.vector1) != length(k.vector2)) {
442
+    stop("Input parameters have different lengths")
443
+  }
444
+  for (i in 1:length(k.vector1)) {
445
+    checkKValue(k.vector1[i])
446
+    checkKValue(k.vector2[i])
447
+  }
448
+
449
+  data.metrics=NULL; names.metr=NULL; names.index=NULL;
450
+  k.cl=NULL; k.min=NULL; k.max=NULL;
451
+  data.metrics=NULL; datos.csv=NULL; datos.raw=NULL;
452
+  ranges=NULL; mins=NULL; data.l=NULL; data.ms=NULL; k.sig=NULL; k.op.sig=NULL;
453
+
454
+  datos.csv = data
455
+  data.metrics <- datos.csv[,-1]
456
+  names.metr <- colnames(datos.csv[,-1])  #nombres de metricas
457
+  names.ont <- datos.csv[,1]
458
+
459
+  ranges <- apply(data.metrics, 2, sample.range)
460
+  mins <- apply(data.metrics, 2, sample.min)
461
+  data.l <- sweep(data.metrics, 2, mins, FUN="-")
462
+  data.ms <- sweep(data.l, 2, ranges, FUN="/")
463
+
464
+  kcolors=c("black","red","blue","green","magenta","pink","yellow","orange","brown","cyan","gray","darkgreen")
465
+
466
+  par(mar=c(4,6,3,3))
467
+  plot(0,0, xlim=range(data.ms), ylim=c(0,length(names.metr)+1),
468
+       lwd=NULL, xlab="", ylab="", xaxt="n", yaxt="n", type="n")
469
+  axis(side=2, at = seq(1,length(names.metr)), labels=names.metr, las=2, cex.axis=.7)
470
+  title(xlab=paste("Scaled raw scores", sep=""), line=1)
471
+  title(ylab="Metrics", line=5)
472
+
473
+  for (i.metr in 1:length(names.metr)) { # i.metr= n de metrica #ejemplo
474
+    #  i.metr=1
475
+
476
+    i=NULL; clusterk5=NULL; clusterkopt=NULL;
477
+    k.cl=NULL; k.op=NULL; data.plot=NULL;
478
+
479
+    i=i.metr
480
+
481
+    #kmeans with k.cl classes
482
+    k.cl=k.vector2[i]
483
+    set.seed(seed)
484
+    clusterk5=kmeans(data.ms[,i], centers=k.cl, iter.max = 100)
485
+    ##
486
+    clusterk5$means=by(data.ms[,i],clusterk5$cluster,mean) #calcula las k medias (centroides)
487
+    for (i.5 in 1:length(clusterk5$means)) {
488
+      clusterk5$partition[which(clusterk5$cluster==i.5)]=clusterk5$centers[i.5]
489
+      #asigna valor centroide a todo miembro del cluster
490
+    }
491
+    #Ordenacion de la particion segun el sentido de la metrica (directa/inversa)
492
+    clusterk5$ordered=ordered(clusterk5$partition,labels=seq(1,length(clusterk5$centers)))
493
+    clusterk5$ordered.inv=ordered(clusterk5$partition,labels=seq(length(clusterk5$centers),1))
494
+    clusterk5$partition=clusterk5$ordered
495
+    clusterk5$means=sort(clusterk5$means,decreasing=FALSE)
496
+
497
+    #kmeans with k.op classes
498
+    k.op=k.vector1[i]
499
+    set.seed(seed)
500
+    clusterkopt=kmeans(data.ms[,i], centers=k.op, iter.max = 100)
501
+
502
+    clusterkopt$means=by(data.ms[,i],clusterkopt$cluster,mean) #calcula las k medias (centroides)
503
+    for (i.opt in 1:length(clusterkopt$means)) {
504
+      clusterkopt$partition[which(clusterkopt$cluster==i.opt)]=clusterkopt$centers[i.opt]
505
+      #asigna valor centroide a todo el cluster
506
+    }
507
+
508
+    clusterkopt$ordered=ordered(clusterkopt$partition,labels=seq(1,length(clusterkopt$centers)))
509
+    clusterkopt$ordered.inv=ordered(clusterkopt$partition,labels=seq(length(clusterkopt$centers),1))
510
+    clusterkopt$partition=clusterkopt$ordered
511
+    clusterkopt$means=sort(clusterkopt$means,decreasing=FALSE)
512
+
513
+    data.plot=data.frame(data.ms[,i],clusterk5$partition,clusterkopt$partition)
514
+    colnames(data.plot)=c(names.metr[i],"k=5","k_op")
515
+    rownames(data.plot)=names.ont
516
+
517
+    xi=data.plot[[1]]
518
+    yi=rep(i.metr,length(xi))
519
+    ci=data.plot[[2]]
520
+    ci=levels(ci)[ci]
521
+    points(xi,yi,type="p", col=kcolors[as.numeric(ci)],lty=1, lwd=1)
522
+
523
+    cj=data.plot[[3]]
524
+    for (ellip.j in unique(cj)) {
525
+      xj=mean(range(xi[which(cj==ellip.j)])) #clusterk5$means[ellip.j]
526
+      yj=rep(i.metr,length(xj))
527
+      aj=diff(range(xi[which(cj==ellip.j)]))/2
528
+      draw.ellipse(x=xj, y=yj, a=aj, b=0.3, nv=100,
529
+                   border=kcolors[as.numeric(ellip.j)], lty=1, lwd=2)
530
+    }
531
+
532
+  } #end for i.metr
533
+}
534
+
405 535
 checkStabilityQualityData <- function(stabData, qualData) {
406 536
   stabDf = assay(stabData) # Getting first assay, which is 'stabData$stability_mean'
407 537
   lengthStabDf = length(colnames(stabDf[,-1]))
... ...
@@ -514,3 +644,6 @@ standardizeStabilityData <- function(stabData, k.range=NULL) {
514 644
   stabDf <- stabDf[ order(row.names(stabDf)), ]
515 645
   return(stabDf)
516 646
 }
647
+
648
+
649
+
Browse code

Version 1.3.3 new k opt. algorithm and stabilitySet, qualitySet

neobernad authored on 17/12/2019 12:25:37
Showing1 changed files
... ...
@@ -279,6 +279,7 @@ getFormattedK <- function(k) {
279 279
 #' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
280 280
 #' kOptTable = getOptimalKValue(stabilityData, qualityData)
281 281
 #'
282
+#'
282 283
 getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
283 284
   checkStabilityQualityData(stabData, qualData)
284 285
 
... ...
@@ -317,12 +318,12 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
317 318
 
318 319
   for (metric in metrics) {
319 320
     cat("Processing metric: ", metric, "\n")
320
-    stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # k1
321
+    stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # ks
321 322
     stabMaxKFormatted = getFormattedK(stabMaxK)
322 323
     stabMaxVal = stabDf[metric, stabMaxK]
323
-    qualMaxK = colnames(qualDf[metric, ])[apply(qualDf[metric, ],1,which.max)] # k2
324
-    qualMaxVal = qualDf[metric, qualMaxK]
324
+    qualMaxK = colnames(qualDf[metric, ])[apply(qualDf[metric, ],1,which.max)] # kg
325 325
     qualMaxKFormatted = getFormattedK(qualMaxK)
326
+    qualMaxVal = qualDf[metric, qualMaxK]
326 327
     ## Info for output table
327 328
     stabMaxKs = append(stabMaxKs, stabMaxKFormatted)
328 329
     stabMaxKsStability = append(stabMaxKsStability, stabDf[metric, stabMaxK]);
... ...
@@ -331,53 +332,59 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
331 332
     qualMaxKs = append(qualMaxKs, qualMaxKFormatted)
332 333
     qualMaxKsStability = append(qualMaxKsStability, stabDf[metric, qualMaxK]);
333 334
     qualMaxKsQuality = append(qualMaxKsQuality, qualDf[metric, qualMaxK]);
334
-    ##
335
-    # CASE 1
336
-    if (length(stabMaxK) >= 1 && length(qualMaxK) >= 1 && identical(stabMaxK, qualMaxK)) {
335
+
336
+    # CASE 1: ks == kg
337
+    if (identical(stabMaxK, qualMaxK)) {
337 338
       k = stabMaxKFormatted
338 339
       cat("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
339 340
       optimalKs = append(optimalKs, k)
340
-      # CASE 2
341
-    } else if ((stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) ||
342
-               (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS)) {
343
-      if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
341
+    } else {
342
+      # CASE 2: ks != kg
343
+      if (stabMaxVal > STABLE_CLASS && stabDf[metric, qualMaxK] > STABLE_CLASS) {
344
+        # Both stables
344 345
         cat("\tBoth Ks have a stable classification: '",
345 346
             stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
346
-      } else {
347
-        cat("\tBoth Ks does not have a stable classification: '",
348
-            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
349
-      }
350
-      k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
351
-      cat("\tUsing '", k, "' since it provides higher silhouette width\n")
352
-      optimalKs = append(optimalKs, k)
353
-      # CASE 3
354
-    } else if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS) {
355
-      cat("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
356
-          qualMaxKFormatted,"' is not\n")
357
-      kSil = qualDf[metric, stabMaxK]
358
-      if (isReasonable(kSil)) {
359
-        cat("\tUsing ", stabMaxKFormatted, " since its silhouette width is at least reasonable\n")
360
-        optimalKs = append(optimalKs, stabMaxKFormatted)
361
-      } else {
362
-        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
363
-        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
347
+        k = qualMaxKFormatted
364 348
         optimalKs = append(optimalKs, k)
365
-      }
366
-      # CASE 4
367
-    } else if (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
368
-      cat("\t Quality k '", qualMaxKFormatted, "' is stable but stability k '",
369
-          stabMaxKFormatted,"' is not\n")
370
-      kSil = qualDf[metric, qualMaxK]
371
-      if (isReasonable(kSil)) {
372
-        cat("\tUsing ", qualMaxKFormatted, " since its silhouette width is at least reasonable\n")
373
-        optimalKs = append(optimalKs, stabMaxKFormatted)
374
-      } else {
375
-        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
376 349
         cat("\tUsing '", k, "' since it provides higher silhouette width\n")
377
-        optimalKs = append(optimalKs, k)
350
+      } else {
351
+        if (stabMaxVal <= STABLE_CLASS && stabDf[metric, qualMaxK] <= STABLE_CLASS) {
352
+          # Both not stables: S_ks <= 0.75 && S_kg <= 0.75
353
+          cat("\tBoth Ks do not have a stable classification: '",
354
+              stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
355
+          k = qualMaxKFormatted
356
+          optimalKs = append(optimalKs, k)
357
+          cat("\tUsing '", k, "' since it provides higher silhouette width\n")
358
+        } else {
359
+          # S_ks > 0.75 && Sil_ks > 0.5 && S_kg <= 0.75
360
+          if ((stabMaxVal > STABLE_CLASS) && (qualDf[metric, stabMaxK] > 0.5)
361
+              && (stabDf[metric, qualMaxK] <= STABLE_CLASS)) {
362
+            cat("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
363
+                qualMaxKFormatted,"' is not\n")
364
+            k = stabMaxKFormatted
365
+            optimalKs = append(optimalKs, k)
366
+            cat("\tUsing '", k, "' since it provides higher stability\n")
367
+          } else {
368
+            # CASE 3
369
+            if (stabMaxVal > STABLE_CLASS && qualDf[metric, stabMaxK] <= 0.5
370
+                && stabDf[metric, qualMaxK] <= STABLE_CLASS)  {
371
+              cat("\tStability k '", stabMaxKFormatted, "' is stable but its silhouette value is not reasonable\n")
372
+              if (qualMaxVal > 0.5) { # S_kg > 0.5
373
+                k = qualMaxKFormatted
374
+                optimalKs = append(optimalKs, k)
375
+                cat("\tUsing quality '", k, "' since its at least reasonable\n")
376
+              } else {# S_kg <= 0.5
377
+                k = stabMaxKFormatted
378
+                optimalKs = append(optimalKs, k)
379
+                cat("\tUsing stability '", k, "' since quality k is not reasonable\n")
380
+              }
381
+            } else { # This should not happen but it might come in handy to check errors
382
+              cat("\tUnknown case\n")
383
+              optimalKs = append(optimalKs, -1)
384
+            }
385
+          }
386
+        }
378 387
       }
379
-    } else { # This should not happen but it might come in handy to check errors
380
-      optimalKs = append(optimalKs, -1)
381 388
     }
382 389
   }
383 390
 
... ...
@@ -392,10 +399,11 @@ getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
392 399
   outputTable["Global_optimal_k"] = unlist(optimalKs)
393 400
 
394 401
   return(outputTable)
402
+
395 403
 }
396 404
 
397 405
 checkStabilityQualityData <- function(stabData, qualData) {
398
-  stabDf = assay(stabData, "stability_mean")
406
+  stabDf = assay(stabData) # Getting first assay, which is 'stabData$stability_mean'
399 407
   lengthStabDf = length(colnames(stabDf[,-1]))
400 408
   stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[1]) # Mean_stability_k_2 -> 2
401 409
   stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[lengthStabDf])
... ...
@@ -471,7 +479,7 @@ standardizeQualityData <- function(qualData, k.range=NULL) {
471 479
 # standardized dataframe to process.
472 480
 #
473 481
 standardizeStabilityData <- function(stabData, k.range=NULL) {
474
-  stabDf = as.data.frame(assay(stabData), "mean")
482
+  stabDf = as.data.frame(assay(stabData)) # Getting first assay, which is 'stabData$stability_mean'
475 483
   lengthColnames = length(colnames(stabDf))
476 484
   toRemove = list()
477 485
   for (i in seq(1, lengthColnames, 1)) {
Browse code

v1.3.2: FPC dependency, stability outputs cluster information

neobernad authored on 10/12/2019 12:17:35
Showing1 changed files
... ...
@@ -1,508 +1,508 @@
1
-#' @title Minimum and maximum metric values plot.
2
-#' @name plotMetricsMinMax
3
-#' @aliases plotMetricsMinMax
4
-#' @description
5
-#' It plots the minimum, maximum and standard deviation
6
-#' values of the metrics in a \code{\link{SummarizedExperiment}} object.
7
-#'
8
-#' @inheritParams stability
9
-#'
10
-#' @return Nothing.
11
-#'
12
-#' @examples
13
-#' # Using example data from our package
14
-#' data("ontMetrics")
15
-#' plotMetricsMinMax(ontMetrics)
16
-#'
17
-plotMetricsMinMax <- function(data) {
18
-  data <- as.data.frame(assay(data))
19
-  # Prepare data for plotting
20
-  # Data matrix without descritive column
21
-  matrix = data.matrix(data[,-1])
22
-  maxs = matrixStats::colMaxs(matrix)
23
-  mins = matrixStats::colMins(matrix)
24
-  means = colMeans(matrix)
25
-  sd = matrixStats::colSds(matrix)
26
-
27
-  dataStats = matrix(NA, nrow=5, ncol = length(data[,-1]), byrow=TRUE,
28
-                     dimnames = list(c("Metric", "Min","Max","Mean","Sd"),
29
-                                     c(colnames(data[,-1]))))
30
-  dataStats["Metric",] = colnames(data[,-1])
31
-  dataStats["Min",] = mins
32
-  dataStats["Max",] = maxs
33
-  dataStats["Mean",] = means
34
-  dataStats["Sd",] = sd
35
-  dataStats.df = as.data.frame(dataStats)
36
-  dataStats.df.t = as.data.frame(t(dataStats.df))
37
-  # Factor to numeric conversion
38
-  dataStats.df.t$Min = as.numeric(as.character(dataStats.df.t$Min))
39
-  dataStats.df.t$Max = as.numeric(as.character(dataStats.df.t$Max))
40
-  dataStats.df.t$Mean = as.numeric(as.character(dataStats.df.t$Mean))
41
-  dataStats.df.t$Sd = as.numeric(as.character(dataStats.df.t$Sd))
42
-
43
-  ## Plotting
44
-  p <- ggplot(dataStats.df.t, aes(x=dataStats.df.t$Metric)) +
45
-    geom_linerange(aes(ymin=dataStats.df.t$Min,ymax=dataStats.df.t$Max),
46
-                   linetype=2,color="#4E84C4") +
47
-    geom_point(aes(y=dataStats.df.t$Min),size=3,color="#00AFBB") +
48
-    geom_point(aes(y=dataStats.df.t$Max),size=3,color="#FC4E07") +
49
-    theme_bw() +
50
-    theme(axis.text.x = element_text(angle = 90),
51
-          #axis.text.y = element_blank(),
52
-          text = element_text(size=15),
53
-          axis.line = element_line(colour = "black",
54
-                                   size = 1, linetype = "solid")
55
-    ) +
56
-    geom_errorbar(aes(ymin=(dataStats.df.t$Max-dataStats.df.t$Sd),
57
-                      ymax=(dataStats.df.t$Max+dataStats.df.t$Sd)), width=.2,
58
-                  position=position_dodge(.9)) +
59
-    geom_errorbar(aes(ymin=(dataStats.df.t$Min-dataStats.df.t$Sd),
60
-                      ymax=(dataStats.df.t$Min+dataStats.df.t$Sd)), width=.2,
61
-                  position=position_dodge(.9)) +
62
-    scale_y_continuous(breaks=seq(round(min(dataStats.df.t$Min-dataStats.df.t$Sd)), # 10 ticks across min - max range
63
-                                  round(max(dataStats.df.t$Max+dataStats.df.t$Sd)),
64
-                                  round((max(dataStats.df.t$Max)-min(dataStats.df.t$Min)))/10),
65
-                       labels=function(x) sprintf("%.2f", x)) + # Two decimals
66
-    labs(x = "Metrics", y = "Metric value", title = "Min/max/sd values across metrics") +
67
-    guides(fill=TRUE)
68
-
69
-  print(p)
70
-}
71
-
72
-
73
-#' @title Metric values as a boxplot.
74
-#' @name plotMetricsBoxplot
75
-#' @aliases plotMetricsBoxplot
76
-#' @description
77
-#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
78
-#' object as a boxplot.
79
-#'
80
-#' @inheritParams stability
81
-#'
82
-#' @return Nothing.
83
-#'
84
-#' @examples
85
-#' # Using example data from our package
86
-#' data("ontMetrics")
87
-#' plotMetricsBoxplot(ontMetrics)
88
-#'
89
-plotMetricsBoxplot <- function(data) {
90
-  data <- as.data.frame(assay(data))
91
-  num_metrics_plot=20
92
-  data.metrics = data[,-1] # Removing Description column
93
-
94
-  metrics_length = length(colnames(data.metrics))
95
-  num_iterations = round(metrics_length/num_metrics_plot)
96
-  if (num_iterations > 0) {
97
-    num_iterations = num_iterations - 1
98
-  }
99
-  for (iteration in 0:num_iterations) {
100
-    i = 1
101
-    rangeStart = (iteration*num_metrics_plot)+1
102
-    rangeEnd = rangeStart+num_metrics_plot-1
103
-    if (rangeEnd > metrics_length) {
104
-      rangeEnd = metrics_length
105
-    }
106
-    suppressMessages({
107
-      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
108
-    })
109
-    # Melting 1 variable (e.g: data.metrics[,11:11])
110
-    # won't create $variable column in data.melt.
111
-    if (rangeStart == rangeEnd) {
112
-      metricName = data[rangeStart, "Description"]
113
-      data.melt$variable = rep(metricName, length(data.melt$value))
114
-    }
115
-    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
116
-      geom_boxplot(
117
-        #aes(fill=data.melt$variable), # Colors
118
-        outlier.colour = "black",
119
-        outlier.alpha = 0.7,
120
-        outlier.shape = 21,
121
-        show.legend = FALSE
122
-      ) +
123
-      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
124
-      scale_color_grey() +
125
-      theme_bw() +
126
-      theme(
127
-        text = element_text(size=20),
128
-        axis.text.x = element_text(angle = 90)
129
-      ) +
130
-      labs(x = "Metrics", y="Metric value", fill="Metrics")
131
-    # compute lower and upper whiskers
132
-    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
133
-    # scale y limits based on ylim1
134
-    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
135
-    print(p)
136
-  }
137
-}
138
-
139
-#' @title Metric values clustering.
140
-#' @name plotMetricsCluster
141
-#' @aliases plotMetricsCluster
142
-#' @description
143
-#' It clusters the value of the metrics in a \code{\link{SummarizedExperiment}}
144
-#' object as a boxplot.
145
-#'
146
-#' @inheritParams stability
147
-#'
148
-#' @return Nothing.
149
-#'
150
-#' @examples
151
-#' # Using example data from our package
152
-#' data("ontMetrics")
153
-#' plotMetricsCluster(ontMetrics)
154
-#'
155
-plotMetricsCluster <- function(data) {
156
-  data <- as.data.frame(assay(data))
157
-  data.metrics = data[,-1] # Removing Description column
158
-  d <- dist(t(data.metrics), method = "euclidean") # distance matrix
159
-  fit <- hclust(d, method="ward.D2")
160
-  theme_set(theme_bw())
161
-  p <- ggdendrogram(fit, rotate = FALSE, size = 2) + # display dendogram
162
-    theme(
163
-      text = element_text(size=15)
164
-    ) +
165
-    labs(title="Metrics dendrogram")
166
-  print(p)
167
-}
168
-
169
-#' @title Metric values as violin plot.
170
-#' @name plotMetricsViolin
171
-#' @aliases plotMetricsViolin
172
-#' @description
173
-#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
174
-#' object as a violin plot.
175
-#'
176
-#' @inheritParams stability
177
-#'
178
-#' @return Nothing.
179
-#'
180
-#' @examples
181
-#' # Using example data from our package
182
-#' data("ontMetrics")
183
-#' plotMetricsViolin(ontMetrics)
184
-#'
185
-plotMetricsViolin <- function(data) {
186
-  data <- as.data.frame(assay(data))
187
-  data.metrics = data[,-1] # Removing Description column
188
-  num_metrics_plot=20
189
-
190
-  metrics_length = length(colnames(data.metrics))
191
-  num_iterations = round(metrics_length/num_metrics_plot)
192
-  if (num_iterations > 0) {
193
-    num_iterations = num_iterations - 1
194
-  }
195
-  for (iteration in 0:num_iterations) {
196
-      i = 1
197
-      rangeStart = (iteration*num_metrics_plot)+1
198
-      rangeEnd = rangeStart+num_metrics_plot-1
199
-      if (rangeEnd > metrics_length) {
200
-        rangeEnd = metrics_length
201
-      }
202
-    suppressMessages({
203
-      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
204
-    })
205
-    # Melting 1 variable (11:11), won't create $variable column in data.melt.
206
-    if (rangeStart == rangeEnd) {
207
-      metricName = data[rangeStart, "Description"]
208
-      data.melt$variable = rep(metricName, length(data.melt$value))
209
-    }
210
-    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
211
-      geom_violin(trim=FALSE) +
212
-      geom_boxplot(width=0.1) +
213
-      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
214
-      scale_color_grey() +
215
-      theme_bw() +
216
-      theme(
217
-        text = element_text(size=20),
218
-        axis.text.x = element_text(angle = 90)
219
-      ) +
220
-      labs(x = "Metrics", y="Metric value", fill="Metrics")
221
-    # compute lower and upper whiskers
222
-    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
223
-    # scale y limits based on ylim1
224
-    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
225
-    print(p)
226
-  }
227
-}
228
-
229
-
230
-#
231
-# It returns true if value is in range (0.5, 0.7]
232
-#
233
-isReasonable <- function(value) {
234
-  return(value > 0.5 && value <= 0.7)
235
-}
236
-
237
-getLargestSilWidth <- function(qualityDf, metric, k1, k2) {
238
-  k1KSil = qualityDf[metric, k1]
239
-  k2KSil = qualityDf[metric, k2]
240
-  k = NULL
241
-  if (k1KSil >= k2KSil) {
242
-    k = k1
243
-  } else {
244
-    k = k2
245
-  }
246
-  return(getFormattedK(k))
247
-}
248
-
249
-#
250
-# It transform a string 'k_X' into 'X'.
251
-# For instace, input is 'k_4', output is '4'
252
-#
253
-getFormattedK <- function(k) {
254
-  return(gsub("^.*_","", k))
255
-}
256
-
257
-#' @title Calculating the optimal value of k.
258
-#' getOptimalKValue
259
-#' @aliases getOptimalKValue
260
-#' @description
261
-#' This method finds the optimal value of K per each metric.
262
-#'
263
-#' @param stabData An output \code{\link{SummarizedExperiment}} from
264
-#' a \code{\link{stabilityRange}} execution.
265
-#'
266
-#' @param qualData An output \code{\link{SummarizedExperiment}} from
267
-#' a \code{\link{qualityRange}} execution.
268
-#'
269
-#' @param k.range A range of K values to limit the scope of the
270
-#' analysis.
271
-#'
272
-#' @return It returns a dataframe following the schema:
273
-#' \code{metric}, \code{optimal_k}.
274
-#'
275
-#' @examples
276
-#' # Using example data from our package
277
-#' data("rnaMetrics")
278
-#' stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
279
-#' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
280
-#' kOptTable = getOptimalKValue(stabilityData, qualityData)
281
-#'
282
-getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
283
-  checkStabilityQualityData(stabData, qualData)
284
-
285
-  if (!is.null(k.range)) {
286
-    k.range.length = length(k.range)
287
-    if (k.range.length != 2) {
288
-      stop("k.range length must be 2")
289
-    }
290
-    k.min = k.range[1]
291
-    k.max = k.range[2]
292
-    checkKValue(k.min)
293
-    checkKValue(k.max)
294
-    if (k.max < k.min) {
295
-      stop("The first value of k.range cannot be greater than its second value")
296
-    } else if (k.min == k.max) {
297
-      stop("Range start point and end point are equals")
298
-    }
299
-  }
300
-
301
-  stabDf = standardizeStabilityData(stabData, k.range)
302
-  qualDf = standardizeQualityData(qualData, k.range)
303
-
304
-  metrics = rownames(stabDf)
305
-  STABLE_CLASS = 0.75
306
-
307
-  outputTable = as.data.frame(metrics)
308
-  rownames(outputTable) = metrics
309
-  outputTable = outputTable[, -1]
310
-  optimalKs = list()
311
-  stabMaxKs = list() # List of maximum K for the stability of metric X
312
-  stabMaxKsStability = list() # Stability of the current K in stabMaxKs
313
-  stabMaxKsQuality = list() # Quality of the current K in stabMaxKs
314
-  qualMaxKs = list() # List of maximum K for the quality of metric X
315
-  qualMaxKsStability = list() # Stability of the current K in qualMaxKs
316
-  qualMaxKsQuality = list() # Quality of the current K in qualMaxKs
317
-
318
-  for (metric in metrics) {
319
-    cat("Processing metric: ", metric, "\n")
320
-    stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # k1
321
-    stabMaxKFormatted = getFormattedK(stabMaxK)
322
-    stabMaxVal = stabDf[metric, stabMaxK]
323
-    qualMaxK = colnames(qualDf[metric, ])[apply(qualDf[metric, ],1,which.max)] # k2
324
-    qualMaxVal = qualDf[metric, qualMaxK]
325
-    qualMaxKFormatted = getFormattedK(qualMaxK)
326
-    ## Info for output table
327
-    stabMaxKs = append(stabMaxKs, stabMaxKFormatted)
328
-    stabMaxKsStability = append(stabMaxKsStability, stabDf[metric, stabMaxK]);
329
-    stabMaxKsQuality = append(stabMaxKsQuality, qualDf[metric, stabMaxK]);
330
-
331
-    qualMaxKs = append(qualMaxKs, qualMaxKFormatted)
332
-    qualMaxKsStability = append(qualMaxKsStability, stabDf[metric, qualMaxK]);
333
-    qualMaxKsQuality = append(qualMaxKsQuality, qualDf[metric, qualMaxK]);
334
-    ##
335
-    # CASE 1
336
-    if (length(stabMaxK) >= 1 && length(qualMaxK) >= 1 && identical(stabMaxK, qualMaxK)) {
337
-      k = stabMaxKFormatted
338
-      cat("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
339
-      optimalKs = append(optimalKs, k)
340
-      # CASE 2
341
-    } else if ((stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) ||
342
-               (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS)) {
343
-      if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
344
-        cat("\tBoth Ks have a stable classification: '",
345
-            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
346
-      } else {
347
-        cat("\tBoth Ks does not have a stable classification: '",
348
-            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
349
-      }
350
-      k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
351
-      cat("\tUsing '", k, "' since it provides higher silhouette width\n")
352
-      optimalKs = append(optimalKs, k)
353
-      # CASE 3
354
-    } else if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS) {
355
-      cat("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
356
-          qualMaxKFormatted,"' is not\n")
357
-      kSil = qualDf[metric, stabMaxK]
358
-      if (isReasonable(kSil)) {
359
-        cat("\tUsing ", stabMaxKFormatted, " since its silhouette width is at least reasonable\n")
360
-        optimalKs = append(optimalKs, stabMaxKFormatted)
361
-      } else {
362
-        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
363
-        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
364
-        optimalKs = append(optimalKs, k)
365
-      }
366
-      # CASE 4
367
-    } else if (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
368
-      cat("\t Quality k '", qualMaxKFormatted, "' is stable but stability k '",
369
-          stabMaxKFormatted,"' is not\n")
370
-      kSil = qualDf[metric, qualMaxK]
371
-      if (isReasonable(kSil)) {
372
-        cat("\tUsing ", qualMaxKFormatted, " since its silhouette width is at least reasonable\n")
373
-        optimalKs = append(optimalKs, stabMaxKFormatted)
374
-      } else {
375
-        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
376
-        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
377
-        optimalKs = append(optimalKs, k)
378
-      }
379
-    } else { # This should not happen but it might come in handy to check errors
380
-      optimalKs = append(optimalKs, -1)
381
-    }
382
-  }
383
-
384
-  outputTable["Stability_max_k"] = unlist(stabMaxKs)
385
-  outputTable["Stability_max_k_stab"] = unlist(stabMaxKsStability)
386
-  outputTable["Stability_max_k_qual"] = unlist(stabMaxKsQuality)
387
-
388
-  outputTable["Quality_max_k"] = unlist(qualMaxKs)
389
-  outputTable["Quality_max_k_stab"] = unlist(qualMaxKsStability)
390
-  outputTable["Quality_max_k_qual"] = unlist(qualMaxKsQuality)
391
-
392
-  outputTable["Global_optimal_k"] = unlist(optimalKs)
393
-
394
-  return(outputTable)
395
-}
396
-
397
-checkStabilityQualityData <- function(stabData, qualData) {
398
-  stabDf = assay(stabData)
399
-  lengthStabDf = length(colnames(stabDf[,-1]))
400
-  stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[1]) # Mean_stability_k_2 -> 2
401
-  stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[lengthStabDf])
402
-  lengthQual = length(qualData)
403
-  namesQual = names(qualData)
404
-  qualRangeStart = getFormattedK(namesQual[1]) # k_2 -> 2
405
-  qualRangeEnd = getFormattedK(namesQual[lengthQual])
406
-  if (stabRangeStart != qualRangeStart || stabRangeEnd != qualRangeEnd) {
407
-    stop("Stability data and quality data have different k ranges")
408
-  }
409
-  stabMetricsList = as.character(stabDf[,"Metric"])
410
-  qualMetricsList = as.character(
411
-    assay(getDataQualityRange(qualData, as.numeric(qualRangeStart)))[,"Metric"]
412
-  )
413
-  if (!identical(stabMetricsList, qualMetricsList)) {
414
-    stop("Stability data and quality data have different metrics")
415
-  }
416
-}
417
-
418
-#
419
-# It transforms the output of qualityRange method
420
-# into a dataframe like this:
421
-# (rownames)       k_2       k_3       k_4
422
-# DegFact          0.6171262 0.6278294 0.4882649
423
-# ...
424
-# So that the input of getOptimalKValue has always a
425
-# standardized dataframe to process.
426
-#
427
-
428
-standardizeQualityData <- function(qualData, k.range=NULL) {
429
-  lengthQuality = length(qualData)
430
-  qualRangeStart = getFormattedK(names(qualData)[1])
431
-  qualRangeEnd = getFormattedK(names(qualData)[lengthQuality])
432
-  Metric = NULL
433
-  kValues = list()
434
-  for (i in seq(qualRangeStart, qualRangeEnd, 1)) {
435
-    curQual = as.data.frame(assay(getDataQualityRange(qualData, i)))
436
-    if (i == qualRangeStart) {
437
-      Metric = as.character(levels(curQual$Metric))
438
-    }
439
-    kValues[[i]] = as.numeric(as.character(curQual$Avg_Silhouette_Width))
440
-  }
441
-  qualDf = as.data.frame(Metric)
442
-  for (i in seq(qualRangeStart, qualRangeEnd, 1)) {
443
-    values = kValues[[i]]
444
-    newColname = paste0("k_", i)
445
-    k = as.numeric(getFormattedK(newColname))
446
-    if (!is.null(k.range) && (k < k.range[1] || k > k.range[2])) {
447
-      next
448
-    }
449
-    qualDf[[newColname]] = values
450
-  }
451
-
452
-  if (!is.null(k.range) && (k.range[1] < qualRangeStart || k.range[2] > qualRangeEnd)) {
453
-    # Input k.range is not a subset of the stabData k ranges
454
-    stop("Input k.range [", k.range[1], ", ", k.range[2], "] is not a subset of range [",
455
-         qualRangeStart, ", ", qualRangeEnd, "]")
456
-  }
457
-
458
-  rownames(qualDf) = qualDf$Metric
459
-  qualDf = qualDf[, -1] # Remove "Metric" column, metrics are rownames now
460
-  qualDf <- qualDf[ order(row.names(qualDf)), ]
461
-  return(qualDf)
462
-}
463
-
464
-#
465
-# It transforms the output of stabilityRange method
466
-# into a dataframe like this:
467
-# (rownames)       k_2       k_3       k_4
468
-# RIN              0.6171262 0.6278294 0.4882649
469
-# ...
470
-# So that the input of getOptimalKValue has always a
471
-# standardized dataframe to process.
472
-#
473
-standardizeStabilityData <- function(stabData, k.range=NULL) {
474
-  stabDf = as.data.frame(assay(stabData))
475
-  lengthColnames = length(colnames(stabDf))
476
-  toRemove = list()
477
-  for (i in seq(1, lengthColnames, 1)) {
478
-    colname = colnames(stabDf)[i]
479
-    newColname = gsub("^.*_.*_.*_","k_", colname)
480
-    colnames(stabDf)[i] = newColname
481
-    if (i != 1) { # Skip Metric column
482
-      k = as.numeric(getFormattedK(newColname))
483
-      if (!is.null(k.range) && (k < k.range[1] || k > k.range[2])) {
484
-        toRemove = append(toRemove, newColname)
485
-        next
486
-      }
487
-      stabDf[newColname] = as.numeric(as.character(stabDf[[newColname]]))
488
-    }
489
-  }
490
-
491
-  for (columnName in toRemove) {
492
-    stabDf[, columnName] = list(NULL)
493
-    lengthColnames = lengthColnames-1
494
-  }
495
-
496
-  inputStartRange = as.numeric(getFormattedK(colnames(stabDf)[2]))
497
-  inputEndRange = as.numeric(getFormattedK(colnames(stabDf)[lengthColnames]))
498
-  if (!is.null(k.range) && (k.range[1] < inputStartRange || k.range[2] > inputEndRange)) {
499
-    # Input k.range is not a subset of the stabData k ranges
500
-    stop("Input k.range [", k.range[1], ", ", k.range[2], "] is not a subset of data range [",
501
-         inputStartRange, ", ", inputEndRange, "]")
502
-  }
503
-
504
-  rownames(stabDf) = stabDf$Metric
505
-  stabDf = stabDf[, -1] # Remove "Metric" column, metrics are rownames now
506
-  stabDf <- stabDf[ order(row.names(stabDf)), ]
507
-  return(stabDf)
508
-}
1
+#' @title Minimum and maximum metric values plot.
2
+#' @name plotMetricsMinMax
3
+#' @aliases plotMetricsMinMax
4
+#' @description
5
+#' It plots the minimum, maximum and standard deviation
6
+#' values of the metrics in a \code{\link{SummarizedExperiment}} object.
7
+#'
8
+#' @inheritParams stability
9
+#'
10
+#' @return Nothing.
11
+#'
12
+#' @examples
13
+#' # Using example data from our package
14
+#' data("ontMetrics")
15
+#' plotMetricsMinMax(ontMetrics)
16
+#'
17
+plotMetricsMinMax <- function(data) {
18
+  data <- as.data.frame(assay(data))
19
+  # Prepare data for plotting
20
+  # Data matrix without descritive column
21
+  matrix = data.matrix(data[,-1])
22
+  maxs = matrixStats::colMaxs(matrix)
23
+  mins = matrixStats::colMins(matrix)
24
+  means = colMeans(matrix)
25
+  sd = matrixStats::colSds(matrix)
26
+
27
+  dataStats = matrix(NA, nrow=5, ncol = length(data[,-1]), byrow=TRUE,
28
+                     dimnames = list(c("Metric", "Min","Max","Mean","Sd"),
29
+                                     c(colnames(data[,-1]))))
30
+  dataStats["Metric",] = colnames(data[,-1])
31
+  dataStats["Min",] = mins
32
+  dataStats["Max",] = maxs
33
+  dataStats["Mean",] = means
34
+  dataStats["Sd",] = sd
35
+  dataStats.df = as.data.frame(dataStats)
36
+  dataStats.df.t = as.data.frame(t(dataStats.df))
37
+  # Factor to numeric conversion
38
+  dataStats.df.t$Min = as.numeric(as.character(dataStats.df.t$Min))
39
+  dataStats.df.t$Max = as.numeric(as.character(dataStats.df.t$Max))
40
+  dataStats.df.t$Mean = as.numeric(as.character(dataStats.df.t$Mean))
41
+  dataStats.df.t$Sd = as.numeric(as.character(dataStats.df.t$Sd))
42
+
43
+  ## Plotting
44
+  p <- ggplot(dataStats.df.t, aes(x=dataStats.df.t$Metric)) +
45
+    geom_linerange(aes(ymin=dataStats.df.t$Min,ymax=dataStats.df.t$Max),
46
+                   linetype=2,color="#4E84C4") +
47
+    geom_point(aes(y=dataStats.df.t$Min),size=3,color="#00AFBB") +
48
+    geom_point(aes(y=dataStats.df.t$Max),size=3,color="#FC4E07") +
49
+    theme_bw() +
50
+    theme(axis.text.x = element_text(angle = 90),
51
+          #axis.text.y = element_blank(),
52
+          text = element_text(size=15),
53
+          axis.line = element_line(colour = "black",
54
+                                   size = 1, linetype = "solid")
55
+    ) +
56
+    geom_errorbar(aes(ymin=(dataStats.df.t$Max-dataStats.df.t$Sd),
57
+                      ymax=(dataStats.df.t$Max+dataStats.df.t$Sd)), width=.2,
58
+                  position=position_dodge(.9)) +
59
+    geom_errorbar(aes(ymin=(dataStats.df.t$Min-dataStats.df.t$Sd),
60
+                      ymax=(dataStats.df.t$Min+dataStats.df.t$Sd)), width=.2,
61
+                  position=position_dodge(.9)) +
62
+    scale_y_continuous(breaks=seq(round(min(dataStats.df.t$Min-dataStats.df.t$Sd)), # 10 ticks across min - max range
63
+                                  round(max(dataStats.df.t$Max+dataStats.df.t$Sd)),
64
+                                  round((max(dataStats.df.t$Max)-min(dataStats.df.t$Min)))/10),
65
+                       labels=function(x) sprintf("%.2f", x)) + # Two decimals
66
+    labs(x = "Metrics", y = "Metric value", title = "Min/max/sd values across metrics") +
67
+    guides(fill=TRUE)
68
+
69
+  print(p)
70
+}
71
+
72
+
73
+#' @title Metric values as a boxplot.
74
+#' @name plotMetricsBoxplot
75
+#' @aliases plotMetricsBoxplot
76
+#' @description
77
+#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
78
+#' object as a boxplot.
79
+#'
80
+#' @inheritParams stability
81
+#'
82
+#' @return Nothing.
83
+#'
84
+#' @examples
85
+#' # Using example data from our package
86
+#' data("ontMetrics")
87
+#' plotMetricsBoxplot(ontMetrics)
88
+#'
89
+plotMetricsBoxplot <- function(data) {
90
+  data <- as.data.frame(assay(data))
91
+  num_metrics_plot=20
92
+  data.metrics = data[,-1] # Removing Description column
93
+
94
+  metrics_length = length(colnames(data.metrics))
95
+  num_iterations = round(metrics_length/num_metrics_plot)
96
+  if (num_iterations > 0) {
97
+    num_iterations = num_iterations - 1
98
+  }
99
+  for (iteration in 0:num_iterations) {
100
+    i = 1
101
+    rangeStart = (iteration*num_metrics_plot)+1
102
+    rangeEnd = rangeStart+num_metrics_plot-1
103
+    if (rangeEnd > metrics_length) {
104
+      rangeEnd = metrics_length
105
+    }
106
+    suppressMessages({
107
+      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
108
+    })
109
+    # Melting 1 variable (e.g: data.metrics[,11:11])
110
+    # won't create $variable column in data.melt.
111
+    if (rangeStart == rangeEnd) {
112
+      metricName = data[rangeStart, "Description"]
113
+      data.melt$variable = rep(metricName, length(data.melt$value))
114
+    }
115
+    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
116
+      geom_boxplot(
117
+        #aes(fill=data.melt$variable), # Colors
118
+        outlier.colour = "black",
119
+        outlier.alpha = 0.7,
120
+        outlier.shape = 21,
121
+        show.legend = FALSE
122
+      ) +
123
+      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
124
+      scale_color_grey() +
125
+      theme_bw() +
126
+      theme(
127
+        text = element_text(size=20),
128
+        axis.text.x = element_text(angle = 90)
129
+      ) +
130
+      labs(x = "Metrics", y="Metric value", fill="Metrics")
131
+    # compute lower and upper whiskers
132
+    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
133
+    # scale y limits based on ylim1
134
+    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
135
+    print(p)
136
+  }
137
+}
138
+
139
+#' @title Metric values clustering.
140
+#' @name plotMetricsCluster
141
+#' @aliases plotMetricsCluster
142
+#' @description
143
+#' It clusters the value of the metrics in a \code{\link{SummarizedExperiment}}
144
+#' object as a boxplot.
145
+#'
146
+#' @inheritParams stability
147
+#'
148
+#' @return Nothing.
149
+#'
150
+#' @examples
151
+#' # Using example data from our package
152
+#' data("ontMetrics")
153
+#' plotMetricsCluster(ontMetrics)
154
+#'
155
+plotMetricsCluster <- function(data) {
156
+  data <- as.data.frame(assay(data))
157
+  data.metrics = data[,-1] # Removing Description column
158
+  d <- dist(t(data.metrics), method = "euclidean") # distance matrix
159
+  fit <- hclust(d, method="ward.D2")
160
+  theme_set(theme_bw())
161
+  p <- ggdendrogram(fit, rotate = FALSE, size = 2) + # display dendogram
162
+    theme(
163
+      text = element_text(size=15)
164
+    ) +
165
+    labs(title="Metrics dendrogram")
166
+  print(p)
167
+}
168
+
169
+#' @title Metric values as violin plot.
170
+#' @name plotMetricsViolin
171
+#' @aliases plotMetricsViolin
172
+#' @description
173
+#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
174
+#' object as a violin plot.
175
+#'
176
+#' @inheritParams stability
177
+#'
178
+#' @return Nothing.
179
+#'
180
+#' @examples
181
+#' # Using example data from our package
182
+#' data("ontMetrics")
183
+#' plotMetricsViolin(ontMetrics)
184
+#'
185
+plotMetricsViolin <- function(data) {
186
+  data <- as.data.frame(assay(data))
187
+  data.metrics = data[,-1] # Removing Description column
188
+  num_metrics_plot=20
189
+
190
+  metrics_length = length(colnames(data.metrics))
191
+  num_iterations = round(metrics_length/num_metrics_plot)
192
+  if (num_iterations > 0) {
193
+    num_iterations = num_iterations - 1
194
+  }
195
+  for (iteration in 0:num_iterations) {
196
+      i = 1
197
+      rangeStart = (iteration*num_metrics_plot)+1
198
+      rangeEnd = rangeStart+num_metrics_plot-1
199
+      if (rangeEnd > metrics_length) {
200
+        rangeEnd = metrics_length
201
+      }
202
+    suppressMessages({
203
+      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
204
+    })
205
+    # Melting 1 variable (11:11), won't create $variable column in data.melt.
206
+    if (rangeStart == rangeEnd) {
207
+      metricName = data[rangeStart, "Description"]
208
+      data.melt$variable = rep(metricName, length(data.melt$value))
209
+    }
210
+    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
211
+      geom_violin(trim=FALSE) +
212
+      geom_boxplot(width=0.1) +
213
+      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
214
+      scale_color_grey() +
215
+      theme_bw() +
216
+      theme(
217
+        text = element_text(size=20),
218
+        axis.text.x = element_text(angle = 90)
219
+      ) +
220
+      labs(x = "Metrics", y="Metric value", fill="Metrics")
221
+    # compute lower and upper whiskers
222
+    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
223
+    # scale y limits based on ylim1
224
+    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
225
+    print(p)
226
+  }
227
+}
228
+
229
+
230
+#
231
+# It returns true if value is in range (0.5, 0.7]
232
+#
233
+isReasonable <- function(value) {
234
+  return(value > 0.5 && value <= 0.7)
235
+}
236
+
237
+getLargestSilWidth <- function(qualityDf, metric, k1, k2) {
238
+  k1KSil = qualityDf[metric, k1]
239
+  k2KSil = qualityDf[metric, k2]
240
+  k = NULL
241
+  if (k1KSil >= k2KSil) {
242
+    k = k1
243
+  } else {
244
+    k = k2
245
+  }
246
+  return(getFormattedK(k))
247
+}
248
+
249
+#
250
+# It transform a string 'k_X' into 'X'.
251
+# For instace, input is 'k_4', output is '4'
252
+#
253
+getFormattedK <- function(k) {
254
+  return(gsub("^.*_","", k))
255
+}
256
+
257
+#' @title Calculating the optimal value of k.
258
+#' getOptimalKValue
259
+#' @aliases getOptimalKValue
260
+#' @description
261
+#' This method finds the optimal value of K per each metric.
262
+#'
263
+#' @param stabData An output \code{\link{ExperimentList}} from
264
+#' a \code{\link{stabilityRange}} execution.
265
+#'
266
+#' @param qualData An output \code{\link{SummarizedExperiment}} from
267
+#' a \code{\link{qualityRange}} execution.
268
+#'
269
+#' @param k.range A range of K values to limit the scope of the
270
+#' analysis.
271
+#'
272
+#' @return It returns a dataframe following the schema:
273
+#' \code{metric}, \code{optimal_k}.
274
+#'
275
+#' @examples
276
+#' # Using example data from our package
277
+#' data("rnaMetrics")
278
+#' stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
279
+#' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
280
+#' kOptTable = getOptimalKValue(stabilityData, qualityData)
281
+#'
282
+getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
283
+  checkStabilityQualityData(stabData, qualData)
284
+
285
+  if (!is.null(k.range)) {
286
+    k.range.length = length(k.range)
287
+    if (k.range.length != 2) {
288
+      stop("k.range length must be 2")
289
+    }
290
+    k.min = k.range[1]
291
+    k.max = k.range[2]
292
+    checkKValue(k.min)
293
+    checkKValue(k.max)
294
+    if (k.max < k.min) {
295
+      stop("The first value of k.range cannot be greater than its second value")
296
+    } else if (k.min == k.max) {
297
+      stop("Range start point and end point are equals")
298
+    }
299
+  }
300
+
301
+  stabDf = standardizeStabilityData(stabData, k.range)
302
+  qualDf = standardizeQualityData(qualData, k.range)
303
+
304
+  metrics = rownames(stabDf)
305
+  STABLE_CLASS = 0.75
306
+
307
+  outputTable = as.data.frame(metrics)
308
+  rownames(outputTable) = metrics
309
+  outputTable = outputTable[, -1]
310
+  optimalKs = list()
311
+  stabMaxKs = list() # List of maximum K for the stability of metric X
312
+  stabMaxKsStability = list() # Stability of the current K in stabMaxKs
313
+  stabMaxKsQuality = list() # Quality of the current K in stabMaxKs
314
+  qualMaxKs = list() # List of maximum K for the quality of metric X
315
+  qualMaxKsStability = list() # Stability of the current K in qualMaxKs
316
+  qualMaxKsQuality = list() # Quality of the current K in qualMaxKs
317
+
318
+  for (metric in metrics) {
319
+    cat("Processing metric: ", metric, "\n")
320
+    stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # k1
321
+    stabMaxKFormatted = getFormattedK(stabMaxK)
322
+    stabMaxVal = stabDf[metric, stabMaxK]
323
+    qualMaxK = colnames(qualDf[metric, ])[apply(qualDf[metric, ],1,which.max)] # k2
324
+    qualMaxVal = qualDf[metric, qualMaxK]
325
+    qualMaxKFormatted = getFormattedK(qualMaxK)
326
+    ## Info for output table
327
+    stabMaxKs = append(stabMaxKs, stabMaxKFormatted)
328
+    stabMaxKsStability = append(stabMaxKsStability, stabDf[metric, stabMaxK]);
329
+    stabMaxKsQuality = append(stabMaxKsQuality, qualDf[metric, stabMaxK]);
330
+
331
+    qualMaxKs = append(qualMaxKs, qualMaxKFormatted)
332
+    qualMaxKsStability = append(qualMaxKsStability, stabDf[metric, qualMaxK]);
333
+    qualMaxKsQuality = append(qualMaxKsQuality, qualDf[metric, qualMaxK]);
334
+    ##
335
+    # CASE 1
336
+    if (length(stabMaxK) >= 1 && length(qualMaxK) >= 1 && identical(stabMaxK, qualMaxK)) {
337
+      k = stabMaxKFormatted
338
+      cat("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
339
+      optimalKs = append(optimalKs, k)
340
+      # CASE 2
341
+    } else if ((stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) ||
342
+               (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS)) {
343
+      if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
344
+        cat("\tBoth Ks have a stable classification: '",
345
+            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
346
+      } else {
347
+        cat("\tBoth Ks does not have a stable classification: '",
348
+            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
349
+      }
350
+      k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
351
+      cat("\tUsing '", k, "' since it provides higher silhouette width\n")
352
+      optimalKs = append(optimalKs, k)
353
+      # CASE 3
354
+    } else if (stabMaxVal >= STABLE_CLASS && stabDf[metric, qualMaxK] < STABLE_CLASS) {
355
+      cat("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
356
+          qualMaxKFormatted,"' is not\n")
357
+      kSil = qualDf[metric, stabMaxK]
358
+      if (isReasonable(kSil)) {
359
+        cat("\tUsing ", stabMaxKFormatted, " since its silhouette width is at least reasonable\n")
360
+        optimalKs = append(optimalKs, stabMaxKFormatted)
361
+      } else {
362
+        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
363
+        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
364
+        optimalKs = append(optimalKs, k)
365
+      }
366
+      # CASE 4
367
+    } else if (stabMaxVal < STABLE_CLASS && stabDf[metric, qualMaxK] >= STABLE_CLASS) {
368
+      cat("\t Quality k '", qualMaxKFormatted, "' is stable but stability k '",
369
+          stabMaxKFormatted,"' is not\n")
370
+      kSil = qualDf[metric, qualMaxK]
371
+      if (isReasonable(kSil)) {
372
+        cat("\tUsing ", qualMaxKFormatted, " since its silhouette width is at least reasonable\n")
373
+        optimalKs = append(optimalKs, stabMaxKFormatted)
374
+      } else {
375
+        k = getLargestSilWidth(qualDf, metric, stabMaxK, qualMaxK)
376
+        cat("\tUsing '", k, "' since it provides higher silhouette width\n")
377
+        optimalKs = append(optimalKs, k)
378
+      }
379
+    } else { # This should not happen but it might come in handy to check errors
380
+      optimalKs = append(optimalKs, -1)
381
+    }
382
+  }
383
+
384
+  outputTable["Stability_max_k"] = unlist(stabMaxKs)
385
+  outputTable["Stability_max_k_stab"] = unlist(stabMaxKsStability)
386
+  outputTable["Stability_max_k_qual"] = unlist(stabMaxKsQuality)
387
+
388
+  outputTable["Quality_max_k"] = unlist(qualMaxKs)
389
+  outputTable["Quality_max_k_stab"] = unlist(qualMaxKsStability)
390
+  outputTable["Quality_max_k_qual"] = unlist(qualMaxKsQuality)
391
+
392
+  outputTable["Global_optimal_k"] = unlist(optimalKs)
393
+
394
+  return(outputTable)
395
+}
396
+
397
+checkStabilityQualityData <- function(stabData, qualData) {
398
+  stabDf = assay(stabData, "stability_mean")
399
+  lengthStabDf = length(colnames(stabDf[,-1]))
400
+  stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[1]) # Mean_stability_k_2 -> 2
401
+  stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf[,-1])[lengthStabDf])
402
+  lengthQual = length(qualData)
403
+  namesQual = names(qualData)
404
+  qualRangeStart = getFormattedK(namesQual[1]) # k_2 -> 2
405
+  qualRangeEnd = getFormattedK(namesQual[lengthQual])
406
+  if (stabRangeStart != qualRangeStart || stabRangeEnd != qualRangeEnd) {
407
+    stop("Stability data and quality data have different k ranges")
408
+  }
409
+  stabMetricsList = as.character(stabDf[,"Metric"])
410
+  qualMetricsList = as.character(
411
+    assay(getDataQualityRange(qualData, as.numeric(qualRangeStart)))[,"Metric"]
412
+  )
413
+  if (!identical(stabMetricsList, qualMetricsList)) {
414
+    stop("Stability data and quality data have different metrics")
415
+  }
416
+}
417
+
418
+#
419
+# It transforms the output of qualityRange method
420
+# into a dataframe like this:
421
+# (rownames)       k_2       k_3       k_4
422
+# DegFact          0.6171262 0.6278294 0.4882649