############################################################################### # functions to read the read counts ############################################################################### BedToGenomicRanges <- function(panelBedFilepath, ampliconColumn, split = "_", doReduce = TRUE, rangeExtend = 0, dropChromossomes = NA, skip = 1) { #load the bed file segments <- read.table(panelBedFilepath, sep = "\t", as.is = TRUE, skip = skip) gr <- GRanges(segments[, 1], IRanges(segments[, 2], segments[, 3])) #make sure that all regions are more than one read apart GenomicRanges::start(gr) <- GenomicRanges::start(gr) - rangeExtend GenomicRanges::end(gr) <- GenomicRanges::end(gr) + rangeExtend if(doReduce) { reducedGr <- GenomicRanges::reduce(gr, ignore.strand = TRUE) #get the genes for each reduced range hitsAnnotation <- GenomicRanges::findOverlaps(reducedGr, gr, select = "first", ignore.strand=TRUE) #get the amplicon names from the segments amplicons = segments[hitsAnnotation, ampliconColumn] #replace the original gr with the reduced gr gr <- reducedGr } else{ amplicons = segments[ , ampliconColumn] } #get the genes from segments splitted = strsplit(amplicons, split = split) # Required because of package checking complaints i <- NULL genes = foreach(i = seq_along(splitted)) %do% { splitted[[i]][1] } genes = unlist(genes) # missing the dollar symbol before text because of groovy script elementMetadata(gr)["geneNames"] = genes elementMetadata(gr)["ampliconNames"] = paste(seqnames(gr), start(gr), end(gr), sep = "_", amplicons) if (!is.na(dropChromossomes)) { gr <- dropSeqlevels(gr, dropChromossomes) } return(gr) } #load the files you want to analyze ReadCountsFromBam <- function(bamFilenames, sampleNames, gr, ampliconNames, minimumMappingQuality = 20, removeDup = FALSE) { if (missing(sampleNames)) { sampleNames = bamFilenames } if (missing(ampliconNames)) { ampliconNames = elementMetadata(gr)$ampliconNames } bamIndexFilepaths <- BamIndexFilepaths(bamFilenames) # Because of package check complaints i <- NULL curbam = foreach(i = seq_along(bamFilenames), .combine = cbind) %do% { if (!file.exists(bamIndexFilepaths[i])) { message(paste("Bai file not found. Creating bai file for", bamFilenames[i], "...")) IndexMultipleBams(bamFilenames) } message(paste("Reading counts for bam file: ", bamFilenames[i])) countBamInGRanges(bamFilenames[i], gr, remove.dup = removeDup, min.mapq = minimumMappingQuality, get.width = TRUE) } listOfBamsHasOnlyOneElement <- length(bamFilenames) == 1 if (listOfBamsHasOnlyOneElement) { curbam <- matrix(curbam) } colnames(curbam) = sampleNames rownames(curbam) = ampliconNames return(curbam) } #get the combined counts CombinedNormalizedCounts <- function (sampleCounts, referenceCounts, method = "tmm", ampliconNames = NULL) { sampleCounts <- as.matrix(sampleCounts) referenceCounts <- as.matrix(referenceCounts) allCounts <- cbind(sampleCounts, referenceCounts) classes <- rep(c("samples", "reference"), c(ncol(sampleCounts), ncol(referenceCounts))) bamDataRangesNorm <- NormalizeCounts(allCounts, method = method) # bamDataRangesNorm <- NormalizeCounts(allCounts) if (!is.null(ampliconNames)) { rownames(bamDataRangesNorm) = ampliconNames } normalizedSamples <- bamDataRangesNorm[, which(classes == "samples"), drop = FALSE] normalizedReference <- bamDataRangesNorm[, which(classes == "reference"), drop = FALSE] return(list(samples = normalizedSamples, reference = normalizedReference)) } ############################################################################### # helper functions ############################################################################### VerifiyIfOutputDirectoryExistsOrIsNotEmpty <- function(outputDir) { outputDirectoryExists <- file.exists(outputDir) outputDirectoryIsNotEmpty <- length(dir(outputDir, all.files=TRUE)) != 0 if (outputDirectoryExists || outputDirectoryIsNotEmpty) { stop("Output directory already exists or is not empty. Please delete or change output directory") } } tss <- function(allCounts) { return(apply(allCounts, 2, function(x) {x / sum(x)})) } NormalizeCounts <- function(allCounts, method = "tmm") { # TODO check if normalization method is supported allowedNormalizationMethods <- c("tmm", # trimmed mean of M-values "tss") # total sum scaling tinyValueToAvoidZeroReadCounts <- 1e-05 bamDataRangesNorm <- NULL if (method == "tmm") { bamDataRangesNorm <- tmm(allCounts, refColumn = NULL, k = tinyValueToAvoidZeroReadCounts) } else if (method == "tss") { # bamDataRangesNorm <- apply(allCounts, 2, function(x) {x / sum(x)}) bamDataRangesNorm <- tss(allCounts) } else { message(paste("method", method, "not included in the allowed Normalization methods", allowedNormalizationMethods)) } return(bamDataRangesNorm) } #get the positions for each gene as a list IndexGenesPositions = function(genes) { positions = seq_along(genes) genesPos = split(positions, genes) return(genesPos) } # #how many numbers in a vector are above a given cutoff # PercentAboveValue <- function(vector, value) { # sum(vector > value)/length(vector) # } # # #how many numbers in a vector are below a given cutoff # PercentBelowValue <- function(vector, value) { # sum(vector < value)/length(vector) # } BamIndexFilepaths <- function(bamFilepaths, indexType = ".bam.bai", ignoreCase = FALSE) { return (gsub(".bam$", bamFilepaths, replacement = indexType, ignore.case = ignoreCase)) } # check this params with Thomas #index the bam files if there is no index yet IndexMultipleBams <- function(bams, index_type = ".bam.bai") { #check if the index already exists and need to be indexed # potentialBaiFilenames <- gsub(".bam$", # bams, # replacement = index_type, # ignore.case = TRUE) potentialBaiFilenames <- BamIndexFilepaths(bams, indexType = index_type) print(paste("potentialBaiFilenames", potentialBaiFilenames)) bamsToBeIndexed <- bams[!sapply(potentialBaiFilenames, file.exists)] if(length(bamsToBeIndexed) > 0) { #index the bams #if(multicore) { # check with Thomas if we can replace by this.. # bplapply(bamsToBeIndexed, indexBam) #mclapply(bamsToBeIndexed, indexBam, mc.cores = ncores) #} else { lapply(bamsToBeIndexed, indexBam) #} } } WriteListToXLSX <- function(listOfDataFrames, multipleFiles = FALSE, outputFolder = file.path(getwd(), "xlsx"), filepath = "list.xlsx") { if (multipleFiles) { dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE) for(name in names(listOfDataFrames)){ message(paste0("Saving file to '", file.path(outputFolder, paste0(name, ".xlsx")))) write.xlsx(listOfDataFrames[[name]], file = file.path(outputFolder, paste0(name, ".xlsx")), rowNames = TRUE) } } else { message(paste0("Saving file to '", filepath, "'")) write.xlsx(listOfDataFrames, file = filepath, rowNames = TRUE) } } ReadXLSXToList <- function(filepath, rowNames = TRUE, colNames = TRUE) { listOfDataFrames <- list() for(name in getSheetNames(filepath)) { listOfDataFrames[[name]] <- read.xlsx(filepath, rowNames = rowNames, colNames = colNames, sheet = name) } return(listOfDataFrames) } ############################################################################### # functions for bootstrapping ############################################################################### adjustedLength <- function(a) { round(sqrt(length(a))) } ratiosMean <- function(ratios) { return(exp(mean(log(ratios)))) } #a function that randomly samples positions from a vector using subsampling SubsamplingPositions <- function(pos, mtry = adjustedLength(pos)) { return(sample(pos, mtry, replace = FALSE)) } #calculates the mean from a vector given specific positions PositionMean <- function(position, vector) { return(ratiosMean(vector[position])) } #calculate the mean for each gene GenePositionMean <- function(genesPos, vector) { means = sapply(genesPos, PositionMean, vector = vector) names(means) = names(genesPos) return(means) } #calculates the mean from a vector given specific positions for a matrix GeneMeanRatioMatrix <- function(genesPos, ratioMatrix) { # Package check complaints i <- NULL ratioList = foreach(i = 1:ncol(ratioMatrix), .combine = cbind) %do% { GenePositionMean(genesPos,ratioMatrix[, i]) } ratioList <- as.matrix(ratioList) colnames(ratioList) <- colnames(ratioMatrix) return(ratioList) } # ReferenceWeights <- function(refmat, varianceFunction = sd) { # variance = apply(refmat, 2, varianceFunction) # weights = 1/variance # return(weights) # } # split into a function generating the bootstrap distribution # and a function averaging over amplicons. # define the function to generate bootstrap distributions and # return a list with the genewise bootstrapping BootList <- function(geneNames, sampleMatrix, refmat, replicates) { enforceDeterministicResult() # get the genes positions in the matrix as a list from a gene name vector genesPos <- IndexGenesPositions(geneNames) # if (class(sampleMatrix) != "matrix") { if (!("matrix" %in% class(sampleMatrix))) { stop(paste("Parameter 'sampleMatrix' has to be of class matrix and is of class", class(sampleMatrix))) } if (is.null(colnames(sampleMatrix)) | length(colnames(sampleMatrix))!=ncol(sampleMatrix)) { stop("All columns of 'sampleMatrix' have to be named") } # a vector and matrix are not the same and for a vector iterating over # the column makes no sense so we have # to check if a matrix or a vector was passed. ncol only works for matrix # not for vector # if (class(sampleMatrix) == "matrix") { if ("matrix" %in% class(sampleMatrix)) { iterator <- 1:ncol(sampleMatrix) # } else { # iterator <- 1 } i <- NULL j <- NULL bootListSamples <- foreach(i = iterator) %:% foreach(j = rep(1, replicates), .combine = rbind) %do% { # a vector and matrix are not the same and for a vector # iterating over the column makes no sense so we have to check # if a mtrix or a vector was passed. # if (class(sampleMatrix) == "matrix") { if ("matrix" %in% class(sampleMatrix)) { testSample <- sampleMatrix[, i] # } else { # testSample <- sampleMatrix } # for each gene subsample the amplicon positions independently # sample the samples using bootstrapping sampleBootPos <- sample(1:ncol(refmat), ncol(refmat), replace = TRUE) geneBootPos <- c(lapply(genesPos, SubsamplingPositions), recursive = TRUE) # given the obtained sampling using the bootstraps calculated refMatPos <- refmat[geneBootPos, sampleBootPos, drop=FALSE] bootRatio <- testSample[geneBootPos]/rowMeans(refMatPos) # after the bootstrapping the gene positions in the vector # changes so recalculate them splitClass <- rep(names(genesPos), sapply(genesPos, adjustedLength)) newGenesPos <- split(seq_along(splitClass), splitClass) sapply(newGenesPos, PositionMean, vector = bootRatio) } names(bootListSamples) <- basename(colnames(sampleMatrix)) # In case a single step of bootstrap is done (for testing purposes), # the elements of each sample will be converted to a matrix for (i in seq_along(bootListSamples)) { bootListSamples[[i]] <- matrix(bootListSamples[[i]], ncol = length(names(genesPos))) colnames(bootListSamples[[i]]) <- names(genesPos) } return(bootListSamples) } ConsensusCheck <- function(genomicRangesFromBed, sampleMatrix, referenceMatrix) { # sampleMatrix <- samplesNormalizedReadCounts # referenceMatrix <- referenceNormalizedReadCounts # View(sampleMatrix[, c(1,2)]) # View(apply(referenceMatrix, 1, median)) # ConsensusCheck(genomicRangesFromBed, results_1PGM_sequencingLibraries1@sampleReadCounts, results_1PGM_sequencingLibraries1@referenceReadColunts), # sampleMatrix <- results_1PGM_sequencingLibraries1@sampleReadCounts # referenceMatrix <- results_1PGM_sequencingLibraries1@referenceReadColunts # This should not reach here.. but just in case.. colnames(sampleMatrix) <- basename(colnames(sampleMatrix)) colnames(referenceMatrix) <- basename(colnames(referenceMatrix)) geneNames <- genomicRangesFromBed$geneNames allSampleRatios <- data.frame() for (smpl in colnames(sampleMatrix)) { # smpl <- colnames(sampleMatrix)[which(grepl("28208", colnames(sampleMatrix)))] ampliconsRatio <- sampleMatrix[, smpl]/apply(referenceMatrix, 1, median) allSampleRatios <- rbind(allSampleRatios, ampliconsRatio) } allSampleRatios <- t(allSampleRatios) rownames(allSampleRatios) <- rownames(sampleMatrix) colnames(allSampleRatios) <- colnames(sampleMatrix) genesPositions <- IndexGenesPositions(geneNames) geneConsensus <- data.frame(matrix(nrow=length(genesPositions), ncol=ncol(sampleMatrix))) geneConsensus <- data.frame() for (smpl in colnames(allSampleRatios)) { for (gene in names(genesPositions)) { geneAmpliconsRatios <- allSampleRatios[,smpl][genesPositions[[gene]]] ampliconConsensus <- all(geneAmpliconsRatios>1) | all(geneAmpliconsRatios<1) geneConsensus[gene, smpl] <- ampliconConsensus } } # TODO case the reference has amplicons with 0 read counts, it returns NA instead of boolean geneConsensus[is.na(geneConsensus)] <- FALSE return(geneConsensus) } HaveMininumNumberOfAmplicons <- function(genomicRangesFromBed, numberOfAmplicons) { geneNames <- genomicRangesFromBed$geneNames genesPositions <- IndexGenesPositions(geneNames) sapply(genesPositions, function(x) {length(x)>=numberOfAmplicons}) } #calculate significance CheckSignificance <- function(bootList, significanceLevel = 0.05) { # Bonferroni correction significanceLevel <- significanceLevel/ncol(bootList[[1]]) margin <- significanceLevel/2 # package check complains i <- NULL j <- NULL sigTables = foreach(i = seq_along(bootList)) %:% foreach(j = 1:ncol(bootList[[i]]), .combine = rbind) %do% { bootRatioDistribution <- bootList[[i]][,j] # meanNoise <- exp(mean(log(bootRatioDistribution))) meanNoise <- ratiosMean(bootRatioDistribution) lowerBound <- quantile(bootRatioDistribution, margin, type = 1) upperBound <- quantile(bootRatioDistribution, 1 - margin, type = 1) bounds <- c(lowerBound, meanNoise, upperBound) roundedBounds <- round(bounds, digits = 2) maybeAmplification <- roundedBounds[1] > 1 maybeDeletion <- roundedBounds[3] < 1 roundedSignificant <- maybeAmplification | maybeDeletion copyNumberPutativeStatus <- "Normal" if(maybeAmplification) { copyNumberPutativeStatus <- "Amplification" } else { if (maybeDeletion) { copyNumberPutativeStatus <- "Deletion" } } sigTable <- c(roundedBounds, roundedSignificant, copyNumberPutativeStatus) names(sigTable) <- c("lowerBound", "mean", "upperBound", "isSig", "putativeStatus") sigTable } names(sigTables) <- names(bootList) # name the genes in the list. All samples should share the same gene names geneNames <- colnames(bootList[[1]]) for (i in seq_along(sigTables)) { rownames(sigTables[[i]]) <- geneNames } return(sigTables) } #SignificantGenes <- function(sigList, genesPositionsIndex) { # # package check complains # i <- NULL # sigGenes = foreach(i = seq_along(sigList)) %do% { # names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE] # } # return(sigGenes) #} # ############################################################################# # # functions for background noise estimation # ############################################################################# NonSignificantGeneIndex <- function(sigList, genesPositionsIndex) { # package check complains i <- NULL genePosNonSig = foreach(i = seq_along(sigList)) %do% { sigGenes = names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE] selectedIndex <- RemSigGenes(genesPositionsIndex, sigGenes) if (length(selectedIndex) == 0) { genesPositionsIndex } else { selectedIndex } } names(genePosNonSig) <- names(sigList) return(genePosNonSig) } AmplProbMultipeSamples <- function(genePosNonSig) { # package check complains i <- NULL amplWeights = foreach(i = seq_along(genePosNonSig)) %do% { AmplProb(genePosNonSig[[i]]) } names(amplWeights) <- names(genePosNonSig) return(amplWeights) } enforceDeterministicResult <- function() { set.seed(1) } Background <- function(geneNames, samplesNormalizedReadCounts, referenceNormalizedReadCounts, bootList, replicates = 1000, significanceLevel = 0.05, robust = FALSE) { enforceDeterministicResult() #which genes showed significant changes sigList <- CheckSignificance(bootList) # gene index genesPositionsIndex <- IndexGenesPositions(geneNames) # calculate the reference mean #refMean <- apply(referenceNormalizedReadCounts, 1, mean) refMean <- rowMeans(referenceNormalizedReadCounts) # calculate the ratio matrix for each sample ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean) # remove the significant genes from the noise estimation genesPosNonSig <- NonSignificantGeneIndex(sigList, genesPositionsIndex) # calculate the weight for each amplicon of non significant changed genes amplWeights <- AmplProbMultipeSamples(genesPosNonSig) uniqueAmpliconNumbers <- NumberOfUniqueAmplicons(genesPositionsIndex) sampleIndex <- NULL backgroundObject <- foreach(sampleIndex = seq_along(genesPosNonSig)) %do% { message(paste0("Calculating Background for ", colnames(samplesNormalizedReadCounts)[sampleIndex])) nonSigAmpliconRatios <- ratioMatrix[unlist(genesPosNonSig[[sampleIndex]]), sampleIndex] IterateAmplNum(uniqueAmpliconNumbers, nonSigAmpliconRatios, replicates = replicates, probs = amplWeights[[sampleIndex]], significanceLevel = significanceLevel, robust = robust) } return(backgroundObject) } roundDf <- function(x, digits) { # round all numeric variables # x: data frame # digits: number of digits to round # numeric_columns <- sapply(x, mode) == 'numeric' numeric_columns <- sapply(x, is.numeric) x[numeric_columns] <- round(x[numeric_columns], digits) x } # Helper function to add column to reportTables ReliableStatus <- function(putative, numberOfThresholdsPassed) { status <- as.vector(putative) # print(status) if (numberOfThresholdsPassed < 2) { status <- "Normal" } return(status) } ReportTables <- function(geneNames, samplesNormalizedReadCounts, referenceNormalizedReadCounts, bootList, backgroundNoise) { # get the background noise in a format that can be used # for a report table backgroundReport <- BackgroundReport(backgroundNoise, geneNames) # gene index genesPositionsIndex <- IndexGenesPositions(geneNames) # calculate the reference mean #refMean <- apply(referenceNormalizedReadCounts, 1, mean) refMean <- rowMeans(referenceNormalizedReadCounts) # calculate the ratio matrix for each sample ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean) # calculate the genewise ratio matrix from the ratio_mat ratioMatGene <- GeneMeanRatioMatrix(genesPositionsIndex, ratioMatrix) sigList <- CheckSignificance(bootList) # because package generaton complains.. i <- NULL reportTables <- foreach(i = seq_along(backgroundReport)) %do% { #isSig <- (sigList[[i]][, "upperBound"] < 1) | (sigList[[i]][, "lowerBound"] > 1) putativeStatus <- sigList[[i]][, "putativeStatus"] isSig <- putativeStatus != "Normal" backgroundUp <- backgroundReport[[i]][, "UpperNoise"] backgroundDown <- backgroundReport[[i]][, "LowerNoise"] rMatGene <- ratioMatGene[, i] # TODO is this correct?! why compare rMatGene > 1 ? aboveNoise <- (rMatGene > 1 & sigList[[i]][, "lowerBound"] > backgroundUp) | (rMatGene < 1 & sigList[[i]][, "upperBound"] < backgroundDown) dfTemp <- data.frame(meanRatio = ratioMatGene[, i], lowerBoundBootstrapRatio = sigList[[i]][, "lowerBound"], meanBootstrapRatio = sigList[[i]][, "mean"], upperBoundBootstrapRatio = sigList[[i]][, "upperBound"], # lowerNoise = backgroundReport[[i]][, 1], # meanNoise = backgroundReport[[i]][, 2], # upperNoise = backgroundReport[[i]][, 3], lowerNoise = backgroundReport[[i]][, "LowerNoise"], meanNoise = backgroundReport[[i]][, "MeanNoise"], upperNoise = backgroundReport[[i]][, "UpperNoise"], significant = isSig, aboveNoise = aboveNoise, amplNum = as.vector(table(geneNames)), putativeStatus = putativeStatus, passed = isSig + aboveNoise) significativeNumbers <- 2 dfTemp <- roundDf(dfTemp, significativeNumbers) names(dfTemp) <- c("MeanRatio", "LowerBoundBoot", "MeanBoot", "UpperBoundBoot", "LowerNoise", "MeanNoise", "UpperNoise", "Signif.", "AboveNoise", "Amplicons", "PutativeStatus", "Passed") dfTemp } # TODO Check if this column added is correct for(i in 1:length(reportTables)) { tmpReportTable <- reportTables[[i]] # tmpReportTable <- reportTables[[1]] status <- mapply(ReliableStatus, tmpReportTable$PutativeStatus, tmpReportTable$Passed) # print(i) reportTables[[i]] <- cbind(reportTables[[i]], "ReliableStatus" = status) } # names(reportTables) <- colnames(samplesNormalizedReadCounts) names(reportTables) <- names(bootList) return(reportTables) } BackgroundReport <- function(background, geneNames) { # because package generation complains.. i <- NULL j <- NULL backgroundReport = foreach(i = seq_along(background)) %:% foreach(j = table(geneNames), .combine = rbind) %do% { background[[i]][[as.character(j)]] } } #remove genes that were considered significant by the algorithm RemSigGenes <- function(genesPos, sigGenes) { #if some of the genes were reported with a significantly different read count #should not be used for #background testing #which genes are in the genes pos object geneNames= names(genesPos) #define a function for not in `%ni%` = Negate(`%in%`) #which of these genes did not show a significant read count change nonSigGenes = which(geneNames %ni% sigGenes) #keep only the genes that did not show a significant read coount change nonSigGenesPos = genesPos[nonSigGenes] return(nonSigGenesPos) } AmplProb <- function(genesPos) { #how many amplicons where used for each ofhte genes geneCounts = countAmplicons(genesPos) #adjust the probablity for depending on the number of amplicons for each gene genePerc = 1/geneCounts #this information has to be available for each stable position ampliconProb = rep(genePerc, geneCounts) return(ampliconProb) } RatioMatrix <- function(sampleMat, refMean) { apply(sampleMat, 2, `/`, refMean) } SampleRatio <- function(ratios, numAmpl, amplWeights = NULL) { replace <- FALSE if(numAmpl >= length(ratios)) { replace <- TRUE } randomPos = sample(seq_along(ratios), numAmpl, prob = amplWeights, replace = replace) randomlySelectedRatios = ratios[randomPos] # randomMean = exp(mean(log(randomlySelectedRatios))) randomMean <- ratiosMean(randomlySelectedRatios) return(randomMean) } DescriptiveStatistics <- function(distribution, robust = FALSE) { # print(paste("Robust is ", robust)) if (robust) { centralTendency <- median(distribution) variability <- mad(distribution, constant = 1) # median absolute deviation } else { centralTendency <- mean(distribution) variability <- sd(distribution) } return(list(centralTendency = centralTendency, variability = variability)) } SampleNoiseGenes <- function(numAmpl = 2, ratios, replicates = 100, probs = NULL, significanceLevel = 0.05, robust = FALSE) { margin <- significanceLevel / 2 # now repeat the sampling for the selected number of # amplicons replicates sampleNoiseDistribution = replicate(replicates, SampleRatio(ratios, numAmpl, amplWeights = probs)) #get the upper, mean and lower values of the noise distribution logSampleNoiseDistribution <- log(sampleNoiseDistribution) ds <- DescriptiveStatistics(logSampleNoiseDistribution, robust) logMeanNoise <- ds$centralTendency logSdNoise <- ds$variability # if (robust) { # logMeanNoise <- median(logSampleNoiseDistribution) # logSdNoise <- mad(logSampleNoiseDistribution, constant = 1) # } else { # logMeanNoise <- mean(logSampleNoiseDistribution) # logSdNoise <- sd(logSampleNoiseDistribution) # } lowerBound <- exp(logMeanNoise + qnorm(margin) * logSdNoise) meanNoise <- exp(logMeanNoise) upperBound <- exp(logMeanNoise + qnorm(1 - margin) * logSdNoise) sampledNoise = c(lowerBound, meanNoise, upperBound) names(sampledNoise) = paste0(c("Lower", "Mean", "Upper"), "Noise") return(sampledNoise) } countAmplicons <- function(x) { return(sapply(x, NROW)) } # Returns a list with the unique number of amplicons for all genes NumberOfUniqueAmplicons <- function(genesPos) { # calcUniqueAmpliconNumbers <-function(genesPos) { ampliconNumbers = countAmplicons(genesPos) uniqueAmpliconNumbers = sort(unique(ampliconNumbers)) return(uniqueAmpliconNumbers) } #now calculate the background for each of the unique amplicon numbers # ratios: the IterateAmplNum <- function(uniqueAmpliconNumbers, ratios, replicates = 100, probs = NULL, significanceLevel = 0.05, robust = FALSE) { # Needed because of package check complaints.. i <- NULL noiseResults = foreach(i = seq_along(uniqueAmpliconNumbers)) %do% { sampledNoise = SampleNoiseGenes(uniqueAmpliconNumbers[i], ratios = ratios, replicates = replicates, probs = probs, significanceLevel = significanceLevel, robust = robust) sampledNoise } names(noiseResults) = as.character(uniqueAmpliconNumbers) return(noiseResults) } SelectReferenceSetByPercentil <- function(allSamplesReadCounts, normalizationMethod = "tmm", lowerBoundPercentage = 1, upperBoundPercentage = 99) { allSamplesReadCountsNormalized <-NormalizeCounts(allSamplesReadCounts, method = normalizationMethod) selectedSamplesFilepath <- NULL allSamples <- round(allSamplesReadCountsNormalized) samplesColnames <- colnames(allSamples) selectedRange <- list() for (i in 1:nrow(allSamples)) { # selectedRange[[i]] <- quantile(allSamples[i,]) selectedRange[[i]] <- quantile(allSamples[i,], c(lowerBoundPercentage, upperBoundPercentage) / 100) } selectedSamplesIndex <- NULL excludedSamplesIndex <- NULL for (j in 1:ncol(allSamples)) { addSample <- TRUE for(i in 1:nrow(allSamples)) { if ((allSamples[i, j] < selectedRange[[i]][1]) | (allSamples[i, j] > selectedRange[[i]][2])) { addSample <- FALSE # print(paste("Add sample?", addSample, i, selectedRange[[i]][1], allSamples[i, j], selectedRange[[i]][2])) break } } if (addSample) { selectedSamplesIndex <- cbind(selectedSamplesIndex, j) } else { excludedSamplesIndex <- cbind(excludedSamplesIndex, j) } } selectedSamples <- allSamples[, selectedSamplesIndex] excludedSamples <- allSamples[, -excludedSamplesIndex] selectedSamplesFilepath <- colnames(selectedSamples) return(selectedSamplesFilepath) } SelectReferenceSetByInterquartileRange <- function(allSamplesReadCounts, normalizationMethod = "tmm", iqrFactor = 1) { allSamplesReadCounts <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod) fivenumReads <- t(apply(allSamplesReadCounts, 1, fivenum)) colnames(fivenumReads) <- c("SampleMinimum", "LowerQuartile", "Median", "UpperQuartile", "SampleMaximum") iqr <- fivenumReads[, "UpperQuartile"] - fivenumReads[, "LowerQuartile"] dispersion <- iqrFactor * iqr sampleIndexes <- NULL for (i in 1:ncol(allSamplesReadCounts)) { if (all((allSamplesReadCounts[, i] >= (fivenumReads[,"LowerQuartile"] - dispersion )) & ((dispersion + fivenumReads[,"UpperQuartile"]) >= allSamplesReadCounts[, i]))) { sampleIndexes <- c(sampleIndexes, i) } } return(colnames(allSamplesReadCounts)[sampleIndexes]) } SelectReferenceSetByKmeans <- function(allSamplesReadCounts, normalizationMethod = "tmm", referenceNumberOfElements) { # referenceSamples <- function(referenceNumberOfElements, allSamples) { allSamples <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod) # allSamples <- allSamplesReadCounts numberOfSamples <- ncol(allSamples) numberOfOutliers <- numberOfSamples - referenceNumberOfElements if (numberOfOutliers < 0) { message(paste("Number of samples", numberOfSamples, "is smaller than the size of the reference set requested", referenceNumberOfElements, ", so all samples will be selected")) return(colnames(allSamples)) } else if (numberOfOutliers == 0) { message(paste("Number of samples", numberOfSamples, "is equal to the size of the reference set requested", referenceNumberOfElements)) return(colnames(allSamples)) } else { allSamples <- t(allSamples) kmeans.result <- kmeans(allSamples, centers=1) # "centers" is a data frame with one center centers <- kmeans.result$centers[kmeans.result$cluster, ] distances <- sqrt(rowSums((allSamples - centers)^2)) outliers <- order(distances, decreasing=TRUE)[1:numberOfOutliers] # these rows are top outliers message(cat("outliers:", colnames(allSamplesReadCounts)[outliers])) allSamples <- t(allSamples) # return(colnames(allSamples[, -outliers, drop = FALSE])) return(colnames(allSamples)[-outliers]) } } SelectReferenceSet <- function(samplesFilepaths, genomicRanges, removePcrDuplicates = FALSE, normalizationMethod = "tmm", referenceMaximumNumberOfElements = 10, referenceSelectionMethod = "percentil", lowerBoundPercentage = 1, upperBoundPercentage = 99) { # samplesFilepaths <- allSamplesFilteredFilepaths[1:3] # genomicRanges <- genomicRangesFromBed allSamplesReadCounts <- ReadCountsFromBam(bamFilenames = samplesFilepaths, sampleNames = samplesFilepaths, gr = genomicRanges, # ampliconNames = ampliconNames, removeDup = removePcrDuplicates) selectedSamplesFilepath <- SelectReferenceSetFromReadCounts(allSamplesReadCounts, # genomicRanges, # removePcrDuplicates = removePcrDuplicates, normalizationMethod = normalizationMethod, referenceMaximumNumberOfElements = referenceMaximumNumberOfElements, referenceSelectionMethod = referenceSelectionMethod, lowerBoundPercentage = lowerBoundPercentage, upperBoundPercentage = lowerBoundPercentage) return(selectedSamplesFilepath) } SelectReferenceSetFromReadCounts <- function(allSamplesReadCounts, # genomicRanges, # removePcrDuplicates = FALSE, normalizationMethod = "tmm", referenceMaximumNumberOfElements = 30, # referenceSelectionMethod = "percentil", referenceSelectionMethod = "kmeans", lowerBoundPercentage = 1, upperBoundPercentage = 99) { selectedSamplesFilepath <- NULL if (referenceSelectionMethod == "percentil") { message("Selecting reference by percentil") selectedSamplesFilepath <- SelectReferenceSetByPercentil(allSamplesReadCounts, normalizationMethod = normalizationMethod, # referenceSelectionMethod = referenceSelectionMethod, lowerBoundPercentage = lowerBoundPercentage, upperBoundPercentage = upperBoundPercentage) } else if (referenceSelectionMethod == "kmeans") { message("Selecting reference by kmeans") selectedSamplesFilepath <- SelectReferenceSetByKmeans(allSamplesReadCounts, normalizationMethod = normalizationMethod, referenceMaximumNumberOfElements) } else { message(paste("method", referenceSelectionMethod, "not supported")) } return(selectedSamplesFilepath) } CollectColumnFromAllReportTables <- function(reportTables, columnName) { compilation <- NULL mySampleNames <- names(reportTables) for(name in mySampleNames) { tmpReportTable <- reportTables[[name]] compilation <- cbind(compilation, as.vector(tmpReportTable[, columnName])) } colnames(compilation) <- mySampleNames rownames(compilation) <- rownames(reportTables[[1]]) return(compilation) } revalueDF <- function(myDataFrame, mappings) { myDataFrameColnames <- colnames(myDataFrame) for(columnName in myDataFrameColnames) { myDataFrame[, columnName] <- revalue(myDataFrame[, columnName], mappings) } mode(myDataFrame) <- "numeric" return(myDataFrame) } ## TODO check this function # ReliableAberrationStatusHeatMap <- function(meanBootResults, # realiableResults, # filepath = NULL) { # meanBootResults <- melanomaMeanBootResults # reliableResults <- melanomaCNVPanelizerResults # test_that("testing correct dimensions", { # expect_equal(colnames(meanBootResults), colnames(reliableResults)) # expect_equal(rownames(meanBootResults), rownames(reliableResults)) # }) # # kNormalDefaultValue <- 1.0 # kAmplificationMaximalValue <- 4.0 # kAmplificationMinimumValue <- 2.0 # # for(i in rownames(reliableResults)) { # for(j in colnames(reliableResults)) { # if (reliableResults[i,j] == "Normal") { # # print(paste("Normal", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue)) # meanBootResults[i,j] <- kNormalDefaultValue # } # # if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] > kAmplificationMaximalValue)) { # print(paste("Too high ", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as ", kAmplificationMaximalValue)) # meanBootResults[i,j] <- kAmplificationMaximalValue # } # # if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] < kAmplificationMinimumValue)) { # print(paste("Amplification too low", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue)) # meanBootResults[i,j] <- kNormalDefaultValue # } # } # } # # df.team_data <- melt(meanBootResults) # colnames(df.team_data) <- c("Gene", "Sample", "Bootratio") # g <- ggplot(data = df.team_data, # aes(x = Gene, y = Sample)) + # # geom_tile(aes(fill = scale(Bootratio))) + # geom_tile(aes(fill = log(Bootratio))) + # # + opts(theme(axis.text.x = element_text(angle = 180, hjust = 1))) # # scale_fill_gradient2(low="darkblue", high="darkgreen", guide="colorbar") + # # scale_fill_gradient2(low="green4", high="red4", guide="colorbar") + # scale_fill_gradient2(low = "darkgreen", mid = "white", high = "darkred") + # theme(axis.text.x = element_text(angle = 90, hjust = 1)) + # theme(axis.text.y = element_text(size=5)) # if (!missing(filepath)) { # ggsave(filepath) # } # } # why these colors by default: StatusHeatmap <- function(dfData, statusColors = c("Deletion" = "blue", "Normal" = "green", # " " = "green", "Amplification" = "red"), header = "Status Heatmap", filepath = "CNVPanelizerHeatMap.png") { mappings <- seq(length(statusColors)) # mappings <- c(-1, 0, 1) names(mappings) <- names(statusColors) matrixData <- revalueDF(dfData, mappings) valuesUsed <- table(matrixData) indexOfColorsUsed <- as.numeric(names(valuesUsed)) colorsRequired <- unname(statusColors[indexOfColorsUsed]) # TODO find out why do I need this horrible hack when only one status is available if (length(colorsRequired) <= 1) { colorsRequired <- c("green", "yellow") } # colorsRequired <- as.vector(statusColors[as.numeric(names(table(matrixData)))]) png(filename = file.path(filepath), # create PNG for the heat map width = 5*300, # 5 x 300 pixels height = 5*300, res = 300, # 300 pixels per inch pointsize = 8) # smaller font size lmat <- rbind( c(5,3,4), c(2,1,4) ) lhei <- c(1.5, 4) lhei <- c(1, 10) lwid <- c(1.5, 4, 0.75) myCexRow <- 0.2 heatmap.2(matrixData, main = header, # plot layout lmat = lmat, lhei = lhei, lwid = lwid, # scale = "row", # ColSideColors=heatColors, key = FALSE, # whether a color-key should be shown # key = TRUE, # whether a color-key should be shown # key.par=list(match=c(0), # amplification_deletion=c(1, 2, 3, 4), # deletion_amplification=c(5, 6)), dendrogram = "none", tracecol=NA, # density.info=c("histogram","density","none"), # col = unname(statusColors), # col = c("green"), col = colorsRequired, # if the values are not all revalued the colors have to be passed selectively # col = heatColors, # col = heatColors[matchPoints], colsep=1:ncol(matrixData), rowsep=1:nrow(matrixData), margins = c(15, 5), Rowv=FALSE, Colv=FALSE, # cexRow=myCexRow ) # legend("topleft", inset=.02, title= "UOUOU", # "this is legend", # # fill=topo.colors(3), # # fill=unname(statusColors), # horiz=TRUE, cex=0.8) #bottomleft # legend("bottom", inset=.01, title="CNV Status", # # # legend("left", inset=.01, title="CNV Status", # # legend("left", inset=0, title="CNV Status", # names(statusColors), # # fill=topo.colors(3), # fill=unname(statusColors), # cex=0.8) # # horiz=TRUE, cex=0.8) # # legend("bottomleft", legend=c("Line 1", "Line 2"), # col=c("red", "blue"), lty=1:2, cex=0.8) # legend(1, 95, legend=c("Line 1", "Line 2"), # col=c("red", "blue"), lty=1:2, cex=0.8) # in case of mistake this instruction should be called until return null instead of png.. dev.off() } ## TODO check # StatusStability <- function(geneNames, sampleNormalizedReadCounts, tmpReferenceNormalizedReadCounts) { # centralTendency <- matrix(rowMeans(tmpReferenceNormalizedReadCounts), ncol = 1) # rownames(centralTendency) <- rownames(tmpReferenceNormalizedReadCounts) # referenceAndSampleDifferences <- centralTendency - sampleNormalizedReadCounts # rownames(referenceAndSampleDifferences) <- rownames(tmpReferenceNormalizedReadCounts) # colnames(referenceAndSampleDifferences) <- "difference" # uniqueGeneNames <- unique(geneNames) # geneStabilityStatus <- c() # for (i in seq(uniqueGeneNames)) { # geneAmpliconIndexes <- grep(uniqueGeneNames[i], rownames(myReference)) # geneAmpliconStatus <- referenceAndSampleDifferences[geneAmpliconIndexes, ] # allGeneAmpliconsHaveSameStatus <- length(unique(geneAmpliconStatus >= 0))==1 # geneStabilityStatus <- c(geneStabilityStatus, allGeneAmpliconsHaveSameStatus) # } # names(geneStabilityStatus) <- uniqueGeneNames # return(geneStabilityStatus) # } ## TODO CHECK # StatusStabilityTable <- function(geneNames, tmpSamplesNormalizedReadCounts, tmpReferenceNormalizedReadCounts) { # statusStabilityTable <- NULL # for (i in seq(ncol(tmpSamplesNormalizedReadCounts))) { # statusStabilityTable <- cbind(statusStabilityTable, StatusStability(geneNames, matrix(tmpSamplesNormalizedReadCounts[, i], ncol = 1), tmpReferenceNormalizedReadCounts)) # } # colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts) # # statusStabilityTable <- t(statusStabilityTable) # # colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts) # return(statusStabilityTable) # } PlotBootstrapDistributions <- function(bootList, reportTables, outputFolder = getwd(), sampleNames = NULL, save = FALSE, # scale = 7) { scale = 10) { selSample <- NULL plotList <- foreach(selSample = seq_along(bootList)) %do% { test <- as.factor(reportTables[[selSample]][, "Passed"]) levelLabels <- c("NoChange", "NonReliableChange", "ReliableChange") names(levelLabels) <- c(0, 1, 2) test <- suppressMessages(revalue(test, levelLabels)) namedColors <- c("#56B4E9", "#CC79A7" ,"#D55E00") names(namedColors) <- levelLabels ratios <- NULL testsPassed <- NULL df <- data.frame(class = as.factor(colnames(bootList[[selSample]])), ratios = (as.vector(t(bootList[[selSample]]))), testsPassed = test) #if a genomic ranges object has been #if(!is.null(gr)) { # newGeneOrder <- bedToGeneOrder(gr) # df$class = with(df,factor(class,levels(class)[newGeneOrder])) #} ylim1 <- boxplot.stats((df$ratios))$stats[c(1, 5)] ylim1 <- c(max(ylim1),max(ylim1)) ylim1 <- log2(ylim1) ylim1 <- ylim1 * c(-scale,scale) if(is.null(sampleNames)) { filename <- names(bootList[selSample]) filepath <- paste0(outputFolder, "/", filename, "_plot.pdf") } else { filename <- sampleNames[selSample] filepath <- paste0(outputFolder, "/", filename, ".pdf") } bootPlot <- ggplot(df, aes(x = class, y = log2(ratios), fill = testsPassed)) + geom_violin() + ggtitle(filename) + theme(plot.title = element_text(lineheight = 0.8, face = "bold"), text = element_text(size = 15), axis.text.x = element_text(angle = 90)) + scale_fill_manual(name = "CNV Reliability", values = namedColors, labels = c("0" = "Foo", "1" = "Bar")) + coord_cartesian(ylim = ylim1) + theme(axis.text=element_text(size=10), axis.title=element_text(size=20,face="bold")) + scale_x_discrete("Gene Names") + # geom_hline(yintercept=log2(1.5), color="#009E73") + # geom_hline(yintercept=log2(0.5), color="#009E73") + geom_hline(yintercept=0, color="#009E73") if(save == TRUE) { message(paste0("Saving plot to '", filepath, "'")) # dir.create(outputFolder, recursive = TRUE, showWarnings = TRUE) dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE) ggsave(filename = filepath, plot = bootPlot, height = 7.42 * 1, width = 8.11 * 2, limitsize = FALSE) } # dev.off() return(bootPlot) } names(plotList) <- names(reportTables) return(plotList) } CNVPanelizerFromReadCountsHELPER <- function(sampleReadCounts, referenceReadCounts, genomicRangesFromBed, numberOfBootstrapReplicates = 10000, normalizationMethod = "tmm", robust = TRUE, backgroundSignificanceLevel = 0.05, outputDir = file.path(getwd(), "CNVPanelizer"), splitSize = 5) { myCNVPanelizerResults <- list() splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize)) # for (i in 1:ncol(sampleReadCounts)) { for (i in 1:length(splits)) { myCNVPanelizerResults[[i]] <- CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]], referenceReadCounts, genomicRangesFromBed, numberOfBootstrapReplicates = numberOfBootstrapReplicates, normalizationMethod = normalizationMethod, robust = robust, backgroundSignificanceLevel = backgroundSignificanceLevel, outputDir = outputDir) } n <- length(myCNVPanelizerResults) res <- NULL for (i in seq(n)) { res <- cbind(res, myCNVPanelizerResults[[i]]) } myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]] if(length(myCNVPanelizerResults) > 1) { for (i in 2:length(myCNVPanelizerResults)) { myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts) myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList) myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise) myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables) } } return(myCNVPanelizerResultsCompiled) } ###################################################################################### # still on testing.. ###################################################################################### # CNVPanelizerFromReadCountsHelperParallel <- function(sampleReadCounts, # referenceReadCounts, # genomicRangesFromBed, # numberOfBootstrapReplicates = 10000, # normalizationMethod = "tmm", # robust = TRUE, # backgroundSignificanceLevel = 0.05, # outputDir = file.path(getwd(), "CNVPanelizer"), # splitSize = 5) { # # myCNVPanelizerResults <- list() # # splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize)) # # # Use the detectCores() function to find the number of cores in system # no_cores <- detectCores() # # Setup cluster # clust <- makeCluster(no_cores - 1) # clusterExport(clust, # varlist = c( # "sampleReadCounts", # "referenceReadCounts", # "genomicRangesFromBed", # "numberOfBootstrapReplicates", # "normalizationMethod", # "robust", # "backgroundSignificanceLevel", # "outputDir", # "CNVPanelizerFromReadCounts"), # envir = environment()) # # myCNVPanelizerResults <- parLapply(clust, 1:length(splits), function(i) { # CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]], # referenceReadCounts, # genomicRangesFromBed, # numberOfBootstrapReplicates = numberOfBootstrapReplicates, # normalizationMethod = normalizationMethod, # robust = robust, # backgroundSignificanceLevel = backgroundSignificanceLevel, # outputDir = outputDir) # }) # stopCluster(clust) # # n <- length(myCNVPanelizerResults) # res <- NULL # for (i in seq(n)) { # res <- cbind(res, myCNVPanelizerResults[[i]]) # } # # myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]] # # if(length(myCNVPanelizerResults) > 1) { # for (i in 2:length(myCNVPanelizerResults)) { # myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts) # myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList) # myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise) # myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables) # } # } # return(myCNVPanelizerResultsCompiled) # } CNVPanelizerFromReadCounts <- function(sampleReadCounts, referenceReadCounts, genomicRangesFromBed, numberOfBootstrapReplicates = 10000, normalizationMethod = "tmm", robust = TRUE, backgroundSignificanceLevel = 0.05, outputDir = file.path(getwd(), "CNVPanelizer")) { # VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir) metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed) geneNames = metadataFromGenomicRanges["geneNames"][, 1] # # # metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed) # # geneNames = metadataFromGenomicRanges["geneNames"][, 1] # # ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1] # # sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths, # sampleNames = sampleBamFilepaths, # gr = genomicRangesFromBed, # removeDup = FALSE) # # sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths, # sampleNames = sampleBamFilepaths, # gr = genomicRangesFromBed, # removeDup = FALSE) normalizedReadCounts <- CombinedNormalizedCounts(sampleReadCounts, referenceReadCounts, method = normalizationMethod # , ampliconNames = ampliconNames ) # After normalization data sets need to be splitted again to perform bootstrap samplesNormalizedReadCounts = normalizedReadCounts["samples"][[1]] referenceNormalizedReadCounts = normalizedReadCounts["reference"][[1]] bootList <- BootList(geneNames, samplesNormalizedReadCounts, referenceNormalizedReadCounts, replicates = numberOfBootstrapReplicates) # Estimate the background noise left after normalization backgroundNoise <- Background(geneNames, samplesNormalizedReadCounts, referenceNormalizedReadCounts, bootList, replicates = numberOfBootstrapReplicates, significanceLevel = backgroundSignificanceLevel, robust = robust) # Build report tables reportTables <- ReportTables(geneNames, samplesNormalizedReadCounts, referenceNormalizedReadCounts, bootList, backgroundNoise) plots = PlotBootstrapDistributions(bootList, reportTables, sampleNames = names(reportTables), outputFolder = file.path(outputDir, "plots"), save = TRUE) WriteListToXLSX(reportTables, multipleFiles = TRUE, outputFolder = file.path(outputDir, "xlsx")) # WriteListToXLSX(reportTables, outputFolder = "", filepath = "list.xlsx") { # write.xlsx(listOfDataFrames, , rowNames = TRUE) # } # # # # # WriteListToXLSX(tmp1@reportTables, filepath = file.path(outputDir, "reportTables.xlsx")) # WriteListToXLSX(tmp1@reportTables, filepath = file.path(getwd(), "CNVPanelizer", "reportTables.xlsx")) # CNVPanelizerResults <- setClass("CNVPanelizerResults", # # setClass("CNVPanelizerResults", # # # use this function to extract the slot names of the slotNames(results_CNVPanelizer[[1]]) # slots = c(sampleReadCounts="matrix", # referenceReadColunts="matrix", # genomicRangesFromBed="GRanges", # bootList="list", # backgroundNoise="list", # plots = "list", # reportTables="list")) # # myCNVPanelizerResults <- CNVPanelizerResults(sampleReadCounts = sampleReadCounts, # referenceReadColunts = referenceReadCounts, # genomicRangesFromBed = genomicRangesFromBed, # bootList = bootList, # backgroundNoise = backgroundNoise, # plots = plots, # reportTables = reportTables) myCNVPanelizerResults <- list(sampleReadCounts = sampleReadCounts, referenceReadCounts = referenceReadCounts, genomicRangesFromBed = genomicRangesFromBed, bootList = bootList, backgroundNoise = backgroundNoise, plots = plots, reportTables = reportTables) return(myCNVPanelizerResults) } CNVPanelizer <- function(sampleBamFilepaths, referenceBamFilepaths, bedFilepath, amplColumnNumber = 6, minimumMappingQuality = 20, numberOfBootstrapReplicates = 10000, removePcrDuplicates = TRUE, # analysisMode = "gene", # analysisMode can be "gene" or "amplicon" robust = TRUE, backgroundSignificanceLevel = 0.05, outputDir = file.path(getwd(), "CNVPanelizer")) { # VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir) genomicRangesFromBed <- BedToGenomicRanges(bedFilepath, ampliconColumn = amplColumnNumber, split = "_") # metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed) # geneNames = metadataFromGenomicRanges["geneNames"][, 1] # ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1] sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths, sampleNames = sampleBamFilepaths, gr = genomicRangesFromBed, minimumMappingQuality = minimumMappingQuality, removeDup = removePcrDuplicates) referenceReadCounts <- ReadCountsFromBam(bamFilenames = referenceBamFilepaths, sampleNames = referenceBamFilepaths, gr = genomicRangesFromBed, minimumMappingQuality = minimumMappingQuality, removeDup = removePcrDuplicates) results <- CNVPanelizerFromReadCounts(sampleReadCounts = sampleReadCounts, referenceReadCounts = referenceReadCounts, genomicRangesFromBed = genomicRangesFromBed, # bedFilepath = bedFilepath, # amplColumnNumber = amplColumnNumber, # minimumMappingQuality = minimumMappingQuality, numberOfBootstrapReplicates = numberOfBootstrapReplicates, # removePcrDuplicates = removePcrDuplicates, # analysisMode = analysisMode, robust = robust, backgroundSignificanceLevel = backgroundSignificanceLevel, outputDir = outputDir) return(results) } Strict <- function(cnvPanelizerResults, amplificationMinimumThreshold = 2, deletionMaximumThreshold = 0.6, minimumNumberOfAmplicons = 2 # , fullConsensusAmongAmplicons = TRUE ) { # myCNVPanelizerTableResults <- CollectColumnFromAllReportTables(cnvPanelizerResults@reportTables, "ReliableStatus") myCNVPanelizerTableResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, "ReliableStatus") myCNVPanelizerTableResults <- t(myCNVPanelizerTableResults) myCNVPanelizerTableResults <- revalueDF(myCNVPanelizerTableResults, c("Normal"="")) interestingColumn <- "MeanBoot" # meanBootResults <- CollectColumnFromAllReportTables(cnvPanelizerResults@reportTables, interestingColumn) meanBootResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, interestingColumn) class(meanBootResults) <- "numeric" meanBootResults <- t(meanBootResults) # consensusResults <- ConsensusCheck(cnvPanelizerResults@genomicRangesFromBed, cnvPanelizerResults@sampleReadCounts, cnvPanelizerResults@referenceReadCounts) consensusResults <- ConsensusCheck(cnvPanelizerResults$genomicRangesFromBed, cnvPanelizerResults$sampleReadCounts, cnvPanelizerResults$referenceReadCounts) consensusResults <- t(consensusResults) test_that("colnames from CNVPanelizer results and Consensus match", { expect_equal(colnames(consensusResults), colnames(myCNVPanelizerTableResults)) }) test_that("rownames from CNVPanelizer results and Consensus match", { expect_equal(rownames(consensusResults), rownames(myCNVPanelizerTableResults)) }) cNVPanelizerResultsConsensus <- myCNVPanelizerTableResults for (rowname in rownames(myCNVPanelizerTableResults)) { for (colname in colnames(myCNVPanelizerTableResults)) { # print(paste(rowname, colname)) if (!consensusResults[rowname, colname]) { cNVPanelizerResultsConsensus[rowname, colname] <- "" print(paste("Setting ", myCNVPanelizerTableResults[rowname, colname], "to", cNVPanelizerResultsConsensus[rowname, colname])) } } } # table(unlist(myCNVPanelizerTableResults)) # table(unlist(cNVPanelizerResultsConsensus)) test_that("colnames from CNVPanelizer results and meanBootResults match", { expect_equal(colnames(meanBootResults), colnames(cNVPanelizerResultsConsensus)) }) test_that("rownames from CNVPanelizer results and meanBootResults match", { expect_equal(rownames(meanBootResults), rownames(cNVPanelizerResultsConsensus)) }) cNVPanelizerResultsConsensusAndThreshold <- cNVPanelizerResultsConsensus for (rowname in rownames(cNVPanelizerResultsConsensusAndThreshold)) { for (colname in colnames(cNVPanelizerResultsConsensusAndThreshold)) { # print(cNVPanelizerResultsConsensus[rowname, colname])}} if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Amplification") & ((meanBootResults[rowname, colname] <= amplificationMinimumThreshold)) # & ((meanBootResults[rowname, colname] <= 3)) ) { cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- "" print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "because", meanBootResults[rowname, colname])) } if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion") & ((meanBootResults[rowname, colname] >= deletionMaximumThreshold)) #& ((meanBootResults[rowname, colname] >= 0.7)) ) { cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- "" print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname])) } } } # table(unlist(cNVPanelizerResultsConsensus)) # table(unlist(cNVPanelizerResultsConsensusAndThreshold)) cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons <- cNVPanelizerResultsConsensusAndThreshold # genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons(cnvPanelizerResults@genomicRangesFromBed, minimumNumberOfAmplicons) genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons(cnvPanelizerResults$genomicRangesFromBed, minimumNumberOfAmplicons) for (rowname in rownames(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)) { for (colname in names(genesWithMinimumNumberOfAmplicons)) { if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion") & !genesWithMinimumNumberOfAmplicons[colname]) { cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname] <- "" print(paste("Setting ", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "to", cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname])) } } } # table(unlist(cNVPanelizerResultsConsensusAndThreshold)) # table(unlist(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)) # write.xlsx(cNVPanelizerResults, file=file.path(outputDir, paste0("cNVPanelizerResults", ".xlsx")), row.names = TRUE) # write.xlsx(cNVPanelizerResultsConsensus, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensus", ".xlsx")), row.names = TRUE) # write.xlsx(cNVPanelizerResultsConsensusAndThreshold, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThreshold", ".xlsx")), row.names = TRUE) # write.xlsx(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons", ".xlsx")), row.names = TRUE) return(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons) } # TODO Could be provided as a measurement of quality for the run # Overview <- function( # # sampleIdentifier, # sampleReadCounts1, # referenceReadCounts1, # allSamplesReadCounts, # genomicRangesFromBed, # # normalizationMethod = "tss", # normalizationMethod = "tmm", # outputDir = file.path(getwd(), "CNVPanelizer", "overviewKNNReference30")) { # # # sampleIdentifier <- "I:/Run Data/Run S5-56/M1_S5-56_IonXpress_074_rawlib.bam" # # # Overview(allSamplesReadCounts[, filepaths], # # allSamplesReadCounts[, referenceReadCounts], # # # allSamplesReadCounts, # # genomicRangesFromBed, # # # normalizationMethod = "tss", # # # normalizationMethod = "tmm", # # outputDir = file.path(getwd(), "CNVPanelizer", "overview")) # # sampleReadCounts1 <- allSamplesReadCounts[, filepaths] # # referenceReadCounts1 <- allSamplesReadCounts[, referenceReadCounts] # referenceReadCounts1 <- allSamplesReadCounts[, referenceFilepathsKmeans] # # allSamplesReadCountsNormalized <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod) # samplesNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(sampleReadCounts1)] # referenceNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(referenceReadCounts1)] # # runSamples <- samplesNormalizedReadCounts # refSamples <- referenceNormalizedReadCounts # entireSetOfSamples <- allSamplesReadCountsNormalized # # geneNames <- unique(genomicRangesFromBed$geneNames) # ampliconNames <- genomicRangesFromBed$ampliconNames # # TODO validate if ampliconNames are the rownames of samples matrices # # ampliconNames <- rownames(allSamplesNormalized) # # sampleIdentifiers <- colnames(runSamples) # #sampleIdentifiers <- sampleIdentifiers[1] # for (sampleIdentifier in sampleIdentifiers) { # myDir <- file.path(outputDir, basename(sampleIdentifier)) # dir.create(myDir, recursive = TRUE) # for (geneName in geneNames) { # # geneName <- "BRAF" # myAmpliconsIndexes <- grep(geneName, ampliconNames) # png(file=file.path(myDir, paste0(geneName,'_amplicons.png')), # width=1000, # height=(ceiling(length(myAmpliconsIndexes)/2) * 500)) # # # par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1)) # par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3, 3, 5, 1), oma=c(0,0,7,0)) # # for (ampliconName in ampliconNames[myAmpliconsIndexes]) { # # ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333" # tmp <- data.frame(y = c(as.vector(runSamples[ampliconName, ]) # ,as.vector(refSamples[ampliconName, ]) # ,as.vector(entireSetOfSamples[ampliconName, ]) # ), # # x = factor( # c(rep("runSamples", length(as.vector(runSamples[ampliconName, ]))) # ,rep("refSamples",length(as.vector(refSamples[ampliconName, ]))) # ,rep("allSamples",length(as.vector(entireSetOfSamples[ampliconName, ]))) # ), levels = c("runSamples", "refSamples", "allSamples")) # # # x = c(rep("runSamples", length(as.vector(runSamples[ampliconName, ]))) # # ,rep("refSamples",length(as.vector(refSamples[ampliconName, ]))) # # ) # ) # # myValues <- runSamples[ampliconName, sampleIdentifier] # # boxplot(y~x, # data=tmp, # ylab="Read Counts", # xlab="Sample set", # names = c("Run Samples", "Reference", "All Samples"), # # main = ampliconName) # main = paste(myValues, ampliconName)) # stripchart(y ~ x, # vertical = TRUE, # data = tmp, # method = "jitter", # add = TRUE, # # pch = 20, # pch = 20, # col = 'blue') # # # # stripchart(list(0.5, 1), vertical=TRUE, add=TRUE, method="stack", col='red', pch="*") # stripchart(myValues, vertical=TRUE, add=TRUE, method="stack", col='red', pch=16, cex=2) # } # title(main = paste(geneName, "-", sampleIdentifier), font.main = 4, outer=TRUE) # dev.off() # } # } # } ## TODO CHECK THIS # # # #rowIndexesToMerge <- IndexGenesPositions(lungGenomicRanges$geneNames) # #GeneMultilinePlot(normalSamples, rowIndexesToMerge, normalizationMethod = "tss") # GeneMultilinePlot <- function(dataSamples, rowIndexesToMerge, normalizationMethod = "tmm") { # dataSamples <- round(NormalizeCounts(dataSamples)) # result <- NULL # for (i in 1:ncol(dataSamples)) { # result <- cbind(result, sapply( rowIndexesToMerge, function(x) {mean(dataSamples[x, i])} )) # } # colnames(result) <- colnames(dataSamples) # dataSamples <- result # # dataSamples <- cbind(geneReadCounts = seq_along(rownames(dataSamples)), dataSamples) # # melted <- melt(dataSamples) # # ggplot(data=melted, aes(x=Var1, y=value, group=Var2, colour=Var2, linetype = Var2)) + # #geom_line() + # # scale_colour_manual(values = c('pink','orange','white', 'red')) + # theme(legend.position="left") + # geom_point(aes(color=Var2)) + # geom_line(size=1) + # theme(axis.text.x = element_text(angle = 90, hjust = 1)) # } # # #Example for testing MultiLinePlot # dataSamples <- read.table(text = "CN CNVPanelizer panelcn.mops ioncopy # CN1 0.1 0.4 0.4 # CN2 0.2 0.1 0.3 # CN3 0.1 0.2 0.1 # CN4 0.1 0.4 0.4 # CN5 0.2 0.1 0.3 # CN6 0.1 0.3 0.5 # CN7 0.6 0.6 0.6 # CN8 0.6 0.3 0.2 # CN9 0.6 0.8 0.8 # CN10 1 0.9 0.8", header=TRUE) # TODO reuse this code at GeneMultilinePlot ... # MultiLinePlot <- function(dataSamples, idColumnIdentifier = NULL, title = "", legendPosition = "left", xLabel = "", yLabel = "", smooth = FALSE) { # rs <- dataSamples # if (is.null(idColumnIdentifier)) { # melted = melt(rs) # } else { # melted = melt(rs, id.vars=idColumnIdentifier) # } # melted$CN <- as.character(melted$CN) # melted$CN <-factor(melted$CN, levels = unique(melted$CN)) # myPlot <- ggplot(data=melted, aes(x=CN, y=value, group=variable, colour=variable, linetype = variable)) + # scale_colour_discrete(name = "Method") + # scale_colour_manual(values = c('pink','orange','white', 'red')) + # ggtitle(title) + # geom_point(aes(color=variable)) + # plots the dots.. # theme(text = element_text(size=10), # axis.text.x = element_text(angle=90, hjust=1)) + # theme(panel.border = element_blank(), # panel.grid.major = element_blank(), # panel.grid.minor = element_blank(), # axis.line = element_line(colour = "black")) + # theme_bw() + # sets the background colour # theme(plot.title = element_text(hjust = 0.5)) + # theme(legend.position=legendPosition) + xlab(xLabel) + ylab(yLabel) # if(smooth) { # # myPlot = myPlot + geom_smooth(method = "loess", se = FALSE, span = 1.5, show.legend = FALSE) # myPlot = myPlot + geom_smooth(method = "auto", se = FALSE, span = 1.5, show.legend = FALSE) # } else { # myPlot = myPlot + geom_line(size = 1, show.legend = FALSE) # links the dots with lines.. # } # return(myPlot) # } SampleReadCountsPools <- function(ampliconsReadCounts, numberOfPools) { listOfPools <- list() for (i in 1:numberOfPools) { listOfPools[[i]] <- ampliconsReadCounts[seq(from = i, to = length(ampliconsReadCounts), by = numberOfPools)] } return(listOfPools) } SampleReadCountsByPool <-function(ampliconReadCounts, pools) { # existingPools <- as.numeric(names(table(pools))) # if (any(is.na(existingPools)) { # stop("Pools are not defined as 1, 2, ...") # } existingPools <- names(table(pools)) if (length(pools) != length(ampliconReadCounts)) { stop("Length of pools is different than ampliconReadCounts") } listOfPools <- list() for (i in existingPools) { listOfPools[[i]] <- (ampliconReadCounts[which(pools==i)]) } return(listOfPools) } PoolsPlots <- function(sampleReadCountsByPool) { plotValues <- as.integer(unlist(lapply(sampleReadCountsByPool, mean))) barplot(plotValues, ylab = "Read Counts mean", xlab="Pools", names.arg = names(sampleReadCountsByPool), col = "dodgerblue4", # ylim=c(0, trunc(max(plotValues))) ) } ################################################################################ # KaryotypeAberrationPlot ################################################################################ # KaryotypeAberrationPlot(bedFilepath = bedFilepath, # ampliconColumn = 4, # filepath = "TESTANDO_APAGAR_KARYOTYPE.png") KaryotypeAberrationPlot <- function(bedFilepath, ampliconColumn, status = sample(c("red", "green", "blue"), 30, replace = TRUE), filepath = "KaryotypeAberrationPlot.png", width = 1200, height = 800) { #if (!requireNamespace("BiocManager", quietly=TRUE)) #install.packages("BiocManager") #BiocManager::install("chromPlot") #library(chromPlot) #library(stringr) hg_gap <- NULL # not sure if this is good solution to avoid 'NOTE' data(hg_gap) getGeneRegionsStatusSummary <- GetGeneRegionsStatusSummary(bedFilepath = bedFilepath, ampliconColumn = ampliconColumn, status = sample(c("red", "green", "blue"), 30, replace=TRUE)) geneRegionsBands <- GetGeneRegionsBand(getGeneRegionsStatusSummary) png(filepath, width = width, height = height) # chromPlot(gaps=hg_gap) # chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value") # chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom)) # chromPlot(gaps=hg_gap, bands=hg_cytoBandIdeo, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom), statName="Value", noHist=TRUE, figCols=5, cex=0.7, statTyp="n", chrSide=c(1,1,1,1,1,1,-1,1)) # TODO TEMPORARARIO PORQUE O chromPlot nao pode ser instalado em versao 3.2.0 ... # karyotypeAberrationPlot <- chromPlot(gaps = hg_gap, # bands = geneRegionsBands, # chr = unique(getGeneRegionsStatusSummary$Chrom), # stat = getGeneRegionsStatusSummary, # statCol = "Value", # statName = "Value", # noHist = TRUE, # figCols = 7, # cex = 0.7, # statTyp = "n", # chrSide = c(1,1,1,1,1,1,-1,1)) karyotypeAberrationPlot <- NA dev.off() return(karyotypeAberrationPlot) } # It Merges all regions related to the same gene and assigns it a color that relates to the aberratiosn status # Status should be a vector of colors representing the amplifications (red for amplification, green for deletion and blue for normal) GetGeneRegionsStatusSummary <- function(bedFilepath, ampliconColumn = 4, split = "_", status = c("blue")) { a <- read.csv(bedFilepath, sep="\t", skip=1, header = FALSE) a[, 1] <- str_replace(a$V1, "chr","") a[, ampliconColumn] <- vapply(strsplit(as.vector(a[, ampliconColumn]), "_", fixed = TRUE), "[", "", 1) a <- a[,1:4] colnames(a) <- c("Chrom", "Start", "End", "ID" # , "Colors" ) geneNames <- unique(a$ID) geneRegions <- NULL for (geneName in geneNames) { genomicRegions <- a[a$ID==geneName, ] chromossome <- unique(genomicRegions$Chrom) start <- min(genomicRegions$Start) end <- max(genomicRegions$End) gene <- unique(genomicRegions$ID) geneRegions <- rbind(geneRegions, c(chromossome, start, end, gene)) } colnames(geneRegions) <- colnames(a) geneRegions <- as.data.frame(geneRegions, # as.is = TRUE) stringsAsFactors = FALSE) geneRegions <- cbind(geneRegions, Colors = status) # geneRegions$Chrom <- as.numeric(geneRegions$Chrom) geneRegions$Start <- as.numeric(geneRegions$Start) geneRegions$End <- as.numeric(geneRegions$End) geneRegions$ID <- as.factor(geneRegions$ID) return(geneRegions) } # Returns objected needed for plotting.. it changes some column names and makes the regions larger to be able to be visible in the plot.. #GetGeneRegionsBand <- function(geneRegionsStatusSummary) { GetGeneRegionsBand <- function(geneRegions) { geneRegionsBands <- geneRegions colnames(geneRegionsBands) <- c("Chrom", "Start", "End", "Name", "Colors") geneRegionsBands$Chrom <- paste0("chr", geneRegionsBands$Chrom) geneRegionsBands$Name <- as.character(geneRegionsBands$Name) geneRegionsBands$Colors <- as.character(geneRegionsBands$Colors) # since the regions are too small to be visualized.. around 1 milion times smaller than the size of chromossome.. geneRegionsBands$End <- geneRegionsBands$End + 1E6 return(geneRegionsBands) } # TODO # Could take in consideration the size of regions and # the whole coverage of the gene (number of amplicons * seqLength) # > 50 bp # panelStatistics(genomicRangesFromBed) { # # hist(width(ranges(genomicRangesFromBed))) # # genomicRangesFromBed$geneNames # # } # boxplotGeneLevel <- function(allSamplesNormalized, # cohortFilepaths, # referenceFilepathsKNN, # referenceFilepathsPercentil, # geneName, # outputDir = file.path(getwd(),"batchAnalysisNoMininumMappingQuality2")) { # #boxplotPerAmplicon <- function(ampliconIndex) { # #geneName <- "BRAF" # #geneName <- "APC" # # geneName <- "ERBB4" # # # setwd(outputDir) # # # cohortFilepaths <- filepaths # martina <- allSamplesNormalized[, (colnames(allSamplesNormalized) %in% cohortFilepaths)] # # nonMartina <- allSamplesNormalized[, !(colnames(allSamplesNormalized) %in% cohortFilepaths)] # # # for (geneName in geneNames) { # # # # ampliconNames <- rownames(allSamplesNormalized) # myAmpliconsIndexes <- grep(geneName, ampliconNames) # # # pdf(file=paste0(geneName,'_plot.pdf'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5) # png(file=paste0(geneName,'_plot.png'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5) # # par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1)) # # for (ampliconName in ampliconNames[myAmpliconsIndexes]) { # # ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333" # tmp <- data.frame(y = c(as.vector(nonMartina[ampliconName, ]), # as.vector(nonMartina[ampliconName, referenceFilepathsKNN]), # as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]), # as.vector(martina[ampliconName, ])), # x = c(rep("nonMartina",length(as.vector(nonMartina[ampliconName, ]))), # rep("referenceKNN", length(as.vector(nonMartina[ampliconName, referenceFilepathsKNN]))), # rep("referencePercentil", length(as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]))), # rep("martina", length(as.vector(martina[ampliconName, ]))))) # # boxplot(y~x, data=tmp, ylab="Read Counts", xlab="Sample set", main = ampliconName) # stripchart(y ~ x, # vertical = TRUE, # data = tmp, # method = "jitter", # add = TRUE, # pch = 20, # col = 'blue') # } # dev.off() # } # } # i <- 0 # for (x in 1:100) { # i <- i+1 # if (i %% 10 == 0) { # Sys.sleep(1) # cat(".") # } # } #