git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@115433 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: SICtools |
2 | 2 |
Type: Package |
3 | 3 |
Title: Find SNV/Indel differences between two bam files with near relationship |
4 |
-Version: 1.1.0 |
|
4 |
+Version: 1.1.1 |
|
5 | 5 |
Date: 2014-12-11 |
6 | 6 |
Author: Xiaobin Xing, Wu Wei |
7 | 7 |
Maintainer: Xiaobin Xing <xiaobinxing0316@gmail.com> |
... | ... |
@@ -15,7 +15,7 @@ Description: This package is to find SNV/Indel differences between two |
15 | 15 |
License: GPL (>=2) |
16 | 16 |
LazyLoad: Yes |
17 | 17 |
Depends: R (>= 3.0.0), methods, Rsamtools (>= 1.18.1), doParallel (>= 1.0.8), Biostrings (>= 2.32.1), stringr (>= 0.6.2), matrixStats |
18 |
- (>= 0.10.0) |
|
18 |
+ (>= 0.10.0), plyr (>= 1.8.3) |
|
19 | 19 |
Imports: foreach, IRanges, GenomicRanges |
20 | 20 |
Suggests: knitr, RUnit, BiocGenerics |
21 | 21 |
biocViews: Alignment, Sequencing, Coverage, SequenceMatching, QualityControl, DataImport, Software, SNP, VariantDetection |
... | ... |
@@ -73,7 +73,7 @@ function(ppIndel,rowIndex,refFsa,pValueCutOff,gtDistCutOff,verbose){ |
73 | 73 |
## select two most alts |
74 | 74 |
if(ncol(gtMtx) >= 3){ |
75 | 75 |
|
76 |
- gtCountMtx <- gtMtx[,c('*',names(sort(colSums(gtMtx[,which(colnames(gtMtx) != '*'),drop=FALSE]), |
|
76 |
+ gtCountMtx <- gtMtx[,c('*',names(sort(colSums(gtMtx[,which(colnames(gtMtx) != '*')]), |
|
77 | 77 |
decreasing=TRUE)[1:2]))] |
78 | 78 |
|
79 | 79 |
}else if(ncol(gtMtx) == 2){ |
... | ... |
@@ -89,7 +89,7 @@ function(ppIndel,rowIndex,refFsa,pValueCutOff,gtDistCutOff,verbose){ |
89 | 89 |
gtPvalue <- ifelse(is.na(gtPvalue),1,gtPvalue) |
90 | 90 |
gtDist <- ifelse(is.na(gtDist),0,gtDist) |
91 | 91 |
|
92 |
- if(gtPvalue < pValueCutOff & gtDist > gtDistCutOff){ |
|
92 |
+ if(gtPvalue <= pValueCutOff & gtDist >= gtDistCutOff){ |
|
93 | 93 |
|
94 | 94 |
## format genotype to output |
95 | 95 |
delSeqLen <- as.numeric(str_extract(colnames(gtCountMtx),'-\\d+')) |
... | ... |
@@ -113,21 +113,23 @@ function(ppIndel,rowIndex,refFsa,pValueCutOff,gtDistCutOff,verbose){ |
113 | 113 |
if(ncol(gtCountMtx) == 3){ |
114 | 114 |
|
115 | 115 |
## output table |
116 |
- tmpOut <- as.character(c(rowDat[,2:3],colnames(gtCountMtx), |
|
116 |
+ tmpOut <- c(rowDat[,2:3],colnames(gtCountMtx), |
|
117 | 117 |
gtCountMtx[1,1], gtCountMtx[1,2],gtCountMtx[1,3], |
118 | 118 |
gtCountMtx[2,1], gtCountMtx[2,2],gtCountMtx[2,3], |
119 |
- pValue = format(gtPvalue,digits=3,scientific=TRUE), |
|
120 |
- gtDist = format(gtDist,digits=3,scientific=TRUE))) |
|
121 |
- |
|
119 |
+ pValue = gtPvalue, |
|
120 |
+ gtDist = gtDist) |
|
121 |
+ |
|
122 | 122 |
}else if(ncol(gtCountMtx) < 3){ |
123 | 123 |
|
124 | 124 |
## output table |
125 |
- tmpOut <- as.character(c(rowDat[,2:3],colnames(gtCountMtx),NA, |
|
125 |
+ tmpOut <- c(rowDat[,2:3],colnames(gtCountMtx),NA, |
|
126 | 126 |
gtCountMtx[1,1], gtCountMtx[1,2],NA, |
127 | 127 |
gtCountMtx[2,1], gtCountMtx[2,2],NA, |
128 |
- format(gtPvalue,digits=3,scientific=TRUE), |
|
129 |
- format(gtDist,digits=3,scientific=TRUE))) |
|
128 |
+ pValue = gtPvalue, |
|
129 |
+ gtDist = gtDist) |
|
130 | 130 |
} |
131 |
+ |
|
132 |
+ return(tmpOut) |
|
131 | 133 |
} |
132 | 134 |
} |
133 | 135 |
} |
... | ... |
@@ -2,6 +2,29 @@ indelDiff <- |
2 | 2 |
function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
3 | 3 |
minMapQuality = 0,nCores = 1,pValueCutOff = 0.05,gtDistCutOff = 0.1,verbose=TRUE){ |
4 | 4 |
|
5 |
+ ## for test |
|
6 |
+# library(doParallel) |
|
7 |
+# library(Rsamtools) |
|
8 |
+# library(matrixStats) |
|
9 |
+# library(plyr) |
|
10 |
+# |
|
11 |
+# bam1 <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/ER.sorted.bam' |
|
12 |
+# bam2 <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/d1c.0.01.bam' |
|
13 |
+# refFsa <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/S288c.fa' |
|
14 |
+# regChr <- 'chr2' |
|
15 |
+# regStart <- 1 |
|
16 |
+# regEnd <- 230218 |
|
17 |
+# minBaseQuality <- 13 |
|
18 |
+# minMapQuality <- 0 |
|
19 |
+# nCores <- 10 |
|
20 |
+# pValueCutOff <- 0.05 |
|
21 |
+# gtDistCutOff <- 0.1 |
|
22 |
+# verbose <- TRUE |
|
23 |
+# |
|
24 |
+ |
|
25 |
+ ## disable dimention dropping for matrix |
|
26 |
+ `[` <- function(...) base::`[`(...,drop=FALSE) |
|
27 |
+ |
|
5 | 28 |
## set NULL |
6 | 29 |
rowIndex <- NULL |
7 | 30 |
|
... | ... |
@@ -11,12 +34,12 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
11 | 34 |
if(!file.exists(refFsa)){stop("ERROR: input refFsa file doesn't exist!")} |
12 | 35 |
|
13 | 36 |
if(regChr < 0){stop("ERROR: please input valid regChr!")} |
14 |
- if(as.numeric(regStart) < 0){stop("ERROR: regStart should be more than 0!")} |
|
15 |
- if(as.numeric(regEnd) < 0){stop("ERROR: regEnd should be more than 0!")} |
|
16 |
- if(as.numeric(regStart) > as.numeric(regEnd)){stop("ERROR: regStart should be smaller than regEnd!")} |
|
37 |
+ regStart <- as.numeric(regStart);if(regStart < 0 ){stop("ERROR: regStart should be more than 0!")} |
|
38 |
+ regEnd <- as.numeric(regEnd);if(regEnd < 0 ){stop("ERROR: regEnd should be more than 0!")} |
|
39 |
+ if(regStart > regEnd){stop("ERROR: regStart should be smaller than regEnd!")} |
|
17 | 40 |
|
18 |
- regStart <- format(as.numeric(regStart),scientific=FALSE) |
|
19 |
- regEnd <- format(as.numeric(regEnd),scientific=FALSE) |
|
41 |
+ regStart <- format(regStart,scientific=FALSE) |
|
42 |
+ regEnd <- format(regEnd,scientific=FALSE) |
|
20 | 43 |
minBaseQuality <- as.numeric(minBaseQuality) |
21 | 44 |
minMapQuality <- as.numeric(minMapQuality) |
22 | 45 |
pValueCutOff <- as.numeric(pValueCutOff) |
... | ... |
@@ -31,46 +54,35 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
31 | 54 |
ppIndel <- read.delim(pipe(paste(pathSICtools,' mpileup -Q ',minBaseQuality,' -q ',minMapQuality, ' -Ogf ',refFsa,' ',bam1,' ',bam2,' -r ',regChr,':',regStart,'-',regEnd,sep='')), |
32 | 55 |
header=FALSE,colClasses = 'character',col.names=paste('V',1:12,sep='')) |
33 | 56 |
|
34 |
- ## remove items with too low indel hits |
|
35 |
- ppIndel <- ppIndel[which(!(unlist(lapply(ppIndel$V6, |
|
36 |
- function(x){sum(gregexpr('[+-]',x)[[1]] > 0)}))/as.numeric(ppIndel$V5) < 0.05 |
|
37 |
- & unlist(lapply(ppIndel$V10, |
|
38 |
- function(x){sum(gregexpr('[+-]',x)[[1]] > 0)}))/as.numeric(ppIndel$V9) < 0.05) |
|
39 |
- & as.numeric(ppIndel$V5) > 0 |
|
40 |
- & as.numeric(ppIndel$V9) > 0),,drop=FALSE] |
|
57 |
+ ## remove items with too low indel hits |
|
58 |
+ ppIndel <- ppIndel[which(!(unlist(lapply(gregexpr('[+-]',ppIndel$V6),function(x){sum(x > 0)}),use.names=FALSE)/as.numeric(ppIndel$V5) < 0.05 & |
|
59 |
+ |
|
60 |
+ unlist(lapply(gregexpr('[+-]',ppIndel$V10),function(x){sum(x > 0)}),use.names=FALSE)/as.numeric(ppIndel$V9) < 0.05) & |
|
61 |
+ as.numeric(ppIndel$V5) > 0 & as.numeric(ppIndel$V9) > 0),] |
|
41 | 62 |
|
42 |
- if(nrow(ppIndel) > 0){ |
|
63 |
+ if(nrow(ppIndel) > 0){ |
|
43 | 64 |
|
44 | 65 |
## test out |
45 |
- testOutIndel <- foreach(rowIndex=1:nrow(ppIndel)) %dopar% |
|
46 |
- .indelDiffFunc(ppIndel,rowIndex,refFsa,pValueCutOff,gtDistCutOff,verbose) |
|
47 |
- |
|
48 |
- ## remove NULL |
|
49 |
- testOutIndel <- testOutIndel[!sapply(testOutIndel,is.null)] |
|
66 |
+ testOutIndel <- ldply(1:nrow(ppIndel),function(rowIndex){.indelDiffFunc(ppIndel,rowIndex,refFsa,pValueCutOff,gtDistCutOff,verbose)},.parallel=TRUE) |
|
50 | 67 |
|
51 |
- ## combine the result |
|
52 |
- testOutIndel <- do.call(rbind,testOutIndel) |
|
53 |
- |
|
54 |
- if(!is.null(testOutIndel)){ |
|
68 |
+ if(nrow(testOutIndel) > 0){ |
|
55 | 69 |
|
56 |
- testIndelMtx <- testOutIndel[,-c(1,3:5),drop=FALSE] |
|
57 |
- class(testIndelMtx) <- 'numeric' |
|
58 |
- testIndelDf <- data.frame(chr=testOutIndel[,1],ref=testOutIndel[,3],alt1=testOutIndel[,4],alt2 = testOutIndel[,5],testIndelMtx,stringsAsFactors=FALSE) |
|
70 |
+ colnames(testOutIndel) <- c('chr','pos','ref','altGt1','altGt2','refBam1Count','altGt1Bam1Count', |
|
71 |
+ 'altGt2Bam1Count','refBam2Count','altGt1Bam2Count','altGt2Bam2Count','p.value','d.value') |
|
72 |
+ class(testOutIndel$pos) <- 'numeric' |
|
73 |
+ class(testOutIndel$refBam1Count) <- 'numeric' |
|
74 |
+ class(testOutIndel$altGt1Bam1Count) <- 'numeric' |
|
75 |
+ class(testOutIndel$altGt2Bam1Count) <- 'numeric' |
|
76 |
+ class(testOutIndel$refBam2Count) <- 'numeric' |
|
77 |
+ class(testOutIndel$altGt1Bam2Count) <- 'numeric' |
|
78 |
+ class(testOutIndel$altGt2Bam2Count) <- 'numeric' |
|
79 |
+ class(testOutIndel$p.value) <- 'numeric' |
|
80 |
+ class(testOutIndel$d.value) <- 'numeric' |
|
59 | 81 |
|
60 |
- ## at least one result |
|
61 |
- if(nrow(testIndelDf) > 0){ |
|
62 |
- |
|
63 |
- rownames(testIndelDf) <- 1:nrow(testIndelDf) |
|
64 |
- testIndelDf <- testIndelDf[,c(1,5,2:4,6:13)] |
|
65 |
- colnames(testIndelDf) <- c('chr','pos','ref','altGt1','altGt2','refBam1Count','altGt1Bam1Count', |
|
66 |
- 'altGt1Bam2Count','refBam2Count','altGt2Bam1Count','altGt2Bam2Count','p.value','d.value') |
|
67 |
- |
|
68 |
- testIndelDf <- testIndelDf[,c('chr','pos','ref','altGt1','altGt2','refBam1Count','altGt1Bam1Count','altGt2Bam1Count', |
|
69 |
- 'refBam2Count','altGt1Bam2Count','altGt2Bam2Count','p.value','d.value')] |
|
70 |
- |
|
71 |
- return(testIndelDf[,,drop=FALSE]) |
|
72 |
- |
|
73 |
- } |
|
82 |
+ return(testOutIndel) |
|
74 | 83 |
} |
84 |
+ |
|
85 |
+ }else{ |
|
86 |
+ NULL |
|
75 | 87 |
} |
76 | 88 |
} |
... | ... |
@@ -1,6 +1,29 @@ |
1 | 1 |
snpDiff <- |
2 |
-function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
|
3 |
- minMapQuality = 0,nCores = 1,pValueCutOff= 0.05,baseDistCutOff = 0.1,verbose=TRUE){ |
|
2 |
+ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
|
3 |
+ minMapQuality = 0,nCores = 1,pValueCutOff= 0.05,baseDistCutOff = 0.1,verbose=TRUE){ |
|
4 |
+ |
|
5 |
+ |
|
6 |
+ ## for test |
|
7 |
+# library(doParallel) |
|
8 |
+# library(Rsamtools) |
|
9 |
+# library(matrixStats) |
|
10 |
+# library(plyr) |
|
11 |
+# |
|
12 |
+# bam1 <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/ER.sorted.bam' |
|
13 |
+# bam2 <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/d1c.0.01.bam' |
|
14 |
+# refFsa <- '/g/steinmetz/xing/SICtoolsDebug/20160319/debug/S288c.fa' |
|
15 |
+# regChr <- 'chr1' |
|
16 |
+# regStart <- 1 |
|
17 |
+# regEnd <- 230218 |
|
18 |
+# minBaseQuality <- 13 |
|
19 |
+# minMapQuality <- 0 |
|
20 |
+# nCores <- 10 |
|
21 |
+# pValueCutOff <- 0.05 |
|
22 |
+# baseDistCutOff <- 0.1 |
|
23 |
+# verbose <- TRUE |
|
24 |
+ |
|
25 |
+ ## disable dimention dropping for matrix |
|
26 |
+ `[` <- function(...) base::`[`(...,drop=FALSE) |
|
4 | 27 |
|
5 | 28 |
## validate the parameters |
6 | 29 |
if(!file.exists(bam1)){stop("ERROR: input bam1 file doesn't exist!")} |
... | ... |
@@ -8,9 +31,9 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
8 | 31 |
if(!file.exists(refFsa)){stop("ERROR: input refFsa file doesn't exist!")} |
9 | 32 |
|
10 | 33 |
if(regChr < 0 ){stop("ERROR: please input valid regChr!")} |
11 |
- if(as.numeric(regStart) < 0 ){stop("ERROR: regStart should be more than 0!")} |
|
12 |
- if(as.numeric(regEnd) < 0 ){stop("ERROR: regEnd should be more than 0!")} |
|
13 |
- if(as.numeric(regStart) > as.numeric(regEnd)){stop("ERROR: regStart should be smaller than regEnd!")} |
|
34 |
+ regStart <- as.numeric(regStart);if(regStart < 0 ){stop("ERROR: regStart should be more than 0!")} |
|
35 |
+ regEnd <- as.numeric(regEnd);if(regEnd < 0 ){stop("ERROR: regEnd should be more than 0!")} |
|
36 |
+ if(regStart > regEnd){stop("ERROR: regStart should be smaller than regEnd!")} |
|
14 | 37 |
|
15 | 38 |
pValueCutOff <- as.numeric(pValueCutOff) |
16 | 39 |
baseDistCutOff <- as.numeric(baseDistCutOff) |
... | ... |
@@ -27,30 +50,24 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
27 | 50 |
countMtx <- cbind(countMtx,x[['pos']]) |
28 | 51 |
|
29 | 52 |
## duplicated index |
30 |
- dupIndex <- duplicated(countMtx[,-c(5,10,11),drop=FALSE]) |
|
53 |
+ dupIndex <- duplicated(countMtx[,-c(5,10,11)]) |
|
31 | 54 |
|
32 | 55 |
## max bam1 == max bam2 index |
33 |
- maxBaseIndex <- rowMaxs(countMtx[,1:4,drop=FALSE])/rowSums(countMtx[,1:4,drop=FALSE]) > 0.95 & |
|
34 |
- rowMaxs(countMtx[,6:9,drop=FALSE])/rowSums(countMtx[,6:9,drop=FALSE]) > 0.95 & |
|
35 |
- max.col(countMtx[,1:4,drop=FALSE]) == max.col(countMtx[,6:9,drop=FALSE]) |
|
36 |
- |
|
37 |
-# maxBaseIndex <- rowMaxs(countMtx[,,drop=FALSE],cols=1:4)/rowSums(countMtx[,,drop=FALSE],cols=1:4) > 0.95 & |
|
38 |
-# rowMaxs(countMtx[,,drop=FALSE],cols=6:9)/rowSums(countMtx[,,drop=FALSE],cols=6:9) > 0.95 & |
|
39 |
-# colMaxs(countMtx[,,drop=FALSE],cols=1:4) == colMaxs(countMtx[,,drop=FALSE],cols=6:9) |
|
56 |
+ maxBaseIndex <- rowMaxs(countMtx[,1:4])/rowSums(countMtx[,1:4]) > 0.95 & |
|
57 |
+ rowMaxs(countMtx[,6:9])/rowSums(countMtx[,6:9]) > 0.95 & |
|
58 |
+ max.col(countMtx[,1:4]) == max.col(countMtx[,6:9]) |
|
40 | 59 |
|
41 | 60 |
## all 0 test |
42 |
- allZeroIndex <- rowSums(countMtx[,1:4,drop=FALSE]) == 0 | rowSums(countMtx[,6:9,drop=FALSE]) == 0 |
|
43 |
-# allZeroIndex <- rowSums(countMtx[,,drop=FALSE],cols=1:4) == 0 | rowSums(countMtx[,,drop=FALSE],cols=6:9) == 0 |
|
44 |
- |
|
61 |
+ allZeroIndex <- rowSums(countMtx[,1:4]) == 0 | rowSums(countMtx[,6:9]) == 0 |
|
45 | 62 |
|
46 | 63 |
## test index |
47 | 64 |
testIndex <- !dupIndex & !maxBaseIndex & !allZeroIndex |
48 | 65 |
|
49 | 66 |
if(sum(testIndex) > 0){ ## at least one candidate |
50 | 67 |
|
51 |
- countMtxTest <- countMtx[testIndex,,drop=FALSE] |
|
68 |
+ countMtxTest <- countMtx[testIndex,] |
|
52 | 69 |
|
53 |
- testTmp <- lapply(1:nrow(countMtxTest),function(rowIndex){ |
|
70 |
+ testResultMtx <- do.call(rbind,llply(1:nrow(countMtxTest),function(rowIndex){ |
|
54 | 71 |
|
55 | 72 |
rowInfo <- countMtxTest[rowIndex,] |
56 | 73 |
|
... | ... |
@@ -58,7 +75,7 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
58 | 75 |
testMtx <- matrix(rowInfo[c(1:4,6:9)],nrow=2,byrow=TRUE) |
59 | 76 |
|
60 | 77 |
## remove 0 columns |
61 |
- testMtx <- testMtx[,colSums(testMtx) > 0,drop=FALSE] |
|
78 |
+ testMtx <- testMtx[,colSums(testMtx) > 0] |
|
62 | 79 |
|
63 | 80 |
## get pValue and dist |
64 | 81 |
pValue <- fisher.test(testMtx)$p.value |
... | ... |
@@ -69,39 +86,24 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
69 | 86 |
|
70 | 87 |
if(pValue <= as.numeric(pValueCutOff) & baseDist >= as.numeric(baseDistCutOff)){ |
71 | 88 |
|
72 |
- as.character(c(names(x[['seqnames']]),rowInfo[11], |
|
73 |
- rowInfo[1],rowInfo[2],rowInfo[3],rowInfo[4],rowInfo[5], |
|
74 |
- rowInfo[6],rowInfo[7],rowInfo[8],rowInfo[9],rowInfo[10], |
|
75 |
- format(pValue,digits=3,scientific=TRUE), |
|
76 |
- format(baseDist,digits=3,scientific=TRUE))) |
|
89 |
+ c(rowInfo[1],rowInfo[2],rowInfo[3],rowInfo[4],rowInfo[5], |
|
90 |
+ rowInfo[6],rowInfo[7],rowInfo[8],rowInfo[9],rowInfo[10],rowInfo[11], |
|
91 |
+ pValue,baseDist) |
|
77 | 92 |
} |
78 |
- }) |
|
79 |
- |
|
80 |
- testTmp2 <- do.call(rbind,testTmp[,drop=FALSE]) |
|
81 |
- testMtx <- testTmp2[,-1,drop=FALSE] |
|
82 |
- class(testMtx) <- 'numeric' |
|
83 |
- testDf <- data.frame(chr = testTmp2[,1],testMtx,stringsAsFactors=FALSE) |
|
84 |
- |
|
93 |
+ })) |
|
94 |
+ |
|
85 | 95 |
## for genotype duplicated positions, trace back |
86 |
- countMtxDup <- countMtx[!maxBaseIndex & !allZeroIndex,,drop=FALSE] |
|
87 |
- countMtxDupDf <- as.data.frame(countMtxDup[duplicated(countMtxDup[,-11,drop=FALSE]),,drop=FALSE],stringsAsFactors=FALSE) |
|
96 |
+ countMtxDup <- countMtx[!maxBaseIndex & !allZeroIndex,] |
|
97 |
+ countMtxDupIndex <- duplicated(countMtxDup[,-11]) |
|
88 | 98 |
|
89 |
- if(nrow(countMtxDupDf) > 0 & nrow(testDf) > 0){ |
|
90 |
- |
|
91 |
- countMtxDupDfFull <- cbind(countMtxDupDf,testDf[match(apply(countMtxDupDf[,-11,drop=FALSE],1,paste,collapse=' '),apply(testDf[,3:12],1,paste,collapse=' ')),c('X12','X13')]) |
|
92 |
- countMtxDupDfFull <- countMtxDupDfFull[!is.na(countMtxDupDfFull$X12),,drop=FALSE] |
|
93 |
- |
|
94 |
- if(nrow(countMtxDupDfFull) > 0){ |
|
95 |
- countMtxDupDfFull[,'X14'] <- unique(testDf$chr) |
|
96 |
- countMtxDupDfFull <- countMtxDupDfFull[,c(14,11,1:10,12,13)] |
|
97 |
- colnames(countMtxDupDfFull) <- colnames(testDf) |
|
98 |
- testDf <- rbind(testDf,countMtxDupDfFull) |
|
99 |
- testDf <- testDf[order(testDf[,'X1']),] |
|
100 |
- } |
|
99 |
+ if(sum(countMtxDupIndex) > 0 & !is.null(testResultMtx)){ |
|
101 | 100 |
|
101 |
+ countMtxDupFull <- cbind(countMtxDup,testResultMtx[match(apply(countMtxDup[,1:10],1,paste,collapse='_'),apply(testResultMtx[,1:10],1,paste,collapse='_')),c(12,13)]) |
|
102 |
+ countMtxDupFull <- countMtxDupFull[!is.na(countMtxDupFull[,12]),] |
|
103 |
+ testResultMtx <- unique(rbind(testResultMtx,countMtxDupFull)) |
|
102 | 104 |
} |
103 | 105 |
|
104 |
- return(testDf) |
|
106 |
+ return(testResultMtx) |
|
105 | 107 |
} |
106 | 108 |
} |
107 | 109 |
|
... | ... |
@@ -111,12 +113,8 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
111 | 113 |
## build bam files for pileup |
112 | 114 |
ppFiles <- PileupFiles(c(bam1,bam2)) |
113 | 115 |
|
114 |
- ## get region info |
|
115 |
- regStart <- as.numeric(regStart) |
|
116 |
- regEnd <- as.numeric(regEnd) + 1 |
|
117 |
- |
|
118 | 116 |
## split into 10M block |
119 |
- regSplit <- shift(as(breakInChunks(regEnd - regStart,1e7L),'IRanges'),regStart-1) |
|
117 |
+ regSplit <- shift(as(breakInChunks(regEnd - regStart + 1,1e7L),'IRanges'),regStart-1) |
|
120 | 118 |
|
121 | 119 |
## for each region |
122 | 120 |
regSplitDiffFunc <- function(splitIndex){ |
... | ... |
@@ -127,7 +125,7 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
127 | 125 |
} |
128 | 126 |
|
129 | 127 |
regSplitParam <- ApplyPileupsParam(flag = scanBamFlag(isDuplicate=FALSE), |
130 |
- which=GRanges(regChr, IRanges(start=start(regSplit)[splitIndex], end=end(regSplit)[splitIndex])), |
|
128 |
+ which=GRanges(regChr, IRanges(start=start(regSplit[splitIndex]), end=end(regSplit[splitIndex]))), |
|
131 | 129 |
what='seq',minDepth = 0L,minBaseQuality, minMapQuality,maxDepth = 2500L, yieldBy = 'range') |
132 | 130 |
|
133 | 131 |
applyPileups(ppFiles,calcInfoRange,param=regSplitParam)[[1]] |
... | ... |
@@ -138,23 +136,21 @@ function(bam1,bam2,refFsa,regChr,regStart,regEnd,minBaseQuality = 13, |
138 | 136 |
splitIndex <- NULL |
139 | 137 |
|
140 | 138 |
## range out |
141 |
- rangeOut <- foreach(splitIndex=1:length(regSplit)) %dopar% regSplitDiffFunc(splitIndex) |
|
142 |
- |
|
143 |
- ## remove NULL |
|
144 |
- rangeOut <- rangeOut[!sapply(rangeOut,is.null)] |
|
145 |
- |
|
146 |
- ## combine the results |
|
147 |
- rangeOut <- do.call(rbind,rangeOut) |
|
148 |
- |
|
149 |
- if(!is.null(rangeOut)){ |
|
139 |
+ rangeOut <- ldply(1:length(regSplit),regSplitDiffFunc,.parallel=TRUE) |
|
140 |
+ |
|
150 | 141 |
if(nrow(rangeOut) > 0){ |
151 | 142 |
|
143 |
+ colnames(rangeOut) <- c('A1','C1','G1','T1','N1','A2','C2','G2','T2','N2','pos','p.value','d.value') |
|
144 |
+ rangeOut$chr <- regChr |
|
145 |
+ |
|
152 | 146 |
## get ref by Biostrings |
153 |
- rangeOut$ref <- as.character(getSeq(FaFile(refFsa),GRanges(seqnames=rangeOut[,1],IRanges(start=rangeOut[,2],width=1)))) |
|
147 |
+ rangeOut$ref <- as.character(getSeq(FaFile(refFsa),GRanges(seqnames=rangeOut$chr,IRanges(start=rangeOut$pos,width=1)))) |
|
148 |
+ rangeOut <- rangeOut[order(rangeOut$pos),] |
|
154 | 149 |
|
155 | 150 |
## format output |
156 |
- rangeOut <- rangeOut[,c(1,2,15,3:14),drop=FALSE] |
|
157 |
- colnames(rangeOut) <- c('chr','pos','ref','A1','C1','G1','T1','N1','A2','C2','G2','T2','N2','p.value','d.value') |
|
158 |
- return(rangeOut[,,drop=FALSE]) |
|
159 |
- }} |
|
151 |
+ rangeOut <- rangeOut[,c('chr','pos','ref','A1','C1','G1','T1','N1','A2','C2','G2','T2','N2','p.value','d.value')] |
|
152 |
+ return(rangeOut) |
|
153 |
+ }else{ |
|
154 |
+ NULL |
|
155 |
+ } |
|
160 | 156 |
} |