Browse code

Bug fixes to illumina genotype calling when using cdfName="nopackage" option

mritchie authored on 19/04/2018 07:17:14
Showing1 changed files
... ...
@@ -144,7 +144,11 @@ priormeans=prepriormeans
144 144
     XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
145 145
 ### impute gender if gender information not provided
146 146
     if (is.null(gender)) {
147
-       gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
147
+       if(sum(is.na(YIndex))==length(YIndex)){
148
+		gender=NULL
149
+       }else{
150
+	gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
151
+       }
148 152
     }
149 153
 
150 154
 ### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
Browse code

added code to allow genotype calling for Illumina chips without the need for region specific package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@126739 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 14/02/2017 03:47:52
Showing1 changed files
... ...
@@ -25,8 +25,7 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
25 25
     colnames(scores) = colnames(M)  
26 26
     
27 27
     priormeans = calculatePriorValues(M, numSNP, verbose)
28
-    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
29
-  
28
+      
30 29
 ### retrieve or calculate coefficients
31 30
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
32 31
     
... ...
@@ -83,7 +82,7 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
83 82
     }
84 83
 
85 84
     message("Leaving out non-variant SNPs")
86
-
85
+    
87 86
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
88 87
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
89 88
     SNPtoProcessIndex = uniqueCount >= 3
... ...
@@ -113,8 +112,18 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
113 112
     }
114 113
     colnames(trueCalls)=colnames(M)
115 114
     rownames(trueCalls)=rownames(M)
116
-
117
-    priormeans = calculatePriorValues(M, numSNP, verbose)
115
+if(numSample<=30){
116
+  priormeans=prepriormeans
117
+}else{
118
+  priormeans = calculatePriorValues(M, numSNP, verbose)
119
+    if((abs(priormeans[1]-prepriormeans[1])+abs(priormeans[2]-prepriormeans[2])+abs(priormeans[3]-prepriormeans[3]))>=3){
120
+priormeans=prepriormeans
121
+}else{
122
+      priormeans=priormeans
123
+  }
124
+ }
125
+ 
126
+    
118 127
     VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
119 128
 
120 129
 ### retrieve or calculate coefficients
... ...
@@ -131,23 +140,24 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
131 140
 ### assign confidence scores
132 141
     computeCallPr(scores, M, calls, numSNP, numSample, verbose)
133 142
 
134
-    YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
135
-
143
+   YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
144
+    XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
136 145
 ### impute gender if gender information not provided
137 146
     if (is.null(gender)) {
138
-        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
147
+       gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
139 148
     }
140 149
 
141
-### double-check ChrY SNP cluster, impute gender if gender information not provided
150
+### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
142 151
     if (!(is.null(gender))) {
143 152
         verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
153
+        verifyChrXSNPs(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose)
144 154
     }
145 155
 
146 156
 #    if (length(YIndex)>0  && !(is.null(gender))) {
147 157
 #      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
148 158
 #    }
149 159
 ### add back SNPs excluded before
150
-    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
160
+   AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
151 161
 
152 162
     close(calls)
153 163
     close(scores)
... ...
@@ -170,7 +180,7 @@ loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
170 180
 
171 181
     numSNP <- nrow(A)
172 182
     numSample <- ncol(A)
173
-    library(limma)
183
+   
174 184
 
175 185
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
176 186
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
... ...
@@ -927,7 +937,7 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
927 937
     S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 
928 938
 
929 939
     # S is calculated and saved in original SNP order. 
930
-    matchy = match(annotation[YIndex, "Name"], rownames(S))
940
+    matchy = match(rownames(annotation)[YIndex], rownames(S))
931 941
     matchy = matchy[!is.na(matchy)]
932 942
     if (length(matchy) <= 10){
933 943
         predictedGender = rep(NA, ncol(S))
... ...
@@ -955,8 +965,8 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
955 965
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
956 966
 
957 967
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
958
-        message("Separation between clusters too small (samples probabaly all the same gender): skipping gender prediction step");
959
-        predictedGender = rep(NA, ncol(Sy))
968
+        message("Separation between clusters too small (samples probabaly all the same gender): ");
969
+      
960 970
     }
961 971
     
962 972
     meanmatrix = apply(Sy, 2, median)
... ...
@@ -986,7 +996,7 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
986 996
     open(callsGt)
987 997
     open(callsPr)
988 998
        
989
-    matchy = match(annotation[YIndex, "Name"], rownames(M))
999
+    matchy = match(rownames(annotation)[YIndex], rownames(M))
990 1000
     matchy = matchy[!is.na(matchy)]
991 1001
    
992 1002
     MChrY = M[matchy,]
... ...
@@ -1031,3 +1041,58 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
1031 1041
     
1032 1042
     if (verbose) message("Done verifying SNPs on Chromosome Y")
1033 1043
 }
1044
+
1045
+
1046
+
1047
+
1048
+#######################################
1049
+verifyChrXSNPs <- function(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose){
1050
+    if (verbose) message("Start verifying SNPs on Chromosome X")
1051
+    callsGt = calls(cnSet)
1052
+    callsPr = snpCallProbability(cnSet)
1053
+    open(callsGt)
1054
+    open(callsPr)
1055
+      
1056
+    matchx = match(rownames(annotation)[XIndex], rownames(M))
1057
+    matchx = matchx[!is.na(matchx)]
1058
+   
1059
+    MChrX= M[matchx,]
1060
+    callsChrX = callsGt[matchx,]
1061
+  
1062
+    male = gender == 1
1063
+    female = gender == 2    
1064
+    
1065
+    checkK = apply(callsChrX[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
1066
+    
1067
+    for(i in 1:nrow(MChrX)){
1068
+      
1069
+        # Chromosome X SNPs for male, k = 1 or 2 permitted for male samples
1070
+thisChrXSNPorigPosition = match(rownames(callsChrX)[i], rownames(callsGt))
1071
+        # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
1072
+        # male samples to 3 clusters by mistake. 
1073
+        if (checkK[i] < 3) next;
1074
+          
1075
+        if (class(try(kmeans(MChrX[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
1076
+           
1077
+            maleSampleCalls = kmeans(MChrX[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
1078
+            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 1] = 1
1079
+            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 2] = 3
1080
+        } else {
1081
+                    
1082
+            distanceToPrior1 = mean(abs(MChrX[i, male] - priormeans[1]))
1083
+            distanceToPrior3 = mean(abs(MChrX[i, male] - priormeans[3]))                
1084
+            callsGt[thisChrXSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
1085
+        }
1086
+
1087
+        MMaleSamples = MChrX[i, male]
1088
+        callsMaleSample = callsGt[thisChrXSNPorigPosition, male]
1089
+        scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
1090
+
1091
+       callsPr[thisChrXSNPorigPosition, male] = scoresMaleSample       
1092
+    }
1093
+
1094
+    close(callsGt)
1095
+    close(callsPr)
1096
+    
1097
+    if (verbose) message("Done verifying SNPs on Chromosome X")
1098
+}
Browse code

Added nopackage option for krlmm

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@117087 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 03/05/2016 23:39:54
Showing1 changed files
... ...
@@ -1,10 +1,9 @@
1 1
 krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
-  
3
-    pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
2
+    pkgname <- getCrlmmAnnotationName(cdfName)
4 3
     
5 4
 ### pre-processing, output M    
6 5
     M = computeLogRatio(cnSet, verbose)
7
-    message("leaving out novariant SNPs")
6
+    message("Leaving out non-variant SNPs")
8 7
     
9 8
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
10 9
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
... ...
@@ -32,11 +31,11 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
32 31
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
33 32
     
34 33
 ### do VGLM fit, to predict k for each SNP
35
-    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
34
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
36 35
     rm(VGLMparameters)
37 36
     
38 37
 ### assign calls
39
-    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
38
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose)
40 39
     rm(kPrediction)
41 40
 
42 41
 ### assign confidence scores
... ...
@@ -70,6 +69,208 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
70 69
     TRUE
71 70
 }
72 71
 
72
+krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) {
73
+    if(is.null(normalize.method)){
74
+        message("No normalization method specified. Quantile normalization applied")
75
+        M=quantilenormalization(cnSet,verbose,offset=offset)
76
+    }else{
77
+        if(normalize.method=="quantile"){
78
+            M=quantilenormalization(cnSet,verbose,offset=offset)
79
+        }
80
+        if(normalize.method=="loess"){
81
+            M=loessnormalization(cnSet,verbose,offset=offset)
82
+	}
83
+    }
84
+
85
+    message("Leaving out non-variant SNPs")
86
+
87
+    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
88
+    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
89
+    SNPtoProcessIndex = uniqueCount >= 3
90
+    noVariantSNPIndex = uniqueCount < 3
91
+    M = M[SNPtoProcessIndex, ]
92
+
93
+    numSNP = nrow(M)
94
+    numSample = ncol(M)
95
+
96
+    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
97
+    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
98
+    open(calls)
99
+    open(scores)
100
+
101
+    rownames(calls) = rownames(M)
102
+    rownames(scores) = rownames(M)
103
+    colnames(calls) = colnames(M)
104
+    colnames(scores) = colnames(M)
105
+
106
+    prepriormeans=calculatePriorcentersC(M,numSample)
107
+
108
+    trueCalls=matrix(NA,nrow(M),ncol(M))
109
+
110
+    for(i in 1:ncol(M)){
111
+      tmp=kmeans(M[,i],centers=prepriormeans,nstart=45)
112
+      trueCalls[,i]=tmp$cluster
113
+    }
114
+    colnames(trueCalls)=colnames(M)
115
+    rownames(trueCalls)=rownames(M)
116
+
117
+    priormeans = calculatePriorValues(M, numSNP, verbose)
118
+    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
119
+
120
+### retrieve or calculate coefficients
121
+
122
+    krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T)
123
+### do VGLM fit, to predict k for each SNP
124
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
125
+    rm(VGLMparameters)
126
+
127
+### assign calls
128
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T)
129
+    rm(kPrediction)
130
+
131
+### assign confidence scores
132
+    computeCallPr(scores, M, calls, numSNP, numSample, verbose)
133
+
134
+    YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
135
+
136
+### impute gender if gender information not provided
137
+    if (is.null(gender)) {
138
+        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
139
+    }
140
+
141
+### double-check ChrY SNP cluster, impute gender if gender information not provided
142
+    if (!(is.null(gender))) {
143
+        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
144
+    }
145
+
146
+#    if (length(YIndex)>0  && !(is.null(gender))) {
147
+#      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
148
+#    }
149
+### add back SNPs excluded before
150
+    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
151
+
152
+    close(calls)
153
+    close(scores)
154
+    rm(calls)
155
+    rm(scores)
156
+    rm(M)
157
+    TRUE
158
+}
159
+
160
+
161
+#######################################################################################################
162
+
163
+loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
164
+    # compute log-ratio in blocksize of 300,000 by default
165
+    message("Start computing log-ratios")
166
+    A <- A(cnSet)
167
+    B <- B(cnSet)
168
+    open(A)
169
+    open(B)
170
+
171
+    numSNP <- nrow(A)
172
+    numSample <- ncol(A)
173
+    library(limma)
174
+
175
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
176
+    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
177
+
178
+    rownames(M) = rownames(A)
179
+    colnames(M) = colnames(A)
180
+
181
+    numBlock = ceiling(numSNP / blockSize)
182
+    for (i in 1:numBlock){
183
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
184
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
185
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
186
+        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
187
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
188
+        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
189
+        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
190
+
191
+        rm(subsetA, subsetB, subsetM,subsetS)
192
+
193
+    }
194
+    close(A)
195
+    close(B)
196
+    gc()
197
+
198
+    isna=is.na(M[,1])
199
+    prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample)
200
+    F01=abs(prepriormeansL[1])-prepriormeansL[2]   
201
+    F02=abs(prepriormeansL[3])+prepriormeansL[2]
202
+
203
+    for(i in 1:ncol(M)) {
204
+        isna=is.na(M[,i])
205
+        Msub = M[!isna,i]
206
+        Mna=M[isna,i]
207
+        Ssub = S[!isna,i]
208
+        Sna=S[isna,i]
209
+
210
+        km = kmeans(Msub, prepriormeansL)
211
+        ind1v2 = km$cluster==1
212
+        ind2v2 = km$cluster==2
213
+        ind3v2 = km$cluster==3
214
+
215
+        l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2])
216
+        l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2])
217
+        l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2])
218
+        Msub[ind1v2] = l1v2$residuals+F01
219
+        Msub[ind2v2] = l2v2$residuals
220
+        Msub[ind3v2] = l3v2$residuals-F02
221
+        Mnew=rbind(as.matrix(Msub),as.matrix(Mna))
222
+        rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna)))
223
+        m=match(rownames(M),rownames(Mnew))
224
+        Mnew=Mnew[m]
225
+        M[,i] = Mnew
226
+        rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2)
227
+    }
228
+    return(M)
229
+}
230
+
231
+
232
+#######################################################################################################
233
+
234
+quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){
235
+    A <- A(cnSet)
236
+    B <- B(cnSet)
237
+    open(A)
238
+    open(B)
239
+
240
+    numSNP <- nrow(A)
241
+    numSample <- ncol(A)
242
+
243
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
244
+    X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer")
245
+    Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer")
246
+    rownames(M) = rownames(X) = rownames(Y) = rownames(A)
247
+    colnames(M) = colnames(X) = colnames(Y) = colnames(A)
248
+
249
+    # This step may chew up a lot of memory...
250
+    X =  normalize.quantiles(as.matrix(A[,]))+offset
251
+    Y =  normalize.quantiles(as.matrix(B[,]))+offset
252
+
253
+    numBlock = ceiling(numSNP / blockSize)
254
+    for (i in 1:numBlock){
255
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
256
+        subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
257
+        subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
258
+        subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm")
259
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
260
+        rm(subsetYqws, subsetXqws, subsetM)
261
+    }
262
+
263
+   close(A)
264
+   close(B)
265
+   return(M)
266
+   delete(X)
267
+   delete(Y)
268
+   rm(X)
269
+   rm(Y)
270
+   gc()
271
+}
272
+
273
+
73 274
 #######################################################################################################
74 275
 
75 276
 getNumberOfCores <- function(){
... ...
@@ -80,11 +281,27 @@ getNumberOfCores <- function(){
80 281
 
81 282
 #######################################################################################################
82 283
 
83
-computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
284
+computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
84 285
     # compute log-ratio in blocksize of 500,000 by default
85
-    message("Start computing log ratio")
86
-    A <- A(cnSet)
87
-    B <- B(cnSet)
286
+    if(verbose)
287
+        message("Start computing log-ratios")
288
+    if(!is.null(col) | !is.null(row)) {
289
+      if(is.null(row)) {
290
+       A <- A(cnSet)[,col]
291
+       B <- B(cnSet)[,col]
292
+      }
293
+      if(is.null(col)) {
294
+       A <- A(cnSet)[row,]
295
+       B <- B(cnSet)[row,]
296
+      }
297
+      if(!is.null(col) & !is.null(row)) {
298
+       A <- A(cnSet)[row,col]
299
+       B <- B(cnSet)[row,col]
300
+      }
301
+    } else {
302
+       A <- A(cnSet)
303
+       B <- B(cnSet)
304
+    }
88 305
     open(A)
89 306
     open(B)
90 307
     
... ...
@@ -97,9 +314,9 @@ computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
97 314
   
98 315
     numBlock = ceiling(numSNP / blockSize)
99 316
     for (i in 1:numBlock){
100
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
101
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
102
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
317
+        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
318
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
319
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset              
103 320
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
104 321
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
105 322
         rm(subsetA, subsetB, subsetM)
... ...
@@ -107,15 +324,32 @@ computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
107 324
     
108 325
     close(A)
109 326
     close(B)
110
-    if (verbose) message("Done computing log ratio")
327
+    if(verbose)
328
+        message("Done computing log-ratios")
111 329
     return(M)    
112 330
 }
113 331
 
114
-computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
332
+computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
115 333
     # compute average log intensity in blocksize of 500,000 by default
116
-    message("Start computing average log intensity")
117
-    A <- A(cnSet)
118
-    B <- B(cnSet)
334
+    if(verbose)
335
+        message("Start computing average log-intensities")
336
+    if(!is.null(col) | !is.null(row)) {
337
+      if(is.null(row)) {
338
+       A <- A(cnSet)[,col]
339
+       B <- B(cnSet)[,col]
340
+      }
341
+      if(is.null(col)) {
342
+       A <- A(cnSet)[row,]
343
+       B <- B(cnSet)[row,]
344
+      }
345
+      if(!is.null(col) & !is.null(row)) {
346
+       A <- A(cnSet)[row,col]
347
+       B <- B(cnSet)[row,col]
348
+      }
349
+    } else {
350
+       A <- A(cnSet)
351
+       B <- B(cnSet)
352
+    }
119 353
     open(A)
120 354
     open(B)
121 355
     
... ...
@@ -128,9 +362,9 @@ computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
128 362
       
129 363
     numBlock = ceiling(numSNP / blockSize)
130 364
     for (i in 1:numBlock){
131
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
132
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
133
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
365
+        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
366
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
367
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset                     
134 368
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
135 369
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
136 370
         rm(subsetA, subsetB, subsetS)
... ...
@@ -138,9 +372,9 @@ computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
138 372
     
139 373
     close(A)
140 374
     close(B)
141
-    if (verbose) message("Done computing average log intensity")
142
-    return(S)
143
-  
375
+    if(verbose)
376
+        message("Done computing average log-intensities")
377
+    return(S)  
144 378
 }
145 379
 
146 380
 
... ...
@@ -153,7 +387,7 @@ calculatePriorValues <- function(M, numSNP, verbose) {
153 387
         return(sort(tmp$centers, decreasing = T))
154 388
     }
155 389
 
156
-    if (verbose) message("Start calculating Prior Means")
390
+    if (verbose) message("Start calculating prior means")
157 391
     cl <- makeCluster(getNumberOfCores())
158 392
     centers <- parRapply(cl, M, calculateOneKmeans)
159 393
     stopCluster(cl) 
... ...
@@ -163,10 +397,27 @@ calculatePriorValues <- function(M, numSNP, verbose) {
163 397
       checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
164 398
       priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
165 399
     }
166
-    if (verbose) message("Done calculating Prior Means")
400
+    if (verbose) message("Done calculating prior means")
167 401
     return(priormeans)
168 402
 }
169 403
 
404
+calculatePriorcentersC <- function(M, numSample) {
405
+
406
+    calculateOneKmeans <- function(x) {
407
+        tmp = kmeans(x, 3, nstart=45)
408
+        return(sort(tmp$centers, decreasing = T))
409
+    }
410
+
411
+    cl <- makeCluster(getNumberOfCores())
412
+    centers <- parCapply(cl, M, calculateOneKmeans)
413
+    stopCluster(cl) 
414
+    centers <- matrix(centers, numSample, 3, byrow = TRUE)
415
+    prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
416
+    
417
+    return(prepriormeans)
418
+}
419
+
420
+
170 421
 #######################################################################################################
171 422
 
172 423
 calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
... ...
@@ -378,7 +629,7 @@ assignCallsOneSNP <- function(x, priormeans, numSample){
378 629
 
379 630
 assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
380 631
     # process by block size of 500,000 by default
381
-    message("Start assign calls")
632
+    message("Start assigning calls")
382 633
        
383 634
     numBlock = ceiling(numSNP / blockSize)
384 635
 
... ...
@@ -402,14 +653,14 @@ assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, b
402 653
         callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
403 654
         rm(subsetM, subseta, subsetcalls)
404 655
     }
405
-    message("Done assign calls")
656
+    message("Done assigning calls")
406 657
 }
407 658
 
408 659
 #######################################################################################################
409 660
 
410 661
 
411 662
 calculateParameters <- function(M, priormeans, numSNP, verbose) {
412
-    if (verbose) message("Start calculating 3-clusters parameters")
663
+    if (verbose) message("Start calculating 3-cluster parameters")
413 664
     params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
414 665
     if (verbose) message("Done calculating 3-cluster parameters")
415 666
 
... ...
@@ -636,7 +887,7 @@ calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
636 887
 
637 888
 computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
638 889
     # compute log-ratio in blocksize of 500,000 by default
639
-    if (verbose) message("Start computing confidence score")
890
+    if (verbose) message("Start computing confidence scores")
640 891
     
641 892
     numBlock = ceiling(numSNP / blockSize)
642 893
     for (i in 1:numBlock){
... ...
@@ -648,7 +899,7 @@ computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSi
648 899
         rm(subsetM, subsetCalls, subsetCallProb)
649 900
     }
650 901
 
651
-    if (verbose) message("Done computing confidence score")
902
+    if (verbose) message("Done computing confidence scores")
652 903
 }
653 904
 
654 905
 #############################################
... ...
@@ -671,15 +922,15 @@ AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, varia
671 922
 
672 923
 #############################################
673 924
 
674
-krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
925
+krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
675 926
     if (verbose) message("Start imputing gender")    
676
-    S = computeAverageLogIntensity(cnSet, verbose) 
927
+    S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 
677 928
 
678 929
     # S is calculated and saved in original SNP order. 
679
-    matchy = match(annotation[YIndex, 2], rownames(S))
930
+    matchy = match(annotation[YIndex, "Name"], rownames(S))
680 931
     matchy = matchy[!is.na(matchy)]
681 932
     if (length(matchy) <= 10){
682
-        predictedGender = rep(NA, ncol(A))
933
+        predictedGender = rep(NA, ncol(S))
683 934
     }
684 935
     Sy = S[matchy,]
685 936
 
... ...
@@ -704,8 +955,8 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
704 955
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
705 956
 
706 957
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
707
-        message("Skipping gender prediction step");
708
-        predictedGender = rep(NA, ncol(Sy))        
958
+        message("Separation between clusters too small (samples probabaly all the same gender): skipping gender prediction step");
959
+        predictedGender = rep(NA, ncol(Sy))
709 960
     }
710 961
     
711 962
     meanmatrix = apply(Sy, 2, median)
... ...
@@ -735,7 +986,7 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
735 986
     open(callsGt)
736 987
     open(callsPr)
737 988
        
738
-    matchy = match(annotation[YIndex, 2], rownames(M))
989
+    matchy = match(annotation[YIndex, "Name"], rownames(M))
739 990
     matchy = matchy[!is.na(matchy)]
740 991
    
741 992
     MChrY = M[matchy,]
Browse code

bug fix to krlmm

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@101808 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 03/04/2015 00:01:45
Showing1 changed files
... ...
@@ -159,10 +159,10 @@ calculatePriorValues <- function(M, numSNP, verbose) {
159 159
     stopCluster(cl) 
160 160
     centers <- matrix(centers, numSNP, 3, byrow = TRUE)
161 161
     priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
162
-       if(abs(sum(priormeans))>1)
163
-        checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
162
+    if(abs(sum(priormeans))>1) {
163
+      checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
164 164
       priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
165
- 
165
+    }
166 166
     if (verbose) message("Done calculating Prior Means")
167 167
     return(priormeans)
168 168
 }
Browse code

Added support for humanexome12v1.2a chip

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@94521 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 25/09/2014 11:06:52
Showing1 changed files
... ...
@@ -159,6 +159,10 @@ calculatePriorValues <- function(M, numSNP, verbose) {
159 159
     stopCluster(cl) 
160 160
     centers <- matrix(centers, numSNP, 3, byrow = TRUE)
161 161
     priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
162
+       if(abs(sum(priormeans))>1)
163
+        checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
164
+      priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
165
+ 
162 166
     if (verbose) message("Done calculating Prior Means")
163 167
     return(priormeans)
164 168
 }
Browse code

Changes to krlmm to avoid error that occurs in k-means clustering when all intensities are identical

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@88225 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 01/04/2014 02:51:49
Showing1 changed files
... ...
@@ -1,657 +1,778 @@
1
-krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
-  
3
-    pkgname <- getCrlmmAnnotationName(cdfName)
4
-    
5
-### pre-processing, output M    
6
-    M = computeLogRatio(cnSet, verbose) 
7
-    numSNP = nrow(M)
8
-    numSample = ncol(M)
9
-    
10
-    callsGt = calls(cnSet)
11
-    callsPr = snpCallProbability(cnSet)
12
-    open(callsGt)
13
-    open(callsPr)
14
-
15
-    priormeans = calculatePriorValues(M, numSNP, verbose)
16
-    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
17
-
18
-### retrieve or calculate coefficients
19
-    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
20
-    
21
-### do VGLM fit, to predict k for each SNP
22
-    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
23
-    rm(VGLMparameters)
24
-    
25
-### assign calls
26
-    assignCalls(callsGt, M, kPrediction, priormeans, numSNP, numSample, verbose);
27
-    rm(kPrediction)
28
-
29
-### assign confidence scores
30
-    computeCallPr(callsPr, M, callsGt, numSNP, numSample, verbose)
31
-    close(callsGt)
32
-    close(callsPr)
33
-
34
-### impute gender if gender information not provided
35
-    if (is.null(gender)) {
36
-      gender = krlmmImputeGender(cnSet, pkgname, verbose)
37
-    }
38
-    open(cnSet$gender)
39
-    cnSet$gender[,] = gender
40
-    close(cnSet$gender)
41
-
42
-    TRUE
43
-}
44
-
45
-#######################################################################################################
46
-
47
-getNumberOfCores <- function(){
48
-    defaultCores = min(detectCores(), 8)
49
-    return(getOption("krlmm.cores", defaultCores))    
50
-}
51
-
52
-
53
-#######################################################################################################
54
-
55
-computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
56
-    # compute log-ratio in blocksize of 500,000 by default
57
-    message("Start computing log ratio")
58
-    A <- A(cnSet)
59
-    B <- B(cnSet)
60
-    open(A)
61
-    open(B)
62
-    
63
-    numSNP <- nrow(A)
64
-    numSample <- ncol(A)
65
-    
66
-    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
67
-    rownames(M) = rownames(A)
68
-    colnames(M) = colnames(A)
69
-  
70
-    numBlock = ceiling(numSNP / blockSize)
71
-    for (i in 1:numBlock){
72
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
73
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
74
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
75
-        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
76
-        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
77
-        rm(subsetA, subsetB, subsetM)
78
-    }
79
-    
80
-    close(A)
81
-    close(B)
82
-    if (verbose) message("Done computing log ratio")
83
-    return(M)
84
-}
85
-
86
-computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
87
-    # compute average log intensity in blocksize of 500,000 by default
88
-    message("Start computing average log intensity")
89
-    A <- A(cnSet)
90
-    B <- B(cnSet)
91
-    open(A)
92
-    open(B)
93
-    
94
-    numSNP <- nrow(A)
95
-    numSample <- ncol(A)
96
-    
97
-    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
98
-    
99
-    numBlock = ceiling(numSNP / blockSize)
100
-    for (i in 1:numBlock){
101
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
102
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
103
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
104
-        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
105
-        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
106
-        rm(subsetA, subsetB, subsetS)
107
-    }
108
-    
109
-    close(A)
110
-    close(B)
111
-    if (verbose) message("Done computing average log intensity")
112
-    return(S)
113
-}
114
-
115
-
116
-#######################################################################################################
117
-
118
-calculatePriorValues <- function(M, numSNP, verbose) {
119
-
120
-    calculateOneKmeans <- function(x) {
121
-        tmp = kmeans(x, 3, nstart=45)
122
-        return(sort(tmp$centers, decreasing = T))
123
-    }
124
-
125
-    if (verbose) message("Start calculating Prior Means")
126
-    cl <- makeCluster(getNumberOfCores())
127
-    centers <- parRapply(cl, M, calculateOneKmeans)
128
-    stopCluster(cl) 
129
-    centers <- matrix(centers, numSNP, 3, byrow = TRUE)
130
-    priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
131
-    if (verbose) message("Done calculating Prior Means")
132
-    return(priormeans)
133
-}
134
-
135
-#######################################################################################################
136
-
137
-calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
138
-    if (!require(VGAM)) {
139
-        message("VGAM package not found, fall back to use defined platform-specific coefficients")
140
-        return(NULL)
141
-    }
142
-    amatch = match(rownames(params), rownames(trueCalls))
143
-    amatch = amatch[!is.na(amatch)]
144
-    trueCalls = trueCalls[amatch,]
145
-
146
-    amatch = match(rownames(trueCalls), rownames(params))
147
-    params = params[amatch, ]
148
-    
149
-    amatch = match(samplenames, colnames(trueCalls))
150
-    amatch = amatch[!is.na(amatch)]    
151
-    trueCalls = trueCalls[, amatch]
152
-        
153
-    if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
154
-        message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
155
-        return(NULL)
156
-    }
157
-    if ((numSample > 40) && (ncol(trueCalls) < 20)){
158
-        message("At least trueCalls of 20 samples are required to calculate coefficients")
159
-        return(NULL)
160
-    }
161
-    if (nrow(trueCalls) < 1000){
162
-        message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
163
-        return(NULL)
164
-    }
165
-
166
-    getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
167
-    truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
168
-   
169
-    params1 = params[!getna,]
170
-    truek1 = truek[!getna]
171
-
172
-    xx = data.frame(params1)
173
-    t = as.factor(as.numeric(truek1)) 
174
-    xx = data.frame(xx, t)
175
-    fit = suppressWarnings(VGAM::vglm(t~., multinomial(refLevel=1), xx))
176
-    coe = VGAM::coefficients(fit)
177
-    return(coe)    
178
-}
179
-
180
-getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
181
-    if (!is.null(trueCalls)) {
182
-        coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
183
-        if (!is.null(coe)) {
184
-            if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
185
-            return(coe)
186
-        }
187
-    }
188
-    if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
189
-    if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
190
-    loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
191
-    return(getVarInEnv("krlmmCoefficients"))      
192
-}
193
-
194
-
195
-predictKwithVGLM <- function(data, coe, verbose){
196
-    if (verbose) message("Start predicting number of clusters")
197
-    logit1 <- rep(0, nrow(data))
198
-    logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
199
-    logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
200
-   
201
-    logits <- cbind(logit1, logit2, logit23)
202
-    rm(logit1)
203
-    rm(logit2)
204
-    rm(logit23)
205
-    p.unscaled <- exp(logits)
206
-    rm(logits)   
207
-    p <- p.unscaled / rowSums(p.unscaled)
208
-    clusterPrediction = apply(p, 1, function(x){which.max(x)})
209
-    rm(p.unscaled)
210
-    rm(p)
211
-    if (verbose) message("Done predicting number of clusters")
212
-    return(clusterPrediction)
213
-}
214
-
215
-#######################################################################################################
216
-
217
-assignCalls3Cluster <- function(intensities, priormeans){
218
-    prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
219
-    prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
220
-
221
-    emp <- rep(NA, length(intensities))
222
-	ansCalls <- emp
223
-    tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
224
-    if(class(tmp) == "try-error") {
225
-        tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
226
-        if(class(tmp1) == "try-error"){
227
-            tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
228
-            if(class(tmp2) == "try-error"){
229
-                ansCalls = emp
230
-            }else{
231
-                ansCalls = tmp2$cluster
232
-            }
233
-        }else{
234
-            ansCalls = tmp1$cluster
235
-        }
236
-    }else{
237
-        ansCalls = tmp$cluster
238
-    }
239
-    rm(prior31, prior32)
240
-    return(ansCalls)
241
- 
242
-}
243
-
244
-#######################################################################################################
245
-
246
-assignCalls2Cluster <- function(intensities, priormeans){
247
-    closest <- rep(NA, 3)	
248
-    for(j in 1:3){
249
-        distance <- as.vector(abs(priormeans[j]-intensities))
250
-        closest[j] <- intensities[which.min(distance)]
251
-    }
252
-    prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
253
-	
254
-    emp <- rep(NA, length(intensities))
255
-    ansCalls <- emp    
256
-    tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
257
-    if(class(tmp) == "try-error") {
258
-        ansCalls <- emp
259
-    }else{
260
-        if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
261
-            mp <- abs(tmp$centers[1]-tmp$centers[2])
262
-            pp <- abs(priormeans[2]-priormeans[3])
263
-            if((mp/pp)<=0.25){
264
-                ansCalls <- emp
265
-            }else{
266
-                ansCalls <- tmp$cluster+1
267
-            }
268
-        }
269
-        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
270
-            mp=abs(tmp$centers[1]-tmp$centers[2])
271
-            pp=abs(priormeans[1]-priormeans[2])
272
-            if((mp/pp)<=0.25){
273
-                ansCalls = emp
274
-            }else{
275
-                ansCalls <- tmp$cluster
276
-            }
277
-        }
278
-        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
279
-            mp <- abs(tmp$centers[1]-tmp$centers[2])
280
-            pp <- abs(priormeans[1]-priormeans[3])
281
-            if ((mp/pp) <=0.25){
282
-                ansCalls <- emp
283
-            }else{
284
-                ansCalls[tmp$cluster==1] <- 1
285
-                ansCalls[tmp$cluster==2] <- 3
286
-            }
287
-        }
288
-    }
289
-    rm(tmp)
290
-    return(ansCalls)
291
-}
292
-
293
-#######################################################################################################
294
-
295
-assignCalls1Cluster <- function(intensities, priormeans){
296
-    closest <- rep(NA, 3)
297
-    for(j in 1:3){
298
-        distance <- as.vector(abs(priormeans[j]-intensities))
299
-        closest[j]=intensities[which.min(distance)]
300
-    }
301
-    clusterindex <- which.min(abs(closest-priormeans))
302
-    return(rep(clusterindex, length(intensities)))
303
-}
304
-  
305
-#######################################################################################################
306
-
307
-assignCallsOneSNP <- function(x, priormeans, numSample){
308
-    tolerance = 1e-3
309
-       
310
-    k = x[numSample + 1]
311
-    values = x[1:numSample]  
312
-    
313
-    if (abs(k - 2) <= tolerance) {
314
-        tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
315
-        if (!(class(tmp)=="try-error")) {
316
-            k <- 3;
317
-        }
318
-    }
319
-
320
-    successful <- FALSE;
321
-    if (abs(k - 3) <= tolerance){
322
-        SNPCalls <- assignCalls3Cluster(values, priormeans)
323
-        successful <- !is.na(SNPCalls[1])
324
-        if (!successful) { 
325
-            k <- 2
326
-        }
327
-    }
328
-		
329
-    if ( (abs(k - 2) <= tolerance) && (!successful)){
330
-        SNPCalls <- assignCalls2Cluster(values, priormeans)
331
-        successful <- !is.na(SNPCalls[1])
332
-        if (!successful) { 
333
-            k <- 1
334
-        }			
335
-    }
336
-
337
-    if ( (abs(k - 1) <= tolerance) && (!successful)){
338
-        SNPCalls <- assignCalls1Cluster(values, priormeans)
339
-    }
340
-    return(SNPCalls)
341
-}
342
-
343
-
344
-assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
345
-    # process by block size of 500,000 by default
346
-    message("Start assign calls")
347
-       
348
-    numBlock = ceiling(numSNP / blockSize)
349
-
350
-    for (i in 1:numBlock){
351
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
352
-        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
353
-        thisnumSNP = nrow(subsetM)
354
-        subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
355
-        subseta = matrix(subseta, thisnumSNP, 1)
356
-
357
-        testValues = cbind(subsetM, subseta)
358
-
359
-        cl <- makeCluster(getNumberOfCores())
360
-        callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
361
-        stopCluster(cl)
362
-
363
-        callAnsAllValues = unlist(callAns)
364
-
365
-        subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
366
-        
367
-        callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
368
-        rm(subsetM, subseta, subsetcalls)
369
-    }
370
-    message("Done assign calls")
371
-}
372
-
373
-#######################################################################################################
374
-
375
-hardyweinberg <- function(x, minN=10){
376
-  if (length(x) < minN){
377
-    return(NA) 
378
-  } else {
379
-    result = .Call("krlmmHardyweinberg", x)
380
-    return(result)
381
-  }
382
-}
383
-
384
-calculateParameters <- function(M, priormeans, numSNP, verbose) {
385
-    if (verbose) message("Start calculating 3-clusters parameters")
386
-    params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
387
-    if (verbose) message("Done calculating 3-cluster parameters")
388
-
389
-    if (verbose) message("Start calculating 2-cluster parameters")
390
-    params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
391
-    if (verbose) message("Done calculating 2-cluster parameters")
392
-
393
-    if (verbose) message("Start calculating 1-cluster parameters")
394
-    params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
395
-    if (verbose) message("Done calculating 1-cluster parameters")
396
-
397
-    parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
398
-                        as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
399
-    
400
-    rownames(parameters) = rownames(M)
401
-    rm(params3cluster)
402
-    rm(params2cluster)
403
-    rm(params1cluster)
404
-    return(parameters)
405
-}
406
-
407
-
408
-calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
409
-    tmp = try(kmeans(x, priors, nstart=1), TRUE)
410
-    if(class(tmp) == "kmeans") {
411
-        flag = 1
412
-     } else {
413
-        tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
414
-        if(class(tmp) == "kmeans") {
415
-            flag = 2
416
-        } else {
417
-            tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
418
-            if (class(tmp) == "kmeans") {
419
-                flag = 3
420
-            } else {
421
-                tmp = kmeans(x, 3, nstart=35)
422
-                flag = 4
423
-            }
424
-        }  
425
-    }
426
-    ss = sum(tmp$withinss)
427
-    hw = hardyweinberg(tmp$cluster) 
428
-    centers = sort(tmp$centers, decreasing = TRUE)
429
-
430
-    return(c(centers, ss, hw, flag))
431
-}
432
-
433
-calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
434
-    Apriors = priormeans
435
-    ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
436
-    ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
437
-
438
-    cl <- makeCluster(getNumberOfCores())
439
-    parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
440
-
441
-    stopCluster(cl)
442
-    parAnsAllValues = unlist(parAns)
443
-    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
444
-   
445
-    centers <- parameters[, 1:3]
446
-    ss <- parameters[, 4]
447
-    hw <- parameters[, 5]
448
-    flag <- parameters[, 6]
449
-
450
-    rm(parAns)
451
-    rm(parameters)
452
-   
453
-    sigma=solve(cov(centers, use="complete.obs"))
454
-    mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
455
-
456
-    rm(sigma)
457
-    rm(centers)
458
-
459
-    gc()
460
-    return(list(eless = ss, elemh = mh, elehw = hw))
461
-}
462
-
463
-
464
-calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
465
-    mahaldist = rep(NA, numSNP)
466
-
467
-    tolerance = 1e-3
468
-    for (i in 1:numSNP) {
469
-        if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
470
-        if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
471
-        if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
472
-        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
473
-    }
474
-    return(mahaldist)
475
-}
476
-
477
-
478
-calculateOneParams2Cluster <- function(x, priors){
479
-  
480
-    aa = rep(NA, 3)   
481
-    for(j in 1:3){
482
-        dist = as.vector(abs(priors[j]-x))
483
-        aa[j]=x[which.min(dist)]
484
-    } 
485
-    prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
486
-     
487
-
488
-    tmp = try(kmeans(x, prior2, nstart=1), TRUE)
489
-    if (class(tmp)=="kmeans") {
490
-        centers = tmp$centers
491
-        hw = hardyweinberg(tmp$cluster)
492
-    }
493
-    rm(tmp)
494
-    tmp = kmeans(x, 2, nstart = 35)
495
-    if (tmp$centers[1] < tmp$centers[2]){
496
-        centers = c(tmp$centers[2], tmp$centers[1])
497
-        hw = hardyweinberg(3 - tmp$cluster)   
498
-    } else {
499
-        centers = tmp$centers
500
-        hw = hardyweinberg(tmp$cluster)
501
-    }
502
-    ss = sum(tmp$withinss)     
503
-    return(c(centers, prior2, ss, hw))
504
-}
505
-
506
-
507
-calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
508
-    Apriors = priormeans
509
-   
510
-    cl <- makeCluster(getNumberOfCores())
511
-    parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
512
-    stopCluster(cl)
513
-
514
-    parAnsAllValues = unlist(parAns)
515
-    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
516
-
517
-    centers <- parameters[, 1:2]
518
-    priors2cluster <- parameters[, 3:4]
519
-    ss <- parameters[, 5]
520
-    hw <- parameters[, 6]
521
-    
522
-    rm(parAns)
523
-    rm(parameters)
524
-    sigma=solve(cov(centers, use="complete.obs"))
525
-
526
-    mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
527
-
528
-    rm(sigma)
529
-    rm(centers)
530
-
531
-    gc()
532
-    return(list(eless = ss, elemh = mh, elehw = hw))
533
-}
534
-
535
-
536
-calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
537
-    mahaldist = rep(NA, numSNP)
538
-    
539
-    for (i in 1:numSNP) {
540
-        difference <- centers[i,] - priors[i,]
541
-        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
542
-    }
543
-    return(mahaldist)
544
-}
545
-
546
-calculateOneParams1Cluster <- function(x, priors){
547
-    center = mean(x)
548
-    diff <- x - center
549
-    diffsquare <- diff^2
550
-    ss = sum(diffsquare)
551
-
552
-    closestPrior = priors[which.min(abs(priors - center))]
553
-
554
-    return(c(center, closestPrior, ss))
555
-}
556
-
557
-
558
-calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
559
-    Apriors = priormeans
560
-   
561
-    cl <- makeCluster(getNumberOfCores())
562
-    parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
563
-    stopCluster(cl)
564
-
565
-    parAnsAllValues = unlist(parAns)
566
-    parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
567
-
568
-    centers <- matrix(parameters[, 1], numSNP, 1)
569
-    prior1cluster <- matrix(parameters[, 2], numSNP, 1)
570
-    ss <- parameters[, 3]
571
-    
572
-    rm(parAns)
573
-    rm(parameters)
574
-    sigma=solve(cov(centers, use="complete.obs"))
575
-
576
-    mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
577
-
578
-    rm(sigma)
579
-    rm(centers)
580
-
581
-    gc()
582
-    return(list(eless = ss, elemh = mh))
583
-}
584
-
585
-
586
-calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
587
-    mahaldist = rep(NA, numSNP)
588
-    
589
-    for(i in 1:numSNP) {
590
-        difference <- as.vector(centers[i, 1] - priors[i, 1])
591
-	mahaldist[i]=difference%*%sigma%*%difference
592
-    }
593
-    return(mahaldist)
594
-}
595
-   
596
-#############################################
597
-
598
-
599
-computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
600
-    # compute log-ratio in blocksize of 500,000 by default
601
-    if (verbose) message("Start computing confidence score")
602
-    
603
-    numBlock = ceiling(numSNP / blockSize)
604
-    for (i in 1:numBlock){
605
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
606
-        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
607
-        subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
608
-        subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
609
-        callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
610
-        rm(subsetM, subsetCalls, subsetCallProb)
611
-    }
612
-
613
-    if (verbose) message("Done computing confidence score")
614
-}
615
-
616
-#############################################
617
-
618
-krlmmImputeGender <- function(cnSet, pkgname, verbose){
619
-    if (verbose) message("Start imputing gender")    
620
-    S = computeAverageLogIntensity(cnSet, verbose) 
621
-    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
622
-    YIndex <- getVarInEnv("YIndex")
623
-    loader("annotation.rda", .crlmmPkgEnv, pkgname)
624
-    annotation <- getVarInEnv("annot")
625
-
626
-    A = A(cnSet)
627
-    open(A)
628
-    SNPNames = rownames(A)
629
-    close(A)
630
-
631
-    matchy = match(annotation[YIndex, 2], SNPNames)
632
-    Sy = S[matchy,]
633
-    rm(S)
634
-    numYChrSNP = nrow(Sy)
635
-
636
-    allS = matrix(NA, numYChrSNP, 2)
637
-
638
-    for (i in 1:numYChrSNP) {
639
-        tmp = kmeans(Sy[i,] ,2, nstart=45)
640
-        allS[i,] = sort(tmp$centers, decreasing=F)
641
-    }
642
-    priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
643
-
644
-    group = rep(NA, ncol(Sy))
645
-
646
-    meanmatrix = apply(Sy, 2, median)
647
-
648
-    Sy1 = abs(meanmatrix-priorS[1])
649
-    Sy2 = abs(meanmatrix-priorS[2])
650
-
651
-    # output male - 1, female - 2, female S-values are smaller than male S-values. 
652
-    test = cbind(Sy2, Sy1)
653
-    group = apply(test, 1, which.min)
654
-
655
-    if (verbose) message("Done imputing gender") 
656
-    return(group)    
657
-}
1
+krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
+  
3
+    pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
4
+    
5
+### pre-processing, output M    
6
+    M = computeLogRatio(cnSet, verbose)
7
+    message("leaving out novariant SNPs")
8
+    
9
+    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
10
+    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
11
+    SNPtoProcessIndex = uniqueCount >= 3
12
+    noVariantSNPIndex = uniqueCount < 3
13
+    M = M[SNPtoProcessIndex, ]
14
+
15
+    numSNP = nrow(M)
16
+    numSample = ncol(M)
17
+    
18
+    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
19
+    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
20
+    open(calls)
21
+    open(scores)
22
+
23
+    rownames(calls) = rownames(M)
24
+    rownames(scores) = rownames(M)
25
+    colnames(calls) = colnames(M)
26
+    colnames(scores) = colnames(M)  
27
+    
28
+    priormeans = calculatePriorValues(M, numSNP, verbose)
29
+    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
30
+  
31
+### retrieve or calculate coefficients
32
+    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
33
+    
34
+### do VGLM fit, to predict k for each SNP
35
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
36
+    rm(VGLMparameters)
37
+    
38
+### assign calls
39
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
40
+    rm(kPrediction)
41
+
42
+### assign confidence scores
43
+    computeCallPr(scores, M, calls, numSNP, numSample, verbose)
44
+    
45
+### add back SNPs excluded before
46
+    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
47
+
48
+    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
49
+    YIndex <- getVarInEnv("YIndex")
50
+    loader("annotation.rda", .crlmmPkgEnv, pkgname)
51
+    annotation <- getVarInEnv("annot")
52
+    
53
+### impute gender if gender information not provided
54
+    if (is.null(gender)) {
55
+        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
56
+    }
57
+
58
+### double-check ChrY SNP cluster, impute gender if gender information not provided
59
+    if (!(is.null(gender))) {
60
+        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
61
+    }
62
+
63
+    close(calls)
64
+    close(scores)
65
+    rm(calls)
66
+    rm(scores)
67
+    rm(M)
68
+    rm(annotation)
69
+    rm(YIndex)
70
+    TRUE