Browse code

fix conflict in description

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@80947 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/09/2013 14:07:17
Showing12 changed files

... ...
@@ -2,6 +2,7 @@ Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4 4
 Version: 1.19.7
5
+Date: Mon Sep 30 10:06:41 EDT 2013
5 6
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry
6 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
... ...
@@ -23,13 +24,15 @@ Imports: methods,
23 24
 	 ff,
24 25
 	 foreach,
25 26
          RcppEigen (>= 0.3.1.2.1),
26
-	 matrixStats
27
+	 matrixStats,
28
+	 VGAM
27 29
 Suggests: hapmapsnp6,
28 30
           genomewidesnp6Crlmm (>= 1.0.7),
29 31
           GGdata,
30 32
           snpStats,
31 33
 	  ellipse,
32
-	  RUnit
34
+	  RUnit,
35
+	  VGAM
33 36
 Collate: AllGenerics.R
34 37
 	 AllClasses.R
35 38
 	 methods-AssayData.R
... ...
@@ -49,4 +52,7 @@ Collate: AllGenerics.R
49 52
          zzz.R
50 53
 	 test_crlmm_package.R
51 54
 LazyLoad: yes
52
-biocViews: Microarray, Preprocessing, SNP, Bioinformatics, CopyNumberVariants
55
+biocViews: Microarray, Preprocessing, SNP, Bioinformatics,CopyNumberVariants
56
+## Local Variables:
57
+## time-stamp-pattern: "8/Date: %3a %3b %2d %02H:%02M:%02S %Z %:y\n"
58
+## End:
... ...
@@ -75,6 +75,8 @@ importFrom(utils, packageDescription, setTxtProgressBar,
75 75
 ## foreach
76 76
 import(foreach)
77 77
 
78
+importFrom(VGAM, vglm, multinomial, coefficients)
79
+
78 80
 ##----------------------------------------------------------------------------
79 81
 ## export
80 82
 ##----------------------------------------------------------------------------
... ...
@@ -150,86 +150,160 @@ readIdatFiles = function(sampleSheet=NULL,
150 150
 }
151 151
 
152 152
 
153
-readGenCallOutput = function(file, path=".", cdfName,
154
-    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
155
-    type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
156 153
 
157
-    if(!identical(names(type), names(colnames)))
158
-       stop("The arguments 'colnames' and 'type' must have consistent names")
159
-    if(verbose)
160
-  	cat("Reading", file, "\n")
161
-    tmp=readLines(file.path(path,file),n=15)
162
-    s=c("\t",",")
163
-    a=unlist(strsplit(tmp[10][1],s[1]))
164
-    if(length(a)!=1){
165
-	sepp=s[1]
166
-      	a1=unlist(strsplit(tmp[10][1],s[1]))
154
+getNumberOfSNPs = function(afile, path){
155
+    fullfilename = file.path(path, afile)
156
+    headerSection = readLines(fullfilename, n=15)
157
+
158
+    headerLine = headerSection[10][1]
159
+    delimiterList = c(",", "\t")
160
+
161
+    headers = unlist(strsplit(headerLine, delimiterList[1]))
162
+    if (length(headers)!=1) {
163
+        delimiterIndex = 1
164
+    }
165
+    if (length(headers) == 1) {
166
+        headers = unlist(strsplit(headerLine, delimiterList[2]))
167
+        if (length(headers) != 1) {
168
+            delimiterIndex = 2
169
+        }
170
+        if (length(headers) == 1) {
171
+            stop("Input file ", fullfilename, " is not delimited by either comm(,) or tab(\\t)")
172
+        }
167 173
     }
168
-    if(length(a)==1){
169
-	sepp=s[2]
170
-	a1=unlist(strsplit(tmp[10][1],s[2]))
174
+
175
+    SNPLine = headerSection[5][1]
176
+    elements = unlist(strsplit(SNPLine, delimiterList[delimiterIndex]))
177
+    numSNP = as.integer(elements[2])
178
+    return(numSNP)
179
+}
180
+
181
+checkNumberOfSNPs = function(filenames, path){
182
+    numSNP = getNumberOfSNPs(filenames[1], path)
183
+    if (length(filenames) > 1) {
184
+		for (i in 2:length(filenames)){
185
+			if (getNumberOfSNPs(filenames[i], path) != numSNP){
186
+				return(FALSE)
187
+			}
188
+		}
171 189
     }
190
+    return(TRUE)
191
+}
172 192
 
173
-    if(sum(is.na(match(colnames,a1)))!=0)
174
-	stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.")
175 193
 
176
-    m1=m=match(a1,colnames)
177
-    m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]]
178
-    m[is.na(m)==TRUE] = list(NULL)
179
-    names(m) = names(colnames)[m1]
194
+getNumberOfSamples = function(filenames, path, numSNP){
195
+    sampleCount = rep(0, length(filenames))
196
+    for (i in 1:length(filenames)){
197
+    	# number of sample in input file line 7 is not reliable, calculate from number of lines and number of SNPs
198
+    	fullfilename = file.path(path, filenames[i])
199
+	LineCount = .Call("countFileLines", fullfilename)
200
+  	if (((LineCount - 10) %% numSNP) != 0){
201
+           stop("Please check input file: ", fullfilename, " Line count is not a multiple of number of SNPs")
202
+    	}
203
+	sampleCount[i] = LineCount %/% numSNP
204
+    }	
205
+    return(sampleCount)
206
+}
180 207
 
181
-    fc = file(file.path(path, file), open="r")
208
+processOneGenCallFile = function(afile, numSNP, numSample,
209
+    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
210
+    verbose=FALSE) {
211
+  
212
+    headerSection = readLines(afile, n=15)
182 213
 
183
-    dat = scan(fc, what=m, skip=10,sep=sepp)
184
-    close(fc)
214
+    headerLine = headerSection[10][1]
215
+    delimiterList = c(",", "\t")
185 216
 
186
-    samples = unique(dat$"SampleID")
187
-    nsamples = length(samples)
188
-    snps = unique(dat$"SNPID")
189
-    nsnps = length(snps)
190
-    if(verbose)
191
-        cat("Check ordering for samples","\n")
192
-
193
-    X = Y = zeroes = matrix(0, nsnps, nsamples)
194
-
195
-    for(i in 1:length(samples)) {
196
-        ind = dat$"SampleID"==samples[i]
197
-	    if(sum(dat$"SNPID"[ind]==snps)==nsnps) {
198
-#           if(verbose)
199
-#	            cat(paste("Correct ordering for sample", samples[i], "\n"))
200
-			X[,i] = dat$"XRaw"[ind]
201
-	        Y[,i] = dat$"YRaw"[ind]
202
-	    }
203
-        if(sum(dat$"SNPID"[ind]==snps)!=nsnps) {
204
-	        if(verbose)
205
-                cat("Reordering sample ", samples[i],"\n")
206
-            m=match(snps,dat$"SNPID"[ind])
207
-            X[,i]= dat$"XRaw"[ind][m]
208
-            Y[,i]= dat$"YRaw"[ind][m]
217
+    headers = unlist(strsplit(headerLine, delimiterList[1]))
218
+    if (length(headers)!=1) {
219
+        delimiterIndex = 1
220
+    }
221
+    if (length(headers) == 1) {
222
+        headers = unlist(strsplit(headerLine, delimiterList[2]))
223
+        if (length(headers) != 1) {
224
+            delimiterIndex = 2
209 225
         }
210
-        gc(verbose=FALSE)
226
+        if (length(headers) == 1) {
227
+            stop("Input file is not delimited by either comm(,) or tab(\\t)")
228
+        }
229
+    }
230
+     
231
+    if(sum(is.na(match(colnames, headers))) != 0)
232
+	stop("Cannot find required columns: ", colnames[is.na(match(colnames, headers))], " in ", file, "\nPlease check whether this data was exported.")
233
+    
234
+    SNPIDPos = which(headers == colnames$SNPID)
235
+    sampleIDPos = which(headers == colnames$SampleID)
236
+    XValuePos = which(headers == colnames$XRaw)
237
+    YValuePos = which(headers == colnames$YRaw)   
238
+        
239
+    if(verbose) {
240
+        message("Number of SNPs in file: ", afile, " is ", numSNP, " and number of samples is ", numSample)
241
+        if (delimiterIndex == 1) message("File is comma-seperated. ")
242
+        if (delimiterIndex == 2) message("File is tab-seperated. ")              
211 243
     }
212 244
 
213
-    zeroes=(X=="0"|Y=="0")
214
-    colnames(X) = colnames(Y) =  colnames(zeroes) = samples
215
-    rownames(X) = rownames(Y) = snps
245
+    values = .Call("readGenCallOutputCFunc", afile, numSNP, numSample, SNPIDPos, sampleIDPos, XValuePos, YValuePos, delimiterIndex)
246
+             
247
+    return(values)
248
+}
216 249
 
217
-    if(verbose)
218
-        cat("Creating NChannelSet object\n")
250
+readGenCallOutput = function(filenames, path=".", cdfName,
251
+    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
252
+    type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
219 253
 
220
-    XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X),
221
-			 Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y),
222
-			 zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes),
223
-			 annotation = cdfName, storage.mode = "environment")
224
-    sampleNames(XY)=colnames(X)
254
+    if(!identical(names(type), names(colnames)))
255
+       stop("The arguments 'colnames' and 'type' must have consistent names")
256
+    if(missing(cdfName)) stop("must specify cdfName")
225 257
 
258
+    if (!checkNumberOfSNPs(filenames, path)){
259
+       stop("Number of SNPs in each file must be identical to form one output NChannelSet object.")
260
+    }
261
+
262
+    if (verbose) message("Checking number of samples and features in each file. ")
263
+    numSNP = getNumberOfSNPs(filenames[1], path)
264
+    sampleCounts = getNumberOfSamples(filenames, path, numSNP)
265
+    numSample = sum(sampleCounts)
266
+
267
+    X = initializeBigMatrix(name = "X", nr = numSNP, nc = numSample, vmode = "integer")
268
+    Y = initializeBigMatrix(name = "Y", nr = numSNP, nc = numSample, vmode = "integer")
269
+    zero = initializeBigMatrix(name = "zero", nr = numSNP, nc = numSample, vmode = "integer")
270
+    
271
+    totSampleNames = rep(NA, numSample)
272
+   
273
+    baseIndex = 1
274
+    if (verbose) message("Start processing ", length(filenames), " input file(s)")
275
+    for (i in 1:length(filenames)){
276
+    	fullfilename = file.path(path, filenames[i])
277
+    	valuesThisFile = processOneGenCallFile(fullfilename, numSNP, sampleCounts[i], colnames, verbose)
278
+	
279
+	if (i == 1){
280
+	    totSNPNames = rownames(valuesThisFile$Xvalues)
281
+	} else {
282
+	    # matching on SNP names? now assume they come in order
283
+	}
284
+	maxIndex = baseIndex + sampleCounts[i] - 1
285
+	X[, baseIndex:maxIndex] = valuesThisFile$Xvalues
286
+	Y[, baseIndex:maxIndex] = valuesThisFile$Yvalues
287
+        zero[, baseIndex:maxIndex] = (X[, baseIndex:maxIndex] == 0) || (Y[, baseIndex:maxIndex] == 0)
288
+	totSampleNames[baseIndex:(baseIndex + sampleCounts[i] - 1)] = colnames(valuesThisFile$Xvalues)
289
+	rm(valuesThisFile)
290
+	baseIndex = baseIndex + sampleCounts[i]
291
+    }
292
+
293
+    if(verbose) message("Creating NChannelSet object\n")
294
+
295
+    XY = new("NChannelSet", X=X, Y=Y, zero=zero, annotation=cdfName, storage.mode = "environment")
296
+    sampleNames(XY) = totSampleNames
297
+    featureNames(XY) = totSNPNames
298
+    
226 299
     if(verbose)
227
-      cat("Done\n")
300
+    cat("Done\n")
228 301
 
229 302
     XY
230 303
 }
231 304
 
232 305
 
306
+
233 307
 RGtoXY = function(RG, chipType, verbose=TRUE) {
234 308
   needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded))
235 309
   if(needToLoad){
... ...
@@ -514,6 +588,7 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
514 588
 		## new lines below - useful to keep track of zeroed out probes
515 589
 		zero <- matrix(as.integer(assayData(XY)[["zero"]][npIndex, ]), nprobes, narrays)
516 590
 		##zero = matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
591
+		
517 592
 		colnames(A) = colnames(B) = colnames(zero) = sns
518 593
 		rownames(A) = rownames(B) = rownames(zero) = names(npIndex)
519 594
 		cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
... ...
@@ -546,7 +621,6 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
546 621
   A = matrix(NA, nprobes, narrays)
547 622
   B = matrix(NA, nprobes, narrays)
548 623
   zero = matrix(NA, nprobes, narrays)
549
-
550 624
   if(verbose && fitMixture){
551 625
      message("Calibrating ", narrays, " arrays.")
552 626
      if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=narrays, style=3)
... ...
@@ -862,7 +936,6 @@ constructInf <- function(sampleSheet=NULL,
862 936
 			 verbose=FALSE,
863 937
 			 batch=NULL, #fns,
864 938
 			 saveDate=TRUE) { #, outdir="."){
865
-	verbose <- FALSE
866 939
         if(is.null(XY)) {
867 940
   	  if(!is.null(arrayNames)) {
868 941
 		pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
... ...
@@ -967,7 +1040,7 @@ constructInf <- function(sampleSheet=NULL,
967 1040
 	  cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
968 1041
 	  cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
969 1042
 	##sampleNames(cnSet) <- basename(sampleNames(cnSet))
970
-        } else{ # if XY specified, easier set-up of cnSet
1043
+        } else { # if XY specified, easier set-up of cnSet
971 1044
           narrays = ncol(XY)
972 1045
           if(verbose) message("Initializing container for genotyping and copy number estimation")           
973 1046
           if(!is.null(batch)) {
... ...
@@ -1084,7 +1157,8 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1084 1157
 			gender=NULL,
1085 1158
 			DF=6,
1086 1159
 			cdfName,
1087
-                        call.method="crlmm"){
1160
+                        call.method="crlmm",
1161
+                        trueCalls=NULL){
1088 1162
 	is.snp = isSnp(cnSet)
1089 1163
 	snp.index = which(is.snp)
1090 1164
 ##	narrays = ncol(cnSet)
... ...
@@ -1125,7 +1199,7 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1125 1199
            close(cnSet$gender)
1126 1200
         }
1127 1201
         if(call.method=="krlmm")
1128
-          tmp <- krlmm(cnSet, cdfName, gender=gender) # new function required...  cnSet, cdfName and gender are probably the only arguments you need...
1202
+          tmp <- krlmm(cnSet, cdfName, gender=gender, trueCalls=trueCalls, verbose=verbose) # new function required...  cnSet, cdfName and gender are probably the only arguments you need...
1129 1203
 	## snp.names=featureNames(cnSet)[snp.index])
1130 1204
 	if(verbose) message("Genotyping finished.") # Updating container with genotype calls and confidence scores.")
1131 1205
 	TRUE
... ...
@@ -1141,7 +1215,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1141 1215
 			      sep="_",
1142 1216
 			      fileExt=list(green="Grn.idat", red="Red.idat"),
1143 1217
                               XY=NULL,
1144
-                              call.method="crlmm",                              
1218
+                              call.method="crlmm",
1219
+                              trueCalls=NULL,
1145 1220
 			      cdfName,
1146 1221
 			      copynumber=TRUE,
1147 1222
 			      batch=NULL,
... ...
@@ -1192,7 +1267,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1192 1267
           # quantile.method="between"
1193 1268
         }
1194 1269
 	is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
1195
-	stopifnot(is.lds)
1270
+        if (!(is.lds))
1271
+            stop("Package ff not loaded")
1196 1272
         if(!is.null(XY) && missing(cdfName))
1197 1273
           cdfName = getCrlmmAnnotationName(annotation(cnSet))
1198 1274
 	if(missing(cdfName)) stop("must specify cdfName")
... ...
@@ -1247,7 +1323,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1247 1323
 		    gender=gender,
1248 1324
 		    DF=DF,
1249 1325
 		    cdfName=cdfName,
1250
-                    call.method=call.method)
1326
+                    call.method=call.method,
1327
+                    trueCalls=trueCalls)
1251 1328
 	return(cnSet)
1252 1329
 }
1253 1330
 
... ...
@@ -1292,8 +1369,9 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1292 1369
                                quantile.method=quantile.method) #, outdir=outdir)
1293 1370
           rm(XY)
1294 1371
         }else{ #XY already available
1372
+          if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)  
1295 1373
           res = preprocessInfinium2(XY[,sel], mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1296
-                                           seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
1374
+                                           seed=seed, eps=eps, cdfName=cdfName, sns=sns[sel], stripNorm=stripNorm, useTarget=useTarget,
1297 1375
                                            quantile.method=quantile.method) 
1298 1376
         }
1299 1377
         gc(verbose=FALSE)
1300 1378
old mode 100644
1301 1379
new mode 100755
... ...
@@ -1,10 +1,658 @@
1
-krlmm = function(cnSet, cdfName, gender) { # Doesn't do anything as yet - Jenny/Cynthia to add functionality based on their code
2
-  if(is.null(gender)) {
3
-    gender=rep(1, ncol(cnSet)) # Need to impute gender if not specified - Cynthia is working on this using Y chr SNPs
1
+krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
+  
3
+    pkgname <- getCrlmmAnnotationName(cdfName)
4
+    
5
+### pre-processing, output M    
6
+    M = computeLogRatio(cnSet, verbose) 
7
+    numSNP = nrow(M)
8
+    numSample = ncol(M)
9
+    
10
+    callsGt = calls(cnSet)
11
+    callsPr = snpCallProbability(cnSet)
12
+    open(callsGt)
13
+    open(callsPr)
14
+
15
+    priormeans = calculatePriorValues(M, numSNP, verbose)
16
+    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
17
+
18
+### retrieve or calculate coefficients
19
+    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
20
+    
21
+### do VGLM fit, to predict k for each SNP
22
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
23
+    rm(VGLMparameters)
24
+    
25
+### assign calls
26
+    assignCalls(callsGt, M, kPrediction, priormeans, numSNP, numSample, verbose);
27
+    rm(kPrediction)
28
+
29
+### assign confidence scores
30
+    computeCallPr(callsPr, M, callsGt, numSNP, numSample, verbose)
31
+    close(callsGt)
32
+    close(callsPr)
33
+
34
+### impute gender if gender information not provided
35
+    if (is.null(gender)) {
36
+      gender = krlmmImputeGender(cnSet, pkgname, verbose)
37
+    }
38
+    open(cnSet$gender)
39
+    cnSet$gender[,] = gender
40
+    close(cnSet$gender)
41
+
42
+    TRUE
43
+}
44
+
45
+#######################################################################################################
46
+
47
+getNumberOfCores <- function(){
48
+    defaultCores = min(detectCores(), 8)
49
+    return(getOption("krlmm.cores", defaultCores))    
50
+}
51
+
52
+
53
+#######################################################################################################
54
+
55
+computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
56
+    # compute log-ratio in blocksize of 500,000 by default
57
+    message("Start computing log ratio")
58
+    A <- A(cnSet)
59
+    B <- B(cnSet)
60
+    open(A)
61
+    open(B)
62
+    
63
+    numSNP <- nrow(A)
64
+    numSample <- ncol(A)
65
+    
66
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
67
+    rownames(M) = rownames(A)
68
+    colnames(M) = colnames(A)
69
+  
70
+    numBlock = ceiling(numSNP / blockSize)
71
+    for (i in 1:numBlock){
72
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
73
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
74
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
75
+        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
76
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
77
+        rm(subsetA, subsetB, subsetM)
78
+    }
79
+    
80
+    close(A)
81
+    close(B)
82
+    if (verbose) message("Done computing log ratio")
83
+    return(M)
84
+}
85
+
86
+computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
87
+    # compute average log intensity in blocksize of 500,000 by default
88
+    message("Start computing average log intensity")
89
+    A <- A(cnSet)
90
+    B <- B(cnSet)
91
+    open(A)
92
+    open(B)
93
+    
94
+    numSNP <- nrow(A)
95
+    numSample <- ncol(A)
96
+    
97
+    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
98
+    
99
+    numBlock = ceiling(numSNP / blockSize)
100
+    for (i in 1:numBlock){
101
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
102
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
103
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
104
+        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
105
+        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
106
+        rm(subsetA, subsetB, subsetS)
107
+    }
108
+    
109
+    close(A)
110
+    close(B)
111
+    if (verbose) message("Done computing average log intensity")
112
+    return(S)
113
+}
114
+
115
+
116
+#######################################################################################################
117
+
118
+calculatePriorValues <- function(M, numSNP, verbose) {
119
+
120
+    calculateOneKmeans <- function(x) {
121
+        tmp = kmeans(x, 3, nstart=45)
122
+        return(sort(tmp$centers, decreasing = T))
123
+    }
124
+
125
+    if (verbose) message("Start calculating Prior Means")
126
+    cl <- makeCluster(getNumberOfCores())
127
+    centers <- parRapply(cl, M, calculateOneKmeans)
128
+    stopCluster(cl) 
129
+    centers <- matrix(centers, numSNP, 3, byrow = TRUE)
130
+    priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
131
+    if (verbose) message("Done calculating Prior Means")
132
+    return(priormeans)
133
+}
134
+
135
+#######################################################################################################
136
+
137
+calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
138
+    if (!require(VGAM)) {
139
+        message("VGAM package not found, fall back to use defined platform-specific coefficients")
140
+        return(NULL)
141
+    }
142
+    amatch = match(rownames(params), rownames(trueCalls))
143
+    amatch = amatch[!is.na(amatch)]
144
+    trueCalls = trueCalls[amatch,]
145
+
146
+    amatch = match(rownames(trueCalls), rownames(params))
147
+    params = params[amatch, ]
148
+    
149
+    amatch = match(samplenames, colnames(trueCalls))
150
+    amatch = amatch[!is.na(amatch)]    
151
+    trueCalls = trueCalls[, amatch]
152
+        
153
+    if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
154
+        message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
155
+        return(NULL)
156
+    }
157
+    if ((numSample > 40) && (ncol(trueCalls) < 20)){
158
+        message("At least trueCalls of 20 samples are required to calculate coefficients")
159
+        return(NULL)
160
+    }
161
+    if (nrow(trueCalls) < 1000){
162
+        message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
163
+        return(NULL)
164
+    }
165
+
166
+    getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
167
+    truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
168
+   
169
+    params1 = params[!getna,]
170
+    truek1 = truek[!getna]
171
+
172
+    xx = data.frame(params1)
173
+    t = as.factor(as.numeric(truek1)) 
174
+    xx = data.frame(xx, t)
175
+    fit = VGAM::vglm(t~., multinomial(refLevel=1), xx)
176
+    coe = VGAM::coefficients(fit)
177
+    return(coe)    
178
+}
179
+
180
+getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
181
+    if (!is.null(trueCalls)) {
182
+        coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
183
+    }
184
+    if (!is.null(coe)) {
185
+        if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
186
+        return(coe)
187
+    } else {
188
+        if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
189
+        if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
190
+        loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
191
+        return(getVarInEnv("krlmmCoefficients"))      
192
+    }
193
+}
194
+
195
+
196
+predictKwithVGLM <- function(data, coe, verbose){
197
+    if (verbose) message("Start predicting number of clusters")
198
+    logit1 <- rep(0, nrow(data))
199
+    logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
200
+    logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
201
+   
202
+    logits <- cbind(logit1, logit2, logit23)
203
+    rm(logit1)
204
+    rm(logit2)
205
+    rm(logit23)
206
+    p.unscaled <- exp(logits)
207
+    rm(logits)   
208
+    p <- p.unscaled / rowSums(p.unscaled)
209
+    clusterPrediction = apply(p, 1, function(x){which.max(x)})
210
+    rm(p.unscaled)
211
+    rm(p)
212
+    if (verbose) message("Done predicting number of clusters")
213
+    return(clusterPrediction)
214
+}
215
+
216
+#######################################################################################################
217
+
218
+assignCalls3Cluster <- function(intensities, priormeans){
219
+    prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
220
+    prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
221
+
222
+    emp <- rep(NA, length(intensities))
223
+	ansCalls <- emp
224
+    tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
225
+    if(class(tmp) == "try-error") {
226
+        tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
227
+        if(class(tmp1) == "try-error"){
228
+            tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
229
+            if(class(tmp2) == "try-error"){
230
+                ansCalls = emp
231
+            }else{
232
+                ansCalls = tmp2$cluster
233
+            }
234
+        }else{
235
+            ansCalls = tmp1$cluster
236
+        }
237
+    }else{
238
+        ansCalls = tmp$cluster
239
+    }
240
+    rm(prior31, prior32)
241
+    return(ansCalls)
242
+ 
243
+}
244
+
245
+#######################################################################################################
246
+
247
+assignCalls2Cluster <- function(intensities, priormeans){
248
+    closest <- rep(NA, 3)	
249
+    for(j in 1:3){
250
+        distance <- as.vector(abs(priormeans[j]-intensities))
251
+        closest[j] <- intensities[which.min(distance)]
252
+    }
253
+    prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
254
+	
255
+    emp <- rep(NA, length(intensities))
256
+    ansCalls <- emp    
257
+    tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
258
+    if(class(tmp) == "try-error") {
259
+        ansCalls <- emp
260
+    }else{
261
+        if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
262
+            mp <- abs(tmp$centers[1]-tmp$centers[2])
263
+            pp <- abs(priormeans[2]-priormeans[3])
264
+            if((mp/pp)<=0.25){
265
+                ansCalls <- emp
266
+            }else{
267
+                ansCalls <- tmp$cluster+1
268
+            }
269
+        }
270
+        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
271
+            mp=abs(tmp$centers[1]-tmp$centers[2])
272
+            pp=abs(priormeans[1]-priormeans[2])
273
+            if((mp/pp)<=0.25){
274
+                ansCalls = emp
275
+            }else{
276
+                ansCalls <- tmp$cluster
277
+            }
278
+        }
279
+        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
280
+            mp <- abs(tmp$centers[1]-tmp$centers[2])
281
+            pp <- abs(priormeans[1]-priormeans[3])
282
+            if ((mp/pp) <=0.25){
283
+                ansCalls <- emp
284
+            }else{
285
+                ansCalls[tmp$cluster==1] <- 1
286
+                ansCalls[tmp$cluster==2] <- 3
287
+            }
288
+        }
289
+    }
290
+    rm(tmp)
291
+    return(ansCalls)
292
+}
293
+
294
+#######################################################################################################
295
+
296
+assignCalls1Cluster <- function(intensities, priormeans){
297
+    closest <- rep(NA, 3)
298
+    for(j in 1:3){
299
+        distance <- as.vector(abs(priormeans[j]-intensities))
300
+        closest[j]=intensities[which.min(distance)]
301
+    }
302
+    clusterindex <- which.min(abs(closest-priormeans))
303
+    return(rep(clusterindex, length(intensities)))
304
+}
305
+  
306
+#######################################################################################################
307
+
308
+assignCallsOneSNP <- function(x, priormeans, numSample){
309
+    tolerance = 1e-3
310
+       
311
+    k = x[numSample + 1]
312
+    values = x[1:numSample]  
313
+    
314
+    if (abs(k - 2) <= tolerance) {
315
+        tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
316
+        if (!(class(tmp)=="try-error")) {
317
+            k <- 3;
318
+        }
319
+    }
320
+
321
+    successful <- FALSE;
322
+    if (abs(k - 3) <= tolerance){
323
+        SNPCalls <- assignCalls3Cluster(values, priormeans)
324
+        successful <- !is.na(SNPCalls[1])
325
+        if (!successful) { 
326
+            k <- 2
327
+        }
328
+    }
329
+		
330
+    if ( (abs(k - 2) <= tolerance) && (!successful)){
331
+        SNPCalls <- assignCalls2Cluster(values, priormeans)
332
+        successful <- !is.na(SNPCalls[1])
333
+        if (!successful) { 
334
+            k <- 1
335
+        }			
336
+    }
337
+
338
+    if ( (abs(k - 1) <= tolerance) && (!successful)){
339
+        SNPCalls <- assignCalls1Cluster(values, priormeans)
340
+    }
341
+    return(SNPCalls)
342
+}
343
+
344
+
345
+assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
346
+    # process by block size of 500,000 by default
347
+    message("Start assign calls")
348
+       
349
+    numBlock = ceiling(numSNP / blockSize)
350
+
351
+    for (i in 1:numBlock){
352
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
353
+        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
354
+        thisnumSNP = nrow(subsetM)
355
+        subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
356
+        subseta = matrix(subseta, thisnumSNP, 1)
357
+
358
+        testValues = cbind(subsetM, subseta)
359
+
360
+        cl <- makeCluster(getNumberOfCores())
361
+        callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
362
+        stopCluster(cl)
363
+
364
+        callAnsAllValues = unlist(callAns)
365
+
366
+        subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
367
+        
368
+        callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
369
+        rm(subsetM, subseta, subsetcalls)
370
+    }
371
+    message("Done assign calls")
372
+}
373
+
374
+#######################################################################################################
375
+
376
+hardyweinberg <- function(x, minN=10){
377
+  if (length(x) < minN){
378
+    return(NA) 
379
+  } else {
380
+    result = .Call("krlmmHardyweinberg", x)
381
+    return(result)
4 382
   }
5
-  open(cnSet$gender)
6
-  cnSet$gender[,] = gender
7
-  close(cnSet$gender)
383
+}
384
+
385
+calculateParameters <- function(M, priormeans, numSNP, verbose) {
386
+    if (verbose) message("Start calculating 3-clusters parameters")
387
+    params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
388
+    if (verbose) message("Done calculating 3-cluster parameters")
389
+
390
+    if (verbose) message("Start calculating 2-cluster parameters")
391
+    params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
392
+    if (verbose) message("Done calculating 2-cluster parameters")
393
+
394
+    if (verbose) message("Start calculating 1-cluster parameters")
395
+    params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
396
+    if (verbose) message("Done calculating 1-cluster parameters")
397
+
398
+    parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
399
+                        as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
400
+    
401
+    rownames(parameters) = rownames(M)
402
+    rm(params3cluster)
403
+    rm(params2cluster)
404
+    rm(params1cluster)
405
+    return(parameters)
406
+}
407
+
408
+
409
+calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
410
+    tmp = try(kmeans(x, priors, nstart=1), TRUE)
411
+    if(class(tmp) == "kmeans") {
412
+        flag = 1
413
+     } else {
414
+        tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
415
+        if(class(tmp) == "kmeans") {
416
+            flag = 2
417
+        } else {
418
+            tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
419
+            if (class(tmp) == "kmeans") {
420
+                flag = 3
421
+            } else {
422
+                tmp = kmeans(x, 3, nstart=35)
423
+                flag = 4
424
+            }
425
+        }  
426
+    }
427
+    ss = sum(tmp$withinss)
428
+    hw = hardyweinberg(tmp$cluster) 
429
+    centers = sort(tmp$centers, decreasing = TRUE)
430
+
431
+    return(c(centers, ss, hw, flag))
432
+}
433
+
434
+calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
435
+    Apriors = priormeans
436
+    ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
437
+    ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
438
+
439
+    cl <- makeCluster(getNumberOfCores())
440
+    parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
441
+
442
+    stopCluster(cl)
443
+    parAnsAllValues = unlist(parAns)
444
+    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
445
+   
446
+    centers <- parameters[, 1:3]
447
+    ss <- parameters[, 4]
448
+    hw <- parameters[, 5]
449
+    flag <- parameters[, 6]
450
+
451
+    rm(parAns)
452
+    rm(parameters)
453
+   
454
+    sigma=solve(cov(centers, use="complete.obs"))
455
+    mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
456
+
457
+    rm(sigma)
458
+    rm(centers)
459
+
460
+    gc()
461
+    return(list(eless = ss, elemh = mh, elehw = hw))
462
+}
463
+
464
+
465
+calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
466
+    mahaldist = rep(NA, numSNP)
467
+
468
+    tolerance = 1e-3
469
+    for (i in 1:numSNP) {
470
+        if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
471
+        if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
472
+        if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
473
+        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
474
+    }
475
+    return(mahaldist)
476
+}
477
+
478
+
479
+calculateOneParams2Cluster <- function(x, priors){
480
+  
481
+    aa = rep(NA, 3)   
482
+    for(j in 1:3){
483
+        dist = as.vector(abs(priors[j]-x))
484
+        aa[j]=x[which.min(dist)]
485
+    } 
486
+    prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
487
+     
488
+
489
+    tmp = try(kmeans(x, prior2, nstart=1), TRUE)
490
+    if (class(tmp)=="kmeans") {
491
+        centers = tmp$centers
492
+        hw = hardyweinberg(tmp$cluster)
493
+    }
494
+    rm(tmp)
495
+    tmp = kmeans(x, 2, nstart = 35)
496
+    if (tmp$centers[1] < tmp$centers[2]){
497
+        centers = c(tmp$centers[2], tmp$centers[1])
498
+        hw = hardyweinberg(3 - tmp$cluster)   
499
+    } else {
500
+        centers = tmp$centers
501
+        hw = hardyweinberg(tmp$cluster)
502
+    }
503
+    ss = sum(tmp$withinss)     
504
+    return(c(centers, prior2, ss, hw))
505
+}
506
+
507
+
508
+calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
509
+    Apriors = priormeans
510
+   
511
+    cl <- makeCluster(getNumberOfCores())
512
+    parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
513
+    stopCluster(cl)
514
+
515
+    parAnsAllValues = unlist(parAns)
516
+    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
517
+
518
+    centers <- parameters[, 1:2]
519
+    priors2cluster <- parameters[, 3:4]
520
+    ss <- parameters[, 5]
521
+    hw <- parameters[, 6]
522
+    
523
+    rm(parAns)
524
+    rm(parameters)
525
+    sigma=solve(cov(centers, use="complete.obs"))
526
+
527
+    mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
528
+
529
+    rm(sigma)
530
+    rm(centers)
531
+
532
+    gc()
533
+    return(list(eless = ss, elemh = mh, elehw = hw))
534
+}
535
+
536
+
537
+calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
538
+    mahaldist = rep(NA, numSNP)
539
+    
540
+    for (i in 1:numSNP) {
541
+        difference <- centers[i,] - priors[i,]
542
+        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
543
+    }
544
+    return(mahaldist)
545
+}
546
+
547
+calculateOneParams1Cluster <- function(x, priors){
548
+    center = mean(x)
549
+    diff <- x - center
550
+    diffsquare <- diff^2
551
+    ss = sum(diffsquare)
552
+
553
+    closestPrior = priors[which.min(abs(priors - center))]
554
+
555
+    return(c(center, closestPrior, ss))
556
+}
557
+
558
+
559
+calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
560
+    Apriors = priormeans
561
+   
562
+    cl <- makeCluster(getNumberOfCores())
563
+    parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
564
+    stopCluster(cl)
565
+
566
+    parAnsAllValues = unlist(parAns)
567
+    parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
568
+
569
+    centers <- matrix(parameters[, 1], numSNP, 1)
570
+    prior1cluster <- matrix(parameters[, 2], numSNP, 1)
571
+    ss <- parameters[, 3]
572
+    
573
+    rm(parAns)
574
+    rm(parameters)
575
+    sigma=solve(cov(centers, use="complete.obs"))
576
+
577
+    mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
578
+
579
+    rm(sigma)
580
+    rm(centers)
581
+
582
+    gc()
583
+    return(list(eless = ss, elemh = mh))
584
+}
585
+
586
+
587
+calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
588
+    mahaldist = rep(NA, numSNP)
589
+    
590
+    for(i in 1:numSNP) {
591
+        difference <- as.vector(centers[i, 1] - priors[i, 1])
592
+	mahaldist[i]=difference%*%sigma%*%difference
593
+    }
594
+    return(mahaldist)
595
+}
596
+   
597
+#############################################
598
+
599
+
600
+computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
601
+    # compute log-ratio in blocksize of 500,000 by default
602
+    if (verbose) message("Start computing confidence score")
603
+    
604
+    numBlock = ceiling(numSNP / blockSize)
605
+    for (i in 1:numBlock){
606
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
607
+        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
608
+        subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
609
+        subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
610
+        callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
611
+        rm(subsetM, subsetCalls, subsetCallProb)
612
+    }
613
+
614
+    if (verbose) message("Done computing confidence score")
615
+}
616
+
617
+#############################################
618
+
619
+krlmmImputeGender <- function(cnSet, pkgname, verbose){
620
+    if (verbose) message("Start imputing gender")    
621
+    S = computeAverageLogIntensity(cnSet, verbose) 
622
+    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
623
+    YIndex <- getVarInEnv("YIndex")
624
+    loader("annotation.rda", .crlmmPkgEnv, pkgname)
625
+    annotation <- getVarInEnv("annot")
626
+
627
+    A = A(cnSet)
628
+    open(A)
629
+    SNPNames = rownames(A)
630
+    close(A)
631
+
632
+    matchy = match(annotation[YIndex, 2], SNPNames)
633
+    Sy = S[matchy,]
634
+    rm(S)
635
+    numYChrSNP = nrow(Sy)
636
+
637
+    allS = matrix(NA, numYChrSNP, 2)
638
+
639
+    for (i in 1:numYChrSNP) {
640
+        tmp = kmeans(Sy[i,] ,2, nstart=45)
641
+        allS[i,] = sort(tmp$centers, decreasing=F)
642
+    }
643
+    priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
644
+
645
+    group = rep(NA, ncol(Sy))
646
+
647
+    meanmatrix = apply(Sy, 2, median)
648
+
649
+    Sy1 = abs(meanmatrix-priorS[1])
650
+    Sy2 = abs(meanmatrix-priorS[2])
651
+
652
+    # output male - 1, female - 2, female S-values are smaller than male S-values. 
653
+    test = cbind(Sy2, Sy1)
654
+    group = apply(test, 1, which.min)
8 655
 
9
-  TRUE
656
+    if (verbose) message("Done imputing gender") 
657
+    return(group)    
10 658
 }
11 659
old mode 100644
12 660
new mode 100755
... ...
@@ -11,7 +11,7 @@
11 11
 genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
12 12
       arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
13 13
       highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL,
14
-      call.method="crlmm", cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, 
14
+      call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, 
15 15
       useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE,                               
16 16
       eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, 
17 17
       recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7)
... ...
@@ -43,6 +43,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
43 43
     specify the .idat file extension for the Cy3 and Cy5 channels.}
44 44
   \item{XY}{\code{NChannelSet} containing X and Y intensities.}
45 45
   \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').}
46
+  \item{trueCalls}{matrix specifying known Genotype calls(can contain some NAs) for a subset of samples and features (1 - AA, 2 - AB, 3 - BB).}
46 47
   \item{cdfName}{ annotation package  (see also \code{validCdfNames})}
47 48
   \item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.}
48 49
   \item{batch}{ character vector indicating the batch variable. Must be
... ...
@@ -65,7 +66,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
65 66
   out samples.}
66 67
   \item{recallMin}{Minimum number of samples for recalibration. }
67 68
   \item{recallRegMin}{Minimum number of SNP's for regression.}
68
-  \item{gender}{  integer vector (  male = 1, female =2 ) or missing,
69
+  \item{gender}{  integer vector (  male = 1, female = 2 ) or missing,
69 70
   with same length as filenames.  If missing, the gender is predicted.}
70 71
   \item{returnParams}{'logical'. Return recalibrated parameters from crlmm.}
71 72
   \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
... ...
@@ -91,7 +92,9 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
91 92
 	Rather, \code{batch} is required in order to initialize a
92 93
 	\code{CNSet} container with the appropriate dimensions.
93 94
 
94
-        The new 'krlmm' option is available for certain chip types.
95
+        The new 'krlmm' option is available for certain chip types. Optional 
96
+	\code{trueCalls} matrix contains known Genotype calls (1 - AA, 2 - AB, 3 - BB)
97
+	for a subset of samples and features. 
95 98
       }
96 99
 
97 100
 \value{	A \code{SnpSuperSet} instance.}
... ...
@@ -17,7 +17,8 @@
17 17
 \usage{
18 18
 genotypeInf(cnSet, mixtureParams, probs = rep(1/3, 3), SNRMin = 5,
19 19
 recallMin = 10, recallRegMin = 1000, verbose = TRUE, returnParams =
20
-TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm")
20
+TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm", 
21
+trueCalls = NULL)
21 22
 }
22 23
 %- maybe also 'usage' for other objects documented here.
23 24
 \arguments{
... ...
@@ -44,6 +45,7 @@ TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm")
44 45
   \item{cdfName}{\code{character} string indicating which annotation
45 46
     package to load.}
46 47
   \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').}
48
+  \item{trueCalls}{ matrix specifying known Genotype calls for a subset of samples and features(1 - AA, 2 - AB, 3 - BB).}
47 49
 }
48 50
 
49 51
 \details{
... ...
@@ -60,6 +62,16 @@ TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm")
60 62
 	the transformation 'round(-1000*log2(1-p))', where p is the
61 63
 	probability.  The function \code{i2P} can be used to convert the
62 64
 	integers back to the scale of probabilities.
65
+
66
+	An optional \code{trueCalls} argument can be provided to KRLMM method
67
+	which contains known genotype calls(can contain some NAs) for some 
68
+	samples and SNPs. This will used to compute KRLMM parameters by calling
69
+	\code{vglm} function from \code{VGAM} package.
70
+
71
+	The KRLMM method makes use of functions provided in \code{parallel} 
72
+	package to speed up the process. It by default initialises up to 
73
+	8 clusters. This is configurable by setting up an option named 
74
+	"krlmm.cores", e.g. options("krlmm.cores" = 16). 
63 75
       }
64 76
 
65 77
 \value{
... ...
@@ -8,12 +8,12 @@
8 8
 }
9 9
 
10 10
 \usage{
11
-readGenCallOutput(file, path=".", cdfName, colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
11
+readGenCallOutput(filenames, path=".", cdfName, colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
12 12
                   type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE)
13 13
 }
14 14
 
15 15
 \arguments{
16
-  \item{file}{'character' string specifying the name of the file to read in}
16
+  \item{filenames}{'character' string, or a vector of character string specifying the name of the file(s) to read in}
17 17
   \item{path}{'character' string specifying the location of file to be read by the function}
18 18
   \item{cdfName}{'character' defining the chip annotation (manifest) to use
19 19
     ('human370v1c', human550v3b', 'human650v3a', 'human1mv1c',
20 20
old mode 100644
21 21
new mode 100755
... ...
@@ -12,3 +12,15 @@ SEXP gtypeCallerPart2(SEXP *, SEXP *, SEXP *, SEXP *,
12 12
                       SEXP *, SEXP *, SEXP *);
13 13
 
14 14
 SEXP normalizeBAF(SEXP *, SEXP *);
15
+
16
+SEXP krlmmComputeM(SEXP *, SEXP *);
17
+
18
+SEXP krlmmComputeS(SEXP *, SEXP *);
19
+
20
+SEXP readGenCallOutputCFunc(SEXP *, SEXP *, SEXP *, SEXP *, SEXP *, SEXP *, SEXP *, SEXP *);
21
+
22
+SEXP krlmmConfidenceScore(SEXP *, SEXP *);
23
+
24
+SEXP krlmmHardyweinberg(SEXP *);
25
+
26
+SEXP countFileLines(SEXP *);
15 27
old mode 100644
16 28
new mode 100755
... ...
@@ -88,7 +88,7 @@ SEXP gtypeCallerPart1(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
88 88
   double buffer;
89 89
 
90 90
   // All pointers appear here
91
-  int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex, *ptr2ncIndexes;
91
+  int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex; //*ptr2ncIndexes;
92 92
   double *ptr2Smedian, *ptr2knots, *ptr2params, *ptr2Centers, *ptr2Scales, *ptr2probs, *ptr2trim;
93 93
   ptr2nIndexes = INTEGER_POINTER(AS_INTEGER(nIndexes));
94 94
   ptr2A = INTEGER_POINTER(AS_INTEGER(A));
... ...
@@ -97,7 +97,7 @@ SEXP gtypeCallerPart1(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
97 97
   ptr2df = INTEGER_POINTER(AS_INTEGER(df));
98 98
   ptr2mIndex = INTEGER_POINTER(AS_INTEGER(mIndex));
99 99
   ptr2fIndex = INTEGER_POINTER(AS_INTEGER(fIndex));
100
-  ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes));
100
+ // ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes));
101 101
   ptr2Smedian = NUMERIC_POINTER(AS_NUMERIC(SMEDIAN));
102 102
   ptr2knots = NUMERIC_POINTER(AS_NUMERIC(knots));
103 103
   ptr2params = NUMERIC_POINTER(AS_NUMERIC(mixtureParams));
... ...
@@ -311,8 +311,9 @@ SEXP gtypeCallerPart2(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
311 311
   ib3 = GET_LENGTH(noInfo);
312 312
 
313 313
   // All pointers appear here
314
-  int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex, *ptr2ncIndexes, *ptr2noTraining, *ptr2noInfo;
315
-  double *ptr2Smedian, *ptr2knots, *ptr2params, *ptr2Centers, *ptr2Scales, *ptr2probs, *ptr2trim;
314
+  int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex; // *ptr2ncIndexes, 
315
+  int *ptr2noTraining, *ptr2noInfo;
316
+  double *ptr2Smedian, *ptr2knots, *ptr2params, *ptr2Centers, *ptr2Scales, *ptr2probs; // *ptr2trim;
316 317
   ptr2nIndexes = INTEGER_POINTER(AS_INTEGER(nIndexes));
317 318
   ptr2A = INTEGER_POINTER(AS_INTEGER(A));
318 319
   ptr2B = INTEGER_POINTER(AS_INTEGER(B));
... ...
@@ -320,7 +321,7 @@ SEXP gtypeCallerPart2(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
320 321
   ptr2df = INTEGER_POINTER(AS_INTEGER(df));
321 322
   ptr2mIndex = INTEGER_POINTER(AS_INTEGER(mIndex));
322 323
   ptr2fIndex = INTEGER_POINTER(AS_INTEGER(fIndex));
323
-  ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes));
324
+ // ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes));
324 325
   ptr2noTraining = INTEGER_POINTER(AS_INTEGER(noTraining));
325 326
   ptr2noInfo = INTEGER_POINTER(AS_INTEGER(noInfo));
326 327
 
... ...
@@ -330,7 +331,7 @@ SEXP gtypeCallerPart2(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
330 331
   ptr2Centers = NUMERIC_POINTER(AS_NUMERIC(theCenters));
331 332
   ptr2Scales = NUMERIC_POINTER(AS_NUMERIC(theScales));
332 333
   ptr2probs = NUMERIC_POINTER(AS_NUMERIC(probs));
333
-  ptr2trim = NUMERIC_POINTER(AS_NUMERIC(trim));
334
+  // ptr2trim = NUMERIC_POINTER(AS_NUMERIC(trim));
334 335
 
335 336
   // End pointers
336 337
 
... ...
@@ -408,3 +409,249 @@ SEXP gtypeCallerPart2(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
408 409
   }
409 410
   return(R_NilValue);
410 411
 }
412
+
413
+
414
+SEXP krlmmComputeM(SEXP A, SEXP B){
415
+
416
+  /*
417
+    ARGUMENTS
418
+    ---------
419
+    A: intensity matrix for allele A
420
+    B: intensity matrix for allele B
421
+    M: log-ratio for a particular SNP (outgoing)
422
+
423
+    INTERNAL VARIABLES
424
+    -------- ---------
425
+    rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
426
+  */
427
+
428
+  int rowsAB, colsAB;
429
+  rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
430
+  colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
431
+
432
+  int i, j;
433
+
434
+  int *ptr2A, *ptr2B;
435
+  ptr2A = INTEGER_POINTER(AS_INTEGER(A));
436
+  ptr2B = INTEGER_POINTER(AS_INTEGER(B));
437
+
438
+  SEXP Rval;
439
+  PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
440
+
441
+  double *ptr2M;
442
+  long ele;
443
+  ptr2M = NUMERIC_POINTER(Rval);
444
+
445
+  for (i = 1; i <= rowsAB; i++){
446
+    for (j = 1; j <= colsAB; j++){
447
+      // elem is an index for A, B and M
448
+      ele = Cmatrix(i, j, rowsAB);
449
+      ptr2M[ele] = (log2(ptr2A[ele])-log2(ptr2B[ele]));
450
+    }
451
+  }
452
+  
453
+  UNPROTECT(1);
454
+  return Rval;
455
+}
456
+
457
+
458
+SEXP krlmmComputeS(SEXP A, SEXP B){
459
+
460
+  /*
461
+    ARGUMENTS
462
+    ---------
463
+    A: intensity matrix for allele A
464
+    B: intensity matrix for allele B
465
+    S: average log-intensity for a particular SNP (outgoing)
466
+
467
+    INTERNAL VARIABLES
468
+    -------- ---------
469
+    rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
470
+  */
471
+
472
+  int rowsAB, colsAB;
473
+  rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
474
+  colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
475
+
476
+  int i, j;
477
+
478
+  int *ptr2A, *ptr2B;
479
+  ptr2A = INTEGER_POINTER(AS_INTEGER(A));
480
+  ptr2B = INTEGER_POINTER(AS_INTEGER(B));
481
+
482
+  SEXP Rval;
483
+  PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
484
+
485
+  double *ptr2S;
486
+  long ele;
487
+  ptr2S = NUMERIC_POINTER(Rval);
488
+
489
+  for (i = 1; i <= rowsAB; i++){
490
+    for (j = 1; j <= colsAB; j++){
491
+      // elem is an index for A, B and S
492
+      ele = Cmatrix(i, j, rowsAB);
493
+      ptr2S[ele] = (log2(ptr2A[ele])+log2(ptr2B[ele]))/2;
494
+    }
495
+  }
496
+  
497
+  UNPROTECT(1);
498
+  return Rval;
499
+}
500
+
501
+void calculate_multiple_cluster_scores(int row, double *intensity, double mean_intensity, int *clustering, int num_SNP, int num_sample, int *ptr, double **dist, int *clustercount){
502
+    int p, q, o;
503
+    long vectorPos;
504
+    double a_dist;
505
+    for(p=1; p <= num_sample; p++){
506
+        dist[p][p] = 0;
507
+    }
508
+    for(p=1; p<num_sample; p++){
509
+        for(q=p+1; q<=num_sample; q++){
510
+   	    a_dist = fabs(intensity[Cmatrix(row, p, num_SNP)] - intensity[Cmatrix(row, q, num_SNP)]);
511
+            dist[p][q] = a_dist;
512
+            dist[q][p] = a_dist;
513
+        }
514
+    }
515
+
516
+    double sum[4];
517
+    double within_cluster;
518
+    double between_cluster;
519
+    double temp_between_cluster;
520
+    double max;
521
+    for (p=1; p <= num_sample; p++){
522
+        vectorPos = Cmatrix(row, p, num_SNP);
523
+        sum[1] = 0;
524
+        sum[2] = 0;
525
+        sum[3] = 0;
526
+        for (q=1; q<=num_sample; q++){
527
+	  sum[clustering[Cmatrix(row, q, num_SNP)]] += dist[p][q];
528
+        }
529
+       
530
+        within_cluster = sum[clustering[vectorPos]] / (clustercount[clustering[vectorPos]]- 1);
531
+        between_cluster = -1.0;
532
+        for (o=1; o<=3; o++){
533
+            if ((o != clustering[vectorPos]) && (clustercount[o] > 0)){
534
+                temp_between_cluster = sum[o] / clustercount[o];
535
+                if ((between_cluster < 0) || (temp_between_cluster < between_cluster)) {
536
+                    between_cluster = temp_between_cluster;
537
+                }                                           
538
+            }                                               
539
+        }
540
+       
541
+        if (clustercount[clustering[vectorPos]] > 1) {
542
+            if (between_cluster > within_cluster) {
543
+                max = between_cluster;
544
+            } else {
545
+                max = within_cluster;
546
+            }
547
+            ptr[vectorPos] = genotypeConfidence2((between_cluster - within_cluster) / max);                                                   
548
+        }
549
+    }   
550
+}
551
+
552
+void calculate_unique_cluster_scores(int row, double *intensity, double mean_intensity, double intensity_range, int num_SNP, int num_sample, int *ptr)				   
553
+{
554
+    int p;
555
+    long vectorPos;
556
+
557
+    for(p = 1; p <= num_sample; p++){
558
+        vectorPos = Cmatrix(row, p, num_SNP); 
559
+        ptr[vectorPos] = genotypeConfidence2(1 -  fabs(fabs(intensity[vectorPos] - mean_intensity) / intensity_range));
560
+    }
561
+}
562
+
563
+
564
+double calculate_SNP_mean(int row, double *intensity, int num_SNP, int num_sample)
565
+{
566
+  double sum_intensity;
567
+  sum_intensity = 0;
568
+  int p;
569
+  for(p = 1; p <= num_sample; p++){
570
+    sum_intensity = sum_intensity + intensity[Cmatrix(row, p, num_SNP)];
571
+  }
572
+  return(sum_intensity / num_sample);
573
+}
574
+
575
+double calculate_SNP_range(int row, double *intensity, int num_SNP, int num_sample)
576
+{
577
+  double min_intensity;
578
+  double max_intensity;
579
+  max_intensity = intensity[Cmatrix(row, 1, num_SNP)];
580
+  min_intensity = intensity[Cmatrix(row, 1, num_SNP)];
581
+  int p;  
582
+  for(p = 2; p <= num_sample; p++){
583
+    if (intensity[Cmatrix(row, p, num_SNP)] < min_intensity){
584
+      min_intensity = intensity[Cmatrix(row, p, num_SNP)];  
585
+    }	
586
+    if (intensity[Cmatrix(row, p, num_SNP)] > max_intensity){
587
+      max_intensity = intensity[Cmatrix(row, p, num_SNP)];
588
+    }
589
+  }
590
+  return(max_intensity - min_intensity);
591
+}
592
+
593
+
594
+SEXP krlmmConfidenceScore(SEXP M, SEXP clustering)
595
+{
596
+    int num_SNP, num_sample;
597
+    num_SNP = INTEGER(getAttrib(M, R_DimSymbol))[0];
598
+    num_sample = INTEGER(getAttrib(M, R_DimSymbol))[1];
599
+
600
+    double *ptr2M;
601
+    int *ptr2cluster;
602
+    ptr2M = NUMERIC_POINTER(AS_NUMERIC(M));
603
+    ptr2cluster = INTEGER_POINTER(AS_INTEGER(clustering));
604
+
605
+    int i, j;
606
+    int cluster;
607
+    double mean_intensity;
608
+    double intensity_range;
609
+    int k;
610
+    
611
+    SEXP Rval;
612
+    PROTECT(Rval = allocMatrix(INTSXP, num_SNP, num_sample));
613
+
614
+    int *ptr2score;
615
+    ptr2score = INTEGER_POINTER(Rval);
616
+    
617
+    double **dist;
618
+    // allocate memory to dist
619
+    dist = (double **)malloc((num_sample + 1)*sizeof(double *));
620
+    for (i = 1; i <= num_sample; i++){
621
+      dist[i] = (double *)malloc((num_sample + 1) * sizeof(double)); 
622
+    }
623
+
624
+    int cluster_count[4]; // extra element to cope with zero-base notation
625
+   // int clid[4];
626
+
627
+    for (i = 1; i <= num_SNP; i++) {
628
+        cluster_count[1] = 0;
629
+        cluster_count[2] = 0;
630
+        cluster_count[3] = 0;
631
+        for (j=1; j<= num_sample; j++){
632
+	    cluster = ptr2cluster[Cmatrix(i, j, num_SNP)];
633
+            cluster_count[cluster]++;
634
+        }
635
+	mean_intensity = calculate_SNP_mean(i, ptr2M, num_SNP, num_sample);
636
+	intensity_range = calculate_SNP_range(i, ptr2M, num_SNP, num_sample);
637
+
638
+        k = 0;
639
+        for (j=1; j<=3; j++){
640
+            if (cluster_count[j] > 0){
641
+                k++;
642
+            //    clid[k] = j;
643
+            }
644
+        }
645
+
646
+	if (k==1) {
647
+	  calculate_unique_cluster_scores(i, ptr2M, mean_intensity, intensity_range, num_SNP, num_sample, ptr2score);
648
+	} else {
649
+	  calculate_multiple_cluster_scores(i, ptr2M, mean_intensity, ptr2cluster, num_SNP, num_sample, ptr2score, dist, cluster_count);
650
+	}       
651
+      
652
+    } 
653
+
654
+    UNPROTECT(1);
655
+    return Rval;
656
+}
657
+
... ...
@@ -2,12 +2,17 @@
2 2
 #include <R_ext/Rdynload.h>
3 3
 #include "crlmm.h"
4 4
 #include <Rinternals.h>
5
-#include <R_ext/Rdynload.h>
6 5
 
7 6
 static const R_CallMethodDef CallEntries[] = {
8 7
     {"gtypeCallerPart1", (DL_FUNC)&gtypeCallerPart1, 17},
9 8
     {"gtypeCallerPart2", (DL_FUNC)&gtypeCallerPart2, 19},
10 9
     {"normalizeBAF", (DL_FUNC)&normalizeBAF, 2},
10
+    {"krlmmComputeM", (DL_FUNC)&krlmmComputeM, 2},
11
+    {"krlmmComputeS", (DL_FUNC)&krlmmComputeS, 2},
12
+    {"readGenCallOutputCFunc", (DL_FUNC)&readGenCallOutputCFunc, 8},
13
+    {"krlmmConfidenceScore", (DL_FUNC)&krlmmConfidenceScore, 2},
14
+    {"krlmmHardyweinberg", (DL_FUNC)&krlmmHardyweinberg, 1},
15
+    {"countFileLines", (DL_FUNC)&countFileLines, 1},
11 16
     {NULL, NULL, 0}
12 17
 };
13 18
 
... ...
@@ -15,7 +20,6 @@ void R_init_crlmm(DllInfo *dll){
15 20
     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
16 21
 }
17 22
 
18
-
19 23
 SEXP subColSummarizeMedianPP(SEXP RMatrix, SEXP R_rowIndexList){
20 24
   static SEXP(*fun)(SEXP, SEXP) = NULL;
21 25
   if (fun == NULL)
... ...
@@ -3,6 +3,8 @@
3 3
 #include <Rdefines.h>
4 4
 #include <Rmath.h>
5 5
 #include <Rinternals.h>
6
+#include <assert.h>
7
+#include <string.h>
6 8
 
7 9
 int genotypeConfidence(const double *prob){
8 10
   int K=1000;
... ...
@@ -10,6 +12,14 @@ int genotypeConfidence(const double *prob){
10 12
   else return( (int) round(-K*log2(1- *prob)));
11 13
 }
12 14
 
15
+
16
+int genotypeConfidence2(double probability){
17
+  int K = 1000;
18
+  if (probability == 1.0) return(INT_MAX);
19
+  else return( (int) round(-K * log2(1 - probability)));
20
+}
21
+
22
+
13 23
 int intInSet(const int *x, const int *set, const int *n){
14 24
   int i;
15 25
   for (i=0; i < *n; i++)
... ...
@@ -121,7 +131,8 @@ double  median(double *x, int length){
121 131
 
122 132
 void mad_median(double *datavec, int *classvec, int class, double trim, int cols, int rows, double *m1, double *m2, double *m3, int i_ext){
123 133
   /* trim is ignored for the moment - for compatibility */
124
-  int i, j=0, n_ignore, n=0;
134
+  int i, j=0; 
135
+  int n=0;
125 136
 
126 137
   for (i = 0; i < cols; i++)
127 138
     if (classvec[i] == class)
... ...
@@ -175,9 +186,9 @@ SEXP normalizeBAF(SEXP theta, SEXP cTheta){
175 186
 	p2baf[idx] = NA_REAL;
176 187
       }else if (p2theta[idx] < p2ctheta[i]){
177 188
 	p2baf[idx] = 0;
178
-      }else if (p2theta[idx] >= p2ctheta[i] & p2theta[idx] < p2ctheta[i + rowsT]){
189
+      }else if ((p2theta[idx] >= p2ctheta[i]) & (p2theta[idx] < p2ctheta[i + rowsT])){
179 190
 	p2baf[idx] = .5*(p2theta[idx]-p2ctheta[i])/(p2ctheta[i+rowsT]-p2ctheta[i]);
180
-      }else if(p2theta[idx] >= p2ctheta[i+rowsT] & p2theta[idx] < p2ctheta[i + 2*rowsT]){
191
+      }else if((p2theta[idx] >= p2ctheta[i+rowsT]) & (p2theta[idx] < p2ctheta[i + 2*rowsT])){
181 192
 	p2baf[idx] = .5+.5*(p2theta[idx]-p2ctheta[i+rowsT])/(p2ctheta[i+2*rowsT]-p2ctheta[i+rowsT]);
182 193
       }else{
183 194
 	p2baf[idx] = 1;
... ...
@@ -263,3 +274,216 @@ SEXP test_mad_median(SEXP X, SEXP Y, SEXP trim){
263 274
 
264 275
   return output;
265 276
 }
277
+
278
+/* Converts row-col notation into base-zero vector notation, based on a column-wise conversion*/
279
+long Cmatrix(int row, int col, int totrow){
280
+  return( (row)-1 + ((col)-1)*(totrow)); // num_SNP is number of rows
281
+}
282
+
283
+SEXP countFileLines(SEXP filename) {
284
+  FILE *fInput = fopen(CHAR(STRING_ELT(filename, 0)), "r");
285
+  char * line = NULL;
286
+  char *readline;
287
+  size_t len = 1000;
288
+  line = (char *)malloc(sizeof(char) * (len + 1));
289
+  if(fInput == NULL){
290
+    fclose(fInput);
291
+    return 0;
292
+  }
293
+  long line_count = 0;
294
+  while ((readline = fgets(line, len, fInput)) != NULL){
295
+    line_count++;
296
+  }
297
+  fclose(fInput);
298
+  free(line);
299
+  
300
+  SEXP Rcount;
301
+  PROTECT(Rcount = allocVector(INTSXP, 1));
302
+  int *ptr;
303
+  ptr = INTEGER_POINTER(Rcount);
304
+  ptr[0] = line_count;
305
+  UNPROTECT(1);
306
+  return Rcount;
307
+}
308
+
309
+void
310
+DoReadGenCallOutput(SEXP filename, int numSNP, int numsample, int SNPIDIndex, int sampleIDIndex, int XValueIndex, int YValueIndex, int *XValuesPt, int *YValuesPt, int delimiter, SEXP SNPNames, SEXP sampleNames){
311
+  FILE *fGenCall;
312
+  fGenCall = fopen(CHAR(STRING_ELT(filename, 0)), "r");
313
+
314
+  // ignore the first 10 lines, which contains heading information
315
+  char* token;
316
+  char * line = NULL;
317
+  size_t len = 1000;
318
+  char *readline;    
319
+  line = (char *)malloc(len+1);
320
+   
321
+  int i, j;
322
+  for (i = 0; i < 10; i++){
323
+    readline = fgets(line, len, fGenCall);
324
+  }
325
+
326
+  long index;
327
+  int pos;
328
+  int aXValue, aYValue;
329
+  aXValue = 0;
330
+  aYValue = 0;
331
+
332
+  for (j = 1; j <= numsample; j++){
333
+    for (i = 1; i <= numSNP; i++){
334
+      readline = fgets(line, len, fGenCall);
335
+	  if (readline == NULL){
336
+		Rprintf("Error reading from file");
337
+	  }
338
+      if (delimiter == 1){
339
+	token = strtok(line, ",");
340
+      } else {
341
+	token = strtok(line, "\t");
342
+      }
343
+      index = Cmatrix(i, j, numSNP);
344
+      pos = 0;     
345
+      while (token != NULL) {
346
+	if (pos == SNPIDIndex){
347
+	  if (j == 1){
348
+	    SET_STRING_ELT(SNPNames, (i - 1), mkChar(token));   
349
+	  }	  
350
+	}
351
+	if (pos == sampleIDIndex){
352
+	  if (i == 1){
353
+	    SET_STRING_ELT(sampleNames, (j - 1), mkChar(token));   
354
+	  }	 
355
+	}
356
+	if (pos == XValueIndex){
357
+	  aXValue = atoi(token);
358
+	}
359
+	if (pos == YValueIndex){
360
+	  aYValue = atoi(token);
361
+	}	
362
+	if (delimiter == 1){
363
+	  token = strtok(NULL, ",");
364
+	} else {
365
+	  token = strtok(NULL, "\t");
366
+	}
367
+	pos++;
368
+      }
369
+      XValuesPt[index] = aXValue; 
370
+      YValuesPt[index] = aYValue;
371
+    }  
372
+  }
373
+ 
374
+  free(line);
375
+  fclose(fGenCall);
376
+}
377
+
378
+SEXP readGenCallOutputCFunc(SEXP genCallOutputfile, SEXP num_SNP, SEXP num_Sample, SEXP SNPID_Index, SEXP sampleID_Index, SEXP XValue_Index, SEXP YValue_Index, SEXP delimiter_Index){
379
+  int numSNP;
380
+  int numsample;
381
+  numSNP = INTEGER_VALUE(num_SNP);
382
+  numsample = INTEGER_VALUE(num_Sample);
383
+
384
+  int SNPIDIndex, sampleIDIndex, XIndex, YIndex, delimiterIndex;
385
+  SNPIDIndex = INTEGER_VALUE(SNPID_Index);
386
+  sampleIDIndex = INTEGER_VALUE(sampleID_Index);
387
+  XIndex = INTEGER_VALUE(XValue_Index);
388
+  YIndex = INTEGER_VALUE(YValue_Index);
389
+  delimiterIndex = INTEGER_VALUE(delimiter_Index);
390
+ 
391
+  SEXP Xvalues, Yvalues, ret, ret_names, rownames, colnames, dimnames;
392
+
393
+  char *names[2] = {"Xvalues", "Yvalues"};
394
+
395
+  /* protect R objects in C. */
396
+  PROTECT(Xvalues = allocMatrix(INTSXP, numSNP, numsample));
397
+  PROTECT(Yvalues = allocMatrix(INTSXP, numSNP, numsample));
398
+ 
399
+  PROTECT(ret = allocVector(VECSXP, 2));
400
+  PROTECT(ret_names = allocVector(STRSXP, 2));
401
+  
402
+    /* set a list object for R. */
403
+  SET_VECTOR_ELT(ret, 0, Xvalues);
404
+  SET_VECTOR_ELT(ret, 1, Yvalues);
405
+
406
+  PROTECT(rownames = allocVector(STRSXP, numSNP));
407
+  PROTECT(colnames = allocVector(STRSXP, numsample));
408
+
409
+  PROTECT(dimnames = allocVector(VECSXP, 2));
410
+
411
+  SET_VECTOR_ELT(dimnames, 0, rownames);
412
+  SET_VECTOR_ELT(dimnames, 1, colnames);
413
+
414
+  setAttrib(Xvalues, R_DimNamesSymbol, dimnames);
415
+  setAttrib(Yvalues, R_DimNamesSymbol, dimnames);  
416
+
417
+    /* set list's names for R. */
418
+  SET_STRING_ELT(ret_names, 0, mkChar(names[0])); 
419
+  SET_STRING_ELT(ret_names, 1, mkChar(names[1])); 
420
+  setAttrib(ret, R_NamesSymbol, ret_names);
421
+  
422
+  int *Xptr, *Yptr;
423
+  /* assign points to R objects. */
424
+  Xptr = INTEGER_POINTER(Xvalues);
425
+  Yptr = INTEGER_POINTER(Yvalues);
426
+
427
+  // C is 0-based and XValueIndex and YVlaueIndex are 1-based
428
+  DoReadGenCallOutput(genCallOutputfile, numSNP, numsample, (SNPIDIndex-1), (sampleIDIndex-1), (XIndex-1), (YIndex-1), Xptr, Yptr, delimiterIndex, rownames, colnames);
429
+  
430
+  UNPROTECT(7);
431
+  return(ret);
432
+}
433
+
434
+
435
+SEXP krlmmHardyweinberg(SEXP clustering){
436
+  int counts[3];
437
+  int N;
438
+  int num;
439
+  num = length(clustering);
440
+
441
+  int *clusterPtr;
442
+  clusterPtr = INTEGER_POINTER(AS_INTEGER(clustering));
443
+
444
+  int aSample, temp;
445
+  counts[0] = 0;
446
+  counts[1] = 0;
447
+  counts[2] = 0;
448
+  for (aSample = 0; aSample < num; aSample++) {
449
+    counts[(clusterPtr[aSample] - 1)]++;
450
+  }
451
+  N = counts[0] + counts[1] + counts[2];
452
+  if (N != num) {
453
+    error("the count from all three doesn't equal to num_sample");
454
+  }
455
+
456
+  SEXP Rans;
457
+  PROTECT(Rans = NEW_NUMERIC(1));
458
+
459
+  double *ansPtr;
460
+  ansPtr = NUMERIC_POINTER(Rans);
461
+
462
+  double p;
463
+  double exp[3];
464
+  double ans;
465
+
466
+  if (counts[0] < counts[2]) {
467
+    temp = counts[0];
468
+    counts[0] = counts[2];
469
+    counts[2] = temp;
470
+  }
471
+  p = 1.0 * (2 * counts[0] + counts[1]) / ( 2 * N);
472
+  if (p == 1) 
473
+    ans = R_NaReal;
474
+  else {
475
+    exp[0] = N * pow(p, 2);
476
+    exp[1] = N * 2 * p * (1-p);
477
+    exp[2] = N * pow((1 - p), 2);
478
+    ans = 0;
479
+    ans += pow((counts[0] - exp[0]), 2) / exp[0];
480
+    ans += pow((counts[1] - exp[1]), 2) / exp[1];
481
+    ans += pow((counts[2] - exp[2]), 2) / exp[2];
482
+  }             
483
+  ansPtr[0] = ans;
484
+
485
+  UNPROTECT(1);
486
+  return(Rans);
487
+}
488
+
489
+
... ...
@@ -1,7 +1,9 @@
1 1
 int genotypeConfidence(const double *prob);
2
+int genotypeConfidence2(double probability);
2 3
 int intInSet(const int *x, const int *set, const int *n);
3 4
 int genotypeCall(const double *pAA, const double *pAB, const double *pBB);
4 5
 double sdCorrection(const int *n);
5 6
 int sort_double(const double *a1, const double *a2);
6 7
 void trimmed_mean(double *datavec, int *classvec, int class, double trim, int cols, int rows, double *m1, double *m2, double *m3, int i_ext);
7 8
 void trimmed_stats(double *data, double *m1, double *m2, double *m3, int *class, int rows, int cols, double *trim);
9
+long Cmatrix(int row, int col, int totrow);