Browse code

constructIlluminaAssayData allows for either ff or matrices. Fixed bug in accessors for medians, mads, ...

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@49131 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/08/2010 19:40:33
Showing7 changed files

... ...
@@ -71,13 +71,20 @@ export(crlmm,
71 71
        snprma,
72 72
        snprma2,
73 73
        crlmm2,
74
-       genotype2,
75
-       batch,
76
-       crlmmCopynumber2,
77
-       constructCNSetForIllumina,
78
-       validCdfNames)
79
-##export(initializeParamObject, biasAdjNP2)
80
-##export(construct)
81
-
82
-export(plotCNSetLM)
74
+       genotype2, genotypeLD,
75
+       crlmmCopynumber2, crlmmCopynumberLD, crlmmCopynumber)
76
+export(constructIlluminaCNSet)
77
+export(totalCopynumber)
78
+export(cnrma, cnrma2)
79
+exportMethods(A, B, nuA, nuB, phiA, phiB, corr, tau2, Ns, medians, mads)
80
+export(genotypeSummary,
81
+       shrinkSummary,
82
+       estimateCnParameters,
83
+       shrinkGenotypeSummaries, indexComplete)
84
+## export(fit.lm1, fit.lm2, fit.lm3, fit.lm4)
85
+
86
+## For debugging
87
+## exportPattern("^[^\\.]")
88
+
89
+
83 90
 
... ...
@@ -522,7 +522,7 @@ fit.lm2 <- function(strata,
522 522
 		B <- batches[[k]]
523 523
 		this.batch <- unique(as.character(batch(object)[B]))
524 524
 		X <- cbind(1, log2(c(medianA.AA[, k], medianB.BB[, k])))
525
-		Y <- log2(c(phiA.snp, phiB.snp))
525
+		Y <- log2(c(phiA.snp[, k], phiB.snp[, k]))
526 526
 		betahat <- solve(crossprod(X), crossprod(X, Y))
527 527
 		crosshyb <- max(median(medianA.AA[, k]) - median(medianA.AA.np[, k]), 0)
528 528
 		X <- cbind(1, log2(medianA.AA.np[, k] + crosshyb))
... ...
@@ -833,7 +833,6 @@ fit.lm4 <- function(strata,
833 833
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
834 834
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
835 835
 	nc <- length(batchNames(object))
836
-
837 836
 	if(enough.males){
838 837
 		res <- summarizeMaleXNps(marker.index=marker.index,
839 838
 					 batches=batches,
... ...
@@ -851,7 +850,6 @@ fit.lm4 <- function(strata,
851 850
 	nuB <- as.matrix(nuB(object)[marker.index, ])
852 851
 	phiA <- as.matrix(phiA(object)[marker.index, ])
853 852
 	phiB <- as.matrix(phiB(object)[marker.index, ])
854
-
855 853
 	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
856 854
 	fns <- featureNames(object)[ii]
857 855
 	flags <- as.matrix(flags(object)[ii, ])
... ...
@@ -1382,18 +1380,47 @@ constructIlluminaAssayData <- function(np, snp, object, storage.mode="environmen
1382 1380
 	}
1383 1381
 	np <- lapply(np, stripnames)
1384 1382
 	snp <- lapply(snp, stripnames)
1385
-	A <- rbind(snp[[1]], np[[1]], deparse.level=0)[order.index, ]
1386
-	B <- rbind(snp[[2]], np[[2]], deparse.level=0)[order.index, ]
1387
-	gt <- stripnames(calls(object))
1388
-	emptyMatrix <- matrix(integer(), nrow(np[[1]]), ncol(A))
1389
-	gt <- rbind(gt, emptyMatrix, deparse.level=0)[order.index,]
1390
-	pr <- stripnames(snpCallProbability(object))
1391
-	pr <- rbind(pr, emptyMatrix, deparse.level=0)[order.index, ]
1392
-	aD <- assayDataNew(storage.mode,
1393
-			   alleleA=A,
1394
-			   alleleB=B,
1395
-			   call=gt,
1396
-			   callProbability=pr)
1383
+	if(is(snp[[1]], "ff")){
1384
+		lapply(snp, open)
1385
+		open(calls(object))
1386
+		open(snpCallProbability(object))
1387
+##		lapply(np, open)
1388
+	}
1389
+	##tmp <- rbind(as.matrix(snp[[1]]), as.matrix(np[[1]]), deparse.level=0)
1390
+	A.snp <- snp[[1]]
1391
+	B.snp <- snp[[2]]
1392
+	##Why is np not a ff object?
1393
+	A.np <- np[[1]]
1394
+	B.np <- np[[2]]
1395
+	nr <- length(order.index)
1396
+	nc <- ncol(object)
1397
+	if(is(A.snp, "ff")){
1398
+		NA.vec <- rep(NA, nrow(A.np))
1399
+		AA <- initializeBigMatrix("A", nr, nc, vmode="integer")
1400
+		BB <- initializeBigMatrix("B", nr, nc, vmode="integer")
1401
+		GG <- initializeBigMatrix("calls", nr, nc, vmode="integer")
1402
+		PP <- initializeBigMatrix("confs", nr, nc, vmode="integer")
1403
+		for(j in 1:ncol(object)){
1404
+			AA[, j] <- c(snp[[1]][, j], np[[1]][, j])[order.index]
1405
+			BB[, j] <- c(snp[[2]][, j], np[[2]][, j])[order.index]
1406
+			GG[, j] <- c(calls(object)[, j], NA.vec)[order.index]
1407
+			PP[, j] <- c(snpCallProbability(object)[, j], NA.vec)[order.index]
1408
+
1409
+		}
1410
+	} else {
1411
+		AA <- rbind(snp[[1]], np[[1]], deparse.level=0)[order.index, ]
1412
+		BB <- rbind(snp[[2]], np[[2]], deparse.level=0)[order.index, ]
1413
+		gt <- stripnames(calls(object))
1414
+		emptyMatrix <- matrix(integer(), nrow(np[[1]]), ncol(A))
1415
+		GG <- rbind(gt, emptyMatrix, deparse.level=0)[order.index,]
1416
+		pr <- stripnames(snpCallProbability(object))
1417
+		PP <- rbind(pr, emptyMatrix, deparse.level=0)[order.index, ]
1418
+	}
1419
+	assayDataNew(storage.mode,
1420
+		     alleleA=AA,
1421
+		     alleleB=BB,
1422
+		     call=GG,
1423
+		     callProbability=PP)
1397 1424
 }
1398 1425
 constructIlluminaCNSet <- function(crlmmResult,
1399 1426
 				   path,
... ...
@@ -1408,20 +1435,16 @@ constructIlluminaCNSet <- function(crlmmResult,
1408 1435
 	fD <- fD[new.order, ]
1409 1436
 	aD <- constructIlluminaAssayData(cnAB, res, crlmmResult, order.index=new.order)
1410 1437
 	##protocolData(crlmmResult)$batch <- vector("integer", ncol(crlmmResult))
1411
-	batch <- vector("integer", ncol(crlmmResult))
1412
-	container <- new("CNSet",
1413
-			 call=aD[["call"]],
1414
-			 callProbability=aD[["callProbability"]],
1415
-			 alleleA=aD[["alleleA"]],
1416
-			 alleleB=aD[["alleleB"]],
1417
-			 phenoData=phenoData(crlmmResult),
1418
-			 protocolData=protocolData(crlmmResult),
1419
-			 featureData=fD,
1420
-			 batch=batch,
1421
-			 annotation="human370v1c")
1422
-##	lM(container) <- initializeParamObject(list(featureNames(container), unique(protocolData(container)$batch)))
1423
-##	lM(container) <- initializeLmFrom(container)
1424
-	container
1438
+	new("CNSet",
1439
+	    call=aD[["call"]],
1440
+	    callProbability=aD[["callProbability"]],
1441
+	    alleleA=aD[["alleleA"]],
1442
+	    alleleB=aD[["alleleB"]],
1443
+	    phenoData=phenoData(crlmmResult),
1444
+	    protocolData=protocolData(crlmmResult),
1445
+	    featureData=fD,
1446
+	    batch=batch,
1447
+	    annotation="human370v1c")
1425 1448
 }
1426 1449
 
1427 1450
 
... ...
@@ -1456,8 +1479,8 @@ shrinkSummary <- function(object,
1456 1479
 			  verbose=TRUE,
1457 1480
 			  marker.index,
1458 1481
 			  is.lds){
1459
-	stopifnot(type %in% c("SNP", "X.SNP"))
1460
-	if(type == "X.SNP"){
1482
+	stopifnot(type[[1]] %in% c("SNP", "X.SNP"))
1483
+	if(type[[1]] == "X.SNP"){
1461 1484
 		gender <- object$gender
1462 1485
 		if(sum(gender == 2) < 3) {
1463 1486
 			return("too few females to estimate within genotype summary statistics on CHR X")
... ...
@@ -1474,17 +1497,10 @@ shrinkSummary <- function(object,
1474 1497
 		if(is.lds) stopifnot(isPackageLoaded("ff"))
1475 1498
 		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
1476 1499
 	}
1477
-	summaryFxn <- function(type){
1478
-		switch(type,
1479
-		       SNP="shrinkGenotypeSummaries",
1480
-		       X.SNP="shrinkGenotypeSummaries", ## this shrinks for the females only
1481
-		       stop())
1482
-	}
1483
-	FUN <- summaryFxn(type[[1]])
1484 1500
 	if(is.lds){
1485 1501
 		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1486 1502
 		ocLapply(seq(along=index.list),
1487
-			 FUN,
1503
+			 shrinkGenotypeSummaries,
1488 1504
 			 index.list=index.list,
1489 1505
 			 object=object,
1490 1506
 			 verbose=verbose,
... ...
@@ -1494,8 +1510,7 @@ shrinkSummary <- function(object,
1494 1510
 			 is.lds=is.lds,
1495 1511
 			 neededPkgs="crlmm")
1496 1512
 	} else {
1497
-		FUN <- match.fun(FUN)
1498
-		object <- FUN(strata=1,
1513
+		object <- shrinkGenotypeSummaries(strata=1,
1499 1514
 			      index.list=list(marker.index),
1500 1515
 			      object=object,
1501 1516
 			      verbose=verbose,
... ...
@@ -1750,7 +1765,7 @@ crlmmCopynumber <- function(object,
1750 1765
 		       X.SNP="chromosome X SNPs",
1751 1766
 		       X.NP="chromosome X nonpolymorphic markers")
1752 1767
 	}
1753
-	if(verbose) message("Computing summary statistics for genotype clusters for each batch")
1768
+	if(verbose) message("Computing summary statistics of the genotype clusters for each batch")
1754 1769
 	for(i in seq_along(type)){
1755 1770
 		## do all types
1756 1771
 		marker.type <- type[i]
... ...
@@ -1765,7 +1780,7 @@ crlmmCopynumber <- function(object,
1765 1780
 					  marker.index=marker.index,
1766 1781
 					  is.lds=is.lds)
1767 1782
 	}
1768
-	if(verbose) message("Imputing unobserved cluster medians and shrinking the variances at polymorphic loci")##SNPs only
1783
+	if(verbose) message("Imputing unobserved genotype medians and shrinking the variances (within-batch, across loci) ")##SNPs only
1769 1784
 	for(i in seq_along(type)){
1770 1785
 		marker.type <- type[i]
1771 1786
 		if(!marker.type %in% c("SNP", "X.SNP")) next()
... ...
@@ -12,7 +12,187 @@ readIdatFiles <- function(sampleSheet=NULL,
12 12
 			  sep="_",
13 13
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
14 14
 			  saveDate=FALSE) {
15
+#       if(!is.null(arrayNames)) {
16
+#	       arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
17
+#	       if(!is.null(sampleSheet)) {
18
+#		       sampleSheet=NULL
19
+#		       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
20
+#	       }
21
+#	       pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
22
+#       }
23
+       if(!is.null(arrayNames)) {
24
+               pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
25
+       }
26
+       if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
27
+	       if(is.null(arrayNames)){
28
+		       ##arrayNames=NULL
29
+		       if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
30
+			       barcode = sampleSheet[,arrayInfoColNames$barcode]
31
+			       arrayNames=barcode
32
+		       }
33
+		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
34
+			       position = sampleSheet[,arrayInfoColNames$position]
35
+			       if(is.null(arrayNames))
36
+				       arrayNames=position
37
+			       else
38
+				       arrayNames = paste(arrayNames, position, sep=sep)
39
+			       if(highDensity) {
40
+				       hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
41
+				       for(i in names(hdExt))
42
+					       arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
43
+			       }
44
+		       }
45
+	       }
46
+	       pd = new("AnnotatedDataFrame", data = sampleSheet)
47
+	       sampleNames(pd) <- basename(arrayNames)
48
+       }
49
+       if(is.null(arrayNames)) {
50
+               arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
51
+               if(!is.null(sampleSheet)) {
52
+                      sampleSheet=NULL
53
+                      cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
54
+               }
55
+               pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
56
+       }
57
+       narrays = length(arrayNames)
58
+       grnfiles = paste(arrayNames, fileExt$green, sep=sep)
59
+       redfiles = paste(arrayNames, fileExt$red, sep=sep)
60
+       if(length(grnfiles)==0 || length(redfiles)==0)
61
+	       stop("Cannot find .idat files")
62
+       if(length(grnfiles)!=length(redfiles))
63
+	       stop("Cannot find matching .idat files")
64
+       if(path[1] != "."){
65
+	       grnidats = file.path(path, grnfiles)
66
+	       redidats = file.path(path, redfiles)
67
+       }  else {
68
+	       message("path arg not set.  Assuming files are in local directory")
69
+	       grnidats <- grnfiles
70
+	       redidats <- redfiles
71
+       }
72
+       if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
73
+       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
74
+##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
75
+##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
76
+##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
77
+##       }
78
+       headerInfo = list(nProbes = rep(NA, narrays),
79
+                         Barcode = rep(NA, narrays),
80
+                         ChipType = rep(NA, narrays),
81
+                         Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
82
+                         Position = rep(NA, narrays)) # this may also vary a bit
83
+       dates = list(decode=rep(NA, narrays),
84
+                    scan=rep(NA, narrays))
85
+       ## read in the data
86
+       for(i in seq(along=arrayNames)) {
87
+	       cat("reading", arrayNames[i], "\t")
88
+	       idsG = idsR = G = R = NULL
89
+	       cat(paste(sep, fileExt$green, sep=""), "\t")
90
+	       G = readIDAT(grnidats[i])
91
+	       idsG = rownames(G$Quants)
92
+	       headerInfo$nProbes[i] = G$nSNPsRead
93
+	       headerInfo$Barcode[i] = G$Barcode
94
+	       headerInfo$ChipType[i] = G$ChipType
95
+	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
96
+	       headerInfo$Position[i] = G$Unknowns$MostlyA
97
+               if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
98
+                       ## headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
99
+		       ## || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
100
+		       ## but differed by a few SNPs for some reason - most of the chip was the same though
101
+		       ##           stop("Chips are not of all of the same type - please check your data")
102
+		       warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
103
+		       next()
104
+	       }
105
+	       dates$decode[i] = G$RunInfo[1, 1]
106
+	       dates$scan[i] = G$RunInfo[2, 1]
107
+	       if(i==1) {
108
+		       if(is.null(ids) && !is.null(G)){
109
+			       ids = idsG
110
+		       }else stop("Could not find probe IDs")
111
+		       nprobes = length(ids)
112
+		       narrays = length(arrayNames)
113
+		       tmpmat = matrix(NA, nprobes, narrays)
114
+		       rownames(tmpmat) = ids
115
+		       if(!is.null(sampleSheet)){
116
+			       colnames(tmpmat) = sampleSheet$Sample_ID
117
+		       }else  colnames(tmpmat) = arrayNames
118
+		       RG <- new("NChannelSet",
119
+				 R=tmpmat,
120
+				 G=tmpmat,
121
+				 zero=tmpmat,
122
+#				 Rnb=tmpmat,
123
+#				 Gnb=tmpmat,
124
+#				 Rse=tmpmat,
125
+#				 Gse=tmpmat,
126
+				 annotation=headerInfo$Manifest[1],
127
+				 phenoData=pd,
128
+				 storage.mode="environment")
129
+		       rm(tmpmat)
130
+		       gc()
131
+	       }
132
+	       if(length(ids)==length(idsG)) {
133
+		       if(sum(ids==idsG)==nprobes) {
134
+			       RG@assayData$G[,i] = G$Quants[, "Mean"]
135
+				   zeroG = G$Quants[, "NBeads"]==0
136
+#			       RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
137
+#			       RG@assayData$Gse[,i] = G$Quants[, "SD"]
138
+		       }
139
+	       } else {
140
+		       indG = match(ids, idsG)
141
+		       RG@assayData$G[,i] = G$Quants[indG, "Mean"]
142
+			   zeroG = G$Quants[indG, "NBeads"]==0
143
+#		       RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
144
+#		       RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
145
+	       }
146
+	       rm(G)
147
+	       gc()
148
+
149
+	       cat(paste(sep, fileExt$red, sep=""), "\n")
150
+	       R = readIDAT(redidats[i])
151
+	       idsR = rownames(R$Quants)
15 152
 
153
+	       if(length(ids)==length(idsG)) {
154
+		       if(sum(ids==idsR)==nprobes) {
155
+			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
156
+				   zeroR = R$Quants[ ,"NBeads"]==0
157
+#			       RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
158
+#			       RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
159
+		       }
160
+	       } else {
161
+		       indR = match(ids, idsR)
162
+		       RG@assayData$R[,i] = R$Quants[indR, "Mean"]
163
+			   zeroR = R$Quants[indR, "NBeads"]==0
164
+#		       RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
165
+#		       RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
166
+	       }
167
+		   RG@assayData$zero[,i] = zeroG | zeroR
168
+	       rm(R, zeroG, zeroR)
169
+	       gc()
170
+       }
171
+       if(saveDate) {
172
+	       protocolData(RG)[["ScanDate"]] = dates$scan
173
+       }
174
+       storageMode(RG) = "lockedEnvironment"
175
+       RG
176
+}
177
+
178
+
179
+readIdatFiles2 <- function(sampleSheet=NULL,
180
+			  arrayNames=NULL,
181
+			  ids=NULL,
182
+			  path=".",
183
+			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
184
+			  highDensity=FALSE,
185
+			  sep="_",
186
+			  fileExt=list(green="Grn.idat", red="Red.idat"),
187
+			  saveDate=FALSE) {
188
+#       if(!is.null(arrayNames)) {
189
+#	       arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
190
+#	       if(!is.null(sampleSheet)) {
191
+#		       sampleSheet=NULL
192
+#		       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
193
+#	       }
194
+#	       pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
195
+#       }
16 196
        if(!is.null(arrayNames)) {
17 197
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
18 198
        }
... ...
@@ -64,6 +244,10 @@ readIdatFiles <- function(sampleSheet=NULL,
64 244
        }
65 245
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
66 246
        if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
247
+##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
248
+##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
249
+##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
250
+##       }
67 251
        headerInfo = list(nProbes = rep(NA, narrays),
68 252
                          Barcode = rep(NA, narrays),
69 253
                          ChipType = rep(NA, narrays),
... ...
@@ -410,6 +594,113 @@ readBPM <- function(bpmFile){
410 594
 }
411 595
 
412 596
 
597
+RGtoXY = function(RG, chipType, verbose=TRUE) {
598
+  chipList = c("human1mv1c",             # 1M
599
+               "human370v1c",            # 370CNV
600
+               "human650v3a",            # 650Y
601
+               "human610quadv1b",        # 610 quad
602
+               "human660quadv1a",        # 660 quad
603
+               "human370quadv3c",        # 370CNV quad
604
+               "human550v3b",            # 550K
605
+               "human1mduov3b",          # 1M Duo
606
+               "humanomni1quadv1b",      # Omni1 quad
607
+               "humanomniexpress12v1b") # Omni express 12
608
+  if(missing(chipType)){
609
+	  chipType = match.arg(annotation(RG), chipList)
610
+  } else chipType = match.arg(chipType, chipList)
611
+
612
+  pkgname <- getCrlmmAnnotationName(chipType)
613
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
614
+     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
615
+     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
616
+     message(strwrap(msg))
617
+     stop("Package ", pkgname, " could not be found.")
618
+     rm(suggCall, msg)
619
+  }
620
+  if(verbose) message("Loading chip annotation information.")
621
+    loader("address.rda", .crlmmPkgEnv, pkgname)
622
+#    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
623
+  aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
624
+  bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
625
+  ids <- names(aids)
626
+  snpbase <- getVarInEnv("base")
627
+
628
+  nsnps = length(aids)
629
+  narrays = ncol(RG)
630
+
631
+#  aidcol = match("AddressA_ID", colnames(annot))
632
+#  if(is.na(aidcol))
633
+#    aidcol = match("Address", colnames(annot))
634
+#  bidcol = match("AddressB_ID", colnames(annot))
635
+#  if(is.na(bidcol))
636
+#    bidcol = match("Address2", colnames(annot))
637
+#  aids = annot[, aidcol]
638
+#  bids = annot[, bidcol]
639
+#  snpids = annot[,"Name"]
640
+#  snpbase = annot[,"SNP"]
641
+  infI = !is.na(bids) & bids!=0
642
+  aord = match(aids, featureNames(RG)) # NAs are possible here
643
+  bord = match(bids, featureNames(RG)) # and here
644
+#  argrg = aids[rrgg]
645
+#  brgrg = bids[rrgg]
646
+
647
+  tmpmat = matrix(as.integer(0), nsnps, narrays)
648
+  rownames(tmpmat) = ids
649
+  colnames(tmpmat) = sampleNames(RG)
650
+  XY = new("NChannelSet", X=tmpmat, Y=tmpmat, zero=tmpmat, # Xnb=tmpmat, Ynb=tmpmat, Xse=tmpmat, Yse=tmpmat, zero=tmpmat,
651
+                 annotation=chipType, phenoData=RG@phenoData, protocolData=RG@protocolData, storage.mode="environment")
652
+  rm(tmpmat)
653
+  gc()
654
+
655
+  # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
656
+  XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
657
+  XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
658
+  XY@assayData$zero[!is.na(aord),] = exprs(channel(RG, "zero"))[aord[!is.na(aord)],] # mostly green
659
+#  XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
660
+#  XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
661
+#  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
662
+#  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
663
+  gc()
664
+
665
+  ## Warning - not 100% sure that the code below is correct - could be more complicated than this
666
+
667
+  # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
668
+#  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
669
+
670
+#  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
671
+#  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
672
+#  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
673
+#  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
674
+#  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
675
+#  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
676
+
677
+  # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
678
+#  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
679
+
680
+#  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
681
+#  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
682
+#  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
683
+#  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
684
+#  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
685
+#  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
686
+
687
+  #  For now zero out Infinium I probes
688
+  XY@assayData$X[infI,] = 0
689
+  XY@assayData$Y[infI,] = 0
690
+  XY@assayData$zero[infI,] = 0
691
+#  XY@assayData$Xnb[infI,] = 0
692
+#  XY@assayData$Ynb[infI,] = 0
693
+#  XY@assayData$Xse[infI,] = 0
694
+#  XY@assayData$Yse[infI,] = 0
695
+
696
+#  XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
697
+  gc()
698
+
699
+#  storageMode(XY) = "lockedEnvironment"
700
+  XY
701
+}
702
+
703
+
413 704
 RGtoXY = function(RG, chipType, verbose=TRUE) {
414 705
   chipList = c("human1mv1c",             # 1M
415 706
                "human370v1c",            # 370CNV
... ...
@@ -472,9 +763,6 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
472 763
   XY@assayData$Y[1:nsnps,] = 0
473 764
   XY@assayData$zero[1:nsnps,] = 0
474 765
 
475
-  open(RG@assayData$R)
476
-  open(RG@assayData$G)
477
-  open(RG@assayData$zero)
478 766
 
479 767
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
480 768
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
... ...
@@ -482,10 +770,6 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
482 770
   XY@assayData$zero[!is.na(aord),] = exprs(channel(RG, "zero"))[aord[!is.na(aord)],] # mostly green
483 771
   gc()
484 772
 
485
-  close(RG@assayData$R)
486
-  close(RG@assayData$G)
487
-  close(RG@assayData$zero)
488
-
489 773
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
490 774
 
491 775
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
... ...
@@ -493,17 +777,31 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
493 777
 
494 778
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
495 779
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
780
+#  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
781
+#  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
782
+#  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
783
+#  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
496 784
 
497 785
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
498 786
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
499 787
 
500 788
 #  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
501 789
 #  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
790
+#  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
791
+#  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
792
+#  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
793
+#  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
502 794
 
503 795
   #  For now zero out Infinium I probes
504 796
   XY@assayData$X[infI,] = 0
505 797
   XY@assayData$Y[infI,] = 0
506 798
   XY@assayData$zero[infI,] = 0
799
+#  XY@assayData$Xnb[infI,] = 0
800
+#  XY@assayData$Ynb[infI,] = 0
801
+#  XY@assayData$Xse[infI,] = 0
802
+#  XY@assayData$Yse[infI,] = 0
803
+
804
+#  XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
507 805
   gc()
508 806
 
509 807
 #  if(class(XY@assayData$X)[1]=="ff_matrix") {
... ...
@@ -534,11 +832,14 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
534 832
   if(useTarget)
535 833
     targetdist = getVarInEnv("reference")
536 834
 
835
+#  Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
836
+#  colnames(Xqws) = colnames(Yqws) = sampleNames(XY) #$X
837
+#  rownames(Xqws) = rownames(Yqws) = featureNames(XY)
838
+
537 839
   if(verbose){
538 840
     message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
539 841
     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
540 842
   }
541
-
542 843
 #  if(class(XY@assayData$X)[1]=="ff_matrix") {
543 844
     open(XY@assayData$X)
544 845
     open(XY@assayData$Y)
... ...
@@ -574,6 +875,146 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
574 875
 
575 876
 
576 877
 preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
878
+				fitMixture=TRUE,
879
+				eps=0.1,
880
+				verbose=TRUE,
881
+				seed=1,
882
+				cdfName,
883
+				sns,
884
+				stripNorm=TRUE,
885
+				useTarget=TRUE,
886
+				save.it=FALSE,
887
+				snpFile,
888
+				cnFile) {
889
+  if(stripNorm)
890
+    XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
891
+
892
+## MR: the code below is mostly straight from snprma.R
893
+  if (missing(sns)) sns <- sampleNames(XY) #$X
894
+  if(missing(cdfName))
895
+    cdfName <- annotation(XY)
896
+##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
897
+  pkgname <- getCrlmmAnnotationName(cdfName)
898
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
899
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
900
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
901
+    message(strwrap(msg))
902
+    stop("Package ", pkgname, " could not be found.")
903
+    rm(suggCall, msg)
904
+  }
905
+  if(verbose) message("Loading snp annotation and mixture model parameters.")
906
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
907
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
908
+  loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
909
+  loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
910
+#  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
911
+  autosomeIndex <- getVarInEnv("autosomeIndex")
912
+  #  pnsa <- getVarInEnv("pnsa")
913
+  #  pnsb <- getVarInEnv("pnsb")
914
+  #  fid <- getVarInEnv("fid")
915
+  #  reference <- getVarInEnv("reference")
916
+  #  aIndex <- getVarInEnv("aIndex")
917
+  #  bIndex <- getVarInEnv("bIndex")
918
+  SMEDIAN <- getVarInEnv("SMEDIAN")
919
+  theKnots <- getVarInEnv("theKnots")
920
+  gns <- getVarInEnv("gns")
921
+
922
+  # separate out copy number probes
923
+  npIndex = getVarInEnv("npProbesFid")
924
+  nprobes = length(npIndex)
925
+  if(length(nprobes)>0) {
926
+    narrays = ncol(XY)
927
+    A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
928
+    B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
929
+
930
+    # new lines below - useful to keep track of zeroed out probes
931
+    zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
932
+
933
+    colnames(A) <- colnames(B) <- colnames(zero) <- sns
934
+    rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
935
+
936
+    cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
937
+    if(save.it & !missing(cnFile)) {
938
+      t0 <- proc.time()
939
+      save(cnAB, file=cnFile)
940
+      t0 <- proc.time()-t0
941
+      if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
942
+    }
943
+    rm(cnAB, B, zero)
944
+  }
945
+
946
+  # next process snp probes
947
+  snpIndex = getVarInEnv("snpProbesFid")
948
+  nprobes <- length(snpIndex)
949
+
950
+  ##We will read each cel file, summarize, and run EM one by one
951
+  ##We will save parameters of EM to use later
952
+  mixtureParams <- matrix(0, 4, narrays)
953
+  SNR <- vector("numeric", narrays)
954
+  SKW <- vector("numeric", narrays)
955
+
956
+  ## This is the sample for the fitting of splines
957
+  ## BC: I like better the idea of the user passing the seed,
958
+  ##     because this might intefere with other analyses
959
+  ##     (like what happened to GCRMA)
960
+  set.seed(seed)
961
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
962
+  idx2 <- sample(nprobes, 10^5)
963
+
964
+  ##S will hold (A+B)/2 and M will hold A-B
965
+  ##NOTE: We actually dont need to save S. Only for pics etc...
966
+  ##f is the correction. we save to avoid recomputing
967
+  A <- matrix(as.integer(exprs(channel(XY, "X"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
968
+  B <- matrix(as.integer(exprs(channel(XY, "Y"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
969
+
970
+  # new lines below - useful to keep track of zeroed out SNPs
971
+  zero <- matrix(as.integer(exprs(channel(XY, "zero"))[snpIndex,]), nprobes, narrays)
972
+
973
+  colnames(A) <- colnames(B) <- colnames(zero) <- sns
974
+  rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
975
+
976
+  if(verbose){
977
+     message("Calibrating ", narrays, " arrays.")
978
+     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
979
+  }
980
+
981
+
982
+  for(i in 1:narrays){
983
+    SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
984
+    if(fitMixture){
985
+      S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
986
+      M <- log2(A[idx, i])-log2(B[idx, i])
987
+
988
+      ##we need to test the choice of eps.. it is not the max diff between funcs
989
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
990
+
991
+      mixtureParams[, i] <- tmp[["coef"]]
992
+      SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
993
+    }
994
+    if(verbose) {
995
+       if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
996
+       else cat(".")
997
+    }
998
+  }
999
+  if (verbose) {
1000
+    if (getRversion() > '2.7.0') close(pb)
1001
+    else cat("\n")
1002
+  }
1003
+  if (!fitMixture) SNR <- mixtureParams <- NA
1004
+  ## gns comes from preprocStuff.rda
1005
+  res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
1006
+
1007
+  if(save.it & !missing(snpFile)) {
1008
+    t0 <- proc.time()
1009
+    save(res, file=snpFile)
1010
+    t0 <- proc.time()-t0
1011
+    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
1012
+  }
1013
+  return(res)
1014
+}
1015
+
1016
+
1017
+preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
577 1018
 				fitMixture=TRUE,
578 1019
 				eps=0.1,
579 1020
 				verbose=TRUE,
... ...
@@ -625,10 +1066,6 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
625 1066
       # new lines below - useful to keep track of zeroed out probes
626 1067
       zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
627 1068
 
628
-      close(XY@assayData$X)
629
-      close(XY@assayData$Y)
630
-      close(XY@assayData$zero)
631
-
632 1069
       colnames(A) <- colnames(B) <- colnames(zero) <- sns
633 1070
       rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
634 1071
 
... ...
@@ -651,6 +1088,9 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
651 1088
   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
652 1089
   SNR <- initializeBigVector("crlmmSNR-", narrays, "double")
653 1090
   SKW <- initializeBigVector("crlmmSKW-", narrays, "double")
1091
+#  mixtureParams <- matrix(0, 4, narrays)
1092
+#  SNR <- vector("numeric", narrays)
1093
+#  SKW <- vector("numeric", narrays)
654 1094
 
655 1095
   ## This is the sample for the fitting of splines
656 1096
   ## BC: I like better the idea of the user passing the seed,
... ...
@@ -663,6 +1103,25 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
663 1103
   ##S will hold (A+B)/2 and M will hold A-B
664 1104
   ##NOTE: We actually dont need to save S. Only for pics etc...
665 1105
   ##f is the correction. we save to avoid recomputing
1106
+#  A <- exprs(channel(XY, "X"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
1107
+#  B <- exprs(channel(XY, "Y"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
1108
+
1109
+  # new lines below - useful to keep track of zeroed out SNPs
1110
+#  zero <- exprs(channel(XY, "zero"))[,] # )) #[snpIndex,]), nprobes, narrays)
1111
+
1112
+#  if(!is.matrix(A)) {
1113
+#     A = A[,]
1114
+#     B = B[,]
1115
+#     zero = zero[,]
1116
+#  }
1117
+
1118
+#  if(!is.integer(A)) {
1119
+#     A = matrix(as.integer(A), nrow(A), ncol(A))
1120
+#     B = matrix(as.integer(B), nrow(B), ncol(B))
1121
+#  }
1122
+
1123
+#  colnames(A) <- colnames(B) <- colnames(zero) <- sns
1124
+#  rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
666 1125
 
667 1126
   A <- initializeBigMatrix("crlmmA-", nprobes, narrays, "integer")
668 1127
   B <- initializeBigMatrix("crlmmB-", nprobes, narrays, "integer")
... ...
@@ -705,10 +1164,12 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
705 1164
   if (!fitMixture) SNR <- mixtureParams <- NA
706 1165
   ## gns comes from preprocStuff.rda
707 1166
 
708
-  close(XY@assayData$X)
709
-  close(XY@assayData$Y)
710
-  close(XY@assayData$zero)
711
-
1167
+  if(save.it & !missing(snpFile)) {
1168
+    t0 <- proc.time()
1169
+    save(res, file=snpFile)
1170
+    t0 <- proc.time()-t0
1171
+    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
1172
+  }
712 1173
 #  if(class(A)[1]=="ff_matrix") {
713 1174
     close(A)
714 1175
     close(B)
... ...
@@ -774,6 +1235,61 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
774 1235
     res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
775 1236
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
776 1237
                         save.it=save.it, snpFile=snpFile, cnFile=cnFile)
1238
+  }else{
1239
+      if(verbose) message("Loading ", snpFile, ".")
1240
+        obj <- load(snpFile)
1241
+        if(verbose) message("Done.")
1242
+        if(!any(obj == "res"))
1243
+          stop("Object in ", snpFile, " seems to be invalid.")
1244
+  }
1245
+  if(row.names) row.names=res[["gns"]] else row.names=NULL
1246
+  if(col.names) col.names=res[["sns"]] else col.names=NULL
1247
+
1248
+  res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
1249
+                  res[["mixtureParams"]], res[["cdfName"]],
1250
+                  gender=gender, row.names=row.names,
1251
+                  col.names=col.names, recallMin=recallMin,
1252
+                  recallRegMin=1000, SNRMin=SNRMin,
1253
+                  returnParams=returnParams, badSNP=badSNP,
1254
+                  verbose=verbose)
1255
+
1256
+  res2[["SNR"]] <- res[["SNR"]]
1257
+  res2[["SKW"]] <- res[["SKW"]]
1258
+  rm(res)
1259
+  gc()
1260
+  return(list2SnpSet(res2, returnParams=returnParams)) # return(res2)
1261
+}
1262
+
1263
+
1264
+crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1265
+                  row.names=TRUE, col.names=TRUE,
1266
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
1267
+                  seed=1, save.it=FALSE, load.it=FALSE, snpFile, cnFile,
1268
+                  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
1269
+                  cdfName, sns, recallMin=10, recallRegMin=1000,
1270
+                  returnParams=FALSE, badSNP=.7) {
1271
+  if (save.it & (missing(snpFile) | missing(cnFile)))
1272
+    stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
1273
+  if (load.it & missing(snpFile))
1274
+    stop("'snpFile' is missing and you chose to load.it")
1275
+  if (!missing(snpFile))
1276
+    if (load.it & !file.exists(snpFile)){
1277
+      load.it <- FALSE
1278
+      message("File ", snpFile, " does not exist.")
1279
+      stop("Cannot load SNP data.")
1280
+  }
1281
+  if (!load.it){
1282
+    if(!missing(RG)) {
1283
+      if(missing(XY))
1284
+        XY = RGtoXY2(RG, chipType=cdfName)
1285
+      else
1286
+        stop("Both RG and XY specified - please use one or the other")
1287
+    }
1288
+    if (missing(sns)) sns <- sampleNames(XY) #$X
1289
+
1290
+    res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1291
+                        seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #,
1292
+                      #  save.it=save.it, snpFile=snpFile, cnFile=cnFile)
777 1293
 
778 1294
 #    fD = featureData(XY)
779 1295
 #    phenD = XY@phenoData
... ...
@@ -801,12 +1317,7 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
801 1317
           stop("Object in ", snpFile, " seems to be invalid.")
802 1318
   }
803 1319
 
804
-    open(res[["A"]])
805
-    open(res[["B"]])
806
-    open(res[["SNR"]])
807
-    open(res[["mixtureParams"]])
808
-
809
-#    rm(phenD, protD , fD)
1320
+ #   rm(phenD, protD , fD)
810 1321
 
811 1322
 #    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
812 1323
 #    suppressWarnings(A(callSet) <- res[["A"]])
... ...
@@ -867,7 +1378,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
867 1378
 #                          rgFile,
868 1379
 			  stripNorm=TRUE,
869 1380
 			  useTarget=TRUE,
870
-			  row.names=TRUE,
1381
+          	          row.names=TRUE,
871 1382
 			  col.names=TRUE,
872 1383
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
873 1384
                           seed=1, save.it=FALSE, snpFile, cnFile,
... ...
@@ -1,11 +1,15 @@
1 1
 setMethod("Ns", signature(object="AssayData"),
2
-	  function(object, i, j, ...){
3
-		  if(!missing(j)){
4
-			  batchnames <- unique(as.character(batch(object)[j]))
5
-		  } else batchnames <- batchNames(object)
6
-		  nc <- length(batchnames)
7
-		  if(!missing(i)) nr <- length(i) else nr <- nrow(object)
8
-		  res <- array(NA, dim=c(nr, 3, nc))
2
+	  function(object, ...){
3
+		  dotArgs <- names(list(...))
4
+		  missing.i <- !("i" %in% dotArgs)
5
+		  missing.j <- !("j" %in% dotArgs)
6
+		  batchnames <- batchNames(object)
7
+		  if(!missing.j) batchnames <- batchnames[j]
8
+		  if(missing.i & missing.j) stop("Must specify either the rows i or batches j")
9
+		  N.AA <- as.matrix(assayDataElement(object, "N.AA")[...])
10
+		  N.AB <- as.matrix(assayDataElement(object, "N.AB")[...])
11
+		  N.BB <- as.matrix(assayDataElement(object, "N.BB")[...])
12
+		  res <- array(NA, dim=c(nrow(N.AA), 3, ncol(N.AA)))
9 13
 		  dimnames(res)[[2]] <- c("AA", "AB", "BB")
10 14
 		  dimnames(res)[[3]] <- batchnames
11 15
 		  if(missing(i) & missing(j)){
... ...
@@ -39,12 +43,16 @@ setMethod("Ns", signature(object="AssayData"),
39 43
 	  })
40 44
 setMethod("corr", signature(object="AssayData"),
41 45
 	  function(object, i, j, ...){
42
-		  if(!missing(j)){
43
-			  batchnames <- unique(as.character(batch(object)[j]))
44
-		  } else batchnames <- batchNames(object)
45
-		  nc <- length(batchnames)
46
-		  if(!missing(i)) nr <- length(i) else nr <- nrow(object)
47
-		  res <- array(NA, dim=c(nr, 3, nc))
46
+		  dotArgs <- names(list(...))
47
+		  missing.i <- !("i" %in% dotArgs)
48
+		  missing.j <- !("j" %in% dotArgs)
49
+		  batchnames <- batchNames(object)
50
+		  if(!missing.j) batchnames <- batchnames[j]
51
+		  if(missing.i & missing.j) stop("Must specify either the rows i or batches j")
52
+		  corrAA <- as.matrix(assayDataElement(object, "corrAA")[...])
53
+		  corrAB <- as.matrix(assayDataElement(object, "corrAB")[...])
54
+		  corrBB <- as.matrix(assayDataElement(object, "corrBB")[...])
55
+		  res <- array(NA, dim=c(nrow(corrAA), 3, ncol(corrAA)))
48 56
 		  dimnames(res)[[2]] <- c("AA", "AB", "BB")
49 57
 		  dimnames(res)[[3]] <- batchnames
50 58
 		  if(missing(i) & missing(j)){
... ...
@@ -78,13 +86,20 @@ setMethod("corr", signature(object="AssayData"),
78 86
 	  })
79 87
 
80 88
 setMethod("medians", signature(object="AssayData"),
81
-	  function(object, i, j, ...){
82
-		  if(!missing(j)){
83
-			  batchnames <- unique(as.character(batch(object)[j]))
84
-		  } else batchnames <- batchNames(object)
85
-		  nc <- length(batchnames)
86
-		  if(!missing(i)) nr <- length(i) else nr <- nrow(object)
87
-		  res <- array(NA, dim=c(nr, 2, 3, nc))
89
+	  function(object, ...){
90
+		  dotArgs <- names(list(...))
91
+		  missing.i <- !("i" %in% dotArgs)
92
+		  missing.j <- !("j" %in% dotArgs)
93
+		  batchnames <- batchNames(object)
94
+		  if(!missing.j) batchnames <- batchnames[j]
95
+		  if(missing.i & missing.j) stop("Must specify either the rows i or batches j")
96
+		  medianA.AA <- as.matrix(assayDataElement(object, "medianA.AA")[...])
97
+		  medianA.AB <- as.matrix(assayDataElement(object, "medianA.AB")[...])
98
+		  medianA.BB <- as.matrix(assayDataElement(object, "medianA.BB")[...])
99
+		  medianB.AA <- as.matrix(assayDataElement(object, "medianB.AA")[...])
100
+		  medianB.AB <- as.matrix(assayDataElement(object, "medianB.AB")[...])
101
+		  medianB.BB <- as.matrix(assayDataElement(object, "medianB.BB")[...])
102
+		  res <- array(NA, dim=c(nrow(medianA.AA), 2, 3, ncol(medianA.AA)))
88 103
 		  dimnames(res)[[2]] <- c("A", "B")
89 104
 		  dimnames(res)[[3]] <- c("AA", "AB", "BB")
90 105
 		  dimnames(res)[[4]] <- batchnames
... ...
@@ -134,14 +149,21 @@ setMethod("medians", signature(object="AssayData"),
134 149
 		  return(res)
135 150
 })
136 151
 
137
-setMethod("medians", signature(object="AssayData"),
138
-	  function(object, i, j, ...){
139
-		  if(!missing(j)){
140
-			  batchnames <- unique(as.character(batch(object)[j]))
141
-		  } else batchnames <- batchNames(object)
142
-		  nc <- length(batchnames)
143
-		  if(!missing(i)) nr <- length(i) else nr <- nrow(object)
144
-		  res <- array(NA, dim=c(nr, 2, 3, nc))
152
+setMethod("mads", signature(object="AssayData"),
153
+	  function(object, ...){
154
+		  dotArgs <- names(list(...))
155
+		  missing.i <- !("i" %in% dotArgs)
156
+		  missing.j <- !("j" %in% dotArgs)
157
+		  batchnames <- batchNames(object)
158
+		  if(!missing.j) batchnames <- batchnames[j]
159
+		  if(missing.i & missing.j) stop("Must specify either the rows i or batches j")
160
+		  madA.AA <- as.matrix(assayDataElement(object, "madA.AA")[...])
161
+		  madA.AB <- as.matrix(assayDataElement(object, "madA.AB")[...])
162
+		  madA.BB <- as.matrix(assayDataElement(object, "madA.BB")[...])
163
+		  madB.AA <- as.matrix(assayDataElement(object, "madB.AA")[...])
164
+		  madB.AB <- as.matrix(assayDataElement(object, "madB.AB")[...])
165
+		  madB.BB <- as.matrix(assayDataElement(object, "madB.BB")[...])
166
+		  res <- array(NA, dim=c(nrow(madA.AA), 2, 3, ncol(madA.AA)))
145 167
 		  dimnames(res)[[2]] <- c("A", "B")
146 168
 		  dimnames(res)[[3]] <- c("AA", "AB", "BB")
147 169
 		  dimnames(res)[[4]] <- batchnames
... ...
@@ -184,7 +184,7 @@ if(!file.exists(file.path(outdir, "cnSet.rda"))){
184 184
 }
185 185
 @
186 186
 
187
-\subsection{Copy number algorithm.}
187
+\subsection{Copy number estimation.}
188 188
 
189 189
 The \Rfunction{crlmmCopynumber} computes summary statistics for each
190 190
 genotype, imputes unobserved genotype centers, shrinks the
... ...
@@ -193,10 +193,14 @@ allele-specific copy number. With \texttt{verbose=TRUE}, the above
193 193
 steps for CN estimation are displayed.
194 194
 
195 195
 <<LDS_copynumber>>=
196
-cnSet <- checkExists("cnSet",
197
-		     .path=outdir,
198
-		     .FUN=crlmmCopynumber,
199
-		     object=gtSet)
196
+##trace(fit.lm4, browser)
197
+##estimateCnParameters(gtSet,type="X.NP")
198
+##estimateCnParameters(gtSet,type="X.SNP")
199
+##trace(fit.lm2, browser)
200
+##estimateCnParameters(gtSet,type="NP")
201
+##estimateCnParameters(gtSet,type="SNP")
202
+cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet)
203
+##cnSet <- crlmmCopynumber(gtSet)
200 204
 @
201 205
 
202 206
 The \Rpackage{crlmmCopynumber} function no longer stores the
... ...
@@ -221,7 +225,7 @@ scaling by the slope coefficient. Specifically,
221 225
 \noindent See \cite{Scharpf2010} for details.
222 226
 
223 227
 The functions \Rfunction{CA}, \Rfunction{CB}, and
224
-\Rfunction{totalCopyNumber} can be used to extract CN estimates from
228
+\Rfunction{totalCopynumber} can be used to extract CN estimates from
225 229
 the \Robject{CNSet} container.
226 230
 
227 231
 <<ca>>=
... ...
@@ -235,7 +239,7 @@ ct <- ca+cb
235 239
 
236 240
 Alternatively, total copy number can be obtained by
237 241
 <<totalCopynumber.snps>>=
238
-ct2 <- totalCopyNumber(cnSet, i=snp.index, j=1:5)
242
+ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5)
239 243
 all.equal(ct, ct2)
240 244
 @
241 245
 
... ...
@@ -243,13 +247,13 @@ At nonpolymorphic loci, either the 'CA' or totalCopynumber method can
243 247
 be used to obtain total copy number.
244 248
 
245 249
 <<ca>>=
246
-cn.nonpolymorphic <- CA(cnSet, i=which(!isSnp(obj)))
250
+cn.nonpolymorphic <- CA(cnSet, i=which(!isSnp(cnSet)))
247 251
 @
248 252
 
249 253
 Total copy number at both polymorphic and nonpolymorphic loci:
250 254
 <<totalCopynumber>>=
251 255
 ##cn <- copyNumber(x)
252
-cn <- totalCopyNumber(cnSet, sample(1:nrow(cnSet), 1e4), 1:5)
256
+cn <- totalCopynumber(cnSet, sample(1:nrow(cnSet), 1e4), 1:5)
253 257
 apply(cn, 2, median, na.rm=TRUE)
254 258
 @
255 259
 
... ...
@@ -57,6 +57,15 @@ outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/
57 57
 datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
58 58
 @
59 59
 
60
+Options for controlling RAM with the \Rpackage{ff} package.
61
+
62
+<<ldOptions>>=
63
+ldPath(outdir)
64
+ocProbesets(150e3)
65
+ocSamples(200)
66
+@
67
+
68
+
60 69
 \paragraph{About this vignette.} This vignette was created using
61 70
 Illumina IDAT files that are located in a specific directory on my
62 71
 computer (\Robject{pathToCels}).  Long computations are saved in the
... ...
@@ -105,41 +114,19 @@ grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep="")))
105 114
 redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep="")))
106 115
 @
107 116
 
108
-<<samplesToProcess2, echo=FALSE>>=
109
-## To speed up repeated calls to Sweave
110
-RG <- checkExists("RG", .path=outdir,
111
-		  .FUN=readIdatFiles2,
112
-		  sampleSheet=samplesheet,
113
-		  path=dirname(arrayNames[1]),
114
-		  arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
115
-		  saveDate=TRUE)
116
-annotation(RG) <- "human370v1c"
117
+<<samplesToProcess2>>=
117 118
 crlmmResult <- checkExists("crlmmResult",
118 119
 			   .path=outdir,
119
-			   .FUN=crlmmIllumina2,
120
-			   RG=RG,
121
-			   sns=pData(RG)$ID,
120
+			   .FUN=crlmm:::crlmmIlluminaV2,
121
+			   sampleSheet=samplesheet,
122
+			   path=dirname(arrayNames[1]),
123
+			   arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
122 124
 			   returnParams=TRUE,
123 125
 			   cnFile=file.path(outdir, "cnFile.rda"),
124 126
 			   snpFile=file.path(outdir, "snpFile.rda"),
125
-			   save.it=TRUE)
126
-protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate
127
-@
128
-
129
-<<samplesToProcess3, eval=FALSE>>=
130
-RG <- readIdatFiles2(samplesheet[1:10, ],
131
-		    path=dirname(arrayNames[1]),
132
-		    arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
133
-		    saveDate=TRUE)
134
-annotation(RG) <- "human370v1c"
135
-crlmmResult <- crlmmIllumina2(RG=RG,
136
-			     cdfName="human370v1c",
137
-			     sns=pData(RG)$ID,
138
-			     returnParams=TRUE,
139
-			     cnFile=file.path(outdir, "cnFile.rda"),
140
-			     snpFile=file.path(outdir, "snpFile.rda"),
141
-			     save.it=TRUE)
142
-protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate
127
+			   save.ab=TRUE,
128
+			   cdfName="human370v1c")
129
+##protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate
143 130
 @
144 131
 
145 132
 %\noindent Finally, we load a few of the intermediate files that were
... ...
@@ -163,21 +150,11 @@ allele-specific copy number estimates.  A few helper functions for
163 150
 facilitating the construction of this container have been added to the
164 151
 inst/scripts directory of this package and can be sourced as follows.
165 152
 
166
-
167
-<<constructContainer1, echo=FALSE>>=
168
-##only load if cnSet object has not been created
169
-if(!file.exists(file.path(outdir, "cnSet.rda"))){
170
-	container <- checkExists("container",
171
-				 .path=outdir,
172
-				 .FUN=constructIlluminaCNSet,
173
-				 crlmmResult=crlmmResult,
174
-				 snpFile=snpFile,
175
-				 cnFile=cnFile)
176
-}
177
-@
178
-
179
-<<constructContainer2, eval=FALSE>>=
180
-container <- constructIlluminaCNSet(crlmmResult=crlmmResult, snpFile=snpFile, cnFile=cnFile)
153
+<<constructContainer2>>=
154
+batch <- as.factor(rep("1", ncol(crlmmResult)))
155
+##trace(constructIlluminaCNSet
156
+##trace(constructIlluminaAssayData, browser)
157
+container <- constructIlluminaCNSet(crlmmResult=crlmmResult, snpFile=file.path(outdir, "snpFile.rda"), cnFile=file.path(outdir, "cnFile.rda"), batch=batch)
181 158
 @
182 159
 
183 160
 As described in the \texttt{copynumber} vignette, two \R{} functions
... ...
@@ -192,10 +169,11 @@ decided at the beginning of the analysis and then propogated to both
192 169
 the genotyping and copy number estimation steps.  Here, we use the
193 170
 \Rfunction{crlmmCopynumber} to estimate copy number.
194 171
 
195
-<<estimateCopynumber1, echo=FALSE>>=
196
-cnSet <- checkExists("cnSet", .path=outdir,
197
-		     .FUN=crlmmCopynumber,
198
-		     object=container)
172
+<<crlmmCopynumber>>=
173
+stop()
174
+trace(shrinkGenotypeSummaries, browser)
175
+options(error=recover)
176
+cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=container)
199 177
 @
200 178
 
201 179
 <<estimateCopynumber, eval=FALSE>>=
... ...
@@ -123,6 +123,10 @@ crlmmCopynumber(object, MIN.SAMPLES=10, SNRMin = 5, MIN.OBS = 1,
123 123
 
124 124
 \details{
125 125
 
126
+    We suggest a minimum of 10 samples per batch for using
127
+    \code{crlmmCopynumber}.  50 or more samples per batch is preferred
128
+    and will improve the estimates.
129
+
126 130
 	The function crlmmCopynumber uses matrices instead of ff objects
127 131
 	if the ff library is not loaded. When the ff package is loaded,
128 132
 	large data support is enabled.  Normalized intensities