Browse code

updated genotype function. snprma now untouched

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45171 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 11/03/2010 16:22:32
Showing3 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.33
4
+Version: 1.5.34
5 5
 Date: 2010-02-05
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -47,6 +47,8 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
47 47
 		M[index, "chromosome"] <- cnProbes[, grep("chr", colnames(cnProbes))]
48 48
 		M[index, "isSnp"] <- 0L
49 49
 	}
50
+	##A few of the snpProbes do not match -- I think it is chromosome Y.
51
+	M[is.na(M[, "isSnp"]), "isSnp"] <- 1L
50 52
 	return(new("AnnotatedDataFrame", data=data.frame(M)))
51 53
 	##list(snpIndex, npIndex, fns)
52 54
 	##crlmmOpts$snpRange <- range(snpIndex)
... ...
@@ -68,63 +70,23 @@ construct <- function(filenames, cdfName, copynumber=FALSE, sns){
68 70
 	sampleNames(callSet) <- sns
69 71
 	return(callSet)
70 72
 }
73
+setMethod("close", "AlleleSet", function(con, ...){
74
+	object <- con
75
+	names <- ls(assayData(object))
76
+	L <- length(names)
77
+	for(i in 1:L) close(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i]))))
78
+	return()
79
+})
80
+setMethod("open", "AlleleSet", function(con, ...){
81
+	object <- con
82
+	names <- ls(assayData(object))
83
+	L <- length(names)
84
+	for(i in 1:L) open(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i]))))
85
+	return()
86
+})
71 87
 setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value))
72 88
 setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value))
73 89
 setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability"))
74
-crlmm.batch <- function(object,
75
-			batchSize,
76
-			mixtureParams,
77
-			probs=rep(1/3,3),
78
-			DF=6,
79
-			SNRMin=5,
80
-			recallMin=10,
81
-			recallRegMin=1000,
82
-			gender=NULL,
83
-			desctrucitve=FALSE,
84
-			verbose=TRUE,
85
-			returnParams=FALSE,
86
-			badSNP=0.7){
87
-	##Call in batches to reduce ram
88
-	BS <- batchSize
89
-	nc <- ncol(object)
90
-	if(nc > BS){
91
-		N <- ceiling(nc/BS)
92
-		S <- ceiling(nc/N)
93
-		colindex <- split(1:nc, rep(1:nc, each=S, length.out=nc))
94
-	} else {
95
-		colindex <- list(1:nc)
96
-	}
97
-	if(length(colindex) > 1)
98
-		message("Calling genotypes in batches of size ", length(colindex[[1]]), " to reduce required RAM")
99
-	row.index <- which(isSnp(object)==1 | is.na(isSnp(object)))
100
-	for(i in seq(along=colindex)){
101
-		col.index <- colindex[[i]]
102
-		tmp <- crlmmGT(A=A(object)[row.index, col.index],
103
-			       B=B(object)[row.index, col.index],
104
-			       SNR=object$SNR[col.index],
105
-			       mixtureParams=mixtureParams[, col.index],
106
-			       cdfName=annotation(object),
107
-			       row.names=featureNames(object)[row.index],
108
-			       col.names=sampleNames(object)[col.index],
109
-			       probs=probs,
110
-			       DF=DF,
111
-			       SNRMin=SNRMin,
112
-			       recallMin=recallMin,
113
-			       recallRegMin=recallRegMin,
114
-			       gender=gender,
115
-			       desctrucitve=desctrucitve,
116
-			       verbose=verbose,
117
-			       returnParams=returnParams,
118
-			       badSNP=badSNP)
119
-		##ensure matrix is passed
120
-		calls(object)[row.index, col.index] <- tmp[["calls"]]
121
-		confs(object)[row.index, col.index] <- tmp[["confs"]]
122
-		object$gender[col.index] <- tmp$gender
123
-		rm(tmp); gc()
124
-	}
125
-	return(object)
126
-}
127
-
128 90
 genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
129 91
 		     fitMixture=TRUE,
130 92
 		     eps=0.1, verbose=TRUE,
... ...
@@ -143,60 +105,66 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
143 105
 	if(missing(sns)) sns <- basename(filenames)
144 106
 	## callSet contains potentially very big matrices
145 107
 	## More big matrices are created within snprma, that will then be removed.
146
-	snprmaRes <- snprma(filenames=filenames,
147
-			    mixtureSampleSize=mixtureSampleSize,
148
-			    fitMixture=fitMixture,
149
-			    eps=eps,
150
-			    verbose=verbose,
151
-			    seed=seed,
152
-			    cdfName=cdfName,
153
-			    sns=sns)
154
-	message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
155 108
 	callSet <- construct(filenames=filenames,
156 109
 			     cdfName=cdfName,
157 110
 			     copynumber=copynumber,
158 111
 			     sns=sns)
159
-	sampleStats <- data.frame(SKW=snprmaRes$SKW,
160
-				  SNR=snprmaRes$SNR)
161
-	pD <- new("AnnotatedDataFrame",
162
-		  data=sampleStats,
163
-		  varMetadata=data.frame(labelDescription=colnames(sampleStats)))
164
-	sampleNames(pD) <- sampleNames(callSet)
165
-	phenoData(callSet) <- pD
166
-	if(!copynumber){
167
-		## A and B are the right size
168
-		A(callSet) <- snprmaRes[["A"]]  ## should work regardless of ff, matrix
169
-		B(callSet) <- snprmaRes[["B"]]
170
-	} else {
171
-		## A and B are not big enough to hold the nonpolymorphic markers
172
-		index <- which(fData(callSet)[, "isSnp"] == 1)
173
-		## Inefficient.  
174
-		## write one column at a time (????).  (we don't want to bring in the whole matrix if its huge)
175
-		if(isPackageLoaded("ff")){
176
-			for(j in 1:ncol(callSet)){
177
-				A(callSet)[index, j] <- snprmaRes[["A"]][, j]  
178
-				B(callSet)[index, j] <- snprmaRes[["B"]][, j]
179
-			}
180
-			delete(snprmaRes[["A"]])##removes the file on disk
181
-			delete(snprmaRes[["B"]])##removes the file on disk
182
-		} else {
183
-			A(callSet)[index, ] <- snprmaRes[["A"]]
184
-			B(callSet)[index, ] <- snprmaRes[["B"]]
185
-		}
112
+	mixtureParams <- matrix(NA, 4, length(filenames))
113
+	snp.index <- which(isSnp(callSet)==1)
114
+	batches <- splitIndicesByLength(1:ncol(callSet), ocSamples())
115
+	for(j in batches){
116
+		snprmaRes <- snprma(filenames=filenames[j],
117
+				    mixtureSampleSize=mixtureSampleSize,
118
+				    fitMixture=fitMixture,
119
+				    eps=eps,
120
+				    verbose=verbose,
121
+				    seed=seed,
122
+				    cdfName=cdfName,
123
+				    sns=sns)
124
+		stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
125
+		message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
126
+		pData(callSet)$SKW[j] <- snprmaRes$SKW
127
+		pData(callSet)$SNR[j] <- snprmaRes$SNR
128
+		A(callSet)[snp.index, j] <- snprmaRes[["A"]]
129
+		B(callSet)[snp.index, j] <- snprmaRes[["B"]]
130
+		mixtureParams[, j] <- snprmaRes$mixtureParams
131
+		rm(snprmaRes); gc()
132
+	}
133
+	## remove snprmaRes and garbage collect before quantile-normalizing NP probes 
134
+	if(copynumber){
135
+		np.index <- which(isSnp(callSet) == 0)
136
+		cnrmaRes <- cnrma(filenames=filenames[j],
137
+				  cdfName=cdfName,
138
+				  row.names=featureNames(callSet)[np.index],				  
139
+				  sns=sns,
140
+				  seed=seed,
141
+				  verbose=verbose)
142
+		stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes)))
143
+		A(callSet)[np.index, j] <- cnrmaRes
144
+		rm(cnrmaRes); gc()
186 145
 	}
187
-	gc()
188
-	callSet <- crlmm.batch(object=callSet,
189
-				batchSize=batchSize,
190
-				mixtureParams=snprmaRes$mixtureParams,
191
-				probs=probs,
192
-				DF=DF,
193
-				SNRMin=SNRMin,
194
-				recallMin=recallMin,
195
-				recallRegMin=recallRegMin,
196
-				gender=gender,
197
-				verbose=verbose,
198
-				returnParams=returnParams,
199
-				badSNP=badSNP)
146
+	for(j in batches){
147
+		tmp <- crlmmGT(A=A(callSet)[snp.index, j],
148
+			       B=B(callSet)[snp.index, j],
149
+			       SNR=callSet$SNR[j],
150
+			       mixtureParams=mixtureParams[, j],
151
+			       cdfName=annotation(callSet),
152
+			       row.names=featureNames(callSet)[snp.index],
153
+			       col.names=sampleNames(callSet)[j],
154
+			       probs=probs,
155
+			       DF=DF,
156
+			       SNRMin=SNRMin,
157
+			       recallMin=recallMin,
158
+			       recallRegMin=recallRegMin,
159
+			       gender=gender,
160
+			       verbose=verbose,
161
+			       returnParams=returnParams,
162
+			       badSNP=badSNP)
163
+		calls(callSet)[snp.index, j] <- tmp[["calls"]]
164
+		confs(callSet)[snp.index, j] <- tmp[["confs"]]
165
+		callSet$gender[j] <- tmp$gender
166
+		rm(tmp); gc()
167
+	}	
200 168
 	return(callSet)
201 169
 }
202 170
 
... ...
@@ -671,26 +639,18 @@ whichPlatform <- function(cdfName){
671 639
 
672 640
 
673 641
 # steps: quantile normalize hapmap: create 1m_reference_cn.rda object
674
-cnrma <- function(filenames, cdfName, sns, seed=1, verbose=FALSE, outdir){
642
+cnrma <- function(filenames, cdfName, row.names, sns, seed=1, verbose=FALSE){
675 643
 	if(missing(cdfName)) stop("must specify cdfName")
676 644
 	pkgname <- getCrlmmAnnotationName(cdfName)
677 645
 	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
678 646
 	if (missing(sns)) sns <- basename(filenames)
679 647
         loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
680 648
 	fid <- getVarInEnv("npProbesFid")
649
+	fid <- fid[match(row.names, names(fid))]
681 650
 	set.seed(seed)
682 651
 	idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
683 652
 	SKW <- vector("numeric", length(filenames))
684
-##	if(bigmemory){
685
-##		NP <- filebacked.big.matrix(length(pnsa), length(filenames),
686
-##					    type="integer",
687
-##					    init=as.integer(0),
688
-##					    backingpath=outdir,
689
-##					    backingfile="NP.bin",
690
-##					    descriptorfile="NP.desc")
691
-##	} else{
692
-		NP <- matrix(NA, length(fid), length(filenames))
693
-##	}
653
+	NP <- matrix(NA, length(fid), length(filenames))
694 654
 	verbose <- TRUE
695 655
 	if(verbose){
696 656
 		message("Processing ", length(filenames), " files.")
... ...
@@ -716,9 +676,9 @@ cnrma <- function(filenames, cdfName, sns, seed=1, verbose=FALSE, outdir){
716 676
 	}
717 677
 	dimnames(NP) <- list(names(fid), sns)
718 678
 	##dimnames(NP) <- list(map[, "man_fsetid"], sns)
719
-	res3 <- list(NP=NP, SKW=SKW)
679
+	##res3 <- list(NP=NP, SKW=SKW)
720 680
 	cat("\n")
721
-	return(res3)
681
+	return(NP)
722 682
 }
723 683
 
724 684
 getFlags <- function(object, PHI.THR){
... ...
@@ -42,11 +42,8 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1,
42 42
   ##S will hold (A+B)/2 and M will hold A-B
43 43
   ##NOTE: We actually dont need to save S. Only for pics etc...
44 44
   ##f is the correction. we save to avoid recomputing
45
-  ##**RS**
46
-  ##A <- matrix(as.integer(0), length(pnsa), length(filenames))
47
-  ##B <- matrix(as.integer(0), length(pnsb), length(filenames))
48
-  A <- initializeBigMatrix(name="tmpA", length(pnsa), length(filenames))
49
-  B <- initializeBigMatrix(name="tmpB", length(pnsa), length(filenames))
45
+  A <- matrix(as.integer(0), length(pnsa), length(filenames))
46
+  B <- matrix(as.integer(0), length(pnsb), length(filenames))
50 47
   
51 48
   if(verbose){
52 49
     message("Processing ", length(filenames), " files.")