Browse code

added genotype function to cnrma-functions. Added AllClasses.R file to set ff_matrix and ffdf classes

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45165 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 11/03/2010 11:02:23
Showing6 changed files

... ...
@@ -10,7 +10,7 @@ License: Artistic-2.0
10 10
 Depends: R (>= 2.11.0),
11 11
          methods,
12 12
          Biobase (>= 2.7.2),
13
-         oligoClasses (>= 1.9.28)
13
+         oligoClasses (>= 1.9.35)
14 14
 Imports: affyio (>= 1.15.2),
15 15
          preprocessCore,
16 16
          utils,
... ...
@@ -26,7 +26,7 @@ importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet)
26 26
 
27 27
 importMethodsFrom(oligoClasses, allele, calls, "calls<-", confs,
28 28
 		  "confs<-", cnConfidence, "cnConfidence<-", 
29
-		  isSnp, chromosome, position, CA, "CA<-", CB, "CB<-", A, B)
29
+		  isSnp, chromosome, position, CA, "CA<-", CB, "CB<-", A, B, "A<-", "B<-")
30 30
 
31 31
 
32 32
 importFrom(oligoClasses, chromosome2integer, celfileDate, list.celfiles)
... ...
@@ -52,14 +52,6 @@ importFrom(mvtnorm, dmvnorm)
52 52
 
53 53
 importFrom(ellipse, ellipse)
54 54
 
55
-exportMethods(A, B, copyNumber)
55
+exportMethods(copyNumber)
56 56
 export(cnOptions, crlmm, crlmmIllumina, crlmmCopynumber, ellipse, readIdatFiles, snprma, getParam) 
57
-
58
-##export(##viterbi.CNSet, ##move to VanillaICE
59
-##	combineIntensities,
60
-##	whichPlatform,
61
-##	isValidCdfName,
62
-##	crlmmWrapper,
63
-##        withinGenotypeMoments,  
64
-##	locationAndScale, getParam.SnpSuperSet)
65
-export(thresholdModelParams, computeCopynumber.CNSet, nuphiAllele, coefs, biasAdjNP)
57
+export(genotype)
66 58
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+setOldClass("ffdf")
2
+setOldClass("ff_matrix")
3
+##setClassUnion("matrix_or_ff", c("matrix", "ff_matrix"))
4
+##setClassUnion("list_or_ffdf", c("list", "ffdf"))
... ...
@@ -52,21 +52,152 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
52 52
 	##crlmmOpts$snpRange <- range(snpIndex)
53 53
 	##crlmmOpts$npRange <- range(npIndex)
54 54
 }
55
-construct <- function(filenames, cdfName){
55
+construct <- function(filenames, cdfName, copynumber=FALSE, sns){
56
+	if(missing(sns)) sns <- basename(filenames)
56 57
 	protocolData <- getProtocolData.Affy(filenames)
57
-	M <- getFeatureData.Affy(cdfName)
58
-	dns <- list(rownames(M), basename(filenames))
59
-	nr <- nrow(M)
60
-	alleleSet <- new("AffymetrixAlleleSet", 
61
-			 alleleA=initializeBigMatrix(dns),
62
-			 alleleB=initializeBigMatrix(dns),
63
-			 genomeAnnotation=M,
64
-			 options=crlmmOptions(object),
65
-			 annotation=annotation(object))
66
-	protocolData(alleleSet) <- protocolData
67
-	sampleNames(alleleSet) <- basename(filenames)
68
-	featureNames(alleleSet) <- dns[[1]]
69
-	return(alleleSet)
58
+	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
59
+	nr <- nrow(featureData); nc <- length(filenames)
60
+	callSet <- new("SnpSuperSet", 
61
+			 alleleA=initializeBigMatrix(name="A", nr, nc),
62
+			 alleleB=initializeBigMatrix(name="B", nr, nc),
63
+			 call=initializeBigMatrix(name="call", nr, nc),
64
+			 callProbability=initializeBigMatrix(name="callPr", nr,nc),
65
+			 featureData=featureData,
66
+			 annotation=cdfName)
67
+	protocolData(callSet) <- protocolData
68
+	sampleNames(callSet) <- sns
69
+	return(callSet)
70
+}
71
+setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value))
72
+setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value))
73
+setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability"))
74
+crlmm.batch <- function(object,
75
+			batchSize,
76
+			mixtureParams,
77
+			probs=rep(1/3,3),
78
+			DF=6,
79
+			SNRMin=5,
80
+			recallMin=10,
81
+			recallRegMin=1000,
82
+			gender=NULL,
83
+			desctrucitve=FALSE,
84
+			verbose=TRUE,
85
+			returnParams=FALSE,
86
+			badSNP=0.7){
87
+	##Call in batches to reduce ram
88
+	BS <- batchSize
89
+	nc <- ncol(object)
90
+	if(nc > BS){
91
+		N <- ceiling(nc/BS)
92
+		S <- ceiling(nc/N)
93
+		colindex <- split(1:nc, rep(1:nc, each=S, length.out=nc))
94
+	} else {
95
+		colindex <- list(1:nc)
96
+	}
97
+	if(length(colindex) > 1)
98
+		message("Calling genotypes in batches of size ", length(colindex[[1]]), " to reduce required RAM")
99
+	row.index <- which(isSnp(object)==1 | is.na(isSnp(object)))
100
+	for(i in seq(along=colindex)){
101
+		col.index <- colindex[[i]]
102
+		tmp <- crlmmGT(A=A(object)[row.index, col.index],
103
+			       B=B(object)[row.index, col.index],
104
+			       SNR=object$SNR[col.index],
105
+			       mixtureParams=mixtureParams[, col.index],
106
+			       cdfName=annotation(object),
107
+			       row.names=featureNames(object)[row.index],
108
+			       col.names=sampleNames(object)[col.index],
109
+			       probs=probs,
110
+			       DF=DF,
111
+			       SNRMin=SNRMin,
112
+			       recallMin=recallMin,
113
+			       recallRegMin=recallRegMin,
114
+			       gender=gender,
115
+			       desctrucitve=desctrucitve,
116
+			       verbose=verbose,
117
+			       returnParams=returnParams,
118
+			       badSNP=badSNP)
119
+		##ensure matrix is passed
120
+		calls(object)[row.index, col.index] <- tmp[["calls"]]
121
+		confs(object)[row.index, col.index] <- tmp[["confs"]]
122
+		object$gender[col.index] <- tmp$gender
123
+		rm(tmp); gc()
124
+	}
125
+	return(object)
126
+}
127
+
128
+genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
129
+		     fitMixture=TRUE,
130
+		     eps=0.1, verbose=TRUE,
131
+		     seed=1, sns, copynumber=FALSE,
132
+		     batchSize=1000,
133
+		     probs=rep(1/3, 3),
134
+		     DF=6,
135
+		     SNRMin=5,
136
+		     recallMin=10,
137
+		     recallRegMin=1000,
138
+		     gender=NULL,
139
+		     returnParams=TRUE,
140
+		     badSNP=0.7){
141
+	if(missing(cdfName)) stop("must specify cdfName")
142
+	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
143
+	if(missing(sns)) sns <- basename(filenames)
144
+	## callSet contains potentially very big matrices
145
+	## More big matrices are created within snprma, that will then be removed.
146
+	snprmaRes <- snprma(filenames=filenames,
147
+			    mixtureSampleSize=mixtureSampleSize,
148
+			    fitMixture=fitMixture,
149
+			    eps=eps,
150
+			    verbose=verbose,
151
+			    seed=seed,
152
+			    cdfName=cdfName,
153
+			    sns=sns)
154
+	message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
155
+	callSet <- construct(filenames=filenames,
156
+			     cdfName=cdfName,
157
+			     copynumber=copynumber,
158
+			     sns=sns)
159
+	sampleStats <- data.frame(SKW=snprmaRes$SKW,
160
+				  SNR=snprmaRes$SNR)
161
+	pD <- new("AnnotatedDataFrame",
162
+		  data=sampleStats,
163
+		  varMetadata=data.frame(labelDescription=colnames(sampleStats)))
164
+	sampleNames(pD) <- sampleNames(callSet)
165
+	phenoData(callSet) <- pD
166
+	if(!copynumber){
167
+		## A and B are the right size
168
+		A(callSet) <- snprmaRes[["A"]]  ## should work regardless of ff, matrix
169
+		B(callSet) <- snprmaRes[["B"]]
170
+	} else {
171
+		## A and B are not big enough to hold the nonpolymorphic markers
172
+		index <- which(fData(callSet)[, "isSnp"] == 1)
173
+		## Inefficient.  
174
+		## write one column at a time (????).  (we don't want to bring in the whole matrix if its huge)
175
+		if(isPackageLoaded("ff")){
176
+			for(j in 1:ncol(callSet)){
177
+				A(callSet)[index, j] <- snprmaRes[["A"]][, j]  
178
+				B(callSet)[index, j] <- snprmaRes[["B"]][, j]
179
+			}
180
+			delete(snprmaRes[["A"]])##removes the file on disk
181
+			delete(snprmaRes[["B"]])##removes the file on disk
182
+		} else {
183
+			A(callSet)[index, ] <- snprmaRes[["A"]]
184
+			B(callSet)[index, ] <- snprmaRes[["B"]]
185
+		}
186
+	}
187
+	gc()
188
+	callSet <- crlmm.batch(object=callSet,
189
+				batchSize=batchSize,
190
+				mixtureParams=snprmaRes$mixtureParams,
191
+				probs=probs,
192
+				DF=DF,
193
+				SNRMin=SNRMin,
194
+				recallMin=recallMin,
195
+				recallRegMin=recallRegMin,
196
+				gender=gender,
197
+				verbose=verbose,
198
+				returnParams=returnParams,
199
+				badSNP=badSNP)
200
+	return(callSet)
70 201
 }
71 202
 
72 203
 
... ...
@@ -42,8 +42,11 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1,
42 42
   ##S will hold (A+B)/2 and M will hold A-B
43 43
   ##NOTE: We actually dont need to save S. Only for pics etc...
44 44
   ##f is the correction. we save to avoid recomputing
45
-  A <- matrix(as.integer(0), length(pnsa), length(filenames))
46
-  B <- matrix(as.integer(0), length(pnsb), length(filenames))
45
+  ##**RS**
46
+  ##A <- matrix(as.integer(0), length(pnsa), length(filenames))
47
+  ##B <- matrix(as.integer(0), length(pnsb), length(filenames))
48
+  A <- initializeBigMatrix(name="tmpA", length(pnsa), length(filenames))
49
+  B <- initializeBigMatrix(name="tmpB", length(pnsa), length(filenames))
47 50
   
48 51
   if(verbose){
49 52
     message("Processing ", length(filenames), " files.")
... ...
@@ -155,7 +155,69 @@ celDates <- function(celfiles){
155 155
 	return(celdts)
156 156
 }
157 157
 
158
+validCdfNames <- function(){
159
+	c("genomewidesnp6",
160
+	  "genomewidesnp5",
161
+	  "human370v1c",
162
+	  "human370quadv3c",
163
+	  "human550v3b",
164
+	  "human650v3a",
165
+	  "human610quadv1b",
166
+	  "human660quadv1a",
167
+	  "human1mduov3b",
168
+	  "humanomni1quadv1b")
169
+}
170
+isValidCdfName <- function(cdfName){
171
+	chipList <- validCdfNames()
172
+	result <- cdfName %in% chipList	
173
+	if(!(result)){
174
+		warning("cdfName must be one of the following: ",
175
+			chipList)
176
+	}
177
+	return(result)
178
+}
158 179
 
180
+isPackageLoaded <- function(pkg){
181
+	stopifnot(is.character(pkg))
182
+	pkg <- paste("package:", pkg, sep="")
183
+	pkg %in% search()
184
+}
159 185
 
186
+initializeBigMatrix <- function(name, nr, nc, vmode="integer"){
187
+	if(isPackageLoaded("ff")){
188
+		if(prod(nr, nc) > 2^31){
189
+			##Need multiple matrices
190
+			## -- use ffdf
191
+			
192
+			## How many samples per ff object
193
+			S <- floor(2^31/nr - 1)
194
+
195
+			## How many ff objects
196
+			L <- ceiling(nc/S)
197
+			name <- paste(name, 1:L, sep="_")
198
+
199
+			results <- vector("list", L)
200
+			##resultsB <- vector("list", L)			
201
+			for(i in 1:(L-1)){  ## the Lth object may have fewer than nc columns
202
+				results[[i]] <- createFF(name=name[i],
203
+							 dim=c(nr, S),
204
+							 vmode=vmode)
205
+			}
206
+			##the Lth element
207
+			leftOver <- nc - ((L-1)*S)
208
+			results[[L]] <- createFF(name=name[L],
209
+						 dim=c(nr, leftOver),
210
+						 vmode=vmode)
211
+			resultsff <- do.call(ffdf, results)
212
+			##dimnames(resultsff) <- dns
213
+		} else {
214
+			resultsff <- createFF(name=name,
215
+					      dim=c(nr, nc),
216
+					      vmode=vmode)
217
+		}
218
+		resultsff[,] <- NA
219
+	}  else resultsff <- matrix(NA, nr, nc)
220
+	return(resultsff)
221
+}
160 222
 setMethod("annotatedDataFrameFrom", "ff_matrix", Biobase:::annotatedDataFrameFromMatrix)
161 223
 setMethod("annotatedDataFrameFrom", "ffdf", Biobase:::annotatedDataFrameFromMatrix)