Browse code

Check batch early in genotype function. Add genotypeRS function to genotype Illumina data. Fix stop message in getVarInEnv -- do not try to print envir.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@53853 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 18/03/2011 01:57:30
Showing3 changed files

... ...
@@ -120,9 +120,11 @@ genotype <- function(filenames,
120 120
 	if(missing(cdfName)) stop("must specify cdfName")
121 121
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
122 122
 	if(missing(sns)) sns <- basename(filenames)
123
+	stopifnot(!missing(batch))
123 124
 	##---------------------------------------------------------------------------
124 125
 	##
125
-	## from snprma2.  Goal is to initialize A and B with appropriate dimension for snps+nps
126
+	## from snprma2.  Goal is to initialize A and B with
127
+	## appropriate dimension for snps+nps
126 128
 	##
127 129
 	##---------------------------------------------------------------------------
128 130
 	if (missing(sns)) sns <- basename(filenames)
... ...
@@ -186,6 +188,7 @@ genotype <- function(filenames,
186 188
 	phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
187 189
 	stopifnot(validObject(cnSet))
188 190
 	snp.I <- isSnp(cnSet)
191
+	snp.index <- which(snp.I)
189 192
 	pData(cnSet)$SKW <- SKW
190 193
 	pData(cnSet)$SNR <- SNR
191 194
 	np.index <- which(!snp.I)
... ...
@@ -212,7 +215,8 @@ genotype <- function(filenames,
212 215
 			  gender=gender,
213 216
 			  verbose=verbose,
214 217
 			  returnParams=returnParams,
215
-			  badSNP=badSNP)
218
+			  badSNP=badSNP,
219
+			  snp.names=featureNames(cnSet)[snp.index])
216 220
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
217 221
 	cnSet$gender <- tmp[["gender"]]
218 222
 	return(cnSet)
... ...
@@ -2363,267 +2367,314 @@ rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
2363 2367
                      col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
2364 2368
                      SNRMin=5, recallMin=10, recallRegMin=1000,
2365 2369
                      gender=NULL, desctrucitve=FALSE, verbose=TRUE,
2366
-                     returnParams=FALSE, badSNP=.7){
2367
-  pkgname <- getCrlmmAnnotationName(cdfName)
2368
-  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
2369
-  open(SNR)
2370
-  open(A)
2371
-  open(B)
2372
-  open(mixtureParams)
2373
-  ## expect objects to be ff
2374
-
2375
-  keepIndex <- which( SNR[] > SNRMin)
2376
-  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
2377
-  if(is.null(row.names) & is.null(rownames(A))){
2378
-	  ##verify that A has only snps.  otherwise, calling function must pass rownames
2379
-	  message("rownames not available.  Assuming only SNPs in A")
2380
-	  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
2381
-  } else{
2382
-	  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
2383
-	  gns <- getVarInEnv("gns")
2384
-	  index <- match(gns, rownames(A))
2385
-	  snpBatches <- splitIndicesByLength(index, ocProbesets())
2386
-  }
2387
-  NR <- length(unlist(snpBatches))
2388
-  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2389
-  NC <- ncol(A)
2390
-
2391
-  if(verbose) message("Loading annotations.")
2392
-  obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
2393
-  obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
2394
-  ## this is toget rid of the 'no visible binding' notes
2395
-  ## variable definitions
2396
-  XIndex <- getVarInEnv("XIndex")
2397
-  autosomeIndex <- getVarInEnv("autosomeIndex")
2398
-  YIndex <- getVarInEnv("YIndex")
2399
-  SMEDIAN <- getVarInEnv("SMEDIAN")
2400
-  theKnots <- getVarInEnv("theKnots")
2401
-  regionInfo <- getVarInEnv("regionInfo")
2402
-  params <- getVarInEnv("params")
2403
-  rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
2404
-  rm(obj1, obj2)
2405
-
2406
-  ## IF gender not provide, we predict
2407
-  ## FIXME: XIndex may be greater than ocProbesets()
2408
-  if(is.null(gender)){
2409
-    if(verbose) message("Determining gender.")
2410
-##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
2411
-    XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
2412
-    XMedian <- unlist(XMedian)
2413
-    if(sum(SNR[] > SNRMin)==1){
2414
-      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
2415
-    }else{
2416
-      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
2417
-    }
2418
-  }
2419
-
2420
-  Indexes <- list(autosomeIndex, XIndex, YIndex)
2421
-  cIndexes <- list(keepIndex,
2422
-                   keepIndex[which(gender[keepIndex]==2)],
2423
-                   keepIndex[which(gender[keepIndex]==1)])
2424
-
2425
-  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2426
-
2427
-  ## call C
2428
-  fIndex <- which(gender==2)
2429
-  mIndex <- which(gender==1)
2430
-
2431
-  ## different here
2432
-  ## use gtypeCallerR in batches
2433
-  ##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
2434
-  newparamsBatch <- vector("list", length(snpBatches))
2435
-
2436
-  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2437
-                       YIndex, A, B, mixtureParams, fIndex, mIndex,
2438
-                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
2439
-    open(A)
2440
-    open(B)
2441
-    open(mixtureParams)
2442
-    snps <- snpBatches[[idxBatch]]
2443
-    rSnps <- range(snps)
2444
-    last <- (idxBatch-1)*batchSize
2445
-    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2446
-                         XIndex[XIndex %in% snps]-last,
2447
-                         YIndex[YIndex %in% snps]-last)
2448
-    IndexesBatch <- lapply(IndexesBatch, as.integer)
2449
-    tmpA <- as.matrix(A[snps,])
2450
-    tmpB <- as.matrix(B[snps,])
2451
-    ## newparamsBatch[[idxBatch]]
2452
-
2453
-    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
2454
-                        params[["centers"]][snps,],
2455
-                        params[["scales"]][snps,],
2456
-                        params[["N"]][snps,],
2457
-                        IndexesBatch, cIndexes,
2458
-                        sapply(IndexesBatch, length),
2459
-                        sapply(cIndexes, length), SMEDIAN,
2460
-                        theKnots, mixtureParams[], DF, probs, 0.025)
2461
-    rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
2462
-    gc(verbose=FALSE)
2463
-    close(A)
2464
-    close(B)
2465
-    close(mixtureParams)
2466
-    tmp
2467
-  }
2468
-
2469
-  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
2470
-                             snpBatches=snpBatches,
2471
-                             autosomeIndex=autosomeIndex, XIndex=XIndex,
2472
-                             YIndex=YIndex, A=A, B=B,
2473
-                             mixtureParams=mixtureParams, fIndex=fIndex,
2474
-                             mIndex=mIndex, params=params,
2475
-                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
2476
-                             theKnots=theKnots, DF=DF, probs=probs,
2477
-                             batchSize=ocProbesets())
2478
-  newparams <- vector("list", 3)
2479
-  names(newparams) <- c("centers", "scales", "N")
2480
-  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
2481
-  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
2482
-  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
2483
-  rm(newparamsBatch); gc(verbose=FALSE)
2484
-  if(verbose) message("Done.")
2485
-  if(verbose) message("Estimating recalibration parameters.")
2486
-  d <- newparams[["centers"]] - params$centers
2487
-
2488
-  ##regression
2489
-  Index <- intersect(which(pmin(newparams[["N"]][, 1],
2490
-                                newparams[["N"]][, 2],
2491
-                                newparams[["N"]][, 3]) > recallMin &
2492
-                                !apply(regionInfo, 1, any)),
2493
-                                autosomeIndex)
2494
-  if(length(Index) < recallRegMin){
2495
-    warning("Recalibration not possible. Possible cause: small sample size.")
2496
-    newparams <- params
2497
-    dev <- vector("numeric", nrow(newparams[["centers"]]))
2498
-    SS <- matrix(Inf, 3, 3)
2499
-    DD <- 0
2500
-  }else{
2501
-    data4reg <- as.data.frame(newparams[["centers"]][Index,])
2502
-    names(data4reg) <- c("AA", "AB", "BB")
2503
-    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
2504
-                       c(coef(lm(AB~AA+BB, data=data4reg)), 0),
2505
-                         coef(lm(BB~AA*AB, data=data4reg)))
2506
-    rownames(regParams) <- c("intercept", "X", "Y", "XY")
2507
-    rm(data4reg)
2508
-
2509
-    minN <- 3
2510
-    newparams[["centers"]][newparams[["N"]] < minN] <- NA
2511
-    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
2512
-    if(verbose) message("Filling out empty centers", appendLF=FALSE)
2513
-    for(i in Index){
2514
-      if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
2515
-      mu <- newparams[["centers"]][i, ]
2516
-      j <- which(is.na(mu))
2517
-      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
2518
-      rm(mu, j)
2519
-    }
2370
+                     returnParams=FALSE, badSNP=.7, snp.names){
2371
+	pkgname <- getCrlmmAnnotationName(cdfName)
2372
+	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
2373
+	open(SNR)
2374
+	open(A)
2375
+	open(B)
2376
+	open(mixtureParams)
2377
+	## expect objects to be ff
2378
+	keepIndex <- which( SNR[] > SNRMin)
2379
+	if(length(keepIndex)==0) stop("No arrays above quality threshold!")
2380
+	if(is.null(rownames(A))){
2381
+		loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
2382
+		gns <- getVarInEnv("gns", .crlmmPkgEnv)
2383
+		stopifnot(nrow(A) == length(gns))
2384
+		index <- seq(length=nrow(A))
2385
+	}
2386
+	if(!missing(snp.names)){
2387
+		stopifnot(!is.null(rownames(A)))
2388
+		##verify that A has only snps.  otherwise, calling function must pass rownames
2389
+		index <- match(snp.names, rownames(A))
2390
+	}
2391
+	snpBatches <- splitIndicesByLength(index, ocProbesets())
2392
+	NR <- length(unlist(snpBatches))
2393
+	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2394
+	NC <- ncol(A)
2395
+	##
2396
+	if(verbose) message("Loading annotations.")
2397
+	obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
2398
+	obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
2399
+	## this is toget rid of the 'no visible binding' notes
2400
+	## variable definitions
2401
+	XIndex <- getVarInEnv("XIndex")
2402
+	autosomeIndex <- getVarInEnv("autosomeIndex")
2403
+	YIndex <- getVarInEnv("YIndex")
2404
+	SMEDIAN <- getVarInEnv("SMEDIAN")
2405
+	theKnots <- getVarInEnv("theKnots")
2406
+	regionInfo <- getVarInEnv("regionInfo")
2407
+	params <- getVarInEnv("params")
2408
+	rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
2409
+	rm(obj1, obj2)
2410
+	##
2411
+	## IF gender not provide, we predict
2412
+	## FIXME: XIndex may be greater than ocProbesets()
2413
+	if(is.null(gender)){
2414
+		if(verbose) message("Determining gender.")
2415
+		##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
2416
+		XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
2417
+		XMedian <- unlist(XMedian)
2418
+		if(sum(SNR[] > SNRMin)==1){
2419
+			gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
2420
+		}else{
2421
+			gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
2422
+		}
2423
+	}
2424
+	##
2425
+	Indexes <- list(autosomeIndex, XIndex, YIndex)
2426
+	cIndexes <- list(keepIndex,
2427
+			 keepIndex[which(gender[keepIndex]==2)],
2428
+			 keepIndex[which(gender[keepIndex]==1)])
2429
+	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2430
+	## call C
2431
+	fIndex <- which(gender==2)
2432
+	mIndex <- which(gender==1)
2433
+	## different here
2434
+	## use gtypeCallerR in batches
2435
+	##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
2436
+	newparamsBatch <- vector("list", length(snpBatches))
2437
+	process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2438
+			     YIndex, A, B, mixtureParams, fIndex, mIndex,
2439
+			     params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
2440
+		open(A)
2441
+		open(B)
2442
+		open(mixtureParams)
2443
+		snps <- snpBatches[[idxBatch]]
2444
+		rSnps <- range(snps)
2445
+		last <- (idxBatch-1)*batchSize
2446
+		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2447
+				     XIndex[XIndex %in% snps]-last,
2448
+				     YIndex[YIndex %in% snps]-last)
2449
+		IndexesBatch <- lapply(IndexesBatch, as.integer)
2450
+		tmpA <- as.matrix(A[snps,])
2451
+		tmpB <- as.matrix(B[snps,])
2452
+		## newparamsBatch[[idxBatch]]
2453
+		tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
2454
+				    params[["centers"]][snps,],
2455
+				    params[["scales"]][snps,],
2456
+				    params[["N"]][snps,],
2457
+				    IndexesBatch, cIndexes,
2458
+				    sapply(IndexesBatch, length),
2459
+				    sapply(cIndexes, length), SMEDIAN,
2460
+				    theKnots, mixtureParams[], DF, probs, 0.025)
2461
+		rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
2462
+		gc(verbose=FALSE)
2463
+		close(A)
2464
+		close(B)
2465
+		close(mixtureParams)
2466
+		tmp
2467
+	}
2468
+	##
2469
+	newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
2470
+				   snpBatches=snpBatches,
2471
+				   autosomeIndex=autosomeIndex, XIndex=XIndex,
2472
+				   YIndex=YIndex, A=A, B=B,
2473
+				   mixtureParams=mixtureParams, fIndex=fIndex,
2474
+				   mIndex=mIndex, params=params,
2475
+				   cIndexes=cIndexes, SMEDIAN=SMEDIAN,
2476
+				   theKnots=theKnots, DF=DF, probs=probs,
2477
+				   batchSize=ocProbesets())
2478
+	newparams <- vector("list", 3)
2479
+	names(newparams) <- c("centers", "scales", "N")
2480
+	newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
2481
+	newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
2482
+	newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
2483
+	rm(newparamsBatch); gc(verbose=FALSE)
2484
+	if(verbose) message("Done.")
2485
+	if(verbose) message("Estimating recalibration parameters.")
2486
+	d <- newparams[["centers"]] - params$centers
2487
+	##
2488
+	##regression
2489
+	Index <- intersect(which(pmin(newparams[["N"]][, 1],
2490
+				      newparams[["N"]][, 2],
2491
+				      newparams[["N"]][, 3]) > recallMin &
2492
+				 !apply(regionInfo, 1, any)),
2493
+			   autosomeIndex)
2494
+	if(length(Index) < recallRegMin){
2495
+		warning("Recalibration not possible. Possible cause: small sample size.")
2496
+		newparams <- params
2497
+		dev <- vector("numeric", nrow(newparams[["centers"]]))
2498
+		SS <- matrix(Inf, 3, 3)
2499
+		DD <- 0
2500
+	}else{
2501
+		data4reg <- as.data.frame(newparams[["centers"]][Index,])
2502
+		names(data4reg) <- c("AA", "AB", "BB")
2503
+		regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
2504
+				   c(coef(lm(AB~AA+BB, data=data4reg)), 0),
2505
+				   coef(lm(BB~AA*AB, data=data4reg)))
2506
+		rownames(regParams) <- c("intercept", "X", "Y", "XY")
2507
+		rm(data4reg)
2508
+		##
2509
+		minN <- 3
2510
+		newparams[["centers"]][newparams[["N"]] < minN] <- NA
2511
+		Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
2512
+		if(verbose) message("Filling out empty centers", appendLF=FALSE)
2513
+		for(i in Index){
2514
+			if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
2515
+			mu <- newparams[["centers"]][i, ]
2516
+			j <- which(is.na(mu))
2517
+			newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
2518
+			rm(mu, j)
2519
+		}
2520
+		##
2521
+		##remaing NAs are made like originals
2522
+		if(length(YIndex)>0){
2523
+			noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
2524
+					     YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
2525
+		}
2526
+		snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
2527
+		snps2keep <- setdiff(autosomeIndex, snps2ignore)
2528
+		rm(snps2ignore)
2529
+		newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
2530
+		if(verbose) cat("\n")
2531
+		##
2532
+		if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
2533
+		GG <- DD <- newparams[["centers"]] - params[["centers"]]
2534
+		DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
2535
+		SS <- cov(DD[autosomeIndex, ])
2536
+		SSI <- solve(SS)
2537
+		dev <- vector("numeric", nrow(DD))
2538
+		if(length(YIndex)){
2539
+			dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
2540
+			dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
2541
+			##Now Y (only two params)
2542
+			SSY <- SS[c(1, 3), c(1, 3)]
2543
+			SSI <- solve(SSY)
2544
+			dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
2545
+			dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
2546
+		} else {
2547
+			dev=apply(DD,1,function(x) x%*%SSI%*%x)
2548
+			dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
2549
+		}
2550
+	}
2551
+	if (verbose) message("OK")
2552
+	##
2553
+	## BC: must keep SD
2554
+	params[-2] <- newparams[-2]
2555
+	rm(newparams)
2556
+	gc(verbose=FALSE)
2557
+	##
2558
+	if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
2559
+	##
2560
+	## ###################
2561
+	## ## MOVE TO C#######
2562
+	##
2563
+	## running in batches
2564
+	process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2565
+			     YIndex, A, B, mixtureParams, fIndex, mIndex,
2566
+			     params, cIndexes, SMEDIAN, theKnots, DF, probs,
2567
+			     regionInfo, batchSize){
2568
+		open(A)
2569
+		open(B)
2570
+		open(mixtureParams)
2571
+		snps <- snpBatches[[idxBatch]]
2572
+		tmpA <- as.matrix(A[snps,])
2573
+		tmpB <- as.matrix(B[snps,])
2574
+		rSnps <- range(snps)
2575
+		last <- (idxBatch-1)*batchSize
2576
+		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2577
+				     XIndex[XIndex %in% snps]-last,
2578
+				     YIndex[YIndex %in% snps]-last)
2579
+		IndexesBatch <- lapply(IndexesBatch, as.integer)
2580
+		ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
2581
+					params[["centers"]][snps,],
2582
+					params[["scales"]][snps,],
2583
+					params[["N"]][snps,],
2584
+					IndexesBatch, cIndexes,
2585
+					sapply(IndexesBatch, length),
2586
+					sapply(cIndexes, length),
2587
+					SMEDIAN, theKnots, mixtureParams[],
2588
+					DF, probs, 0.025,
2589
+					which(regionInfo[snps, 2]),
2590
+					which(regionInfo[snps, 1]))
2591
+		A[snps,] <- tmpA
2592
+		B[snps,] <- tmpB
2593
+		rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
2594
+		gc(verbose=FALSE)
2595
+		close(A)
2596
+		close(B)
2597
+		close(mixtureParams)
2598
+	}
2599
+	##
2600
+	ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
2601
+		 autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
2602
+		 A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
2603
+		 mIndex=mIndex, params=params, cIndexes=cIndexes,
2604
+		 SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
2605
+		 regionInfo=regionInfo, batchSize=ocProbesets())
2606
+	##  END MOVE TO C#######
2607
+	## ##################
2608
+	##
2609
+	dev <- dev/(dev+1/383)
2610
+	if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
2611
+	if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
2612
+	##
2613
+	if(length(Index) >= recallRegMin){
2614
+		tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
2615
+		tmpSnpQc <- dev[autosomeIndex]
2616
+		SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
2617
+		batchQC <- mean(diag(SS))
2618
+	}else{
2619
+		SS <- matrix(0, 3, 3)
2620
+		batchQC <- Inf
2621
+	}
2622
+	##
2623
+	if(verbose) message("Done.")
2624
+	if (returnParams){
2625
+		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2626
+	}else{
2627
+		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2628
+	}
2629
+}
2520 2630
 
2521
-    ##remaing NAs are made like originals
2522
-    if(length(YIndex)>0){
2523
-      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
2524
-                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
2631
+crlmm2.2 <- function(filenames, row.names=TRUE, col.names=TRUE,
2632
+                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
2633
+                   save.it=FALSE, load.it=FALSE, intensityFile,
2634
+                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
2635
+                   cdfName, sns, recallMin=10, recallRegMin=1000,
2636
+                   returnParams=FALSE, badSNP=.7){
2637
+  if ((load.it || save.it) && missing(intensityFile))
2638
+    stop("'intensityFile' is missing, and you chose either load.it or save.it")
2639
+  if (missing(sns)) sns <- basename(filenames)
2640
+  if (!missing(intensityFile))
2641
+    if (load.it & !file.exists(intensityFile)){
2642
+      load.it <- FALSE
2643
+      message("File ", intensityFile, " does not exist.")
2644
+      message("Not loading it, but running SNPRMA from scratch.")
2525 2645
     }
2526
-    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
2527
-    snps2keep <- setdiff(autosomeIndex, snps2ignore)
2528
-    rm(snps2ignore)
2529
-    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
2530
-    if(verbose) cat("\n")
2531
-
2532
-    if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
2533
-    GG <- DD <- newparams[["centers"]] - params[["centers"]]
2534
-    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
2535
-    SS <- cov(DD[autosomeIndex, ])
2536
-    SSI <- solve(SS)
2537
-    dev <- vector("numeric", nrow(DD))
2538
-    if(length(YIndex)){
2539
-      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
2540
-      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
2541
-      ##Now Y (only two params)
2542
-      SSY <- SS[c(1, 3), c(1, 3)]
2543
-      SSI <- solve(SSY)
2544
-      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
2545
-      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
2546
-    } else {
2547
-      dev=apply(DD,1,function(x) x%*%SSI%*%x)
2548
-      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
2646
+  if (!load.it){
2647
+    res <- snprma2(filenames, fitMixture=TRUE,
2648
+                   mixtureSampleSize=mixtureSampleSize, verbose=verbose,
2649
+                   eps=eps, cdfName=cdfName, sns=sns)
2650
+    open(res[["A"]])
2651
+    open(res[["B"]])
2652
+    open(res[["SNR"]])
2653
+    open(res[["mixtureParams"]])
2654
+    if(save.it){
2655
+      t0 <- proc.time()
2656
+      save(res, file=intensityFile)
2657
+      t0 <- proc.time()-t0
2658
+      if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".")
2549 2659
     }
2550
-  }
2551
-  if (verbose) message("OK")
2552
-
2553
-  ## BC: must keep SD
2554
-  params[-2] <- newparams[-2]
2555
-  rm(newparams)
2556
-  gc(verbose=FALSE)
2557
-
2558
-  if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
2559
-
2560
-  ## ###################
2561
-  ## ## MOVE TO C#######
2562
-
2563
-  ## running in batches
2564
-  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2565
-                       YIndex, A, B, mixtureParams, fIndex, mIndex,
2566
-                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
2567
-                       regionInfo, batchSize){
2568
-    open(A)
2569
-    open(B)
2570
-    open(mixtureParams)
2571
-    snps <- snpBatches[[idxBatch]]
2572
-    tmpA <- as.matrix(A[snps,])
2573
-    tmpB <- as.matrix(B[snps,])
2574
-    rSnps <- range(snps)
2575
-    last <- (idxBatch-1)*batchSize
2576
-    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2577
-                         XIndex[XIndex %in% snps]-last,
2578
-                         YIndex[YIndex %in% snps]-last)
2579
-    IndexesBatch <- lapply(IndexesBatch, as.integer)
2580
-    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
2581
-                            params[["centers"]][snps,],
2582
-                            params[["scales"]][snps,],
2583
-                            params[["N"]][snps,],
2584
-                            IndexesBatch, cIndexes,
2585
-                            sapply(IndexesBatch, length),
2586
-                            sapply(cIndexes, length),
2587
-                            SMEDIAN, theKnots, mixtureParams[],
2588
-                            DF, probs, 0.025,
2589
-                            which(regionInfo[snps, 2]),
2590
-                            which(regionInfo[snps, 1]))
2591
-    A[snps,] <- tmpA
2592
-    B[snps,] <- tmpB
2593
-    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
2594
-    gc(verbose=FALSE)
2595
-    close(A)
2596
-    close(B)
2597
-    close(mixtureParams)
2598
-  }
2599
-
2600
-  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
2601
-           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
2602
-           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
2603
-           mIndex=mIndex, params=params, cIndexes=cIndexes,
2604
-           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
2605
-           regionInfo=regionInfo, batchSize=ocProbesets())
2606
-  ##  END MOVE TO C#######
2607
-  ## ##################
2608
-
2609
-  dev <- dev/(dev+1/383)
2610
-  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
2611
-  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
2612
-
2613
-  if(length(Index) >= recallRegMin){
2614
-   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
2615
-   tmpSnpQc <- dev[autosomeIndex]
2616
-   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
2617
-   batchQC <- mean(diag(SS))
2618
-  }else{
2619
-    SS <- matrix(0, 3, 3)
2620
-    batchQC <- Inf
2621
-  }
2622
-
2623
-  if(verbose) message("Done.")
2624
-  if (returnParams){
2625
-    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2626 2660
   }else{
2627
-    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2661
+    if (verbose) message("Loading ", intensityFile, ".")
2662
+    obj <- load(intensityFile)
2663
+    if (verbose) message("Done.")
2664
+    if (obj != "res")
2665
+      stop("Object in ", intensityFile, " seems to be invalid.")
2628 2666
   }
2667
+  if(row.names) row.names=res$gns else row.names=NULL
2668
+  if(col.names) col.names=res$sns else col.names=NULL
2669
+  res2 <- rscrlmmGT2(res[["A"]], res[["B"]], res[["SNR"]],
2670
+                   res[["mixtureParams"]], res[["cdfName"]],
2671
+                   gender=gender, row.names=row.names,
2672
+                   col.names=col.names, recallMin=recallMin,
2673
+                   recallRegMin=1000, SNRMin=SNRMin,
2674
+                   returnParams=returnParams, badSNP=badSNP,
2675
+                   verbose=verbose)
2676
+
2677
+  res2[["SNR"]] <- res[["SNR"]]
2678
+  res2[["SKW"]] <- res[["SKW"]]
2679
+  return(list2SnpSet(res2, returnParams=returnParams))
2629 2680
 }
... ...
@@ -1110,12 +1110,119 @@ construct.Illumina <- function(sampleSheet=NULL,
1110 1110
 	protocolData(cnSet) = protocolData
1111 1111
 	featureData(cnSet) = featureData
1112 1112
 	featureNames(cnSet) = featureNames(featureData)
1113
+	cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer")
1114
+	cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
1115
+	cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
1116
+	sampleNames(cnSet) <- basename(sampleNames(cnSet))
1113 1117
 	##pd = data.frame(matrix(NA, nc, 3), row.names=sampleNames(cnSet))
1114 1118
 	##colnames(pd)=c("SKW", "SNR", "gender")
1115 1119
 	##phenoData(cnSet) = new("AnnotatedDataFrame", data=pd)
1116 1120
 	return(cnSet)
1117 1121
 }
1118 1122
 
1123
+preprocess <- function(cnSet,
1124
+		       arrayNames=NULL,
1125
+		       ids=NULL,
1126
+		       path=".",
1127
+		       arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
1128
+		       highDensity=TRUE,
1129
+		       sep="_",
1130
+		       fileExt=list(green="Grn.idat", red="Red.idat"),
1131
+		       saveDate=TRUE,
1132
+		       stripNorm=TRUE,
1133
+		       useTarget=TRUE,
1134
+		       mixtureSampleSize=10^5,
1135
+		       fitMixture=TRUE,
1136
+		       eps=0.1,
1137
+		       verbose=TRUE,
1138
+		       seed=1){
1139
+	stopifnot(all(c("gender", "SNR", "SKW") %in% varLabels(cnSet)))
1140
+	open(A(cnSet))
1141
+	open(B(cnSet))
1142
+	sns <- sampleNames(cnSet)
1143
+	pkgname = getCrlmmAnnotationName(annotation(cnSet))
1144
+	is.snp = isSnp(cnSet)
1145
+	snp.index = which(is.snp)
1146
+	narrays = ncol(cnSet)
1147
+	sampleBatches <- splitIndicesByLength(seq(length=ncol(cnSet)), ocSamples())
1148
+	mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1149
+	ocLapply(seq_along(sampleBatches),
1150
+		 processIDAT,
1151
+		 sampleBatches=sampleBatches,
1152
+		 sampleSheet=sampleSheet,
1153
+		 arrayNames=arrayNames,
1154
+		 ids=ids,
1155
+		 path=path,
1156
+		 arrayInfoColNames=arrayInfoColNames,
1157
+		 highDensity=highDensity,
1158
+		 sep=sep,
1159
+		 fileExt=fileExt,
1160
+		 saveDate=saveDate,
1161
+		 verbose=verbose,
1162
+		 mixtureSampleSize=mixtureSampleSize,
1163
+		 fitMixture=fitMixture,
1164
+		 eps=eps,
1165
+		 seed=seed,
1166
+		 cdfName=cdfName,
1167
+		 sns=sns,
1168
+		 stripNorm=stripNorm,
1169
+		 useTarget=useTarget,
1170
+		 A=A(cnSet),
1171
+		 B=B(cnSet),
1172
+		 SKW=cnSet$SKW,
1173
+		 SNR=cnSet$SNR,
1174
+		 mixtureParams=mixtureParams,
1175
+		 is.snp=is.snp,
1176
+		 neededPkgs=c("crlmm", pkgname)) # outdir=outdir,
1177
+	return(mixtureParams)
1178
+}
1179
+
1180
+genotypeRS <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1181
+		       SNRMin=5,
1182
+		       recallMin=10,
1183
+		       recallRegMin=1000,
1184
+		       verbose=TRUE,
1185
+		       returnParams=TRUE,
1186
+		       badSNP=0.7,
1187
+		       gender=NULL,
1188
+		       DF=6){
1189
+	is.snp = isSnp(cnSet)
1190
+	snp.index = which(is.snp)
1191
+	narrays = ncol(cnSet)
1192
+	open(A(cnSet))
1193
+	open(B(cnSet))
1194
+	open(snpCall(cnSet))
1195
+	open(snpCallProbability(cnSet))
1196
+	## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores
1197
+	message("Writing to call and callProbability slots")
1198
+	for(j in 1:ncol(cnSet)){
1199
+		snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j])
1200
+		snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j])
1201
+	}
1202
+	message("Writing complete.  Begin genotyping...")
1203
+	close(A(cnSet))
1204
+	close(B(cnSet))
1205
+	tmp <- rscrlmmGT2(A=snpCall(cnSet),
1206
+			  B=snpCallProbability(cnSet),
1207
+			  SNR=cnSet$SNR,
1208
+			  mixtureParams=mixtureParams,
1209
+			  cdfName=cdfName,
1210
+			  col.names=sampleNames(cnSet),
1211
+			  probs=probs,
1212
+			  DF=DF,
1213
+			  SNRMin=SNRMin,
1214
+			  recallMin=recallMin,
1215
+			  recallRegMin=recallRegMin,
1216
+			  gender=gender,
1217
+			  verbose=verbose,
1218
+			  returnParams=returnParams,
1219
+			  badSNP=badSNP,
1220
+			  snp.names=featureNames(cnSet)[snp.index])
1221
+	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
1222
+	cnSet$gender[,] = tmp$gender
1223
+	TRUE
1224
+}
1225
+
1119 1226
 
1120 1227
 genotype.Illumina <- function(sampleSheet=NULL,
1121 1228
 			      arrayNames=NULL,
... ...
@@ -1144,18 +1251,28 @@ genotype.Illumina <- function(sampleSheet=NULL,
1144 1251
 			      recallRegMin=1000,
1145 1252
 			      gender=NULL,
1146 1253
 			      returnParams=TRUE,
1147
-			      badSNP=0.7) {
1254
+			      badSNP=0.7,
1255
+			      callSet) {
1148 1256
 	is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
1149 1257
 	stopifnot(is.lds)
1150 1258
 	if(missing(cdfName)) stop("must specify cdfName")
1151 1259
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1152 1260
         pkgname = getCrlmmAnnotationName(cdfName)
1153
-	callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames,
1261
+	if(missing(callSet) | !validObject(callSet)){
1262
+		callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames,
1154 1263
 				      ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1155 1264
 				      highDensity=highDensity, sep=sep, fileExt=fileExt,
1156 1265
 				      cdfName=cdfName, copynumber=copynumber, verbose=verbose, batch=batch, # fns=fns,
1157 1266
 				      saveDate=saveDate) #, outdir=outdir)
1158
-	sampleNames(callSet) <- basename(sampleNames(callSet))
1267
+	}
1268
+	## Basic checks
1269
+	##check that mixtureParams provided
1270
+	if(missing(mixtureParams))
1271
+		stop("preprocess is FALSE but mixtureParams were not provided")
1272
+	if(ncol(callSet) != length(arrayNames))
1273
+		stop("callSet provided but number of samples is not the same as the length of arrayNames")
1274
+	snr <- callSet$SNR[]
1275
+	if(any(is.na(snr))) stop("missing values in callSet$SNR")
1159 1276
 	if(missing(sns)) sns = basename(sampleNames(callSet))
1160 1277
 	open(A(callSet))
1161 1278
 	open(B(callSet))
... ...
@@ -1180,12 +1297,6 @@ genotype.Illumina <- function(sampleSheet=NULL,
1180 1297
 	save(callSet, file=file.path(ldPath(), "callSet.rda"))
1181 1298
 	close(SNR)
1182 1299
 	close(SKW)
1183
-	FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
1184
-	crlmmGTfxn = function(FUN,...){
1185
-		switch(FUN,
1186
-		       crlmmGT2=crlmmGT2(...),
1187
-		       crlmmGT=crlmmGT(...))
1188
-	}
1189 1300
 	open(A(callSet))
1190 1301
 	open(B(callSet))
1191 1302
 	tmpA = initializeBigMatrix(name="tmpA", length(snp.index), narrays)
... ...
@@ -1204,27 +1315,25 @@ genotype.Illumina <- function(sampleSheet=NULL,
1204 1315
 	save(SKW, file=file.path(ldPath(), "SKW.rda"))
1205 1316
 	save(mixtureParams, file=file.path(ldPath(), "mixtureParams.rda"))
1206 1317
 	message("Begin genotyping")
1207
-	tmp <- crlmmGTfxn(FUN,
1208
-			  A=tmpA,
1209
-			  B=tmpB,
1210
-			  SNR=SNR,
1211
-			  mixtureParams=mixtureParams,
1212
-			  cdfName=cdfName,
1213
-			  row.names=NULL,
1214
-			  col.names=sampleNames(callSet),
1215
-			  probs=probs,
1216
-			  DF=DF,
1217
-			  SNRMin=SNRMin,
1218
-			  recallMin=recallMin,
1219
-			  recallRegMin=recallRegMin,
1220
-			  gender=gender,
1221
-			  verbose=verbose,
1222
-			  returnParams=returnParams,
1223
-			  badSNP=badSNP)
1318
+	tmp <- crlmmGT2(A=tmpA,
1319
+			B=tmpB,
1320
+			SNR=SNR,
1321
+			mixtureParams=mixtureParams,
1322
+			cdfName=cdfName,
1323
+			row.names=NULL,
1324
+			col.names=sampleNames(callSet),
1325
+			probs=probs,
1326
+			DF=DF,
1327
+			SNRMin=SNRMin,
1328
+			recallMin=recallMin,
1329
+			recallRegMin=recallRegMin,
1330
+			gender=gender,
1331
+			verbose=verbose,
1332
+			returnParams=returnParams,
1333
+			badSNP=badSNP)
1224 1334
 	save(tmp, file=file.path(ldPath(), "tmp.rda"))
1225 1335
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
1226
-	open(tmpA)
1227
-	open(tmpB)
1336
+	open(tmpA); open(tmpB)
1228 1337
 	for(j in 1:ncol(callSet)){
1229 1338
 		snpCall(callSet)[snp.index, j] <- as.integer(tmpA[, j])
1230 1339
 		snpCallProbability(callSet)[snp.index, j] <- as.integer(tmpB[, j])
... ...
@@ -1262,7 +1371,7 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1262 1371
 	message("Processing sample stratum ", stratum, " of ", length(sampleBatches))
1263 1372
 	sel <- sampleBatches[[stratum]]
1264 1373
         if(length(path)>= length(sel)) path = path[sel]
1265
-	message("RS:... processIDAT:  calling readIdatFiles")
1374
+	##message("RS:... processIDAT:  calling readIdatFiles")
1266 1375
         RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
1267 1376
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1268 1377
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
... ...
@@ -1273,7 +1382,7 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1273 1382
         gc()
1274 1383
         if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
1275 1384
 
1276
-	message("RS:... processIDAT: calling preprocessInfinium2")
1385
+	##message("RS:... processIDAT: calling preprocessInfinium2")
1277 1386
         res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1278 1387
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir)
1279 1388
 #							    save.it=save.it, snpFile=snpFile, cnFile=cnFile)
... ...
@@ -33,7 +33,7 @@ isLoaded <- function(dataset, environ=.crlmmPkgEnv)
33 33
 
34 34
 getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){
35 35
 	if (!isLoaded(dataset, environ=environ))
36
-		stop("Variable ", dataset, " not found in ", environ)
36
+		stop("Variable ", dataset, " not found in supplied environment")
37 37
 	environ[[dataset]]
38 38
 }
39 39