Browse code

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@49966 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 06/10/2010 07:28:28
Showing3 changed files

... ...
@@ -559,3 +559,8 @@ function (which expects ff objects and supports parallel processing)
559 559
 
560 560
 2010-09-30 M. Ritchie 1.7.16
561 561
 ** copy number A and B intensities now stored in callSet from genotype.Illumina()
562
+
563
+
564
+2010-10-06 M. Ritchie 1.7.18
565
+** new internal function processIDAT which uses ocLapply() to parallelize pre-processing of Illumina data
566
+** changes to genotype.Illumina()
... ...
@@ -1,8 +1,8 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.7.18
5
-Date: 2010-09-30
4
+Version: 1.7.19
5
+Date: 2010-10-06
6 6
 Author: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
... ...
@@ -474,6 +474,13 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
474 474
   XY@assayData$Y[1:nsnps,] = 0
475 475
   XY@assayData$zero[1:nsnps,] = 0
476 476
 
477
+  is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
478
+  
479
+  if(is.lds) {
480
+    open(RG@assayData$G)
481
+    open(RG@assayData$R)
482
+    open(RG@assayData$zero)
483
+  }
477 484
 
478 485
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
479 486
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
... ...
@@ -1201,7 +1208,7 @@ construct.Illumina <- function(sampleSheet=NULL,
1201 1208
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
1202 1209
        if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
1203 1210
 
1204
-	if(verbose) message("Initializing container for copy number estimation")
1211
+	if(verbose) message("Initializing container for genotyping and copy number estimation")
1205 1212
 	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
1206 1213
 	if(!missing(fns)){
1207 1214
 		index <- match(fns, featureNames(featureData))
... ...
@@ -1382,3 +1389,253 @@ genotype.Illumina <- function(sampleSheet=NULL,
1382 1389
 	return(callSet)
1383 1390
 }
1384 1391
 
1392
+genotype.Illumina2 <- function(sampleSheet=NULL,
1393
+			  arrayNames=NULL,
1394
+			  ids=NULL,
1395
+			  path=".",
1396
+			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
1397
+			  highDensity=FALSE,
1398
+			  sep="_",
1399
+			  fileExt=list(green="Grn.idat",
1400
+			  red="Red.idat"),
1401
+		      	  cdfName,
1402
+		      	  copynumber=TRUE,
1403
+                          batch,
1404
+                          fns,
1405
+                          saveDate=TRUE,
1406
+       			  stripNorm=TRUE,
1407
+			  useTarget=TRUE,
1408
+		          mixtureSampleSize=10^5,
1409
+                          fitMixture=TRUE,
1410
+		          eps=0.1,
1411
+		          verbose=TRUE,
1412
+		          seed=1,
1413
+		          sns,
1414
+		          probs=rep(1/3, 3),
1415
+		          DF=6,
1416
+		          SNRMin=5,
1417
+		          recallMin=10,
1418
+		          recallRegMin=1000,
1419
+		          gender=NULL,
1420
+		          returnParams=TRUE,
1421
+		          badSNP=0.7) {
1422
+	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
1423
+	if(missing(cdfName)) stop("must specify cdfName")
1424
+	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1425
+	callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames,
1426
+			     ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1427
+                             highDensity=highDensity, sep=sep, fileExt=fileExt, 
1428
+			     cdfName=cdfName, copynumber=copynumber, verbose=verbose, batch=batch,
1429
+                             fns=fns, saveDate=saveDate)
1430
+        if(missing(sns)) sns = sampleNames(callSet)
1431
+	open(callSet)
1432
+	is.snp <- isSnp(callSet)
1433
+	snp.index <- which(is.snp)
1434
+        narrays = ncol(callSet)
1435
+
1436
+        if(is.lds) {
1437
+          sampleBatches <- splitIndicesByNode(seq(along=sampleNames(callSet)))
1438
+
1439
+          mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1440
+          SNR <- initializeBigVector("crlmmSNR-", narrays, "double")
1441
+          SKW <- initializeBigVector("crlmmSKW-", narrays, "double")
1442
+          
1443
+          ocLapply(sampleBatches, processIDAT, sampleSheet=sampleSheet, arrayNames=arrayNames,
1444
+                 ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity,
1445
+                 sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose, mixtureSampleSize=mixtureSampleSize,
1446
+                 fitMixture=fitMixture, eps=eps, seed=seed, cdfName=cdfName, sns=sns, stripNorm=stripNorm,
1447
+                 useTarget=useTarget, A=A(callSet), B=B(callSet), SKW=SKW, SNR=SNR,
1448
+                 mixtureParams=mixtureParams, is.snp=is.snp, neededPkgs=c("crlmm", cdfName))
1449
+
1450
+          open(SKW)
1451
+          open(SNR)
1452
+          pData(callSet)$SKW <- SKW
1453
+          pData(callSet)$SNR <- SNR
1454
+          close(SNR)
1455
+          close(SKW)
1456
+        } else {
1457
+          mixtureParams <- matrix(NA, 4, nrow(callSet))
1458
+
1459
+          RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
1460
+                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1461
+                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
1462
+
1463
+          XY = RGtoXY(RG, chipType=cdfName)
1464
+          rm(RG); gc()
1465
+        
1466
+          res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1467
+                               seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget)
1468
+          rm(XY); gc()
1469
+          if(verbose) message("Finished preprocessing.")
1470
+          np.index <- which(!is.snp)        
1471
+          A(callSet)[snp.index, ] <- res[["A"]]
1472
+          B(callSet)[snp.index, ] <- res[["B"]]
1473
+          if(length(np.index)>0) {
1474
+            for (j in 1:ncol(callSet)) {
1475
+        	A(callSet)[np.index, ] <- res[["cnAB"]]$A
1476
+        	B(callSet)[np.index, ] <- res[["cnAB"]]$B
1477
+        	}
1478
+          }
1479
+	  SKW = pData(callSet)$SKW <- res[["SKW"]]
1480
+	  SNR = pData(callSet)$SNR <- res[["SNR"]]
1481
+	  mixtureParams <- res[["mixtureParams"]]
1482
+          rm(res)
1483
+        }
1484
+
1485
+	FUN <- ifelse(is.lds, "crlmmGT2", "crlmmGT")
1486
+	## genotyping
1487
+	crlmmGTfxn <- function(FUN,...){
1488
+		switch(FUN,
1489
+		       crlmmGT2=crlmmGT2(...),
1490
+		       crlmmGT=crlmmGT(...))
1491
+	}
1492
+
1493
+        if(is.lds) {
1494
+          open(A(callSet))
1495
+          open(B(callSet))
1496
+          tmpA = initializeBigMatrix(name="A", length(snp.index), narrays)
1497
+          tmpB = initializeBigMatrix(name="B", length(snp.index), narrays)
1498
+          bb = ocProbesets()*length(sns)*8
1499
+	  ffrowapply(tmpA[i1:i2, ] <- A(callSet)[snp.index,][i1:i2, ], X=A(callSet)[snp.index,], BATCHBYTES=bb)
1500
+	  ffrowapply(tmpB[i1:i2, ] <- B(callSet)[snp.index,][i1:i2, ], X=B(callSet)[snp.index,], BATCHBYTES=bb)
1501
+          close(tmpA)
1502
+          close(tmpB)
1503
+        } else {
1504
+          tmpA = A(callSet)[snp.index,]
1505
+          tmpB = B(callSet)[snp.index,]
1506
+        }
1507
+        
1508
+	tmp <- crlmmGTfxn(FUN,
1509
+			  A=tmpA,
1510
+			  B=tmpB,
1511
+			  SNR=SNR,
1512
+			  mixtureParams=mixtureParams,
1513
+			  cdfName=cdfName,
1514
+			  row.names=NULL,
1515
+			  col.names=sampleNames(callSet),
1516
+			  probs=probs,
1517
+			  DF=DF,
1518
+			  SNRMin=SNRMin,
1519
+			  recallMin=recallMin,
1520
+			  recallRegMin=recallRegMin,
1521
+			  gender=gender,
1522
+			  verbose=verbose,
1523
+			  returnParams=returnParams,
1524
+			  badSNP=badSNP)
1525
+#        rm(res); gc()
1526
+	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
1527
+	if(is.lds){
1528
+		bb = ocProbesets()*ncol(callSet)*8
1529
+		open(tmp[["calls"]])
1530
+		open(tmp[["confs"]])
1531
+		ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
1532
+		ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
1533
+#		close(tmp[["calls"]])
1534
+#		close(tmp[["confs"]])
1535
+#                open(tmpA); open(tmpB)
1536
+#                delete(tmpA); delete(tmpB);
1537
+                delete(tmp[["calls"]]); delete(tmp[["confs"]])
1538
+                rm(tmpA, tmpB)
1539
+	} else {
1540
+		calls(callSet)[snp.index, ] <- tmp[["calls"]]
1541
+		snpCallProbability(callSet)[snp.index, ] <- tmp[["confs"]]
1542
+                rm(tmpA, tmpB)
1543
+	}
1544
+	callSet$gender <- tmp$gender
1545
+        rm(tmp)
1546
+	close(callSet)
1547
+	return(callSet)
1548
+}
1549
+
1550
+
1551
+processIDAT =  function(sel, sampleSheet=NULL,
1552
+			  arrayNames=NULL,
1553
+			  ids=NULL,
1554
+			  path=".",
1555
+			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
1556
+			  highDensity=FALSE,
1557
+			  sep="_",
1558
+			  fileExt=list(green="Grn.idat", red="Red.idat"),
1559
+			  saveDate=FALSE, 
1560
+                          verbose=TRUE,
1561
+                          mixtureSampleSize=10^5,
1562
+			  fitMixture=TRUE,
1563
+			  eps=0.1,
1564
+			  seed=1,
1565
+			  cdfName,
1566
+			  sns,
1567
+			  stripNorm=TRUE,
1568
+			  useTarget=TRUE,
1569
+                          A, B, SKW, SNR, mixtureParams, is.snp) {
1570
+# 	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
1571
+
1572
+        if(length(path)>= length(sel)) path = path[sel]
1573
+        RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
1574
+                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1575
+                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
1576
+
1577
+        XY = RGtoXY(RG, chipType=cdfName)
1578
+ #       if(is.lds) {
1579
+          open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
1580
+          delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero); rm(RG)
1581
+#        } else {
1582
+#          rm(RG)
1583
+#        }
1584
+        gc()
1585
+        if (missing(sns)) { sns = sampleNames(XY) #subsns = sampleNames(XY)
1586
+        } # else subsns = sns[j]
1587
+        
1588
+        res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1589
+                               seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget)
1590
+        #                       save.it=save.it, snpFile=snpFile, cnFile=cnFile)
1591
+#        if(is.lds) {
1592
+          open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
1593
+          delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero); rm(XY)
1594
+#        } else {
1595
+#          rm(XY)
1596
+#        }
1597
+        gc()
1598
+	if(verbose) message("Finished preprocessing.")
1599
+        snp.index = which(is.snp)
1600
+	np.index <- which(!is.snp)      
1601
+#	if(is.lds){
1602
+                open(res[["A"]])
1603
+                open(res[["B"]])
1604
+                open(res[["SKW"]])
1605
+                open(res[["SNR"]])
1606
+                open(res[["mixtureParams"]])
1607
+		bb = ocProbesets()*length(sns)*8
1608
+		ffrowapply(A[snp.index,][i1:i2, sel] <- res[["A"]][i1:i2, sel], X=res[["A"]], BATCHBYTES=bb)
1609
+		ffrowapply(B[snp.index,][i1:i2, sel] <- res[["B"]][i1:i2, sel], X=res[["B"]], BATCHBYTES=bb)
1610
+        	if(length(np.index)>0) {
1611
+        		for (j in sel) {
1612
+        			A[np.index, j] <- res[["cnAB"]]$A[,j]
1613
+        			B[np.index, j] <- res[["cnAB"]]$B[,j]
1614
+        		}
1615
+                }
1616
+                delete(res[["A"]]); delete(res[["B"]])
1617
+#	} else {
1618
+#		A[snp.index, ] <- res[["A"]]
1619
+#		B[snp.index, ] <- res[["B"]]
1620
+#        	if(length(np.index)>0) {
1621
+#        		A[np.index, ] <- res[["cnAB"]]$A
1622
+#        		B[np.index, ] <- res[["cnAB"]]$B
1623
+#                }
1624
+#	}
1625
+#        open(res[["SKW"]])
1626
+#        open(res[["SNR"]])
1627
+#        open(res[["mixtureParams"]])
1628
+	SKW[sel] <- res[["SKW"]][]
1629
+	SNR[sel] <- res[["SNR"]][]
1630
+	mixtureParams[,sel] <- res[["mixtureParams"]][]
1631
+        close(A)
1632
+        close(B)
1633
+        close(SNR)
1634
+        close(SKW)
1635
+        close(mixtureParams)
1636
+#       	if(is.lds){
1637
+          delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
1638
+#        }
1639
+        rm(res)
1640
+        TRUE
1641
+      }