... | ... |
@@ -559,3 +559,8 @@ function (which expects ff objects and supports parallel processing) |
559 | 559 |
|
560 | 560 |
2010-09-30 M. Ritchie 1.7.16 |
561 | 561 |
** copy number A and B intensities now stored in callSet from genotype.Illumina() |
562 |
+ |
|
563 |
+ |
|
564 |
+2010-10-06 M. Ritchie 1.7.18 |
|
565 |
+** new internal function processIDAT which uses ocLapply() to parallelize pre-processing of Illumina data |
|
566 |
+** changes to genotype.Illumina() |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.7.18 |
|
5 |
-Date: 2010-09-30 |
|
4 |
+Version: 1.7.19 |
|
5 |
+Date: 2010-10-06 |
|
6 | 6 |
Author: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms |
... | ... |
@@ -474,6 +474,13 @@ RGtoXY = function(RG, chipType, verbose=TRUE) { |
474 | 474 |
XY@assayData$Y[1:nsnps,] = 0 |
475 | 475 |
XY@assayData$zero[1:nsnps,] = 0 |
476 | 476 |
|
477 |
+ is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
|
478 |
+ |
|
479 |
+ if(is.lds) { |
|
480 |
+ open(RG@assayData$G) |
|
481 |
+ open(RG@assayData$R) |
|
482 |
+ open(RG@assayData$zero) |
|
483 |
+ } |
|
477 | 484 |
|
478 | 485 |
# First sort out Infinium II SNPs, X -> R (allele A) and Y -> G (allele B) from the same probe |
479 | 486 |
XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red |
... | ... |
@@ -1201,7 +1208,7 @@ construct.Illumina <- function(sampleSheet=NULL, |
1201 | 1208 |
if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files") |
1202 | 1209 |
if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") |
1203 | 1210 |
|
1204 |
- if(verbose) message("Initializing container for copy number estimation") |
|
1211 |
+ if(verbose) message("Initializing container for genotyping and copy number estimation") |
|
1205 | 1212 |
featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber) |
1206 | 1213 |
if(!missing(fns)){ |
1207 | 1214 |
index <- match(fns, featureNames(featureData)) |
... | ... |
@@ -1382,3 +1389,253 @@ genotype.Illumina <- function(sampleSheet=NULL, |
1382 | 1389 |
return(callSet) |
1383 | 1390 |
} |
1384 | 1391 |
|
1392 |
+genotype.Illumina2 <- function(sampleSheet=NULL, |
|
1393 |
+ arrayNames=NULL, |
|
1394 |
+ ids=NULL, |
|
1395 |
+ path=".", |
|
1396 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
|
1397 |
+ highDensity=FALSE, |
|
1398 |
+ sep="_", |
|
1399 |
+ fileExt=list(green="Grn.idat", |
|
1400 |
+ red="Red.idat"), |
|
1401 |
+ cdfName, |
|
1402 |
+ copynumber=TRUE, |
|
1403 |
+ batch, |
|
1404 |
+ fns, |
|
1405 |
+ saveDate=TRUE, |
|
1406 |
+ stripNorm=TRUE, |
|
1407 |
+ useTarget=TRUE, |
|
1408 |
+ mixtureSampleSize=10^5, |
|
1409 |
+ fitMixture=TRUE, |
|
1410 |
+ eps=0.1, |
|
1411 |
+ verbose=TRUE, |
|
1412 |
+ seed=1, |
|
1413 |
+ sns, |
|
1414 |
+ probs=rep(1/3, 3), |
|
1415 |
+ DF=6, |
|
1416 |
+ SNRMin=5, |
|
1417 |
+ recallMin=10, |
|
1418 |
+ recallRegMin=1000, |
|
1419 |
+ gender=NULL, |
|
1420 |
+ returnParams=TRUE, |
|
1421 |
+ badSNP=0.7) { |
|
1422 |
+ is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
|
1423 |
+ if(missing(cdfName)) stop("must specify cdfName") |
|
1424 |
+ if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
|
1425 |
+ callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames, |
|
1426 |
+ ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, |
|
1427 |
+ highDensity=highDensity, sep=sep, fileExt=fileExt, |
|
1428 |
+ cdfName=cdfName, copynumber=copynumber, verbose=verbose, batch=batch, |
|
1429 |
+ fns=fns, saveDate=saveDate) |
|
1430 |
+ if(missing(sns)) sns = sampleNames(callSet) |
|
1431 |
+ open(callSet) |
|
1432 |
+ is.snp <- isSnp(callSet) |
|
1433 |
+ snp.index <- which(is.snp) |
|
1434 |
+ narrays = ncol(callSet) |
|
1435 |
+ |
|
1436 |
+ if(is.lds) { |
|
1437 |
+ sampleBatches <- splitIndicesByNode(seq(along=sampleNames(callSet))) |
|
1438 |
+ |
|
1439 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double") |
|
1440 |
+ SNR <- initializeBigVector("crlmmSNR-", narrays, "double") |
|
1441 |
+ SKW <- initializeBigVector("crlmmSKW-", narrays, "double") |
|
1442 |
+ |
|
1443 |
+ ocLapply(sampleBatches, processIDAT, sampleSheet=sampleSheet, arrayNames=arrayNames, |
|
1444 |
+ ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, |
|
1445 |
+ sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose, mixtureSampleSize=mixtureSampleSize, |
|
1446 |
+ fitMixture=fitMixture, eps=eps, seed=seed, cdfName=cdfName, sns=sns, stripNorm=stripNorm, |
|
1447 |
+ useTarget=useTarget, A=A(callSet), B=B(callSet), SKW=SKW, SNR=SNR, |
|
1448 |
+ mixtureParams=mixtureParams, is.snp=is.snp, neededPkgs=c("crlmm", cdfName)) |
|
1449 |
+ |
|
1450 |
+ open(SKW) |
|
1451 |
+ open(SNR) |
|
1452 |
+ pData(callSet)$SKW <- SKW |
|
1453 |
+ pData(callSet)$SNR <- SNR |
|
1454 |
+ close(SNR) |
|
1455 |
+ close(SKW) |
|
1456 |
+ } else { |
|
1457 |
+ mixtureParams <- matrix(NA, 4, nrow(callSet)) |
|
1458 |
+ |
|
1459 |
+ RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames, |
|
1460 |
+ ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, |
|
1461 |
+ highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate) |
|
1462 |
+ |
|
1463 |
+ XY = RGtoXY(RG, chipType=cdfName) |
|
1464 |
+ rm(RG); gc() |
|
1465 |
+ |
|
1466 |
+ res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
1467 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) |
|
1468 |
+ rm(XY); gc() |
|
1469 |
+ if(verbose) message("Finished preprocessing.") |
|
1470 |
+ np.index <- which(!is.snp) |
|
1471 |
+ A(callSet)[snp.index, ] <- res[["A"]] |
|
1472 |
+ B(callSet)[snp.index, ] <- res[["B"]] |
|
1473 |
+ if(length(np.index)>0) { |
|
1474 |
+ for (j in 1:ncol(callSet)) { |
|
1475 |
+ A(callSet)[np.index, ] <- res[["cnAB"]]$A |
|
1476 |
+ B(callSet)[np.index, ] <- res[["cnAB"]]$B |
|
1477 |
+ } |
|
1478 |
+ } |
|
1479 |
+ SKW = pData(callSet)$SKW <- res[["SKW"]] |
|
1480 |
+ SNR = pData(callSet)$SNR <- res[["SNR"]] |
|
1481 |
+ mixtureParams <- res[["mixtureParams"]] |
|
1482 |
+ rm(res) |
|
1483 |
+ } |
|
1484 |
+ |
|
1485 |
+ FUN <- ifelse(is.lds, "crlmmGT2", "crlmmGT") |
|
1486 |
+ ## genotyping |
|
1487 |
+ crlmmGTfxn <- function(FUN,...){ |
|
1488 |
+ switch(FUN, |
|
1489 |
+ crlmmGT2=crlmmGT2(...), |
|
1490 |
+ crlmmGT=crlmmGT(...)) |
|
1491 |
+ } |
|
1492 |
+ |
|
1493 |
+ if(is.lds) { |
|
1494 |
+ open(A(callSet)) |
|
1495 |
+ open(B(callSet)) |
|
1496 |
+ tmpA = initializeBigMatrix(name="A", length(snp.index), narrays) |
|
1497 |
+ tmpB = initializeBigMatrix(name="B", length(snp.index), narrays) |
|
1498 |
+ bb = ocProbesets()*length(sns)*8 |
|
1499 |
+ ffrowapply(tmpA[i1:i2, ] <- A(callSet)[snp.index,][i1:i2, ], X=A(callSet)[snp.index,], BATCHBYTES=bb) |
|
1500 |
+ ffrowapply(tmpB[i1:i2, ] <- B(callSet)[snp.index,][i1:i2, ], X=B(callSet)[snp.index,], BATCHBYTES=bb) |
|
1501 |
+ close(tmpA) |
|
1502 |
+ close(tmpB) |
|
1503 |
+ } else { |
|
1504 |
+ tmpA = A(callSet)[snp.index,] |
|
1505 |
+ tmpB = B(callSet)[snp.index,] |
|
1506 |
+ } |
|
1507 |
+ |
|
1508 |
+ tmp <- crlmmGTfxn(FUN, |
|
1509 |
+ A=tmpA, |
|
1510 |
+ B=tmpB, |
|
1511 |
+ SNR=SNR, |
|
1512 |
+ mixtureParams=mixtureParams, |
|
1513 |
+ cdfName=cdfName, |
|
1514 |
+ row.names=NULL, |
|
1515 |
+ col.names=sampleNames(callSet), |
|
1516 |
+ probs=probs, |
|
1517 |
+ DF=DF, |
|
1518 |
+ SNRMin=SNRMin, |
|
1519 |
+ recallMin=recallMin, |
|
1520 |
+ recallRegMin=recallRegMin, |
|
1521 |
+ gender=gender, |
|
1522 |
+ verbose=verbose, |
|
1523 |
+ returnParams=returnParams, |
|
1524 |
+ badSNP=badSNP) |
|
1525 |
+# rm(res); gc() |
|
1526 |
+ if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.") |
|
1527 |
+ if(is.lds){ |
|
1528 |
+ bb = ocProbesets()*ncol(callSet)*8 |
|
1529 |
+ open(tmp[["calls"]]) |
|
1530 |
+ open(tmp[["confs"]]) |
|
1531 |
+ ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
|
1532 |
+ ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
|
1533 |
+# close(tmp[["calls"]]) |
|
1534 |
+# close(tmp[["confs"]]) |
|
1535 |
+# open(tmpA); open(tmpB) |
|
1536 |
+# delete(tmpA); delete(tmpB); |
|
1537 |
+ delete(tmp[["calls"]]); delete(tmp[["confs"]]) |
|
1538 |
+ rm(tmpA, tmpB) |
|
1539 |
+ } else { |
|
1540 |
+ calls(callSet)[snp.index, ] <- tmp[["calls"]] |
|
1541 |
+ snpCallProbability(callSet)[snp.index, ] <- tmp[["confs"]] |
|
1542 |
+ rm(tmpA, tmpB) |
|
1543 |
+ } |
|
1544 |
+ callSet$gender <- tmp$gender |
|
1545 |
+ rm(tmp) |
|
1546 |
+ close(callSet) |
|
1547 |
+ return(callSet) |
|
1548 |
+} |
|
1549 |
+ |
|
1550 |
+ |
|
1551 |
+processIDAT = function(sel, sampleSheet=NULL, |
|
1552 |
+ arrayNames=NULL, |
|
1553 |
+ ids=NULL, |
|
1554 |
+ path=".", |
|
1555 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
|
1556 |
+ highDensity=FALSE, |
|
1557 |
+ sep="_", |
|
1558 |
+ fileExt=list(green="Grn.idat", red="Red.idat"), |
|
1559 |
+ saveDate=FALSE, |
|
1560 |
+ verbose=TRUE, |
|
1561 |
+ mixtureSampleSize=10^5, |
|
1562 |
+ fitMixture=TRUE, |
|
1563 |
+ eps=0.1, |
|
1564 |
+ seed=1, |
|
1565 |
+ cdfName, |
|
1566 |
+ sns, |
|
1567 |
+ stripNorm=TRUE, |
|
1568 |
+ useTarget=TRUE, |
|
1569 |
+ A, B, SKW, SNR, mixtureParams, is.snp) { |
|
1570 |
+# is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
|
1571 |
+ |
|
1572 |
+ if(length(path)>= length(sel)) path = path[sel] |
|
1573 |
+ RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel], |
|
1574 |
+ ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, |
|
1575 |
+ highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate) |
|
1576 |
+ |
|
1577 |
+ XY = RGtoXY(RG, chipType=cdfName) |
|
1578 |
+ # if(is.lds) { |
|
1579 |
+ open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero) |
|
1580 |
+ delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero); rm(RG) |
|
1581 |
+# } else { |
|
1582 |
+# rm(RG) |
|
1583 |
+# } |
|
1584 |
+ gc() |
|
1585 |
+ if (missing(sns)) { sns = sampleNames(XY) #subsns = sampleNames(XY) |
|
1586 |
+ } # else subsns = sns[j] |
|
1587 |
+ |
|
1588 |
+ res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
1589 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) |
|
1590 |
+ # save.it=save.it, snpFile=snpFile, cnFile=cnFile) |
|
1591 |
+# if(is.lds) { |
|
1592 |
+ open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero) |
|
1593 |
+ delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero); rm(XY) |
|
1594 |
+# } else { |
|
1595 |
+# rm(XY) |
|
1596 |
+# } |
|
1597 |
+ gc() |
|
1598 |
+ if(verbose) message("Finished preprocessing.") |
|
1599 |
+ snp.index = which(is.snp) |
|
1600 |
+ np.index <- which(!is.snp) |
|
1601 |
+# if(is.lds){ |
|
1602 |
+ open(res[["A"]]) |
|
1603 |
+ open(res[["B"]]) |
|
1604 |
+ open(res[["SKW"]]) |
|
1605 |
+ open(res[["SNR"]]) |
|
1606 |
+ open(res[["mixtureParams"]]) |
|
1607 |
+ bb = ocProbesets()*length(sns)*8 |
|
1608 |
+ ffrowapply(A[snp.index,][i1:i2, sel] <- res[["A"]][i1:i2, sel], X=res[["A"]], BATCHBYTES=bb) |
|
1609 |
+ ffrowapply(B[snp.index,][i1:i2, sel] <- res[["B"]][i1:i2, sel], X=res[["B"]], BATCHBYTES=bb) |
|
1610 |
+ if(length(np.index)>0) { |
|
1611 |
+ for (j in sel) { |
|
1612 |
+ A[np.index, j] <- res[["cnAB"]]$A[,j] |
|
1613 |
+ B[np.index, j] <- res[["cnAB"]]$B[,j] |
|
1614 |
+ } |
|
1615 |
+ } |
|
1616 |
+ delete(res[["A"]]); delete(res[["B"]]) |
|
1617 |
+# } else { |
|
1618 |
+# A[snp.index, ] <- res[["A"]] |
|
1619 |
+# B[snp.index, ] <- res[["B"]] |
|
1620 |
+# if(length(np.index)>0) { |
|
1621 |
+# A[np.index, ] <- res[["cnAB"]]$A |
|
1622 |
+# B[np.index, ] <- res[["cnAB"]]$B |
|
1623 |
+# } |
|
1624 |
+# } |
|
1625 |
+# open(res[["SKW"]]) |
|
1626 |
+# open(res[["SNR"]]) |
|
1627 |
+# open(res[["mixtureParams"]]) |
|
1628 |
+ SKW[sel] <- res[["SKW"]][] |
|
1629 |
+ SNR[sel] <- res[["SNR"]][] |
|
1630 |
+ mixtureParams[,sel] <- res[["mixtureParams"]][] |
|
1631 |
+ close(A) |
|
1632 |
+ close(B) |
|
1633 |
+ close(SNR) |
|
1634 |
+ close(SKW) |
|
1635 |
+ close(mixtureParams) |
|
1636 |
+# if(is.lds){ |
|
1637 |
+ delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]]) |
|
1638 |
+# } |
|
1639 |
+ rm(res) |
|
1640 |
+ TRUE |
|
1641 |
+ } |