063b3d14 |
# function below works OK provided all .idat files are in the current working directory
|
cd8b85b6 |
# - could add an option to allow files in Illumina directory structure to be handled
# or to use the optional 'Path' column in sampleSheet
# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
|
450a6d8a |
readIdatFiles = function(sampleSheet=NULL,
|
5fcfed7d |
arrayNames=NULL,
ids=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=FALSE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
|
99202387 |
saveDate=FALSE, verbose=FALSE) {
|
453e688a |
verbose <- FALSE
|
e609f78b |
if(!is.null(arrayNames)) {
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
}
if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
if(is.null(arrayNames)){
if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
barcode = sampleSheet[,arrayInfoColNames$barcode]
arrayNames=barcode
}
|
f1e14c0e |
if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
|
e609f78b |
position = sampleSheet[,arrayInfoColNames$position]
if(is.null(arrayNames))
arrayNames=position
else
arrayNames = paste(arrayNames, position, sep=sep)
if(highDensity) {
hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
for(i in names(hdExt))
arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
}
}
}
pd = new("AnnotatedDataFrame", data = sampleSheet)
|
063b3d14 |
sampleNames(pd) = make.unique(basename(arrayNames))
|
e609f78b |
}
if(is.null(arrayNames)) {
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
if(!is.null(sampleSheet)) {
sampleSheet=NULL
cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
}
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
}
narrays = length(arrayNames)
grnfiles = paste(arrayNames, fileExt$green, sep=sep)
redfiles = paste(arrayNames, fileExt$red, sep=sep)
if(length(grnfiles)==0 || length(redfiles)==0)
stop("Cannot find .idat files")
if(length(grnfiles)!=length(redfiles))
stop("Cannot find matching .idat files")
|
d98bfc16 |
if(path[1] != "." & path[1] != ""){
|
e609f78b |
grnidats = file.path(path, grnfiles)
redidats = file.path(path, redfiles)
} else {
|
d98bfc16 |
message("path arg not set. Assuming files are in local directory, or that complete path is provided")
|
450a6d8a |
grnidats = grnfiles
redidats = redfiles
|
e609f78b |
}
if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
|
f1e14c0e |
if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
|
e609f78b |
headerInfo = list(nProbes = rep(NA, narrays),
Barcode = rep(NA, narrays),
ChipType = rep(NA, narrays),
Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
Position = rep(NA, narrays)) # this may also vary a bit
dates = list(decode=rep(NA, narrays),
scan=rep(NA, narrays))
## read in the data
|
d98bfc16 |
for(i in seq_along(arrayNames)) {
|
99202387 |
if(verbose) {
|
453e688a |
## RS
##cat("reading", arrayNames[i], "\t")
cat("reading", basename(arrayNames[i]), "\t")
cat(paste(sep, fileExt$green, sep=""), "\t")
|
99202387 |
}
|
e609f78b |
idsG = idsR = G = R = NULL
|
adde216c |
G = readIDAT(grnidats[i])
|
e609f78b |
idsG = rownames(G$Quants)
headerInfo$nProbes[i] = G$nSNPsRead
headerInfo$Barcode[i] = G$Barcode
headerInfo$ChipType[i] = G$ChipType
headerInfo$Manifest[i] = G$Unknown$MostlyNull
headerInfo$Position[i] = G$Unknowns$MostlyA
|
f7308027 |
if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
|
e609f78b |
warning("Chips are not of the same type. Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
next()
}
dates$decode[i] = G$RunInfo[1, 1]
dates$scan[i] = G$RunInfo[2, 1]
if(i==1) {
if(is.null(ids) && !is.null(G)){
ids = idsG
|
063b3d14 |
} # else stop("Could not find probe IDs")
|
e609f78b |
nprobes = length(ids)
narrays = length(arrayNames)
|
450a6d8a |
RG = new("NChannelSet",
|
99202387 |
R=matrix(NA, nprobes, narrays),
G=matrix(NA, nprobes, narrays),
zero=matrix(NA, nprobes, narrays),
annotation=headerInfo$Manifest[1],
phenoData=pd, storage.mode="environment")
featureNames(RG) = ids
|
e609f78b |
if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
|
063b3d14 |
sampleNames(RG) = make.unique(sampleSheet$Sample_ID)
} else sampleNames(RG) = make.unique(basename(arrayNames))
|
a1394042 |
gc(verbose=FALSE)
|
e609f78b |
}
if(length(ids)==length(idsG)) {
if(sum(ids==idsG)==nprobes) {
RG@assayData$G[,i] = G$Quants[, "Mean"]
zeroG = G$Quants[, "NBeads"]==0
}
} else {
indG = match(ids, idsG)
RG@assayData$G[,i] = G$Quants[indG, "Mean"]
zeroG = G$Quants[indG, "NBeads"]==0
}
rm(G)
|
a1394042 |
gc(verbose=FALSE)
|
e6e6981f |
if(verbose) {
|
99202387 |
cat(paste(sep, fileExt$red, sep=""), "\n")
}
|
e609f78b |
R = readIDAT(redidats[i])
idsR = rownames(R$Quants)
|
f1e14c0e |
if(length(ids)==length(idsG)) {
|
e609f78b |
if(sum(ids==idsR)==nprobes) {
RG@assayData$R[,i] = R$Quants[ ,"Mean"]
|
99202387 |
zeroR = R$Quants[ ,"NBeads"]==0
|
e609f78b |
}
} else {
indR = match(ids, idsR)
RG@assayData$R[,i] = R$Quants[indR, "Mean"]
zeroR = R$Quants[indR, "NBeads"]==0
}
RG@assayData$zero[,i] = zeroG | zeroR
rm(R, zeroG, zeroR)
|
a1394042 |
gc(verbose=FALSE)
|
e609f78b |
}
if(saveDate) {
protocolData(RG)[["ScanDate"]] = dates$scan
}
storageMode(RG) = "lockedEnvironment"
RG
}
|
64463948 |
## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
|
d773ad80 |
## edits provided by Kasper Daniel Hansen, 4/10/2011
|
cd8b85b6 |
readIDAT <- function(idatFile){
fileSize <- file.info(idatFile)$size
tempCon <- file(idatFile,"rb")
prefixCheck <- readChar(tempCon,4)
|
e3c2c3b9 |
# if(prefixCheck != "IDAT"){
# warning("May need to check format of ", idatFile)
# }
|
cd8b85b6 |
|
d773ad80 |
versionNumber <- readBin(tempCon, "integer", n=1, size=8, endian="little")
|
b5d1d7e5 |
if(versionNumber<3)
stop("Older style IDAT files not supported: consider updating your scanner settings")
|
f1e14c0e |
|
d773ad80 |
nFields <- readBin(tempCon, "integer", n=1, size=4, endian="little")
|
cd8b85b6 |
fields <- matrix(0,nFields,3);
colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
for(i1 in 1:nFields){
|
f1e14c0e |
fields[i1,"Field Code"] <-
|
cd8b85b6 |
readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
|
f1e14c0e |
fields[i1,"Byte Offset"] <-
|
82ef3a72 |
readBin(tempCon, "integer", n=1, size=8, endian="little")
|
cd8b85b6 |
}
|
d773ad80 |
knownCodes <-
c("nSNPsRead" = 1000,
"IlluminaID" = 102,
"SD" = 103,
"Mean" = 104,
"NBeads" = 107,
"MidBlock" = 200,
"RunInfo" = 300,
"RedGreen" = 400,
"MostlyNull" = 401,
"Barcode" = 402,
"ChipType" = 403,
"MostlyA" = 404,
"Unknown.1" = 405,
"Unknown.2" = 406,
"Unknown.3" = 407,
"Unknown.4" = 408,
"Unknown.5" = 409,
"Unknown.6" = 410,
"Unknown.7" = 510
|
cd8b85b6 |
)
nNewFields <- 1
rownames(fields) <- paste("Null", 1:nFields)
for(i1 in 1:nFields){
temp <- match(fields[i1,"Field Code"], knownCodes)
if(!is.na(temp)){
rownames(fields)[i1] <- names(knownCodes)[temp]
}else{
rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
nNewFields <- nNewFields + 1
}
}
|
d773ad80 |
fields <- fields[order(fields[, "Byte Offset"]),]
|
cd8b85b6 |
|
d773ad80 |
seek(tempCon, fields["nSNPsRead", "Byte Offset"])
nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, endian="little")
readBlock <- function(nam) {
switch(nam,
"IlluminaID" = {
seek(tempCon, fields["IlluminaID", "Byte Offset"])
IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, endian="little")
IlluminaID
},
"SD" = {
seek(tempCon, fields["SD", "Byte Offset"])
SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE)
SD
},
"Mean" = {
seek(tempCon, fields["Mean", "Byte Offset"])
Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE)
Mean
},
"NBeads" = {
seek(tempCon, fields["NBeads", "Byte Offset"])
NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
NBeads
},
"MidBlock" = {
seek(tempCon, fields["MidBlock", "Byte Offset"])
nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, endian="little")
MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4,
endian="little")
MidBlock
},
"RunInfo" = {
seek(tempCon, fields["RunInfo", "Byte Offset"])
nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, endian="little")
RunInfo <- matrix(NA, nRunInfoBlocks, 5)
colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
"BlockCode", "CodeVersion")
for(i1 in 1:2) { #nRunInfoBlocks){ ## MR edit
for(i2 in 1:5){
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
RunInfo[i1,i2] <- readChar(tempCon, nChars)
}
}
RunInfo
},
"RedGreen" = {
seek(tempCon, fields["RedGreen", "Byte Offset"])
RedGreen <- readBin(tempCon, "numeric", n=1, size=4,
endian="little")
RedGreen
},
"MostlyNull" = {
seek(tempCon, fields["MostlyNull", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
MostlyNull <- readChar(tempCon, nChars)
MostlyNull
},
|
af08b5d6 |
"Barcode" = {
|
d773ad80 |
seek(tempCon, fields["Barcode", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Barcode <- readChar(tempCon, nChars)
Barcode
},
"ChipType" = {
seek(tempCon, fields["ChipType", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
ChipType <- readChar(tempCon, nChars)
ChipType
},
"MostlyA" = {
seek(tempCon, fields["MostlyA", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
MostlyA <- readChar(tempCon, nChars)
},
"Unknown.1" = {
seek(tempCon, fields["Unknown.1", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.1 <- readChar(tempCon, nChars)
Unknown.1
},
"Unknown.2" = {
seek(tempCon, fields["Unknown.2", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.2 <- readChar(tempCon, nChars)
Unknown.2
},
"Unknown.3" = {
seek(tempCon, fields["Unknown.3", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.3 <- readChar(tempCon, nChars)
Unknown.3
},
"Unknown.4" = {
seek(tempCon, fields["Unknown.4", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.4 <- readChar(tempCon, nChars)
Unknown.4
},
"Unknown.5" = {
seek(tempCon, fields["Unknown.5", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.5 <- readChar(tempCon, nChars)
Unknown.5
},
"Unknown.6" = {
seek(tempCon, fields["Unknown.6", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.6 <- readChar(tempCon, nChars)
Unknown.6
},
"Unknown.7" = {
seek(tempCon, fields["Unknown.7", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.7 <- readChar(tempCon, nChars)
Unknown.7
})
|
cd8b85b6 |
}
|
d773ad80 |
readFields <- setdiff(rownames(fields), "nSNPsRead")
names(readFields) <- readFields
|
af08b5d6 |
|
d773ad80 |
allFields <- lapply(readFields, readBlock)
|
cd8b85b6 |
close(tempCon)
|
d773ad80 |
UnknownNames <- c("MostlyNull", "MostlyA", "Unknown.1",
"Unknown.2", "Unknown.3", "Unknown.4",
"Unknown.5", "Unknown.6", "Unknown.7")
Unknowns <- allFields[intersect(names(allFields), UnknownNames)]
|
cd8b85b6 |
|
d773ad80 |
Quants <- cbind(allFields$Mean, allFields$SD, allFields$NBeads)
|
cd8b85b6 |
colnames(Quants) <- c("Mean", "SD", "NBeads")
|
d773ad80 |
rownames(Quants) <- as.character(allFields$IlluminaID)
|
cd8b85b6 |
|
d773ad80 |
InfoNames <- c("MidBlock", "RunInfo", "RedGreen", "Barcode", "ChipType")
Info <- allFields[intersect(names(allFields), InfoNames)]
|
af08b5d6 |
|
f1e14c0e |
idatValues <-
list(fileSize=fileSize,
versionNumber=versionNumber,
nFields=nFields,
|
cd8b85b6 |
fields=fields,
|
f1e14c0e |
nSNPsRead=nSNPsRead,
|
d773ad80 |
Quants=Quants)
idatValues <- c(idatValues, Info, list(Unknowns = Unknowns))
|
cd8b85b6 |
idatValues
}
|
d773ad80 |
|
cd8b85b6 |
readBPM <- function(bpmFile){
## Reads and parses Illumina BPM files
|
f1e14c0e |
|
cd8b85b6 |
fileSize <- file.info(bpmFile)$size
tempCon <- file(bpmFile,"rb")
# The first few bytes of the egtFile are some type of
|
f1e14c0e |
# header, but there's no related byte offset information.
|
cd8b85b6 |
prefixCheck <- readChar(tempCon,3) ## should be "BPM"
null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
|
f1e14c0e |
## should be 1
versionNumber <-
|
82ef3a72 |
readBin(tempCon, "integer", n=1, size=4, endian="little")
|
cd8b85b6 |
## should be 4
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
chipType <- readChar(tempCon, nChars)
null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
csvLines <- readLines(tempCon, 22)
entriesByteOffset <- seek(tempCon);
nEntries <- readBin(tempCon, "integer", n=1, size=4,
|
82ef3a72 |
endian="little")
|
f1e14c0e |
|
cd8b85b6 |
if(FALSE){
|
f1e14c0e |
|
cd8b85b6 |
snpIndexByteOffset <- seek(tempCon)
snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
|
82ef3a72 |
endian="little")
|
cd8b85b6 |
## for the 1M array, these are simply in order from 1 to 1072820.
|
f1e14c0e |
|
cd8b85b6 |
snpNamesByteOffset <- seek(tempCon)
snpNames <- rep("A", nEntries)
for(i1 in 1:nEntries){
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
snpNames[i1] <- readChar(tempCon, nChars)
}
}
seek(tempCon, 15278138)
|
f1e14c0e |
|
cd8b85b6 |
normIDByteOffset <- seek(tempCon)
normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
|
f1e14c0e |
|
cd8b85b6 |
newBlockByteOffset <- seek(tempCon)
newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
|
f1e14c0e |
|
cd8b85b6 |
close(tempCon)
byteOffsets <- list(entriesByteOffset=entriesByteOffset,
|
f1e14c0e |
#snpIndexByteOffset=snpIndexByteOffset,
|
cd8b85b6 |
#snpNamesByteOffset=snpNamesByteOffset,
normIDByteOffset=normIDByteOffset,
newBlockByteOffset=newBlockByteOffset)
|
f1e14c0e |
|
cd8b85b6 |
allStuff <- list(prefixCheck=prefixCheck,
null.1=null.1,
versionNumber=versionNumber,
chipType=chipType,
null.2=null.2,
csvLines=csvLines,
nEntries=nEntries,
#snpIndex=snpIndex,
#snpNames=snpNames,
normID=normID,
newBlock=newBlock,
byteOffsets=byteOffsets)
allStuff
|
f1e14c0e |
|
cd8b85b6 |
}
|
5fcfed7d |
|
d3668036 |
readGenCallOutput = function(file, path=".", cdfName,
|
7c0c9ac5 |
colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
|
d3668036 |
type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
if(!identical(names(type), names(colnames)))
stop("The arguments 'colnames' and 'type' must have consistent names")
|
5cdc0c76 |
if(verbose)
|
d3668036 |
cat("Reading", file, "\n")
tmp=readLines(file.path(path,file),n=15)
s=c("\t",",")
|
5cdc0c76 |
a=unlist(strsplit(tmp[10][1],s[1]))
|
d3668036 |
if(length(a)!=1){
sepp=s[1]
|
5cdc0c76 |
a1=unlist(strsplit(tmp[10][1],s[1]))
}
|
d3668036 |
if(length(a)==1){
sepp=s[2]
a1=unlist(strsplit(tmp[10][1],s[2]))
|
5cdc0c76 |
}
|
7c0c9ac5 |
|
d3668036 |
if(sum(is.na(match(colnames,a1)))!=0)
|
7c0c9ac5 |
stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.")
|
d3668036 |
m1=m=match(a1,colnames)
m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]]
m[is.na(m)==TRUE] = list(NULL)
names(m) = names(colnames)[m1]
|
5cdc0c76 |
fc = file(file.path(path, file), open="r")
|
7c0c9ac5 |
|
d3668036 |
dat = scan(fc, what=m, skip=10,sep=sepp)
close(fc)
|
7c0c9ac5 |
|
d3668036 |
samples = unique(dat$"SampleID")
nsamples = length(samples)
snps = unique(dat$"SNPID")
nsnps = length(snps)
if(verbose)
cat("Check ordering for samples","\n")
|
7c0c9ac5 |
|
d3668036 |
X = Y = zeroes = matrix(0, nsnps, nsamples)
|
7c0c9ac5 |
|
d3668036 |
for(i in 1:length(samples)) {
ind = dat$"SampleID"==samples[i]
if(sum(dat$"SNPID"[ind]==snps)==nsnps) {
# if(verbose)
# cat(paste("Correct ordering for sample", samples[i], "\n"))
X[,i] = dat$"XRaw"[ind]
Y[,i] = dat$"YRaw"[ind]
}
if(sum(dat$"SNPID"[ind]==snps)!=nsnps) {
if(verbose)
cat("Reordering sample ", samples[i],"\n")
m=match(snps,dat$"SNPID"[ind])
X[,i]= dat$"XRaw"[ind][m]
Y[,i]= dat$"YRaw"[ind][m]
}
|
a1394042 |
gc(verbose=FALSE)
|
d3668036 |
}
|
7c0c9ac5 |
|
5cdc0c76 |
zeroes=(X=="0"|Y=="0")
|
d3668036 |
colnames(X) = colnames(Y) = colnames(zeroes) = samples
rownames(X) = rownames(Y) = snps
|
7c0c9ac5 |
if(verbose)
|
d3668036 |
cat("Creating NChannelSet object\n")
|
7c0c9ac5 |
|
5cdc0c76 |
XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X),
Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y),
zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes),
annotation = cdfName, storage.mode = "environment")
|
d3668036 |
sampleNames(XY)=colnames(X)
|
7c0c9ac5 |
|
5cdc0c76 |
if(verbose)
cat("Done\n")
|
7c0c9ac5 |
|
5cdc0c76 |
XY
}
|
9a4a5148 |
RGtoXY = function(RG, chipType, verbose=TRUE) {
|
453e688a |
needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded))
if(needToLoad){
|
01aea5e1 |
chipList = c("human1mv1c",# 1M
|
453e688a |
"human370v1c", # 370CNV
"human650v3a", # 650Y
"human610quadv1b", # 610 quad
"human660quadv1a", # 660 quad
"human370quadv3c", # 370CNV quad
"human550v3b", # 550K
"human1mduov3b", # 1M Duo
"humanomni1quadv1b", # Omni1 quad
"humanomni25quadv1b", # Omni2.5 quad
|
063b3d14 |
"humanomni258v1a", # Omni2.5 8 sample
|
453e688a |
"humanomniexpress12v1b", # Omni express 12
|
01aea5e1 |
"humanimmuno12v1b", # Immuno chip 12
"humancytosnp12v2p1h") # CytoSNP 12
|
063b3d14 |
## RS: added cleancdfname()
|
453e688a |
if(missing(chipType)){
|
063b3d14 |
chipType = match.arg(cleancdfname(annotation(RG)), chipList)
} else chipType = match.arg(cleancdfname(chipType), chipList)
|
453e688a |
pkgname = getCrlmmAnnotationName(chipType)
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading chip annotation information.")
loader("address.rda", .crlmmPkgEnv, pkgname)
|
e609f78b |
}
|
450a6d8a |
aids = getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
bids = getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
snpbase = getVarInEnv("base")
|
453e688a |
## loader(‘file.rda’)
## x = getVarInEnv(‘x’)
## y = getVarInEnv(‘y’)
##
## I’d consider using something like:
##
## needToLoad = !all(sapply(c(‘x’, ‘y’), isLoaded))
## if (needToLoad){
## loader(‘file.rda’)
## x = getVarInEnv(‘x’)
## y = getVarInEnv(‘y’)
## }
ids = names(aids)
|
e609f78b |
nsnps = length(aids)
narrays = ncol(RG)
# aidcol = match("AddressA_ID", colnames(annot))
# if(is.na(aidcol))
# aidcol = match("Address", colnames(annot))
# bidcol = match("AddressB_ID", colnames(annot))
|
f1e14c0e |
# if(is.na(bidcol))
|
e609f78b |
# bidcol = match("Address2", colnames(annot))
# aids = annot[, aidcol]
# bids = annot[, bidcol]
# snpids = annot[,"Name"]
|
f1e14c0e |
# snpbase = annot[,"SNP"]
infI = !is.na(bids) & bids!=0
|
e609f78b |
aord = match(aids, featureNames(RG)) # NAs are possible here
bord = match(bids, featureNames(RG)) # and here
# argrg = aids[rrgg]
# brgrg = bids[rrgg]
|
453e688a |
XY <- new("NChannelSet",
X=matrix(0, nsnps, narrays),
Y=matrix(0, nsnps, narrays),
zero=matrix(0, nsnps, narrays),
annotation=chipType, phenoData=RG@phenoData,
protocolData=RG@protocolData, storage.mode="environment")
|
3f404581 |
featureNames(XY) = ids
|
e972ec4a |
sampleNames(XY) = sampleNames(RG)
|
453e688a |
## RS
rm(list=c("bord", "infI", "aids", "bids", "ids", "snpbase"))
|
a1394042 |
# print(gc())
gc(verbose=FALSE)
|
a11efc9c |
# Need to initialize - matrices filled with NAs to begin with
|
137e5619 |
# XY@assayData$X[1:nsnps,] = 0
# XY@assayData$Y[1:nsnps,] = 0
# XY@assayData$zero[1:nsnps,] = 0
|
f1e14c0e |
|
e609f78b |
# First sort out Infinium II SNPs, X -> R (allele A) and Y -> G (allele B) from the same probe
|
453e688a |
## RS added
not.na <- !is.na(aord)
not.na.aord <- aord[not.na]
## RS substitution !is.na(aord) -> not.na
##r <- as.matrix(exprs(channel(RG, "R"))[not.na.aord,]) # mostly red
r <- as.matrix(assayData(RG)[["R"]][not.na.aord, ])
XY@assayData$X[not.na,] <- r
|
a1394042 |
rm(r);gc(verbose=FALSE)
|
453e688a |
g <- as.matrix(assayData(RG)[["G"]][not.na.aord,]) # mostly green
XY@assayData$Y[not.na,] <- g
|
a1394042 |
rm(g); gc(verbose=FALSE)
|
453e688a |
##z <- as.matrix(exprs(channel(RG, "zero"))[not.na.aord,]) # mostly green
z <- as.matrix(assayData(RG)[["zero"]][not.na.aord, ])
XY@assayData$zero[not.na,] <- z
|
a1394042 |
rm(z); gc(verbose=FALSE)
|
453e688a |
##RS added
rm(RG)
|
a1394042 |
# print(gc())
gc(verbose=FALSE)
|
e609f78b |
## Warning - not 100% sure that the code below is correct - could be more complicated than this
# infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
|
f1e14c0e |
|
e609f78b |
# X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
# Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
|
f1e14c0e |
|
e609f78b |
# Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
# infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
# X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
# Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
|
f1e14c0e |
|
e609f78b |
# For now zero out Infinium I probes
|
99202387 |
# XY@assayData$X[infI,] = 0
# XY@assayData$Y[infI,] = 0
# XY@assayData$zero[infI,] = 0
|
a1394042 |
# gc(verbose=FALSE)
|
e609f78b |
XY
}
|
cd8b85b6 |
stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
|
453e688a |
if(useTarget){
objectsNeeded <- c("stripnum", "reference")
} else objectsNeeded <- "stripnum"
needToLoad <- !all(sapply(objectsNeeded, isLoaded))
if(needToLoad){
pkgname = getCrlmmAnnotationName(annotation(XY))
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading strip and reference normalization information.")
loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
}
stripnum = getVarInEnv("stripnum")
if(useTarget)
targetdist = getVarInEnv("reference")
if(verbose){
message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=max(stripnum), style=3)
}
|
5fcfed7d |
for(s in 1:max(stripnum)) {
if(verbose) {
if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
else cat(".")
}
sel = stripnum==s
|
453e688a |
##RS: faster to access data directly
##subX = as.matrix(exprs(channel(XY, "X"))[sel,])
##subY = as.matrix(exprs(channel(XY, "Y"))[sel,])
subX <- as.matrix(assayData(XY)[["X"]][sel, ])
subY <- as.matrix(assayData(XY)[["Y"]][sel, ])
|
5fcfed7d |
if(useTarget)
|
453e688a |
tmp = normalize.quantiles.use.target(cbind(subX, subY), targetdist[[s]])
|
5fcfed7d |
else
|
453e688a |
tmp = normalize.quantiles(cbind(subX, subY))
|
a11efc9c |
XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2)
XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2)
|
5fcfed7d |
rm(subX, subY, tmp, sel)
|
a1394042 |
gc(verbose=FALSE)
|
5fcfed7d |
}
if(verbose)
cat("\n")
XY
|
cd8b85b6 |
}
|
450a6d8a |
preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
|
e609f78b |
fitMixture=TRUE,
eps=0.1,
verbose=TRUE,
seed=1,
cdfName,
sns,
stripNorm=TRUE,
|
99202387 |
useTarget=TRUE) { #,
# outdir=".") {
|
e972ec4a |
# save.it=FALSE,
# snpFile,
# cnFile) {
|
453e688a |
if(stripNorm)
XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
## MR: the code below is mostly straight from snprma.R
if (missing(sns)) sns = sampleNames(XY) #$X
if(missing(cdfName))
cdfName = annotation(XY)
needToLoad <- !all(sapply(c("autosomeIndex",
"SMEDIAN",
"theKnots",
"npProbesFid"), isLoaded))
if(needToLoad){
pkgname = getCrlmmAnnotationName(cdfName)
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading snp annotation and mixture model parameters.")
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
}
autosomeIndex = getVarInEnv("autosomeIndex")
SMEDIAN = getVarInEnv("SMEDIAN")
theKnots = getVarInEnv("theKnots")
npIndex = getVarInEnv("npProbesFid")
snpIndex = getVarInEnv("snpProbesFid")
narrays = ncol(XY)
nprobes = length(npIndex)
if(length(nprobes)>0) {
## RS: channel creates an expression set. This is much slower than directly accessing the data
A <- matrix(as.integer(assayData(XY)[["X"]][npIndex, ]), nprobes, narrays)
##system.time(A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays))
B <- matrix(as.integer(assayData(XY)[["Y"]][npIndex, ]), nprobes, narrays)
##B = matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
## new lines below - useful to keep track of zeroed out probes
zero <- matrix(as.integer(assayData(XY)[["zero"]][npIndex, ]), nprobes, narrays)
##zero = matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
colnames(A) = colnames(B) = colnames(zero) = sns
rownames(A) = rownames(B) = rownames(zero) = names(npIndex)
cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
## t0 <- proc.time()
## save(cnAB, file=cnFile)
## t0 <- proc.time()-t0
## if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
rm(A, B, zero)
|
a1394042 |
# print(gc())
gc(verbose=FALSE)
|
453e688a |
}
|
e609f78b |
# next process snp probes
|
450a6d8a |
nprobes = length(snpIndex)
|
e609f78b |
##We will read each cel file, summarize, and run EM one by one
##We will save parameters of EM to use later
|
99202387 |
mixtureParams = matrix(NA, 4, narrays)
SNR = rep(NA, narrays)
SKW = rep(NA, narrays)
|
e609f78b |
## This is the sample for the fitting of splines
## BC: I like better the idea of the user passing the seed,
## because this might intefere with other analyses
## (like what happened to GCRMA)
set.seed(seed)
|
450a6d8a |
idx = sort(sample(autosomeIndex, mixtureSampleSize))
idx2 = sample(nprobes, 10^5)
|
e609f78b |
##S will hold (A+B)/2 and M will hold A-B
##NOTE: We actually dont need to save S. Only for pics etc...
##f is the correction. we save to avoid recomputing
|
a11efc9c |
|
99202387 |
A = matrix(NA, nprobes, narrays)
B = matrix(NA, nprobes, narrays)
zero = matrix(NA, nprobes, narrays)
|
f1e14c0e |
|
e609f78b |
if(verbose){
message("Calibrating ", narrays, " arrays.")
|
450a6d8a |
if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=narrays, style=3)
|
e609f78b |
}
for(i in 1:narrays){
|
453e688a |
## RS: faster to access data directly without using channel method
##A[,i] = as.integer(exprs(channel(XY, "X"))[snpIndex,i])
A[, i] <- as.integer(assayData(XY)[["X"]][snpIndex, i])
B[, i] <- as.integer(assayData(XY)[["Y"]][snpIndex, i])
zero[, i] <- as.integer(assayData(XY)[["zero"]][snpIndex, i])
##B[,i] = as.integer(exprs(channel(XY, "Y"))[snpIndex,i])
##zero[,i] = as.integer(exprs(channel(XY, "zero"))[snpIndex,i])
SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
if(fitMixture){
S = (log2(A[idx,i])+log2(B[idx,i]))/2 - SMEDIAN
M = log2(A[idx,i])-log2(B[idx,i])
##we need to test the choice of eps.. it is not the max diff between funcs
tmp = fitAffySnpMixture56(S, M, theKnots, eps=eps)
mixtureParams[, i] = tmp[["coef"]]
SNR[i] = tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
}
if(verbose) {
if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
else cat(".")
}
## run garbage collection every now and then
|
a1394042 |
if(i %% 100 == 0) gc(verbose=FALSE);
|
e609f78b |
}
if (verbose) {
|
453e688a |
if (getRversion() > '2.7.0') close(pb)
else cat("\n")
|
e609f78b |
}
|
450a6d8a |
if (!fitMixture) SNR = mixtureParams = NA
|
e609f78b |
## gns comes from preprocStuff.rda
|
8d6ca965 |
|
e972ec4a |
# if(save.it & !missing(snpFile)) {
# t0 <- proc.time()
# save(res, file=snpFile)
# t0 <- proc.time()-t0
# if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
# }
|
f1e14c0e |
|
33753706 |
res = list(A=A, B=B,
zero=zero, sns=sns, gns=names(snpIndex), SNR=SNR, SKW=SKW,
|
e972ec4a |
mixtureParams=mixtureParams, cdfName=cdfName, cnAB=cnAB)
|
3f404581 |
|
e972ec4a |
# open(res[["A"]])
# open(res[["B"]])
# open(res[["zero"]])
# open(res[["SNR"]])
# open(res[["mixtureParams"]])
# if(save.it & !missing(snpFile)) {
# t0 <- proc.time()
# save(res, file=snpFile)
# t0 <- proc.time()-t0
# if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
# }
|
8d6ca965 |
|
e972ec4a |
# close(res[["A"]])
# close(res[["B"]])
# close(res[["zero"]])
# close(res[["SNR"]])
# close(res[["mixtureParams"]])
|
a1a4312d |
|
e609f78b |
return(res)
}
|
3be95c2b |
crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
row.names=TRUE, col.names=TRUE,
probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
seed=1,
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
cdfName, sns, recallMin=10, recallRegMin=1000,
returnParams=FALSE, badSNP=.7) {
if(missing(cdfName)) {
if(!missing(RG))
cdfName = annotation(RG)
if(!missing(XY))
cdfName = annotation(XY)
}
if(!isValidCdfName(cdfName))
stop("cdfName not valid. see validCdfNames")
if(!missing(RG)) {
if(missing(XY))
XY = RGtoXY(RG, chipType=cdfName)
else
stop("Both RG and XY specified - please use one or the other")
}
if (missing(sns)) sns = sampleNames(XY)
res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
fitMixture=TRUE, verbose=verbose,
seed=seed, eps=eps, cdfName=cdfName, sns=sns,
stripNorm=stripNorm, useTarget=useTarget) #,
if(row.names) row.names=res$gns else row.names=NULL
if(col.names) col.names=res$sns else col.names=NULL
res2 <- crlmmGT(A=res[["A"]],
B=res[["B"]],
SNR=res[["SNR"]],
mixtureParams=res[["mixtureParams"]],
cdfName=cdfName,
row.names=row.names,
col.names=col.names,
probs=probs,
DF=DF,
SNRMin=SNRMin,
recallMin=recallMin,
recallRegMin=recallRegMin,
gender=gender,
verbose=verbose,
returnParams=returnParams,
badSNP=badSNP)
res2[["SNR"]] = res[["SNR"]]
res2[["SKW"]] = res[["SKW"]]
rm(res); gc(verbose=FALSE)
return(list2SnpSet(res2, returnParams=returnParams))
|
e609f78b |
}
|
f1e14c0e |
## MR: Below is a more memory efficient version of crlmmIllumina() which
## reads in the .idats and genotypes in the one function and removes objects
|
4e4601af |
## after they have been used
|
5fcfed7d |
crlmmIlluminaV2 = function(sampleSheet=NULL,
|
e609f78b |
arrayNames=NULL,
|
5fcfed7d |
ids=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=FALSE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
saveDate=FALSE,
stripNorm=TRUE,
useTarget=TRUE,
|
99202387 |
# outdir=".",
|
8e2d9355 |
row.names=TRUE,
|
5fcfed7d |
col.names=TRUE,
probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
|
e972ec4a |
seed=1, # save.it=FALSE, snpFile, cnFile,
|
e609f78b |
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
cdfName, sns, recallMin=10, recallRegMin=1000,
returnParams=FALSE, badSNP=.7) {
|
8e2d9355 |
|
450a6d8a |
if(missing(cdfName)) stop("must specify cdfName")
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames")
|
3f404581 |
|
99202387 |
# is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
is.lds=FALSE
|
450a6d8a |
RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
|
4e4601af |
ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
|
9a4a5148 |
XY = RGtoXY(RG, chipType=cdfName)
|
99202387 |
# if(is.lds) {
# open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
# delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero)
# }
|
a1394042 |
rm(RG); gc(verbose=FALSE)
|
3f404581 |
|
450a6d8a |
if (missing(sns)) { sns = sampleNames(XY)
|
3f404581 |
}
|
9a4a5148 |
res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
|
99202387 |
seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) #,
|
e972ec4a |
# save.it=save.it, snpFile=snpFile, cnFile=cnFile)
|
99202387 |
# if(is.lds) {
# open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
# delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero)
# }
|
a1394042 |
rm(XY); gc(verbose=FALSE)
|
e609f78b |
|
450a6d8a |
if(row.names) row.names=res$gns else row.names=NULL
if(col.names) col.names=res$sns else col.names=NULL
|
3be95c2b |
##
## if(is.lds){
## res2 <- crlmmGT2(A=res[["A"]],
## B=res[["B"]],
## SNR=res[["SNR"]],
## mixtureParams=res[["mixtureParams"]],
## cdfName=cdfName,
## row.names=row.names,
## col.names=col.names,
## probs=probs,
## DF=DF,
## SNRMin=SNRMin,
## recallMin=recallMin,
## recallRegMin=recallRegMin,
## gender=gender,
## verbose=verbose,
## returnParams=returnParams,
## badSNP=badSNP)
## } else {
res2 <- crlmmGT(A=res[["A"]],
B=res[["B"]],
SNR=res[["SNR"]],
mixtureParams=res[["mixtureParams"]],
cdfName=cdfName,
row.names=row.names,
col.names=col.names,
probs=probs,
DF=DF,
SNRMin=SNRMin,
recallMin=recallMin,
recallRegMin=recallRegMin,
gender=gender,
verbose=verbose,
returnParams=returnParams,
badSNP=badSNP)
## }
|
3f404581 |
|
3be95c2b |
## FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
## ## genotyping
## crlmmGTfxn = function(FUN,...){
## switch(FUN,
## crlmmGT2=crlmmGT2(...),
## crlmmGT=crlmmGT(...))
## }
## res2 = crlmmGTfxn(FUN,
## A=res[["A"]],
## B=res[["B"]],
## SNR=res[["SNR"]],
## mixtureParams=res[["mixtureParams"]],
## cdfName=cdfName,
## row.names=row.names,
## col.names=col.names,
## probs=probs,
## DF=DF,
## SNRMin=SNRMin,
## recallMin=recallMin,
## recallRegMin=recallRegMin,
## gender=gender,
## verbose=verbose,
## returnParams=returnParams,
## badSNP=badSNP)
|
3f404581 |
|
99202387 |
# if(is.lds) {
# open(res[["SNR"]]); open(res[["SKW"]])
# }
|
450a6d8a |
res2[["SNR"]] = res[["SNR"]]
res2[["SKW"]] = res[["SKW"]]
# if(is.lds) {
# delete(res[["A"]]); delete(res[["B"]])
# delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
# }
|
a1394042 |
rm(res); gc(verbose=FALSE)
|
450a6d8a |
return(list2SnpSet(res2, returnParams=returnParams))
|
4e4601af |
}
|
e972ec4a |
|
450a6d8a |
# Functions analogous to Rob's Affy functions to set up container
|
99202387 |
getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verbose=FALSE) {
|
e972ec4a |
narrays = length(filenames)
headerInfo = list(nProbes = rep(NA, narrays),
Barcode = rep(NA, narrays),
ChipType = rep(NA, narrays),
|
450a6d8a |
Manifest = rep(NA, narrays),
Position = rep(NA, narrays))
|
e972ec4a |
scanDates = data.frame(ScanDate=rep(NA, narrays), DecodeDate=rep(NA, narrays))
|
063b3d14 |
rownames(scanDates) <- make.unique(gsub(paste(sep, fileExt, sep=""), "", filenames))
|
e972ec4a |
## read in the data
for(i in seq_along(filenames)) {
|
99202387 |
if(verbose)
cat("reading", filenames[i], "\n")
|
e972ec4a |
idsG = G = NULL
G = readIDAT(filenames[i])
idsG = rownames(G$Quants)
headerInfo$nProbes[i] = G$nSNPsRead
headerInfo$Barcode[i] = G$Barcode
headerInfo$ChipType[i] = G$ChipType
headerInfo$Manifest[i] = G$Unknown$MostlyNull
headerInfo$Position[i] = G$Unknowns$MostlyA
if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
warning("Chips are not of the same type. Skipping ", basename(filenames[i]))
next()
}
scanDates$ScanDate[i] = G$RunInfo[1, 1]
scanDates$DecodeDate[i] = G$RunInfo[2, 1]
rm(G)
|
a1394042 |
gc(verbose=FALSE)
|
e972ec4a |
}
protocoldata = new("AnnotatedDataFrame",
data=scanDates,
varMetadata=data.frame(labelDescription=colnames(scanDates),
row.names=colnames(scanDates)))
|
063b3d14 |
return(protocoldata)
|
e972ec4a |
}
|
ea8dfe72 |
##getAvailableIlluminaGenomeBuild <- function(path){
## snp.file <- list.files(path, pattern="snpProbes_hg")
## if(length(snp.file) > 1){
## ## use hg19
## message("genome build not specified. Using build hg19 for annotation.")
## snp.file <- snp.file[1]
## }
## genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
##}
|
3be95c2b |
getAvailableIlluminaGenomeBuild <- function(path){
snp.file <- list.files(path, pattern="snpProbes_hg")
if(length(snp.file) > 1){
## use hg19
message("genome build not specified. Using build hg19 for annotation.")
snp.file <- snp.file[1]
}
|
ea8dfe72 |
if(length(snp.file) == 1)
genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
if(length(snp.file)==0) genome <- ""
genome
|
3be95c2b |
}
|
e972ec4a |
|
f0f8ba16 |
constructInf <- function(sampleSheet=NULL,
|
415a3b06 |
arrayNames=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=FALSE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
cdfName,
verbose=FALSE,
batch, #fns,
saveDate=TRUE) { #, outdir="."){
|
453e688a |
verbose <- FALSE
if(!is.null(arrayNames)) {
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
}
if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
if(is.null(arrayNames)) {
if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
barcode = sampleSheet[,arrayInfoColNames$barcode]
arrayNames=barcode
}
if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
position = sampleSheet[,arrayInfoColNames$position]
if(is.null(arrayNames))
arrayNames=position
else
arrayNames = paste(arrayNames, position, sep=sep)
if(highDensity) {
hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
for(i in names(hdExt))
arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
}
}
}
pd = new("AnnotatedDataFrame", data = sampleSheet)
|
063b3d14 |
sampleNames(pd) = make.unique(basename(arrayNames))
|
453e688a |
}
if(is.null(arrayNames)) {
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
if(!is.null(sampleSheet)) {
sampleSheet=NULL
cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
}
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
}
narrays = length(arrayNames)
|
3f404581 |
|
453e688a |
if(!missing(batch)) {
|
e972ec4a |
stopifnot(length(batch) == narrays)
|
453e688a |
}
if(missing(batch)) {
|
137e5619 |
stop("Must specify 'batch'") # batch = as.factor(rep(1, narrays))
|
453e688a |
}
|
e972ec4a |
|
453e688a |
grnfiles = paste(arrayNames, fileExt$green, sep=sep)
redfiles = paste(arrayNames, fileExt$red, sep=sep)
if(length(grnfiles)==0 || length(redfiles)==0)
stop("Cannot find .idat files")
if(length(grnfiles)!=length(redfiles))
stop("Cannot find matching .idat files")
if(path[1] != "." & path[1] != ""){
grnidats = file.path(path, grnfiles)
redidats = file.path(path, redfiles)
} else {
message("path arg not set. Assuming files are in local directory, or that complete path is provided")
grnidats = grnfiles
redidats = redfiles
}
if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
|
e972ec4a |
|
d3320c21 |
if(verbose) message("Initializing container for genotyping and copy number estimation")
|
3be95c2b |
pkgname <- getCrlmmAnnotationName(cdfName)
path <- system.file("extdata", package=pkgname)
genome <- getAvailableIlluminaGenomeBuild(path)
featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
|
450a6d8a |
nr = nrow(featureData); nc = narrays
|
063b3d14 |
sns <- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
make.unique(sampleSheet$Sample_ID)
} else{
make.unique(basename(arrayNames))
}
biga <- initializeBigMatrix(name="A", nr, nc)
bigb <- initializeBigMatrix(name="B", nr, nc)
bigc <- initializeBigMatrix(name="call", nr, nc)
bigd <- initializeBigMatrix(name="callPr", nr,nc)
colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns
|
453e688a |
cnSet <- new("CNSet",
|
063b3d14 |
alleleA=biga,
alleleB=bigb,
call=bigc,
callProbability=bigd,
|
e972ec4a |
annotation=cdfName,
|
7c0c9ac5 |
featureData=featureData,
|
3be95c2b |
batch=batch,
genome=genome)
|
063b3d14 |
## if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
## sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID)
## } else{
## sampleNames(cnSet) <- make.unique(basename(arrayNames))
## }
|
e972ec4a |
if(saveDate){
|
99202387 |
protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose)
|
e972ec4a |
} else{
|
450a6d8a |
protocolData = annotatedDataFrameFrom(A(cnSet), byrow=FALSE)
|
e972ec4a |
}
|
450a6d8a |
rownames(pData(protocolData)) = sampleNames(cnSet)
protocolData(cnSet) = protocolData
|
7c0c9ac5 |
##featureData(cnSet) = featureData
|
450a6d8a |
featureNames(cnSet) = featureNames(featureData)
|
cfa9fbbc |
cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer")
cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
|
063b3d14 |
##sampleNames(cnSet) <- basename(sampleNames(cnSet))
|
e972ec4a |
return(cnSet)
}
|
f0f8ba16 |
construct.Illumina <- constructInf
|
e972ec4a |
|
f0f8ba16 |
preprocessInf <- function(cnSet,
sampleSheet=NULL,
|
cfa9fbbc |
arrayNames=NULL,
ids=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=TRUE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
saveDate=TRUE,
stripNorm=TRUE,
useTarget=TRUE,
mixtureSampleSize=10^5,
|
7c0c9ac5 |
fitMixture=TRUE,
|
cfa9fbbc |
eps=0.1,
verbose=TRUE,
|
7c0c9ac5 |
seed=1, cdfName){
|
cfa9fbbc |
stopifnot(all(c("gender", "SNR", "SKW") %in% varLabels(cnSet)))
open(A(cnSet))
open(B(cnSet))
sns <- sampleNames(cnSet)
pkgname = getCrlmmAnnotationName(annotation(cnSet))
is.snp = isSnp(cnSet)
snp.index = which(is.snp)
narrays = ncol(cnSet)
|
063b3d14 |
sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers())
|
cfa9fbbc |
mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
ocLapply(seq_along(sampleBatches),
|
3d159620 |
crlmm:::processIDAT,
|
cfa9fbbc |
sampleBatches=sampleBatches,
sampleSheet=sampleSheet,
arrayNames=arrayNames,
ids=ids,
path=path,
arrayInfoColNames=arrayInfoColNames,
highDensity=highDensity,
sep=sep,
fileExt=fileExt,
saveDate=saveDate,
verbose=verbose,
mixtureSampleSize=mixtureSampleSize,
fitMixture=fitMixture,
eps=eps,
seed=seed,
cdfName=cdfName,
sns=sns,
stripNorm=stripNorm,
useTarget=useTarget,
A=A(cnSet),
B=B(cnSet),
|
5c9f60e9 |
GT=calls(cnSet),
GTP=snpCallProbability(cnSet),
|
cfa9fbbc |
SKW=cnSet$SKW,
SNR=cnSet$SNR,
mixtureParams=mixtureParams,
is.snp=is.snp,
neededPkgs=c("crlmm", pkgname)) # outdir=outdir,
return(mixtureParams)
}
|
f0f8ba16 |
preprocess <- preprocessInf
|
cfa9fbbc |
|
f0f8ba16 |
genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
SNRMin=5,
recallMin=10,
recallRegMin=1000,
verbose=TRUE,
returnParams=TRUE,
badSNP=0.7,
gender=NULL,
|
7c0c9ac5 |
DF=6,
cdfName){
|
284565dd |
is.snp = isSnp(cnSet)
snp.index = which(is.snp)
|
5c9f60e9 |
## narrays = ncol(cnSet)
## open(A(cnSet))
## open(B(cnSet))
## open(snpCall(cnSet))
## open(snpCallProbability(cnSet))
## ## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores
## message("Writing to call and callProbability slots")
## for(j in 1:ncol(cnSet)){
## snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j])
## snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j])
## }
## message("Writing complete. Begin genotyping...")
## close(A(cnSet))
## close(B(cnSet))
|
3be95c2b |
tmp <- crlmmGT2(A=A(cnSet),
B=B(cnSet),
SNR=cnSet$SNR,
mixtureParams=mixtureParams,
cdfName=cdfName,
col.names=sampleNames(cnSet),
probs=probs,
DF=DF,
SNRMin=SNRMin,
recallMin=recallMin,
recallRegMin=recallRegMin,
gender=gender,
verbose=verbose,
returnParams=returnParams,
badSNP=badSNP,
callsGt=calls(cnSet),
callsPr=snpCallProbability(cnSet))#,
|
1e0ed16d |
##RS: I changed the API for crlmmGT2 to be consistent with crlmmGT
## snp.names=featureNames(cnSet)[snp.index])
|
cfa9fbbc |
if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.")
|
252da926 |
open(cnSet$gender)
|
cfa9fbbc |
cnSet$gender[,] = tmp$gender
|
252da926 |
close(cnSet$gender)
|
cfa9fbbc |
TRUE
}
|
e972ec4a |
|
453e688a |
genotype.Illumina <- function(sampleSheet=NULL,
arrayNames=NULL,
ids=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=FALSE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
cdfName,
copynumber=TRUE,
batch,
saveDate=TRUE,
stripNorm=TRUE,
useTarget=TRUE,
mixtureSampleSize=10^5,
fitMixture=TRUE,
eps=0.1,
verbose=TRUE,
seed=1,
sns,
probs=rep(1/3, 3),
DF=6,
SNRMin=5,
recallMin=10,
recallRegMin=1000,
gender=NULL,
returnParams=TRUE,
|
ac7e0c8f |
badSNP=0.7) {
|
450a6d8a |
is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
|
d878c182 |
stopifnot(is.lds)
|
d3320c21 |
if(missing(cdfName)) stop("must specify cdfName")
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames")
|
f0f8ba16 |
stopifnot(!missing(batch))
|
7c0c9ac5 |
if(is(batch, "factor")) batch <- as.character(batch)
|
f0f8ba16 |
pkgname <- getCrlmmAnnotationName(cdfName)
message("Instantiate CNSet container.")
cnSet <- constructInf(sampleSheet=sampleSheet,
arrayNames=arrayNames,
path=path,
arrayInfoColNames=arrayInfoColNames,
highDensity=highDensity,
sep=sep,
fileExt=fileExt,
cdfName=cdfName,
verbose=verbose,
batch=batch,
saveDate=saveDate)
mixtureParams <- preprocessInf(cnSet=cnSet,
sampleSheet=sampleSheet,
arrayNames=arrayNames,
ids=ids,
path=path,
arrayInfoColNames=arrayInfoColNames,
highDensity=highDensity,
sep=sep,
fileExt=fileExt,
saveDate=saveDate,
stripNorm=stripNorm,
useTarget=useTarget,
mixtureSampleSize=mixtureSampleSize,
fitMixture=fitMixture,
eps=eps,
verbose=verbose,
|
7c0c9ac5 |
seed=seed,
cdfName=cdfName)
|
f0f8ba16 |
message("Preprocessing complete. Begin genotyping...")
genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
|
3be95c2b |
probs=probs,
|
f0f8ba16 |
SNRMin=SNRMin,
recallMin=recallMin,
recallRegMin=recallRegMin,
verbose=verbose,
returnParams=returnParams,
badSNP=badSNP,
gender=gender,
|
7c0c9ac5 |
DF=DF,
cdfName=cdfName)
|
f0f8ba16 |
return(cnSet)
|
d3320c21 |
}
|
453e688a |
|
edc69ca1 |
processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
|
453e688a |
arrayNames=NULL,
|
802ca707 |
ids=NULL,
path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
highDensity=FALSE,
sep="_",
fileExt=list(green="Grn.idat", red="Red.idat"),
saveDate=FALSE,
verbose=TRUE,
|
453e688a |
mixtureSampleSize=10^5,
|
802ca707 |
fitMixture=TRUE,
eps=0.1,
seed=1,
cdfName,
sns,
stripNorm=TRUE,
useTarget=TRUE,
|
5c9f60e9 |
A, B,
GT,
GTP,
SKW, SNR, mixtureParams, is.snp) { #, outdir=".") {
|
edc69ca1 |
message("Processing sample stratum ", stratum, " of ", length(sampleBatches))
sel <- sampleBatches[[stratum]]
|
d3320c21 |
if(length(path)>= length(sel)) path = path[sel]
RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
|
99202387 |
highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
|
d3320c21 |
XY = RGtoXY(RG, chipType=cdfName)
|
99202387 |
rm(RG)
|
a1394042 |
gc(verbose=FALSE)
|
9774abc3 |
if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
|
d3320c21 |
res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
|
99202387 |
seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir)
rm(XY)
|
a1394042 |
gc(verbose=FALSE)
|
5c9f60e9 |
if(verbose) message("Finished preprocessing.")
|
d3320c21 |
snp.index = which(is.snp)
|
e6e6981f |
np.index = which(!is.snp)
|
453e688a |
open(A); open(B);
|
5c9f60e9 |
open(GT); open(GTP)
|
802ca707 |
Amatrix <- res[["A"]]
Bmatrix <- res[["B"]]
|
5c9f60e9 |
## Amatrix and Bmatrix are ordinary matrices--not ff objects.
## By writing columns of a ordinary matrix to GT and GTP, we
## save one read step later on. GT and GTP will be updated to
## calls and call probabilities by the crlmmGT2 function. The A
## and B slots will retain the normalized intensities for the
## A and B alleles
|
802ca707 |
for(j in seq_along(sel)){
jj <- sel[j]
A[snp.index, jj] <- Amatrix[, j]
|
5c9f60e9 |
GT[snp.index, jj] <- Amatrix[, j]
|
802ca707 |
B[snp.index, jj] <- Bmatrix[, j]
|
5c9f60e9 |
GTP[snp.index, jj] <- Bmatrix[, j]
|
802ca707 |
}
|
5c9f60e9 |
if(length(np.index)>0) {
cnAmatrix <- res[["cnAB"]]$A
cnBmatrix <- res[["cnAB"]]$B
for(j in seq_along(sel)){
jj <- sel[j]
A[np.index, jj] <- cnAmatrix[, j]
GT[np.index, jj] <- cnAmatrix[, j]
B[np.index, jj] <- cnBmatrix[, j]
GTP[np.index, jj] <- cnBmatrix[, j]
}
|
450a6d8a |
}
|
453e688a |
open(SKW); open(SNR); open(mixtureParams)
|
802ca707 |
SKW[sel] = res[["SKW"]][]
SNR[sel] = res[["SNR"]][]
mixtureParams[, sel] = res[["mixtureParams"]][]
|
5c9f60e9 |
close(A); close(B)
close(GT); close(GTP)
close(SNR); close(SKW)
|
d3320c21 |
close(mixtureParams)
rm(res)
|
a1394042 |
gc(verbose=FALSE)
|
d3320c21 |
TRUE
|
2564dd8b |
}
|