cd8b85b6 |
# function below works OK provided all .idat files are in the current working directory
# - could add an option to allow files in Illumina directory structure to be handled
# or to use the optional 'Path' column in sampleSheet
# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
readIdatFiles = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
|
cfdeb14b |
highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=FALSE) {
|
cd8b85b6 |
if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
arrayNames=NULL
if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
barcode = sampleSheet[,arrayInfoColNames$barcode]
arrayNames=barcode
}
if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
position = sampleSheet[,arrayInfoColNames$position]
if(is.null(arrayNames))
arrayNames=position
else
arrayNames = paste(arrayNames, position, sep=sep)
if(highDensity) {
hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
for(i in names(hdExt))
arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
}
}
|
852cc5cf |
pd = new("AnnotatedDataFrame", data = sampleSheet)
|
cd8b85b6 |
}
if(is.null(arrayNames)) {
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
if(!is.null(sampleSheet)) {
sampleSheet=NULL
cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
}
|
852cc5cf |
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
|
cd8b85b6 |
}
|
cfdeb14b |
narrays = length(arrayNames)
|
cd8b85b6 |
grnfiles = paste(arrayNames, fileExt$green, sep=sep)
redfiles = paste(arrayNames, fileExt$red, sep=sep)
if(length(grnfiles)==0 || length(redfiles)==0)
stop("Cannot find .idat files")
if(length(grnfiles)!=length(redfiles))
stop("Cannot find matching .idat files")
if(!all(c(redfiles,grnfiles) %in% dir(path=path)))
stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
grnidats = file.path(path, grnfiles)
redidats = file.path(path, redfiles)
|
cfdeb14b |
headerInfo = list(nProbes = rep(NA, narrays),
Barcode = rep(NA, narrays),
ChipType = rep(NA, narrays),
Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
Position = rep(NA, narrays)) # this may also vary a bit
dates = list(decode=rep(NA, narrays),
|
b938db26 |
scan=rep(NA, narrays))
|
cfdeb14b |
|
cd8b85b6 |
# read in the data
for(i in seq(along=arrayNames)) {
cat("reading", arrayNames[i], "\t")
idsG = idsR = G = R = NULL
|
852cc5cf |
|
cd8b85b6 |
cat(paste(sep, fileExt$green, sep=""), "\t")
G = readIDAT(grnidats[i])
idsG = rownames(G$Quants)
headerInfo$nProbes[i] = G$nSNPsRead
headerInfo$Barcode[i] = G$Barcode
headerInfo$ChipType[i] = G$ChipType
headerInfo$Manifest[i] = G$Unknown$MostlyNull
headerInfo$Position[i] = G$Unknowns$MostlyA
|
852cc5cf |
if(headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
|
cd8b85b6 |
# || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
# but differed by a few SNPs for some reason - most of the chip was the same though
|
852cc5cf |
# stop("Chips are not of all of the same type - please check your data")
warning("Chips are not of the same type. Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
next()
}
|
cfdeb14b |
dates$decode[i] = G$RunInfo[1, 1]
dates$scan[i] = G$RunInfo[2, 1]
|
cd8b85b6 |
if(i==1) {
if(is.null(ids) && !is.null(G))
ids = idsG
else
stop("Could not find probe IDs")
nprobes = length(ids)
narrays = length(arrayNames)
|
852cc5cf |
tmpmat = matrix(NA, nprobes, narrays)
rownames(tmpmat) = ids
|
cd8b85b6 |
if(!is.null(sampleSheet))
|
852cc5cf |
colnames(tmpmat) = sampleSheet$Sample_ID
|
cd8b85b6 |
else
|
852cc5cf |
colnames(tmpmat) = arrayNames
RG = new("NChannelSet",
R=tmpmat, G=tmpmat, Rnb=tmpmat, Gnb=tmpmat,
Rse=tmpmat, Gse=tmpmat, annotation=headerInfo$Manifest[1],
phenoData=pd, storage.mode="environment")
rm(tmpmat)
gc()
|
cd8b85b6 |
}
if(length(ids)==length(idsG)) {
if(sum(ids==idsG)==nprobes) {
|
852cc5cf |
RG@assayData$G[,i] = G$Quants[, "Mean"]
RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
RG@assayData$Gse[,i] = G$Quants[, "SD"]
|
cd8b85b6 |
}
}
else {
indG = match(ids, idsG)
|
852cc5cf |
RG@assayData$G[,i] = G$Quants[indG, "Mean"]
RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
|
cd8b85b6 |
}
|
852cc5cf |
rm(G)
gc()
cat(paste(sep, fileExt$red, sep=""), "\n")
R = readIDAT(redidats[i])
idsR = rownames(R$Quants)
|
cd8b85b6 |
if(length(ids)==length(idsG)) {
if(sum(ids==idsR)==nprobes) {
|
852cc5cf |
RG@assayData$R[,i] = R$Quants[ ,"Mean"]
RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
|
cd8b85b6 |
}
}
else {
indR = match(ids, idsR)
|
852cc5cf |
RG@assayData$R[,i] = R$Quants[indR, "Mean"]
RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
|
cd8b85b6 |
}
|
852cc5cf |
rm(R)
gc()
|
cd8b85b6 |
}
|
852cc5cf |
if(saveDate)
RG@scanDates = dates$scan
storageMode(RG) = "lockedEnvironment"
RG
|
cd8b85b6 |
}
# the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
readIDAT <- function(idatFile){
fileSize <- file.info(idatFile)$size
tempCon <- file(idatFile,"rb")
prefixCheck <- readChar(tempCon,4)
if(prefixCheck != "IDAT"){
}
versionNumber <- readBin(tempCon, "integer", n=1, size=8,
endian="little", signed=FALSE)
nFields <- readBin(tempCon, "integer", n=1, size=4,
endian="little", signed=FALSE)
fields <- matrix(0,nFields,3);
colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
for(i1 in 1:nFields){
fields[i1,"Field Code"] <-
readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
fields[i1,"Byte Offset"] <-
readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE)
}
knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400,
401, 402, 403, 404, 405, 406, 407, 408, 409)
names(knownCodes) <-
c("nSNPsRead", # 1000
"IlluminaID", # 102
"SD", # 103
"Mean", # 104
"NBeads", # 107
"MidBlock", # 200
"RunInfo", # 300
"RedGreen", # 400
"MostlyNull", # 401
"Barcode", # 402
"ChipType", # 403
"MostlyA", # 404
"Unknown.1", # 405
"Unknown.2", # 406
"Unknown.3", # 407
"Unknown.4", # 408
"Unknown.5" # 409
)
nNewFields <- 1
rownames(fields) <- paste("Null", 1:nFields)
for(i1 in 1:nFields){
temp <- match(fields[i1,"Field Code"], knownCodes)
if(!is.na(temp)){
rownames(fields)[i1] <- names(knownCodes)[temp]
}else{
rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
nNewFields <- nNewFields + 1
}
}
seek(tempCon, fields["nSNPsRead", "Byte Offset"])
nSNPsRead <- readBin(tempCon, "integer", n=1, size=4,
endian="little", signed=FALSE)
seek(tempCon, fields["IlluminaID", "Byte Offset"])
IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4,
endian="little", signed=FALSE)
seek(tempCon, fields["SD", "Byte Offset"])
SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
endian="little", signed=FALSE)
seek(tempCon, fields["Mean", "Byte Offset"])
Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
endian="little", signed=FALSE)
seek(tempCon, fields["NBeads", "Byte Offset"])
NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
seek(tempCon, fields["MidBlock", "Byte Offset"])
nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4,
endian="little", signed=FALSE)
MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4,
endian="little", signed=FALSE)
seek(tempCon, fields["RunInfo", "Byte Offset"])
nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4,
endian="little", signed=FALSE)
RunInfo <- matrix(NA, nRunInfoBlocks, 5)
colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
"BlockCode", "CodeVersion")
|
b938db26 |
for(i1 in 1:2) { #nRunInfoBlocks){ ## MR edit
|
cd8b85b6 |
for(i2 in 1:5){
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
RunInfo[i1,i2] <- readChar(tempCon, nChars)
}
}
seek(tempCon, fields["RedGreen", "Byte Offset"])
RedGreen <- readBin(tempCon, "numeric", n=1, size=4,
endian="little", signed=FALSE)
#RedGreen <- readBin(tempCon, "integer", n=4, size=1,
# endian="little", signed=FALSE)
seek(tempCon, fields["MostlyNull", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
MostlyNull <- readChar(tempCon, nChars)
seek(tempCon, fields["Barcode", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Barcode <- readChar(tempCon, nChars)
seek(tempCon, fields["ChipType", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
ChipType <- readChar(tempCon, nChars)
seek(tempCon, fields["MostlyA", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
MostlyA <- readChar(tempCon, nChars)
seek(tempCon, fields["Unknown.1", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.1 <- readChar(tempCon, nChars)
seek(tempCon, fields["Unknown.2", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.2 <- readChar(tempCon, nChars)
seek(tempCon, fields["Unknown.3", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.3 <- readChar(tempCon, nChars)
seek(tempCon, fields["Unknown.4", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.4 <- readChar(tempCon, nChars)
seek(tempCon, fields["Unknown.5", "Byte Offset"])
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
Unknown.5 <- readChar(tempCon, nChars)
close(tempCon)
Unknowns <-
list(MostlyNull=MostlyNull,
MostlyA=MostlyA,
Unknown.1=Unknown.1,
Unknown.2=Unknown.2,
Unknown.3=Unknown.3,
Unknown.4=Unknown.4,
Unknown.5=Unknown.5)
Quants <- cbind(Mean, SD, NBeads)
colnames(Quants) <- c("Mean", "SD", "NBeads")
rownames(Quants) <- as.character(IlluminaID)
idatValues <-
list(fileSize=fileSize,
versionNumber=versionNumber,
nFields=nFields,
fields=fields,
nSNPsRead=nSNPsRead,
#IlluminaID=IlluminaID,
#SD=SD,
#Mean=Mean,
#NBeads=NBeads,
Quants=Quants,
MidBlock=MidBlock,
RunInfo=RunInfo,
RedGreen=RedGreen,
Barcode=Barcode,
ChipType=ChipType,
Unknowns=Unknowns)
idatValues
}
readBPM <- function(bpmFile){
## Reads and parses Illumina BPM files
fileSize <- file.info(bpmFile)$size
tempCon <- file(bpmFile,"rb")
# The first few bytes of the egtFile are some type of
# header, but there's no related byte offset information.
prefixCheck <- readChar(tempCon,3) ## should be "BPM"
null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
## should be 1
versionNumber <-
readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE)
## should be 4
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
chipType <- readChar(tempCon, nChars)
null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
csvLines <- readLines(tempCon, 22)
entriesByteOffset <- seek(tempCon);
nEntries <- readBin(tempCon, "integer", n=1, size=4,
endian="little", signed=FALSE)
if(FALSE){
snpIndexByteOffset <- seek(tempCon)
snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
endian="little", signed=FALSE)
## for the 1M array, these are simply in order from 1 to 1072820.
snpNamesByteOffset <- seek(tempCon)
snpNames <- rep("A", nEntries)
for(i1 in 1:nEntries){
nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
snpNames[i1] <- readChar(tempCon, nChars)
}
}
seek(tempCon, 15278138)
normIDByteOffset <- seek(tempCon)
normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
newBlockByteOffset <- seek(tempCon)
newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
close(tempCon)
byteOffsets <- list(entriesByteOffset=entriesByteOffset,
#snpIndexByteOffset=snpIndexByteOffset,
#snpNamesByteOffset=snpNamesByteOffset,
normIDByteOffset=normIDByteOffset,
newBlockByteOffset=newBlockByteOffset)
allStuff <- list(prefixCheck=prefixCheck,
null.1=null.1,
versionNumber=versionNumber,
chipType=chipType,
null.2=null.2,
csvLines=csvLines,
nEntries=nEntries,
#snpIndex=snpIndex,
#snpNames=snpNames,
normID=normID,
newBlock=newBlock,
byteOffsets=byteOffsets)
allStuff
}
|
cfdeb14b |
RGtoXY = function(RG, chipType, verbose=TRUE) {
|
cd8b85b6 |
chipList = c("human1mv1c", # 1M
"human370v1c", # 370CNV
"human650v3a", # 650Y
"human610quadv1b", # 610 quad
"human660quadv1a", # 660 quad
"human370quadv3c", # 370CNV quad
"human550v3b", # 550K
"human1mduov3b") # 1M Duo
if(missing(chipType))
chipType = match.arg(annotation(RG), chipList)
else
chipType = match.arg(chipType, chipList)
pkgname <- getCrlmmAnnotationName(chipType)
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading chip annotation information.")
|
cfdeb14b |
loader("address.rda", .crlmmPkgEnv, pkgname)
|
cd8b85b6 |
# data(annotation, package=pkgname, envir=.crlmmPkgEnv)
|
cfdeb14b |
aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
ids <- names(aids)
snpbase <- getVarInEnv("base")
|
cd8b85b6 |
|
cfdeb14b |
nsnps = length(aids)
|
cd8b85b6 |
narrays = ncol(RG)
|
cfdeb14b |
# aidcol = match("AddressA_ID", colnames(annot))
# if(is.na(aidcol))
# aidcol = match("Address", colnames(annot))
# bidcol = match("AddressB_ID", colnames(annot))
# if(is.na(bidcol))
# bidcol = match("Address2", colnames(annot))
# aids = annot[, aidcol]
# bids = annot[, bidcol]
# snpids = annot[,"Name"]
# snpbase = annot[,"SNP"]
infI = !is.na(bids) & bids!=0
|
cd8b85b6 |
aord = match(aids, featureNames(RG)) # NAs are possible here
|
cfdeb14b |
bord = match(bids, featureNames(RG)) # and here
|
cd8b85b6 |
# argrg = aids[rrgg]
# brgrg = bids[rrgg]
|
cfdeb14b |
|
852cc5cf |
tmpmat = matrix(0, nsnps, narrays)
rownames(tmpmat) = ids
colnames(tmpmat) = sampleNames(RG)
XY = new("NChannelSet", X=tmpmat, Y=tmpmat, Xnb=tmpmat, Ynb=tmpmat, Xse=tmpmat, Yse=tmpmat, zero=tmpmat,
annotation=chipType, phenoData=RG@phenoData, scanDates=RG@scanDates, storage.mode="environment")
rm(tmpmat)
gc()
|
cfdeb14b |
# First sort out Infinium II SNPs, X -> R (allele A) and Y -> G (allele B) from the same probe
|
852cc5cf |
XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
gc()
|
cd8b85b6 |
|
cfdeb14b |
## Warning - not 100% sure that the code below is correct - could be more complicated than this
# Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
# infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
# X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
# Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
# Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
# Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
# Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
# Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
|
cd8b85b6 |
|
cfdeb14b |
# Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
# infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
# X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
# Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
# Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
# Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
# Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
# Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
# For now zero out Infinium I probes
|
852cc5cf |
XY@assayData$X[infI,] = 0
XY@assayData$Y[infI,] = 0
XY@assayData$Xnb[infI,] = 0
XY@assayData$Ynb[infI,] = 0
XY@assayData$Xse[infI,] = 0
XY@assayData$Yse[infI,] = 0
XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
|
cd8b85b6 |
gc()
|
852cc5cf |
# storageMode(XY) = "lockedEnvironment"
XY
|
cd8b85b6 |
}
stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
pkgname <- getCrlmmAnnotationName(annotation(XY))
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading strip and reference normalization information.")
loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
# data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv)
stripnum <- getVarInEnv("stripnum")
if(useTarget)
targetdist = getVarInEnv("reference")
|
852cc5cf |
# Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
# colnames(Xqws) = colnames(Yqws) = sampleNames(XY) #$X
# rownames(Xqws) = rownames(Yqws) = featureNames(XY)
|
cd8b85b6 |
if(verbose){
message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
}
for(s in 1:max(stripnum)) {
if(verbose) {
if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
else cat(".")
}
sel = stripnum==s
subX = exprs(channel(XY, "X"))[sel,]
subY = exprs(channel(XY, "Y"))[sel,]
if(useTarget)
tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]])
else
tmp = normalize.quantiles(as.matrix(cbind(subX, subY)))
|
852cc5cf |
XY@assayData$X[sel,] = tmp[,1:(ncol(tmp)/2)]
XY@assayData$Y[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
# Xqws[sel,] = tmp[,1:(ncol(tmp)/2)]
# Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
|
cd8b85b6 |
rm(subX, subY, tmp, sel)
gc()
}
if(verbose)
cat("\n")
|
852cc5cf |
XY
# XYNorm = new("NChannelSet",
# X=Xqws+16,
# Y=Yqws+16,
# Xnb=exprs(channel(XY, "Xnb")),
# Ynb=exprs(channel(XY, "Ynb")),
# Xse=exprs(channel(XY, "Xse")),
# Yse=exprs(channel(XY, "Yse")),
# zero=exprs(channel(XY, "zero")),
# annotation=annotation(XY),
# phenoData=XY@phenoData)
|
cd8b85b6 |
}
preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1,
|
cfdeb14b |
cdfName, sns, stripNorm=TRUE, useTarget=TRUE, save.it=FALSE, intensityFile) {
|
cd8b85b6 |
if(stripNorm)
XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
|
cfdeb14b |
|
cd8b85b6 |
## MR: the code below is mostly straight from snprma.R
|
9564b5c5 |
if (missing(sns)) sns <- sampleNames(XY) #$X
|
cd8b85b6 |
if(missing(cdfName))
cdfName <- annotation(XY)
## stuffDir <- changeToCrlmmAnnotationName(cdfName)
pkgname <- getCrlmmAnnotationName(cdfName)
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
message(strwrap(msg))
stop("Package ", pkgname, " could not be found.")
rm(suggCall, msg)
}
if(verbose) message("Loading snp annotation and mixture model parameters.")
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
|
cfdeb14b |
loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
|
cd8b85b6 |
# data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
autosomeIndex <- getVarInEnv("autosomeIndex")
# pnsa <- getVarInEnv("pnsa")
# pnsb <- getVarInEnv("pnsb")
# fid <- getVarInEnv("fid")
# reference <- getVarInEnv("reference")
# aIndex <- getVarInEnv("aIndex")
# bIndex <- getVarInEnv("bIndex")
SMEDIAN <- getVarInEnv("SMEDIAN")
theKnots <- getVarInEnv("theKnots")
gns <- getVarInEnv("gns")
|
cfdeb14b |
# separate out copy number probes
npIndex = getVarInEnv("npProbesFid")
nprobes = length(npIndex)
narrays = ncol(XY)
A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
cnAB = list(A=A, B=B, sns=sns, gns=names(npIndex), cdfName=cdfName)
# next process snp probes
snpIndex = getVarInEnv("snpProbesFid")
nprobes <- length(snpIndex)
|
cd8b85b6 |
##We will read each cel file, summarize, and run EM one by one
##We will save parameters of EM to use later
mixtureParams <- matrix(0, 4, narrays)
SNR <- vector("numeric", narrays)
SKW <- vector("numeric", narrays)
## This is the sample for the fitting of splines
## BC: I like better the idea of the user passing the seed,
## because this might intefere with other analyses
## (like what happened to GCRMA)
set.seed(seed)
idx <- sort(sample(autosomeIndex, mixtureSampleSize))
##S will hold (A+B)/2 and M will hold A-B
##NOTE: We actually dont need to save S. Only for pics etc...
##f is the correction. we save to avoid recomputing
|
cfdeb14b |
A <- matrix(as.integer(exprs(channel(XY, "X"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
B <- matrix(as.integer(exprs(channel(XY, "Y"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
|
cd8b85b6 |
# new lines below - useful to keep track of zeroed out SNPs
|
cfdeb14b |
zero <- matrix(as.integer(exprs(channel(XY, "zero"))[snpIndex,]), nprobes, narrays)
|
cd8b85b6 |
|
cfdeb14b |
colnames(A) <- colnames(B) <- colnames(zero) <- sns
rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
|
cd8b85b6 |
if(verbose){
message("Calibrating ", narrays, " arrays.")
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
}
idx2 <- sample(nprobes, 10^5)
for(i in 1:narrays){
SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
if(fitMixture){
S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
M <- log2(A[idx, i])-log2(B[idx, i])
##we need to test the choice of eps.. it is not the max diff between funcs
tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
mixtureParams[, i] <- tmp[["coef"]]
SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
}
if(verbose) {
|
cfdeb14b |
if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
else cat(".")
|
cd8b85b6 |
}
}
if (verbose) {
if (getRversion() > '2.7.0') close(pb)
else cat("\n")
}
if (!fitMixture) SNR <- mixtureParams <- NA
## gns comes from preprocStuff.rda
|
cfdeb14b |
res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
if(save.it & !missing(intensityFile)) {
t0 <- proc.time()
save(cnAB, res, file=intensityFile)
t0 <- proc.time()-t0
if(verbose) message("Used ", round(t0[3],1), " seconds to save ", intensityFile, ".")
}
return(res)
|
cd8b85b6 |
}
|
cfdeb14b |
## MR: Could add arguments to allow this function to read in idat data as well,
## although this would add a further 7 arguments, which might over-complicate things
|
cd8b85b6 |
crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
row.names=TRUE, col.names=TRUE,
probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
|
d6533a61 |
seed=1, save.it=FALSE, load.it=FALSE, intensityFile,
|
cd8b85b6 |
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
cdfName, sns, recallMin=10, recallRegMin=1000,
returnParams=FALSE, badSNP=.7) {
|
cfdeb14b |
|
d6533a61 |
if ((load.it | save.it) & missing(intensityFile))
stop("'intensityFile' is missing, and you chose either load.it or save.it")
if (!missing(intensityFile))
if (load.it & !file.exists(intensityFile)){
load.it <- FALSE
message("File ", intensityFile, " does not exist.")
message("Not loading it, but running SNPRMA from scratch.")
}
if (!load.it){
|
cfdeb14b |
if(!missing(RG)) {
if(missing(XY))
XY = RGtoXY(RG, chipType=cdfName)
else
stop("Both RG and XY specified - please use one or the other")
|
d6533a61 |
}
|
9564b5c5 |
if (missing(sns)) sns <- sampleNames(XY) #$X
|
cfdeb14b |
res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
save.it=save.it, intensityFile=intensityFile)
|
d6533a61 |
}else{
if(verbose) message("Loading ", intensityFile, ".")
obj <- load(intensityFile)
if(verbose) message("Done.")
|
cfdeb14b |
if(!any(obj == "res"))
stop("Object in ", intensityFile, " seems to be invalid.")
|
d6533a61 |
}
|
cd8b85b6 |
if(row.names) row.names=res$gns else row.names=NULL
if(col.names) col.names=res$sns else col.names=NULL
res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
res[["mixtureParams"]], res[["cdfName"]],
gender=gender, row.names=row.names,
col.names=col.names, recallMin=recallMin,
recallRegMin=1000, SNRMin=SNRMin,
returnParams=returnParams, badSNP=badSNP,
verbose=verbose)
res2[["SNR"]] <- res[["SNR"]]
res2[["SKW"]] <- res[["SKW"]]
|
cfdeb14b |
return(list2SnpSet(res2, returnParams=returnParams)) # return(res2)
|
cd8b85b6 |
}
|