... | ... |
@@ -501,5 +501,11 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error. |
501 | 501 |
** modified the storage of RG, XY and genotype call data to use the ff package and initializeBigMatrix() function from oligoClasses package. The genotype call data now includes non-polymorphic probes (which have NA calls). Functions which use ff storage are named readIdatFiles2(), RGtoXY2(), preprocessInfinium2v2(), crlmmIllumina2() and crlmmIlluminaV2(). |
502 | 502 |
|
503 | 503 |
2010-04-08 B Carvalho committed version 1.5.46 |
504 |
- |
|
505 | 504 |
** moved NEWS to inst/ |
505 |
+ |
|
506 |
+2010-04-09 M. Ritchie committed version 1.5.47 |
|
507 |
+** fixed bugs in code and removed copy number probes from output |
|
508 |
+(now saved separately as before - SnpSuperSet not used) |
|
509 |
+** crlmmIllumina2() and crlmmIlluminaV2() now use the crlmmGT2() |
|
510 |
+function (which expects ff objects and supports parallel processing) |
|
511 |
+** updated crlmmIllumina.pdf vignette |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.5.46 |
|
5 |
-Date: 2010-02-07 |
|
4 |
+Version: 1.5.47 |
|
5 |
+Date: 2010-04-09 |
|
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms |
... | ... |
@@ -663,7 +663,7 @@ RGtoXY = function(RG, chipType, verbose=TRUE) { |
663 | 663 |
# argrg = aids[rrgg] |
664 | 664 |
# brgrg = bids[rrgg] |
665 | 665 |
|
666 |
- tmpmat = matrix(0, nsnps, narrays) |
|
666 |
+ tmpmat = matrix(as.integer(0), nsnps, narrays) |
|
667 | 667 |
rownames(tmpmat) = ids |
668 | 668 |
colnames(tmpmat) = sampleNames(RG) |
669 | 669 |
XY = new("NChannelSet", X=tmpmat, Y=tmpmat, zero=tmpmat, # Xnb=tmpmat, Ynb=tmpmat, Xse=tmpmat, Yse=tmpmat, zero=tmpmat, |
... | ... |
@@ -778,6 +778,10 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) { |
778 | 778 |
protocolData=RG@protocolData, storage.mode="environment") |
779 | 779 |
featureNames(XY) = ids # featureNames(RG) |
780 | 780 |
gc() |
781 |
+ # Need to initialize - matrices filled with NAs to begin with |
|
782 |
+ XY@assayData$X[1:nsnps,] = 0 |
|
783 |
+ XY@assayData$Y[1:nsnps,] = 0 |
|
784 |
+ XY@assayData$zero[1:nsnps,] = 0 |
|
781 | 785 |
|
782 | 786 |
# First sort out Infinium II SNPs, X -> R (allele A) and Y -> G (allele B) from the same probe |
783 | 787 |
XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red |
... | ... |
@@ -866,8 +870,8 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) { |
866 | 870 |
tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]]) |
867 | 871 |
else |
868 | 872 |
tmp = normalize.quantiles(as.matrix(cbind(subX, subY))) |
869 |
- XY@assayData$X[sel,] = tmp[,1:(ncol(tmp)/2)]+16 |
|
870 |
- XY@assayData$Y[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16 |
|
873 |
+ XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2) |
|
874 |
+ XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2) |
|
871 | 875 |
# Xqws[sel,] = tmp[,1:(ncol(tmp)/2)] |
872 | 876 |
# Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)] |
873 | 877 |
rm(subX, subY, tmp, sel) |
... | ... |
@@ -972,6 +976,7 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5, |
972 | 976 |
## (like what happened to GCRMA) |
973 | 977 |
set.seed(seed) |
974 | 978 |
idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
979 |
+ idx2 <- sample(nprobes, 10^5) |
|
975 | 980 |
|
976 | 981 |
##S will hold (A+B)/2 and M will hold A-B |
977 | 982 |
##NOTE: We actually dont need to save S. Only for pics etc... |
... | ... |
@@ -990,7 +995,7 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5, |
990 | 995 |
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3) |
991 | 996 |
} |
992 | 997 |
|
993 |
- idx2 <- sample(nprobes, 10^5) |
|
998 |
+ |
|
994 | 999 |
for(i in 1:narrays){ |
995 | 1000 |
SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3) |
996 | 1001 |
if(fitMixture){ |
... | ... |
@@ -1034,10 +1039,10 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1034 | 1039 |
cdfName, |
1035 | 1040 |
sns, |
1036 | 1041 |
stripNorm=TRUE, |
1037 |
- useTarget=TRUE) { #, |
|
1038 |
-# save.it=FALSE, |
|
1039 |
-# snpFile, |
|
1040 |
-# cnFile) { |
|
1042 |
+ useTarget=TRUE, # ) { #, |
|
1043 |
+ save.it=FALSE, |
|
1044 |
+ snpFile, |
|
1045 |
+ cnFile) { |
|
1041 | 1046 |
if(stripNorm) |
1042 | 1047 |
XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose) |
1043 | 1048 |
|
... | ... |
@@ -1069,29 +1074,30 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1069 | 1074 |
# bIndex <- getVarInEnv("bIndex") |
1070 | 1075 |
SMEDIAN <- getVarInEnv("SMEDIAN") |
1071 | 1076 |
theKnots <- getVarInEnv("theKnots") |
1072 |
- gns <- featureNames(XY) # getVarInEnv("gns") # needs to include np probes - gns is only for snps |
|
1073 |
- |
|
1074 |
- # separate out copy number probes |
|
1075 |
- npIndex = getVarInEnv("npProbesFid") |
|
1076 |
-# nprobes = length(npIndex) |
|
1077 |
+# gns <- featureNames(XY) # getVarInEnv("gns") # needs to include np probes - gns is only for snps |
|
1077 | 1078 |
narrays = ncol(XY) |
1078 |
-# A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays) |
|
1079 |
-# B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays) |
|
1079 |
+ |
|
1080 |
+ if(save.it & !missing(cnFile)) { |
|
1081 |
+ # separate out copy number probes |
|
1082 |
+ npIndex = getVarInEnv("npProbesFid") |
|
1083 |
+ nprobes = length(npIndex) |
|
1084 |
+ A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays) |
|
1085 |
+ B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays) |
|
1080 | 1086 |
|
1081 |
- # new lines below - useful to keep track of zeroed out probes |
|
1082 |
-# zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) |
|
1087 |
+ # new lines below - useful to keep track of zeroed out probes |
|
1088 |
+ zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) |
|
1083 | 1089 |
|
1084 |
-# colnames(A) <- colnames(B) <- colnames(zero) <- sns |
|
1085 |
-# rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex) |
|
1090 |
+ colnames(A) <- colnames(B) <- colnames(zero) <- sns |
|
1091 |
+ rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex) |
|
1086 | 1092 |
|
1087 |
-# cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName) |
|
1088 |
-# if(save.it & !missing(cnFile)) { |
|
1089 |
-# t0 <- proc.time() |
|
1090 |
-# save(cnAB, file=cnFile) |
|
1091 |
-# t0 <- proc.time()-t0 |
|
1092 |
-# if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".") |
|
1093 |
-# } |
|
1094 |
-# rm(cnAB, B, zero) |
|
1093 |
+ cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName) |
|
1094 |
+ |
|
1095 |
+ t0 <- proc.time() |
|
1096 |
+ save(cnAB, file=cnFile) |
|
1097 |
+ t0 <- proc.time()-t0 |
|
1098 |
+ if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".") |
|
1099 |
+ rm(cnAB, B, zero) |
|
1100 |
+ } |
|
1095 | 1101 |
|
1096 | 1102 |
# next process snp probes |
1097 | 1103 |
snpIndex = getVarInEnv("snpProbesFid") |
... | ... |
@@ -1099,12 +1105,12 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1099 | 1105 |
|
1100 | 1106 |
##We will read each cel file, summarize, and run EM one by one |
1101 | 1107 |
##We will save parameters of EM to use later |
1102 |
-# mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(narrays), "double") |
|
1103 |
-# SNR <- initializeBigVector("crlmmSNR-", narrays, "double") |
|
1104 |
-# SKW <- initializeBigVector("crlmmSKW-", narrays, "double") |
|
1105 |
- mixtureParams <- matrix(0, 4, narrays) |
|
1106 |
- SNR <- vector("numeric", narrays) |
|
1107 |
- SKW <- vector("numeric", narrays) |
|
1108 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double") |
|
1109 |
+ SNR <- initializeBigVector("crlmmSNR-", narrays, "double") |
|
1110 |
+ SKW <- initializeBigVector("crlmmSKW-", narrays, "double") |
|
1111 |
+# mixtureParams <- matrix(0, 4, narrays) |
|
1112 |
+# SNR <- vector("numeric", narrays) |
|
1113 |
+# SKW <- vector("numeric", narrays) |
|
1108 | 1114 |
|
1109 | 1115 |
## This is the sample for the fitting of splines |
1110 | 1116 |
## BC: I like better the idea of the user passing the seed, |
... | ... |
@@ -1112,15 +1118,16 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1112 | 1118 |
## (like what happened to GCRMA) |
1113 | 1119 |
set.seed(seed) |
1114 | 1120 |
idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
1121 |
+ idx2 <- sample(nprobes, 10^5) |
|
1115 | 1122 |
|
1116 | 1123 |
##S will hold (A+B)/2 and M will hold A-B |
1117 | 1124 |
##NOTE: We actually dont need to save S. Only for pics etc... |
1118 | 1125 |
##f is the correction. we save to avoid recomputing |
1119 |
- A <- exprs(channel(XY, "X"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames)) |
|
1120 |
- B <- exprs(channel(XY, "Y"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames)) |
|
1126 |
+# A <- exprs(channel(XY, "X"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames)) |
|
1127 |
+# B <- exprs(channel(XY, "Y"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames)) |
|
1121 | 1128 |
|
1122 | 1129 |
# new lines below - useful to keep track of zeroed out SNPs |
1123 |
- zero <- exprs(channel(XY, "zero"))[,] # )) #[snpIndex,]), nprobes, narrays) |
|
1130 |
+# zero <- exprs(channel(XY, "zero"))[,] # )) #[snpIndex,]), nprobes, narrays) |
|
1124 | 1131 |
|
1125 | 1132 |
# if(!is.matrix(A)) { |
1126 | 1133 |
# A = A[,] |
... | ... |
@@ -1128,25 +1135,32 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1128 | 1135 |
# zero = zero[,] |
1129 | 1136 |
# } |
1130 | 1137 |
|
1131 |
- if(!is.integer(A)) { |
|
1132 |
- A = matrix(as.integer(A), nrow(A), ncol(A)) |
|
1133 |
- B = matrix(as.integer(B), nrow(B), ncol(B)) |
|
1134 |
- } |
|
1138 |
+# if(!is.integer(A)) { |
|
1139 |
+# A = matrix(as.integer(A), nrow(A), ncol(A)) |
|
1140 |
+# B = matrix(as.integer(B), nrow(B), ncol(B)) |
|
1141 |
+# } |
|
1135 | 1142 |
|
1136 | 1143 |
# colnames(A) <- colnames(B) <- colnames(zero) <- sns |
1137 | 1144 |
# rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY) |
1145 |
+ |
|
1146 |
+ A <- initializeBigMatrix("crlmmA-", nprobes, narrays, "integer") |
|
1147 |
+ B <- initializeBigMatrix("crlmmB-", nprobes, narrays, "integer") |
|
1148 |
+ zero <- initializeBigMatrix("crlmmZero-", nprobes, narrays, "integer") |
|
1138 | 1149 |
|
1139 | 1150 |
if(verbose){ |
1140 | 1151 |
message("Calibrating ", narrays, " arrays.") |
1141 | 1152 |
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3) |
1142 | 1153 |
} |
1143 | 1154 |
|
1144 |
- idx2 <- sample(nprobes, 10^5) |
|
1145 | 1155 |
for(i in 1:narrays){ |
1146 |
- SKW[i] = mean((A[snpIndex,i][idx2]-mean(A[snpIndex,i][idx2]))^3)/(sd(A[snpIndex,i][idx2])^3) |
|
1156 |
+ A[,i] = exprs(channel(XY, "X"))[snpIndex,i] |
|
1157 |
+ B[,i] = exprs(channel(XY, "Y"))[snpIndex,i] |
|
1158 |
+ zero[,i] = exprs(channel(XY, "zero"))[snpIndex,i] |
|
1159 |
+# SKW[i] = mean((A[snpIndex,i][idx2]-mean(A[snpIndex,i][idx2]))^3)/(sd(A[snpIndex,i][idx2])^3) |
|
1160 |
+ SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3) |
|
1147 | 1161 |
if(fitMixture){ |
1148 |
- S <- (log2(A[snpIndex,i][idx])+log2(B[snpIndex,i][idx]))/2 - SMEDIAN |
|
1149 |
- M <- log2(A[snpIndex,i][idx])-log2(B[snpIndex,i][idx]) |
|
1162 |
+ S <- (log2(A[idx,i])+log2(B[idx,i]))/2 - SMEDIAN |
|
1163 |
+ M <- log2(A[idx,i])-log2(B[idx,i]) |
|
1150 | 1164 |
|
1151 | 1165 |
##we need to test the choice of eps.. it is not the max diff between funcs |
1152 | 1166 |
tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
... | ... |
@@ -1165,14 +1179,22 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5, |
1165 | 1179 |
} |
1166 | 1180 |
if (!fitMixture) SNR <- mixtureParams <- NA |
1167 | 1181 |
## gns comes from preprocStuff.rda |
1168 |
- res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName, snpIndex=snpIndex, npIndex=npIndex) |
|
1182 |
+ res = list(A=A, B=B, |
|
1183 |
+ zero=zero, sns=sns, gns=names(snpIndex), SNR=SNR, SKW=SKW, |
|
1184 |
+ mixtureParams=mixtureParams, cdfName=cdfName) # , snpIndex=snpIndex, npIndex=npIndex) |
|
1169 | 1185 |
|
1170 |
-# if(save.it & !missing(snpFile)) { |
|
1171 |
-# t0 <- proc.time() |
|
1172 |
-# save(res, file=snpFile) |
|
1173 |
-# t0 <- proc.time()-t0 |
|
1174 |
-# if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".") |
|
1175 |
-# } |
|
1186 |
+ if(save.it & !missing(snpFile)) { |
|
1187 |
+ t0 <- proc.time() |
|
1188 |
+ save(res, file=snpFile) |
|
1189 |
+ t0 <- proc.time()-t0 |
|
1190 |
+ if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".") |
|
1191 |
+ } |
|
1192 |
+ close(A) |
|
1193 |
+ close(B) |
|
1194 |
+ close(zero) |
|
1195 |
+ close(SKW) |
|
1196 |
+ close(mixtureParams) |
|
1197 |
+ close(SNR) |
|
1176 | 1198 |
return(res) |
1177 | 1199 |
} |
1178 | 1200 |
|
... | ... |
@@ -1213,8 +1235,8 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
1213 | 1235 |
if(!any(obj == "res")) |
1214 | 1236 |
stop("Object in ", snpFile, " seems to be invalid.") |
1215 | 1237 |
} |
1216 |
- if(row.names) row.names=res$gns else row.names=NULL |
|
1217 |
- if(col.names) col.names=res$sns else col.names=NULL |
|
1238 |
+ if(row.names) row.names=res[["gns"]] else row.names=NULL |
|
1239 |
+ if(col.names) col.names=res[["sns"]] else col.names=NULL |
|
1218 | 1240 |
|
1219 | 1241 |
res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]], |
1220 | 1242 |
res[["mixtureParams"]], res[["cdfName"]], |
... | ... |
@@ -1235,21 +1257,21 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
1235 | 1257 |
crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
1236 | 1258 |
row.names=TRUE, col.names=TRUE, |
1237 | 1259 |
probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
1238 |
- seed=1, # save.it=FALSE, load.it=FALSE, snpFile, cnFile, |
|
1260 |
+ seed=1, save.it=FALSE, load.it=FALSE, snpFile, cnFile, |
|
1239 | 1261 |
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
1240 | 1262 |
cdfName, sns, recallMin=10, recallRegMin=1000, |
1241 | 1263 |
returnParams=FALSE, badSNP=.7) { |
1242 |
-# if (save.it & (missing(snpFile) | missing(cnFile))) |
|
1243 |
-# stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it") |
|
1244 |
-# if (load.it & missing(snpFile)) |
|
1245 |
-# stop("'snpFile' is missing and you chose to load.it") |
|
1246 |
-# if (!missing(snpFile)) |
|
1247 |
-# if (load.it & !file.exists(snpFile)){ |
|
1248 |
-# load.it <- FALSE |
|
1249 |
-# message("File ", snpFile, " does not exist.") |
|
1250 |
-# stop("Cannot load SNP data.") |
|
1251 |
-# } |
|
1252 |
-# if (!load.it){ |
|
1264 |
+ if (save.it & (missing(snpFile) | missing(cnFile))) |
|
1265 |
+ stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it") |
|
1266 |
+ if (load.it & missing(snpFile)) |
|
1267 |
+ stop("'snpFile' is missing and you chose to load.it") |
|
1268 |
+ if (!missing(snpFile)) |
|
1269 |
+ if (load.it & !file.exists(snpFile)){ |
|
1270 |
+ load.it <- FALSE |
|
1271 |
+ message("File ", snpFile, " does not exist.") |
|
1272 |
+ stop("Cannot load SNP data.") |
|
1273 |
+ } |
|
1274 |
+ if (!load.it){ |
|
1253 | 1275 |
if(!missing(RG)) { |
1254 | 1276 |
if(missing(XY)) |
1255 | 1277 |
XY = RGtoXY2(RG, chipType=cdfName) |
... | ... |
@@ -1262,48 +1284,51 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
1262 | 1284 |
seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, |
1263 | 1285 |
# save.it=save.it, snpFile=snpFile, cnFile=cnFile) |
1264 | 1286 |
|
1265 |
- fD = featureData(XY) |
|
1266 |
- phenD = XY@phenoData |
|
1267 |
- protD = XY@protocolData |
|
1268 |
- rm(XY) |
|
1269 |
- gc() |
|
1270 |
- if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
|
1271 |
- callSet <- new("SnpSuperSet", |
|
1272 |
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
1273 |
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
1274 |
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
1275 |
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
1276 |
- annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) |
|
1277 |
- sampleNames(callSet) <- sns |
|
1278 |
- featureNames(callSet) <- res[["gns"]] |
|
1279 |
- pData(callSet)$SKW <- rep(NA, length(sns)) |
|
1280 |
- pData(callSet)$SNR <- rep(NA, length(sns)) |
|
1281 |
- pData(callSet)$gender <- rep(NA, length(sns)) |
|
1287 |
+# fD = featureData(XY) |
|
1288 |
+# phenD = XY@phenoData |
|
1289 |
+# protD = XY@protocolData |
|
1290 |
+# rm(XY) |
|
1291 |
+# gc() |
|
1292 |
+# if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
|
1293 |
+# callSet <- new("SnpSuperSet", |
|
1294 |
+# alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
1295 |
+# alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
1296 |
+# call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
1297 |
+# callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
1298 |
+# annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) |
|
1299 |
+# sampleNames(callSet) <- sns |
|
1300 |
+# featureNames(callSet) <- res[["gns"]] |
|
1301 |
+# pData(callSet)$SKW <- rep(NA, length(sns)) |
|
1302 |
+# pData(callSet)$SNR <- rep(NA, length(sns)) |
|
1303 |
+# pData(callSet)$gender <- rep(NA, length(sns)) |
|
1282 | 1304 |
|
1283 |
-# }else{ |
|
1284 |
-# if(verbose) message("Loading ", snpFile, ".") |
|
1285 |
-# obj <- load(snpFile) |
|
1286 |
-# if(verbose) message("Done.") |
|
1287 |
-# if(!any(obj == "res")) |
|
1288 |
-# stop("Object in ", snpFile, " seems to be invalid.") |
|
1289 |
-# } |
|
1305 |
+ }else{ |
|
1306 |
+ if(verbose) message("Loading ", snpFile, ".") |
|
1307 |
+ obj <- load(snpFile) |
|
1308 |
+ if(verbose) message("Done.") |
|
1309 |
+ if(!any(obj == "res")) |
|
1310 |
+ stop("Object in ", snpFile, " seems to be invalid.") |
|
1311 |
+ } |
|
1290 | 1312 |
|
1291 |
- rm(phenD, protD , fD) |
|
1313 |
+ # rm(phenD, protD , fD) |
|
1292 | 1314 |
|
1293 |
- snp.index <- res$snpIndex #match(res$gns, featureNames(callSet)) |
|
1294 |
- suppressWarnings(A(callSet) <- res[["A"]]) |
|
1295 |
- suppressWarnings(B(callSet) <- res[["B"]]) |
|
1296 |
- pData(callSet)$SKW <- res$SKW |
|
1297 |
- pData(callSet)$SNR <- res$SNR |
|
1298 |
- mixtureParams <- res$mixtureParams |
|
1299 |
- rm(res); gc() |
|
1300 |
- tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]), |
|
1301 |
- B=as.matrix(B(callSet)[snp.index,]), # j]), |
|
1302 |
- SNR=callSet$SNR, # [j], |
|
1303 |
- mixtureParams=mixtureParams, |
|
1304 |
- cdfName=annotation(callSet), |
|
1305 |
-# row.names=featureNames(callSet)[snp.index], |
|
1306 |
-# col.names=sampleNames(callSet), #[j], |
|
1315 |
+# snp.index <- res$snpIndex #match(res$gns, featureNames(callSet)) |
|
1316 |
+# suppressWarnings(A(callSet) <- res[["A"]]) |
|
1317 |
+# suppressWarnings(B(callSet) <- res[["B"]]) |
|
1318 |
+# pData(callSet)$SKW <- res$SKW |
|
1319 |
+# pData(callSet)$SNR <- res$SNR |
|
1320 |
+# mixtureParams <- res$mixtureParams |
|
1321 |
+# rm(res); gc() |
|
1322 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
1323 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
1324 |
+ |
|
1325 |
+ res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]), |
|
1326 |
+ B=res[["B"]], # as.matrix(B(callSet)[snp.index,]), # j]), |
|
1327 |
+ SNR=res[["SNR"]], # callSet$SNR, # [j], |
|
1328 |
+ mixtureParams=res[["mixtureParams"]], #, |
|
1329 |
+ cdfName=res[["cdfName"]], # annotation(callSet), |
|
1330 |
+ row.names=row.names, # featureNames(callSet)[snp.index], |
|
1331 |
+ col.names=col.names, # sampleNames(callSet), #[j], |
|
1307 | 1332 |
probs=probs, |
1308 | 1333 |
DF=DF, |
1309 | 1334 |
SNRMin=SNRMin, |
... | ... |
@@ -1313,14 +1338,20 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
1313 | 1338 |
verbose=verbose, |
1314 | 1339 |
returnParams=returnParams, |
1315 | 1340 |
badSNP=badSNP) |
1341 |
+# rm(res); gc() |
|
1316 | 1342 |
# suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]]) |
1317 | 1343 |
# suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]) |
1318 | 1344 |
# callSet$gender[j] <- tmp$gender |
1319 |
- suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]]) |
|
1320 |
- suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]]) |
|
1321 |
- callSet$gender <- tmp$gender |
|
1322 |
- rm(tmp); gc() |
|
1323 |
- return(callSet) |
|
1345 |
+# suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]]) |
|
1346 |
+# suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]]) |
|
1347 |
+# callSet$gender <- tmp$gender |
|
1348 |
+# rm(tmp); gc() |
|
1349 |
+# return(callSet) |
|
1350 |
+ |
|
1351 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
1352 |
+ res2[["SKW"]] <- res[["SKW"]] |
|
1353 |
+ rm(res); gc() |
|
1354 |
+ return(list2SnpSet(res2, returnParams=returnParams)) |
|
1324 | 1355 |
} |
1325 | 1356 |
|
1326 | 1357 |
|
... | ... |
@@ -1343,7 +1374,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1343 | 1374 |
row.names=TRUE, |
1344 | 1375 |
col.names=TRUE, |
1345 | 1376 |
probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
1346 |
- seed=1, # save.ab=FALSE, snpFile, cnFile, |
|
1377 |
+ seed=1, save.ab=FALSE, snpFile, cnFile, |
|
1347 | 1378 |
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
1348 | 1379 |
cdfName, sns, recallMin=10, recallRegMin=1000, |
1349 | 1380 |
returnParams=FALSE, badSNP=.7) { |
... | ... |
@@ -1354,8 +1385,8 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1354 | 1385 |
|
1355 | 1386 |
# if (save.rg & missing(rgFile)) |
1356 | 1387 |
# stop("'rgFile' is missing, and you chose save.rg") |
1357 |
-# if (save.ab & (missing(snpFile) | missing(cnFile))) |
|
1358 |
-# stop("'snpFile' or 'cnFile' is missing and you chose save.ab") |
|
1388 |
+ if (save.ab & (missing(snpFile) | missing(cnFile))) |
|
1389 |
+ stop("'snpFile' or 'cnFile' is missing and you chose save.ab") |
|
1359 | 1390 |
# batches = NULL |
1360 | 1391 |
# if(!is.null(arrayNames)) |
1361 | 1392 |
# batches <- rep(1, length(arrayNames)) # problem here if arrayNames not specified! # splitIndicesByLength(seq(along=arrayNames), ocSamples()) |
... | ... |
@@ -1377,27 +1408,25 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1377 | 1408 |
# save(RG, file=rgFile) |
1378 | 1409 |
|
1379 | 1410 |
XY = RGtoXY2(RG, chipType=cdfName) |
1380 |
- rm(RG) |
|
1381 |
- gc() |
|
1411 |
+ rm(RG); gc() |
|
1382 | 1412 |
if (missing(sns)) { sns = sampleNames(XY) #subsns = sampleNames(XY) |
1383 |
- } else subsns = sns[j] |
|
1413 |
+ } # else subsns = sns[j] |
|
1384 | 1414 |
res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
1385 |
- seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) # sns=subsns |
|
1386 |
-# save.it=save.ab, snpFile=snpFile, cnFile=cnFile) |
|
1387 |
- fD = featureData(XY) |
|
1388 |
- phenD = XY@phenoData |
|
1389 |
- protD = XY@protocolData |
|
1390 |
- rm(XY) |
|
1391 |
- gc() |
|
1415 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget, #) # sns=subsns |
|
1416 |
+ save.it=save.ab, snpFile=snpFile, cnFile=cnFile) |
|
1417 |
+# fD = featureData(XY) |
|
1418 |
+# phenD = XY@phenoData |
|
1419 |
+# protD = XY@protocolData |
|
1420 |
+ rm(XY); gc() |
|
1392 | 1421 |
# if(k == 1){ |
1393 |
- if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
|
1394 |
- callSet <- new("SnpSuperSet", |
|
1395 |
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
1396 |
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
1397 |
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
1398 |
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
1399 |
- annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) |
|
1400 |
- sampleNames(callSet) <- sns |
|
1422 |
+# if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
|
1423 |
+# callSet <- new("SnpSuperSet", |
|
1424 |
+# alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
1425 |
+# alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
1426 |
+# call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
1427 |
+# callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
1428 |
+# annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) |
|
1429 |
+# sampleNames(callSet) <- sns |
|
1401 | 1430 |
# phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet, |
1402 | 1431 |
# arrayNames=sns, |
1403 | 1432 |
# arrayInfoColNames=arrayInfoColNames) |
... | ... |
@@ -1405,17 +1434,17 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1405 | 1434 |
# colnames(pD) <- "ScanDate" |
1406 | 1435 |
# protocolData(callSet) <- pData(protD) # new("AnnotatedDataFrame", data=pD) |
1407 | 1436 |
# pData(protocolData(callSet))[j, ] <- pData(protocolData) |
1408 |
- featureNames(callSet) <- res[["gns"]] |
|
1409 |
- pData(callSet)$SKW <- rep(NA, length(sns)) |
|
1410 |
- pData(callSet)$SNR <- rep(NA, length(sns)) |
|
1411 |
- pData(callSet)$gender <- rep(NA, length(sns)) |
|
1437 |
+# featureNames(callSet) <- res[["gns"]] |
|
1438 |
+# pData(callSet)$SKW <- rep(NA, length(sns)) |
|
1439 |
+# pData(callSet)$SNR <- rep(NA, length(sns)) |
|
1440 |
+# pData(callSet)$gender <- rep(NA, length(sns)) |
|
1412 | 1441 |
# } |
1413 | 1442 |
# pData(callSet)[j,] <- phenD |
1414 | 1443 |
# pData(protocolData(callSet))[j,] <- protD |
1415 | 1444 |
# pData(callSet) <- phenD |
1416 | 1445 |
# pData(protocolData(callSet)) <- protD |
1417 | 1446 |
|
1418 |
- rm(phenD, protD, fD) |
|
1447 |
+# rm(phenD, protD, fD) |
|
1419 | 1448 |
|
1420 | 1449 |
# if(k > 1 & nrow(res[[1]]) != nrow(callSet)){ |
1421 | 1450 |
##RS: I don't understand why the IDATS for the |
... | ... |
@@ -1424,24 +1453,26 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1424 | 1453 |
# res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ] |
1425 | 1454 |
# } |
1426 | 1455 |
|
1427 |
- snp.index <- res$snpIndex #match(res$gns, featureNames(callSet)) |
|
1456 |
+# snp.index <- res$snpIndex #match(res$gns, featureNames(callSet)) |
|
1428 | 1457 |
# suppressWarnings(A(callSet)[, j] <- res[["A"]]) |
1429 | 1458 |
# suppressWarnings(B(callSet)[, j] <- res[["B"]]) |
1430 |
- suppressWarnings(A(callSet) <- res[["A"]]) |
|
1431 |
- suppressWarnings(B(callSet) <- res[["B"]]) |
|
1459 |
+# suppressWarnings(A(callSet) <- res[["A"]]) |
|
1460 |
+# suppressWarnings(B(callSet) <- res[["B"]]) |
|
1432 | 1461 |
# pData(callSet)$SKW[j] <- res$SKW |
1433 | 1462 |
# pData(callSet)$SNR[j] <- res$SNR |
1434 |
- pData(callSet)$SKW <- res$SKW |
|
1435 |
- pData(callSet)$SNR <- res$SNR |
|
1436 |
- mixtureParams <- res$mixtureParams |
|
1437 |
- rm(res); gc() |
|
1438 |
- tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]), |
|
1439 |
- B=as.matrix(B(callSet)[snp.index,]), # j]), |
|
1440 |
- SNR=callSet$SNR, # [j], |
|
1441 |
- mixtureParams=mixtureParams, |
|
1442 |
- cdfName=annotation(callSet), |
|
1443 |
- row.names=featureNames(callSet)[snp.index], |
|
1444 |
- col.names=sampleNames(callSet), #[j], |
|
1463 |
+# pData(callSet)$SKW <- res$SKW |
|
1464 |
+# pData(callSet)$SNR <- res$SNR |
|
1465 |
+# mixtureParams <- res$mixtureParams |
|
1466 |
+# rm(res); gc() |
|
1467 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
1468 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
1469 |
+ res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]), |
|
1470 |
+ B=res[["B"]], # as.matrix(B(callSet)[snp.index,]), # j]), |
|
1471 |
+ SNR=res[["SNR"]], # callSet$SNR, # [j], |
|
1472 |
+ mixtureParams=res[["mixtureParams"]], #, |
|
1473 |
+ cdfName=res[["cdfName"]], # annotation(callSet), |
|
1474 |
+ row.names=row.names, # featureNames(callSet)[snp.index], |
|
1475 |
+ col.names=col.names, # sampleNames(callSet), #[j], |
|
1445 | 1476 |
probs=probs, |
1446 | 1477 |
DF=DF, |
1447 | 1478 |
SNRMin=SNRMin, |
... | ... |
@@ -1451,14 +1482,44 @@ crlmmIlluminaV2 = function(sampleSheet=NULL, |
1451 | 1482 |
verbose=verbose, |
1452 | 1483 |
returnParams=returnParams, |
1453 | 1484 |
badSNP=badSNP) |
1485 |
+# rm(res); gc() |
|
1486 |
+# suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]]) |
|
1487 |
+# suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]) |
|
1488 |
+# callSet$gender[j] <- tmp$gender |
|
1489 |
+# suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]]) |
|
1490 |
+# suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]]) |
|
1491 |
+# callSet$gender <- tmp$gender |
|
1492 |
+# rm(tmp); gc() |
|
1493 |
+# return(callSet) |
|
1494 |
+ |
|
1495 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
1496 |
+ res2[["SKW"]] <- res[["SKW"]] |
|
1497 |
+ rm(res); gc() |
|
1498 |
+ return(list2SnpSet(res2, returnParams=returnParams)) |
|
1499 |
+# tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]), |
|
1500 |
+# B=as.matrix(B(callSet)[snp.index,]), # j]), |
|
1501 |
+# SNR=callSet$SNR, # [j], |
|
1502 |
+# mixtureParams=mixtureParams, |
|
1503 |
+# cdfName=annotation(callSet), |
|
1504 |
+# row.names=featureNames(callSet)[snp.index], |
|
1505 |
+# col.names=sampleNames(callSet), #[j], |
|
1506 |
+# probs=probs, |
|
1507 |
+# DF=DF, |
|
1508 |
+# SNRMin=SNRMin, |
|
1509 |
+# recallMin=recallMin, |
|
1510 |
+# recallRegMin=recallRegMin, |
|
1511 |
+# gender=gender, |
|
1512 |
+# verbose=verbose, |
|
1513 |
+# returnParams=returnParams, |
|
1514 |
+# badSNP=badSNP) |
|
1454 | 1515 |
# suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]]) |
1455 | 1516 |
# suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]) |
1456 | 1517 |
# callSet$gender[j] <- tmp$gender |
1457 |
- suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]]) |
|
1458 |
- suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]]) |
|
1459 |
- callSet$gender <- tmp$gender |
|
1460 |
- rm(tmp); gc() |
|
1518 |
+# suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]]) |
|
1519 |
+# suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]]) |
|
1520 |
+# callSet$gender <- tmp$gender |
|
1521 |
+# rm(tmp); gc() |
|
1461 | 1522 |
# k <- k+1 |
1462 | 1523 |
# } |
1463 |
- return(callSet) |
|
1524 |
+# return(callSet) |
|
1464 | 1525 |
} |
... | ... |
@@ -44,12 +44,15 @@ In this user guide we read in and genotype data from 40 HapMap samples |
44 | 44 |
which have been analyzed using Illumina's 370k Duo BeadChips. |
45 | 45 |
This data is available in the \Rpackage{hapmap370k} package. |
46 | 46 |
Additional chip-specific model parameters and basic SNP annotation |
47 |
-information used by CRLMM is stored in the \Rpackage{human370v1c} package. |
|
48 |
-These can be downloaded from \href{http://rafalab.jhsph.edu/software.html}{http://rafalab.jhsph.edu/software.html} |
|
49 |
-and must be installed for the following code to work. |
|
47 |
+information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
|
48 |
+These packages can be installed in the usual way using the \Rfunction{biocLite} function. |
|
50 | 49 |
|
51 |
-\section{Reading in data} |
|
50 |
+<<<echo=TRUE, results=hide, eval=FALSE>>= |
|
51 |
+source("http://www.bioconductor.org/biocLite.R") |
|
52 |
+biocLite(c("hapmap370k", "human370v1cCrlmm")) |
|
53 |
+@ |
|
52 | 54 |
|
55 |
+\section{Reading in data} |
|
53 | 56 |
The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
54 | 57 |
from the binary {\tt idat} files output by Illumina's scanning device. |
55 | 58 |
The file {\tt samples370k.csv} contains information about each sample. |
... | ... |
@@ -58,7 +61,7 @@ The file {\tt samples370k.csv} contains information about each sample. |
58 | 61 |
options(width=50) |
59 | 62 |
@ |
60 | 63 |
|
61 |
-<<read>>= |
|
64 |
+<<read, results=hide, eval=TRUE>>= |
|
62 | 65 |
library(Biobase) |
63 | 66 |
library(crlmm) |
64 | 67 |
library(hapmap370k) |
... | ... |
@@ -112,8 +115,7 @@ mtext("Array", side=1, outer=TRUE) |
112 | 115 |
|
113 | 116 |
\section{Genotyping} |
114 | 117 |
|
115 |
-Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping |
|
116 |
-using the CRLMM algorithm. |
|
118 |
+Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm. |
|
117 | 119 |
|
118 | 120 |
<<genotype, results=hide, cache=TRUE>>= |
119 | 121 |
crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE) |