Browse code

Updated crlmmIllumina.Rnw

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@59206 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 14/10/2011 06:34:23
Showing4 changed files

... ...
@@ -443,83 +443,78 @@ readBPM <- function(bpmFile){
443 443
 }
444 444
 
445 445
 
446
-readGenCallOutput = function(file, path=".", cdfName, verbose=FALSE) {
446
+readGenCallOutput = function(file, path=".", cdfName,
447
+    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"), 
448
+    type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
449
+
450
+    if(!identical(names(type), names(colnames)))
451
+       stop("The arguments 'colnames' and 'type' must have consistent names")
447 452
     if(verbose)
448
-  	  cat("Reading", file, "\n")
449
-	tmp=readLines(file.path(path,file),n=15)
450
-	s=c("\t",",")
453
+  	cat("Reading", file, "\n")
454
+    tmp=readLines(file.path(path,file),n=15)
455
+    s=c("\t",",")
451 456
     a=unlist(strsplit(tmp[10][1],s[1]))
452
-	if(length(a)!=1){
453
-		sepp=s[1]
457
+    if(length(a)!=1){
458
+	sepp=s[1]
454 459
       	a1=unlist(strsplit(tmp[10][1],s[1]))
455 460
     }
456
-	if(length(a)==1){
457
-		sepp=s[2]
458
-		a1=unlist(strsplit(tmp[10][1],s[2]))
461
+    if(length(a)==1){
462
+	sepp=s[2]
463
+	a1=unlist(strsplit(tmp[10][1],s[2]))
459 464
     }
460
-	
461
-	b=c("Sample ID","SNP Name","Allele1 - Forward","Allele2 - Forward","GC Score","X Raw","Y Raw")
462
-	b2=c("GC Score","X Raw","Y Raw")
463
-	
464
-	m1=m=match(a1,b)
465
-	m2=match(a1,b2)
466
-	m[is.na(m)==FALSE]<-list(character(0))
467
-	m[is.na(m2)==FALSE]<-list(numeric(0))
468
-	m[is.na(m)==TRUE]<-list(NULL)
469
-	names(m)<-paste(b[m1],names(m),sep='')
470
-	
465
+    
466
+    if(sum(is.na(match(colnames,a1)))!=0)
467
+	stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.") 
468
+
469
+    m1=m=match(a1,colnames)
470
+    m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]]
471
+    m[is.na(m)==TRUE] = list(NULL)
472
+    names(m) = names(colnames)[m1]
473
+
471 474
     fc = file(file.path(path, file), open="r")
472 475
 	
473
-	dat = scan(fc, what=m, skip=10,sep=sepp)
474
-	close(fc)
476
+    dat = scan(fc, what=m, skip=10,sep=sepp)
477
+    close(fc)
475 478
 	
476
-	samples = unique(dat$"Sample ID")
477
-	nsamples = length(samples)
478
-	snps = unique(dat$"SNP Name")
479
-	nsnps = length(snps)
480
-	if(verbose)
481
-   	  cat("Check ordering for samples","\n")
479
+    samples = unique(dat$"SampleID")
480
+    nsamples = length(samples)
481
+    snps = unique(dat$"SNPID")
482
+    nsnps = length(snps)
483
+    if(verbose)
484
+        cat("Check ordering for samples","\n")
482 485
     
483
-	X = Y = zeroes = matrix(0, nsnps, nsamples)
486
+    X = Y = zeroes = matrix(0, nsnps, nsamples)
484 487
 	
485
-	for(i in 1:length(samples)) {
486
-		ind = dat$"Sample ID"==samples[i]
487
-		if(sum(dat$"SNP Name"[ind]==snps)==nsnps) {
488
-		    if(verbose)
489
-	          cat(paste("Correct ordering for sample", samples[i], "\n"))
490
-			X[,i] = dat$"X Raw"[ind]
491
-			Y[,i] = dat$"Y Raw"[ind]
492
-			gc()
493
-		}
494
-		if(sum(dat$"SNP Name"[ind]==snps)!=nsnps) {
495
-		    if(verbose)
496
-			  cat("Reordering sample ", samples[i],"\n")
497
-			m=match(snps,dat$"SNP Name"[ind])
498
-			X[,i]= dat$"X Raw"[ind][m]
499
-			Y[,i]= dat$"Y Raw"[ind][m]
500
-		}
501
-	}
488
+    for(i in 1:length(samples)) {
489
+        ind = dat$"SampleID"==samples[i]
490
+	    if(sum(dat$"SNPID"[ind]==snps)==nsnps) {
491
+#           if(verbose)
492
+#	            cat(paste("Correct ordering for sample", samples[i], "\n"))
493
+			X[,i] = dat$"XRaw"[ind]
494
+	        Y[,i] = dat$"YRaw"[ind]
495
+	    }
496
+        if(sum(dat$"SNPID"[ind]==snps)!=nsnps) {
497
+	        if(verbose)
498
+                cat("Reordering sample ", samples[i],"\n")
499
+            m=match(snps,dat$"SNPID"[ind])
500
+            X[,i]= dat$"XRaw"[ind][m]
501
+            Y[,i]= dat$"YRaw"[ind][m]
502
+        }
503
+        gc()
504
+    }
502 505
 	
503
-#	samplenamescorrect = gsub("(_R.)|(_R$)", "", samples)
504 506
     zeroes=(X=="0"|Y=="0")
505
-	colnames(X) = colnames(Y) =  colnames(zeroes) = samples #namescorrect
506
-	rownames(X) = rownames(Y) = snps
507
+    colnames(X) = colnames(Y) =  colnames(zeroes) = samples
508
+    rownames(X) = rownames(Y) = snps
507 509
     
508
-	if(verbose)      
509
-      cat("Creating NChannelSet object\n")
510
+    if(verbose)      
511
+        cat("Creating NChannelSet object\n")
510 512
    
511
-#	if (getRversion() < "2.13.0") {
512
-#      rpath <- getRversion()
513
-# 	  outdir <- paste(path, "/illumina_vignette", sep = "")
514
-#	  ldPath(outdir)
515
-#	  dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
516
-#	}
517
-        
518 513
     XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X),
519 514
 			 Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y),
520 515
 			 zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes),
521 516
 			 annotation = cdfName, storage.mode = "environment")
522
-	sampleNames(XY)=colnames(X)
517
+    sampleNames(XY)=colnames(X)
523 518
 	
524 519
     if(verbose)
525 520
       cat("Done\n")
... ...
@@ -860,6 +855,16 @@ crlmmIllumina = function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
860 855
                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
861 856
                   cdfName, sns, recallMin=10, recallRegMin=1000,
862 857
                   returnParams=FALSE, badSNP=.7) {
858
+				  
859
+if(missing(cdfName)) {
860
+   if(!missing(RG))
861
+      cdfName = annotation(RG)
862
+   if(!missing(XY))
863
+      cdfName = annotation(XY)
864
+}
865
+if(!isValidCdfName(cdfName))
866
+   stop("cdfName not valid.  see validCdfNames")
867
+     
863 868
 #  if (save.it & (missing(snpFile) | missing(cnFile)))
864 869
 #    stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
865 870
 #  if (load.it & missing(snpFile))
866 871
Binary files a/inst/doc/crlmmIllumina.pdf and b/inst/doc/crlmmIllumina.pdf differ
... ...
@@ -47,7 +47,7 @@ Additional chip-specific model parameters and basic SNP annotation
47 47
 information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package.
48 48
 The required packages can be installed in the usual way using the \Rfunction{biocLite} function.
49 49
 
50
-<<<echo=TRUE, results=hide, eval=FALSE>>=
50
+<<echo=TRUE, results=hide, eval=FALSE>>=
51 51
 source("http://www.bioconductor.org/biocLite.R")
52 52
 biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm"))
53 53
 @ 
... ...
@@ -57,8 +57,8 @@ The function \Rfunction{readIdatFiles} extracts the Red and Green intensities
57 57
 from the binary {\tt idat} files output by Illumina's scanning device.
58 58
 The file {\tt samples370k.csv} contains information about each sample.
59 59
 
60
-<<<echo=FALSE, results=hide, eval=TRUE>>=
61
-options(width=50)
60
+<<echo=FALSE, results=hide, eval=TRUE>>=
61
+options(width=70)
62 62
 @ 
63 63
 
64 64
 <<read, results=hide, eval=TRUE>>=
... ...
@@ -75,7 +75,8 @@ samples[1:5,]
75 75
 
76 76
 <<read2, results=hide, cache=TRUE>>=
77 77
 # Read in .idats using sampleSheet information
78
-RG = readIdatFiles(samples, path=data.dir, arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), saveDate=TRUE)
78
+RG = readIdatFiles(samples, path=data.dir, 
79
+arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE)
79 80
 @
80 81
 
81 82
 Reading in this data takes approximately 100 seconds and peak memory usage 
... ...
@@ -104,13 +105,20 @@ levels(scanbatch) = 1:16
104 105
 scanbatch = as.numeric(scanbatch)
105 106
 @
106 107
 
107
-Plots of the summarised data can be easily generated to check for arrays 
108
-with poor signal.
108
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be 
109
+used to read in the data.  
110
+This function assumes the GenCall output is formatted to have samples listed one below the other,
111
+and that the columns 'X Raw' and 'Y Raw' are available in the file.  
112
+The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files).
113
+
114
+Plots of the summarised data can be easily generated to check for arrays with poor signal.
109 115
 
110 116
 <<boxplots, fig=TRUE, width=8, height=8>>=
111 117
 par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0))
112
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, main="Red channel", outline=FALSE, las=2)
113
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, main="Green channel", outline=FALSE, las=2)
118
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, 
119
+main="Red channel",outline=FALSE,las=2)
120
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, 
121
+main="Green channel",outline=FALSE,las=2)
114 122
 mtext(expression(log[2](intensity)), side=2, outer=TRUE)
115 123
 mtext("Array", side=1, outer=TRUE)
116 124
 @
... ...
@@ -120,25 +128,34 @@ mtext("Array", side=1, outer=TRUE)
120 128
 Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm.
121 129
 
122 130
 <<genotype, results=hide, cache=TRUE>>=
123
-crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE)
131
+crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE)
124 132
 @ 
125 133
 
126
-This analysis took 18 minutes to complete and peak memory usage was 2.5 GB on our system.
134
+This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system.
127 135
 The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object.                                                                                                                                                         
128 136
 <<explore2>>=
129
-  class(crlmmResult)
130
-  dim(crlmmResult)
131
-  slotNames(crlmmResult)
132
-  calls(crlmmResult)[1:10, 1:5]
137
+class(crlmmResult)
138
+dim(crlmmResult)
139
+slotNames(crlmmResult)
140
+calls(crlmmResult)[1:10, 1:5]
133 141
 @ 
134 142
 
135 143
 Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for 
136 144
 arrays scanned on different days).
137 145
 
138 146
 <<snr,  fig=TRUE, width=8, height=6>>=
139
-plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", main="Signal-to-noise ratio per array", las=2)
147
+plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", 
148
+main="Signal-to-noise ratio per array",las=2)
140 149
 @
141 150
 
151
+An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines reading of idat files with genotyping is also available.
152
+
153
+<<readandgenotypeinone, results=hide, cache=TRUE>>=
154
+crlmmResult2 = crlmmIlluminaV2(samples, path=data.dir, 
155
+arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),
156
+saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
157
+@ 
158
+
142 159
 \section{System information}
143 160
 
144 161
 This analysis was carried out on a linux machine with 32GB of RAM 
... ...
@@ -28,9 +28,8 @@ readGenCallOutput(file, path=".", cdfName, colnames=list("SampleID"="Sample ID",
28 28
 
29 29
 \details{
30 30
 This function returns an \code{NChannelSet} containing raw intensity data (X and Y) from GenCall final report file. 
31
-It assumes that the data is in the format where the GenCall results are listed sample by sample below the one and other,
32
-an that the columns 'X Raw' and 'Y Raw' are available in the file.
33
-The function crlmmillumina() can be run on the output of the \code{readGenCallOutyputis function. 
31
+It assumes the GenCall output is formatted to have samples listed one below the other, and that the columns 'X Raw' and 'Y Raw' are available in the file.
32
+The function crlmmillumina() can be run on the output of the \code{readGenCallOutput} function. 
34 33
 }
35 34
 
36 35
 \value{