Browse code

Edits to CA,CB, ACN accessors for copy number

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48961 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 21/08/2010 02:50:05
Showing6 changed files

... ...
@@ -29,7 +29,7 @@ importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet, CNSet)
29 29
 importMethodsFrom(oligoClasses, allele, calls, "calls<-", confs,
30 30
 		  "confs<-", cnConfidence, "cnConfidence<-", isSnp,
31 31
 		  chromosome, position, A, B,
32
-		  "A<-", "B<-", open, close, lM, flags,
32
+		  "A<-", "B<-", open, close, flags,
33 33
 		  batchStatistics, "batchStatistics<-", updateObject)
34 34
 
35 35
 
... ...
@@ -190,7 +190,6 @@ genotype <- function(filenames,
190 190
 		       verbose=TRUE,
191 191
 		       seed=1,
192 192
 		       sns,
193
-		       copynumber=TRUE,
194 193
 		       probs=rep(1/3, 3),
195 194
 		       DF=6,
196 195
 		       SNRMin=5,
... ...
@@ -200,24 +199,6 @@ genotype <- function(filenames,
200 199
 		       returnParams=TRUE,
201 200
 		       badSNP=0.7){
202 201
 	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
203
-	if(!copynumber){
204
-		FUN <- ifelse(is.lds, "crlmm2", "crlmm")
205
-		callSet <- FUN(filenames=filenames,
206
-			       cdfName=cdfName,
207
-			       mixtureSampleSize=mixtureSampleSize,
208
-			       eps=eps,
209
-			       verbose=verbose,
210
-			       sns=sns,
211
-			       probs=probs,
212
-			       DF=DF,
213
-			       SNRMin=SNRMin,
214
-			       recallMin=recallMin,
215
-			       recallRegMin=recallRegMin,
216
-			       gender=gender,
217
-			       returnParams=returnParams,
218
-			       badSNP=badSNP)
219
-		return(callSet)
220
-	}
221 202
 	if(missing(cdfName)) stop("must specify cdfName")
222 203
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
223 204
 	if(missing(sns)) sns <- basename(filenames)
... ...
@@ -308,6 +289,8 @@ genotype <- function(filenames,
308 289
 		open(tmp[["confs"]])
309 290
 		ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
310 291
 		ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
292
+		close(tmp[["calls"]])
293
+		close(tmp[["confs"]])
311 294
 	} else {
312 295
 		calls(callSet)[snp.index, ] <- tmp[["calls"]]
313 296
 		snpCallProbability(callSet)[snp.index, ] <- tmp[["confs"]]
... ...
@@ -859,12 +842,12 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
859 842
 		shrink.corrAA[, k] <- shrink(cor.AA, NN[, 1], DF.PRIOR)
860 843
 		shrink.corrAB[, k] <- shrink(cor.AB, NN[, 2], DF.PRIOR)
861 844
 		shrink.corrBB[, k] <- shrink(cor.BB, NN[, 3], DF.PRIOR)
862
-
845
+		##
863 846
 		##---------------------------------------------------------------------------
864 847
 		## SNPs that we'll use for imputing location/scale of unobserved genotypes
865 848
 		##---------------------------------------------------------------------------
866 849
 		index.complete <- indexComplete(NN, medianA[[k]], medianB[[k]], MIN.OBS)
867
-
850
+		##
868 851
 		##---------------------------------------------------------------------------
869 852
 		## Impute sufficient statistics for unobserved genotypes (plate-specific)
870 853
 		##---------------------------------------------------------------------------
... ...
@@ -880,7 +863,7 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
880 863
 		medianB[[k]] <- res[[2]]
881 864
 		rm(res)
882 865
 		##the NA's in 'medianA' and 'medianB' are monomorphic if MIN.OBS = 1
883
-
866
+		##
884 867
 		## RS: For Monomorphic SNPs a mixture model may be better
885 868
 		## RS: Further, we can improve estimation by borrowing strength across batch
886 869
 		unobserved.index[[1]] <- which(unobservedAA & unobservedAB)
... ...
@@ -902,14 +885,14 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
902 885
 	medianB.AA(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 1]))
903 886
 	medianB.AB(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 2]))
904 887
 	medianB.BB(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 3]))
905
-
888
+	##
906 889
 	madA.AA(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 1]))
907 890
 	madA.AB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 2]))
908 891
 	madA.BB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 3]))
909 892
 	madB.AA(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 1]))
910 893
 	madB.AB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 2]))
911 894
 	madB.BB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 3]))
912
-
895
+	##
913 896
 	corrAA(object)[marker.index, ] <- shrink.corrAA
914 897
 	corrAB(object)[marker.index, ] <- shrink.corrAB
915 898
 	corrBB(object)[marker.index, ] <- shrink.corrBB
... ...
@@ -917,7 +900,7 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
917 900
 	tau2A.BB(object)[marker.index,] <- shrink.tau2A.BB
918 901
 	tau2B.AA(object)[marker.index,] <- shrink.tau2B.AA
919 902
 	tau2B.BB(object)[marker.index,] <- shrink.tau2B.BB
920
-	if(is.lds) return(TRUE) else retrun(object)
903
+	if(is.lds) return(TRUE) else return(object)
921 904
 }
922 905
 
923 906
 
... ...
@@ -925,11 +908,7 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
925 908
 fit.lm1 <- function(strata,
926 909
 		    index.list,
927 910
 		    object,
928
-		    SNRMin,
929 911
 		    MIN.SAMPLES,
930
-		    MIN.OBS,
931
-		    DF.PRIOR,
932
-		    GT.CONF.THR,
933 912
 		    THR.NU.PHI,
934 913
 		    MIN.NU,
935 914
 		    MIN.PHI,
... ...
@@ -970,6 +949,7 @@ fit.lm1 <- function(strata,
970 949
 	flags <- as.matrix(flags(object)[snps, ])
971 950
 	for(k in seq(along=batches)){
972 951
 		B <- batches[[k]]
952
+		if(length(B) < MIN.SAMPLES) next()
973 953
 		this.batch <- unique(as.character(batch(object)[B]))
974 954
 		medianA <- cbind(medianA.AA[, k], medianA.AB[, k], medianA.BB[, k])
975 955
 		medianB <- cbind(medianB.AA[, k], medianB.AB[, k], medianB.BB[, k])
... ...
@@ -981,9 +961,6 @@ fit.lm1 <- function(strata,
981 961
 		nuA[, k] <- res[1, ]
982 962
 		phiA[, k] <- res[2, ]
983 963
 		rm(res)
984
-		##res <- fit.wls(allele="A", Ystar=YA, W=wA, Ns=Ns)
985
-		##nuA[, J] <- res[[1]]
986
-		##phiA[, J] <- res[[2]]
987 964
 		res <- fit.wls(NN=NN, sigma=madB, allele="B", Y=medianB, autosome=!CHR.X)##allele="B", Ystar=YB, W=wB, Ns=Ns)
988 965
 		nuB[, k] <- res[1, ]
989 966
 		phiB[, k] <- res[2, ]
... ...
@@ -996,14 +973,6 @@ fit.lm1 <- function(strata,
996 973
 		phiA[phiA < MIN.PHI] <- MIN.PHI
997 974
 		phiB[phiB < MIN.PHI] <- MIN.PHI
998 975
 	}
999
-##	cA[cA < 0.05] <- 0.05
1000
-##	cB[cB < 0.05] <- 0.05
1001
-##	cA[cA > 5] <-  5
1002
-##	cB[cB > 5] <- 5
1003
-##	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1004
-##	cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB))
1005
-##	CA(object)[snps, ] <- cA
1006
-##	CB(object)[snps, ] <- cB
1007 976
 	nuA(object)[snps, ] <- nuA
1008 977
 	nuB(object)[snps, ] <- nuB
1009 978
 	phiA(object)[snps, ] <- phiA
... ...
@@ -1019,11 +988,7 @@ fit.lm1 <- function(strata,
1019 988
 fit.lm2 <- function(strata,
1020 989
 		    index.list,
1021 990
 		    object,
1022
-		    SNRMin,
1023 991
 		    MIN.SAMPLES,
1024
-		    MIN.OBS,
1025
-		    DF.PRIOR,
1026
-		    GT.CONF.THR,
1027 992
 		    THR.NU.PHI,
1028 993
 		    MIN.NU,
1029 994
 		    MIN.PHI,
... ...
@@ -1034,24 +999,16 @@ fit.lm2 <- function(strata,
1034 999
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
1035 1000
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1036 1001
 
1037
-
1038 1002
 	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
1039 1003
 	flags <- as.matrix(flags(object)[ii, ])
1040 1004
 	fns <- featureNames(object)[ii]
1041 1005
 	fns.noflags <- fns[rowSums(flags, na.rm=T) == 0]
1042 1006
 	snp.index <- sample(match(fns.noflags, featureNames(object)), 5000)
1043 1007
 
1044
-	##flags <- as.matrix(snpflags[,])
1045
-	##noflags <- rowSums(flags, na.rm=TRUE) == 0  ##NA's for unevaluated batches
1046
-
1047 1008
 	nuA.np <- as.matrix(nuA(object)[marker.index, ])
1048 1009
 	phiA.np <- as.matrix(phiA(object)[marker.index, ])
1049 1010
 	tau2A.AA <- as.matrix(tau2A.AA(object)[marker.index, ])
1050 1011
 
1051
-	##nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(marker.index), length(unique(batch(object))))
1052
-	## for imputation, we need the corresponding parameters of the snps
1053
-	##NN <- min(10e3, length(which(ii & noflags)))
1054
-	##snp.ind <- sample(which(ii & noflags), NN)
1055 1012
 	nuA.snp <- as.matrix(nuA(object)[snp.index, ])
1056 1013
 	nuB.snp <- as.matrix(nuB(object)[snp.index, ])
1057 1014
 	phiA.snp <- as.matrix(phiA(object)[snp.index, ])
... ...
@@ -1059,70 +1016,24 @@ fit.lm2 <- function(strata,
1059 1016
 	medianA.AA <- as.matrix(medianA.AA(object)[snp.index,])
1060 1017
 	medianB.BB <- as.matrix(medianB.BB(object)[snp.index,])
1061 1018
 
1062
-
1063
-
1064 1019
 	medianA.AA.np <- as.matrix(medianA.AA(object)[marker.index,])
1065
-
1066
-##	nnuA.snp <- as.matrix(physical(lM(object))$nuA[snp.ind,])
1067
-##	pphiA.snp <- as.matrix(physical(lM(object))$phiA[snp.ind,])
1068
-##	nnuB.snp <- as.matrix(physical(lM(object))$nuB[snp.ind,])
1069
-##	pphiB.snp <- as.matrix(physical(lM(object))$phiB[snp.ind,])
1070
-
1071
-##	AA.snp <- as.matrix(A(object)[snp.ind, ])
1072
-##	BB.snp <- as.matrix(B(object)[snp.ind, ])
1073
-##	NNORM.snp <- as.matrix(normal[snp.ind, ])
1074
-##	NORM.np <- as.matrix(normal[snps, ])
1075
-##	AA.np <- as.matrix(A(object)[marker.index, ])
1076
-##	GG <- as.matrix(calls(object)[snp.ind, ])
1077
-##	CP <- as.matrix(snpCallProbability(object)[snp.ind, ])
1078 1020
 	for(k in seq_along(batches)){
1079 1021
 		B <- batches[[k]]
1080 1022
 		this.batch <- unique(as.character(batch(object)[B]))
1081
-##		phiA.snp <- pphiA.snp[, J]
1082
-##		phiB.snp <- pphiB.snp[, J]
1083
-##		A.snp <- AA.snp[, k]
1084
-##		B.snp <- BB.snp[, k]
1085
-##		NORM.snp <- NNORM.snp[, k]
1086
-##		G <- GG[, k]
1087
-##		xx <- CP[, k]
1088
-##		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1089
-##		G <- G*highConf*NORM.snp
1090
-##		G[G==0] <- NA
1091
-		##nonpolymorphic
1092
-##		A.np <- AA.np[, k]
1093
-##		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1094
-##		muA <- applyByGenotype(A.snp, rowMedians, G)
1095
-##		muB <- applyByGenotype(B.snp, rowMedians, G)
1096
-##		muA <- muA[, 1]
1097
-##		muB <- muB[, 3]
1098
-##		X <- cbind(1, log2(c(muA, muB)))
1099 1023
 		X <- cbind(1, log2(c(medianA.AA[, k], medianB.BB[, k])))
1100 1024
 		Y <- log2(c(phiA.snp, phiB.snp))
1101 1025
 		betahat <- solve(crossprod(X), crossprod(X, Y))
1102
-		##
1103
-##		mus <- rowMedians(A.np * NORM.np[, k], na.rm=TRUE)
1104
-##		averaging across markers, is there a difference in the
1105
-##		typical AA intensity for SNPs and the AA intensity for
1106
-##		nonpolymorphic loci
1107
-##		crosshyb <- max(median(muA) - median(mus), 0)
1108 1026
 		crosshyb <- max(median(medianA.AA[, k]) - median(medianA.AA.np[, k]), 0)
1109
-##		X <- cbind(1, log2(mus+crosshyb))
1110 1027
 		X <- cbind(1, log2(medianA.AA.np[, k] + crosshyb))
1111 1028
 		logPhiT <- X %*% betahat
1112 1029
 		phiA.np[, k] <- 2^(logPhiT)
1113 1030
 		nuA.np[, k] <- medianA.AA.np[,k]-2*phiA.np[, k]
1114 1031
 ##		cA[, k] <- 1/phiA.np[, J] * (A.np - nuA.np[, J])
1115
-##		sig2A.np[, J] <- rowMAD(log2(A.np*NORM.np[, k]), na.rm=TRUE)
1116
-##		rm(NORM.snp, highConf, xx, G, Ns, A.np, X, Y, betahat, mus, logPhiT)
1117
-##		gc()
1118 1032
 	}
1119 1033
 	if(THR.NU.PHI){
1120 1034
 		nuA.np[nuA.np < MIN.NU] <- MIN.NU
1121 1035
 		phiA.np[phiA.np < MIN.PHI] <- MIN.PHI
1122 1036
 	}
1123
-##	cA[cA < 0.05] <- 0.05
1124
-##	cA[cA > 5] <-  5
1125
-##	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1126 1037
 	nuA(object)[marker.index, ] <- nuA.np
1127 1038
 	phiA(object)[marker.index, ] <- phiA.np
1128 1039
 	if(is.lds) { close(object); return(TRUE)}
... ...
@@ -1409,17 +1320,13 @@ fit.lm3 <- function(strata,
1409 1320
 	phiB(object)[marker.index, ] <- phiB
1410 1321
 	phiPrimeA(object)[marker.index, ] <- phiA2
1411 1322
 	phiPrimeB(object)[marker.index, ] <- phiB2
1412
-	TRUE
1323
+	if(is.lds) {close(object); return(TRUE)} else return(object)
1413 1324
 }
1414 1325
 
1415 1326
 fit.lm4 <- function(strata,
1416 1327
 		    index.list,
1417 1328
 		    object,
1418
-		    SNRMin,
1419 1329
 		    MIN.SAMPLES,
1420
-		    MIN.OBS,
1421
-		    DF.PRIOR,
1422
-		    GT.CONF.THR,
1423 1330
 		    THR.NU.PHI,
1424 1331
 		    MIN.NU,
1425 1332
 		    MIN.PHI,
... ...
@@ -1606,379 +1513,6 @@ processCEL2 <- function(i, filenames, row.names, A, seed, cdfName, pkgname){
1606 1513
 }
1607 1514
 
1608 1515
 
1609
-# steps: quantile normalize hapmap: create 1m_reference_cn.rda object
1610
-##cnrma <- function(filenames, cdfName, row.names, sns, seed=1, verbose=FALSE){
1611
-##	if(missing(cdfName)) stop("must specify cdfName")
1612
-##	pkgname <- getCrlmmAnnotationName(cdfName)
1613
-##	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
1614
-##	if (missing(sns)) sns <- basename(filenames)
1615
-##        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
1616
-##	fid <- getVarInEnv("npProbesFid")
1617
-##	fid <- fid[match(row.names, names(fid))]
1618
-##	set.seed(seed)
1619
-##	idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
1620
-##	SKW <- vector("numeric", length(filenames))
1621
-##	NP <- matrix(NA, length(fid), length(filenames))
1622
-##	verbose <- TRUE
1623
-##	if(verbose){
1624
-##		message("Processing ", length(filenames), " files.")
1625
-##		if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3)
1626
-##	}
1627
-##	if(cdfName=="genomewidesnp6"){
1628
-##		loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname)
1629
-##	}
1630
-##	if(cdfName=="genomewidesnp5"){
1631
-##		loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname)
1632
-##	}
1633
-##	reference <- getVarInEnv("reference")
1634
-##	##if(!is.matrix(reference)) stop("target distribution for quantile normalization not available.")
1635
-##	for(i in seq(along=filenames)){
1636
-##		y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
1637
-##		x <- log2(y[idx2])
1638
-##		SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3)
1639
-##		rm(x)
1640
-##		NP[, i] <- as.integer(normalize.quantiles.use.target(y, target=reference))
1641
-##		if (verbose)
1642
-##			if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
1643
-##			else cat(".")
1644
-##	}
1645
-##	dimnames(NP) <- list(names(fid), sns)
1646
-##	##dimnames(NP) <- list(map[, "man_fsetid"], sns)
1647
-##	##res3 <- list(NP=NP, SKW=SKW)
1648
-##	cat("\n")
1649
-##	return(NP)
1650
-##}
1651
-
1652
-##getFlags <- function(object, PHI.THR){
1653
-##	batch <- unique(object$batch)
1654
-##	nuA <- getParam(object, "nuA", batch)
1655
-##	nuB <- getParam(object, "nuB", batch)
1656
-##	phiA <- getParam(object, "phiA", batch)
1657
-##	phiB <- getParam(object, "phiB", batch)
1658
-##	negativeNus <- nuA < 1 | nuB < 1
1659
-##	negativePhis <- phiA < PHI.THR | phiB < PHI.THR
1660
-##	negativeCoef <- negativeNus | negativePhis
1661
-##	notfinitePhi <- !is.finite(phiA) | !is.finite(phiB)
1662
-##	flags <- negativeCoef | notfinitePhi
1663
-##	return(flags)
1664
-##}
1665
-
1666
-
1667
-##instantiateObjects <- function(object, cnOptions){
1668
-##	Ns <- matrix(NA, nrow(object), 5)
1669
-##	colnames(Ns) <- c("A", "B", "AA", "AB", "BB")
1670
-##	vA <- vB <- muB <- muA <- Ns
1671
-##	normal <- matrix(TRUE, nrow(object), ncol(object))
1672
-##	dimnames(normal) <- list(featureNames(object), sampleNames(object))
1673
-##	tmp.objects <- list(vA=vA,
1674
-##			    vB=vB,
1675
-##			    muB=muB,
1676
-##			    muA=muA,
1677
-##			    Ns=Ns,
1678
-##			    normal=normal)
1679
-##        return(tmp.objects)
1680
-##}
1681
-##
1682
-##thresholdCopynumber <- function(object){
1683
-##	ca <- CA(object)
1684
-##	cb <- CB(object)
1685
-##	ca[ca < 0.05] <- 0.05
1686
-##	ca[ca > 5] <- 5
1687
-##	cb[cb < 0.05] <- 0.05
1688
-##	cb[cb > 5] <- 5
1689
-##	CA(object) <- ca
1690
-##	CB(object) <- cb
1691
-##	return(object)
1692
-##}
1693
-##
1694
-####linear model parameters
1695
-##lm.parameters <- function(object, batch){##cnOptions){
1696
-##	fD <- fData(object)
1697
-##	##batch <- object$batch
1698
-##	uplate <- unique(batch)
1699
-##	parameterNames <- c(paste("tau2A", uplate, sep="_"),
1700
-##			    paste("tau2B", uplate, sep="_"),
1701
-##			    paste("sig2A", uplate, sep="_"),
1702
-##			    paste("sig2B", uplate, sep="_"),
1703
-##			    paste("nuA", uplate, sep="_"),
1704
-##			    paste("nuA.se", uplate, sep="_"),
1705
-##			    paste("nuB", uplate, sep="_"),
1706
-##			    paste("nuB.se", uplate, sep="_"),
1707
-##			    paste("phiA", uplate, sep="_"),
1708
-##			    paste("phiA.se", uplate, sep="_"),
1709
-##			    paste("phiB", uplate, sep="_"),
1710
-##			    paste("phiB.se", uplate, sep="_"),
1711
-##			    paste("phiAX", uplate, sep="_"),
1712
-##			    paste("phiBX", uplate, sep="_"),
1713
-##			    paste("corr", uplate, sep="_"),
1714
-##			    paste("corrA.BB", uplate, sep="_"),
1715
-##			    paste("corrB.AA", uplate, sep="_"))
1716
-##	pMatrix <- data.frame(matrix(numeric(0),
1717
-##				     nrow(object),
1718
-##				     length(parameterNames)),
1719
-##				     row.names=featureNames(object))
1720
-##	colnames(pMatrix) <- parameterNames
1721
-##	fD2 <- cbind(fD, pMatrix)
1722
-##	new("AnnotatedDataFrame", data=fD2,
1723
-##	    varMetadata=data.frame(labelDescription=colnames(fD2),
1724
-##	    row.names=colnames(fD2)))
1725
-##}
1726
-##
1727
-##nonpolymorphic <- function(object, cnOptions, tmp.objects){
1728
-##	batch <- unique(object$batch)
1729
-##	CHR <- unique(chromosome(object))
1730
-##	goodSnps <- function(object, PHI.THR, tmp.objects, nAA.THR, nBB.THR){
1731
-##		Ns <- tmp.objects[["Ns"]]
1732
-##		##Ns <- get("Ns", envir)
1733
-##		flags <- getFlags(object, PHI.THR)
1734
-##		fewAA <- Ns[, "AA"] < nAA.THR
1735
-##		fewBB <- Ns[, "BB"] < nBB.THR
1736
-##		flagsA <- flags | fewAA
1737
-##		flagsB <- flags | fewBB
1738
-##		flags <- list(A=flagsA, B=flagsB)
1739
-##		return(flags)
1740
-##	}
1741
-##	nAA.THR <- cnOptions$nHOM.THR
1742
-##	nBB.THR <- cnOptions$nHOM.THR
1743
-##	PHI.THR <- cnOptions$PHI.THR
1744
-##	snpflags <- goodSnps(object, PHI.THR, tmp.objects, nAA.THR, nBB.THR)
1745
-##	flagsA <- snpflags$A
1746
-##	flagsB <- snpflags$B
1747
-####	if(all(flagsA) | all(flagsB)) stop("all snps are flagged")
1748
-##	nuA <- getParam(object, "nuA", batch)
1749
-##	nuB <- getParam(object, "nuB", batch)
1750
-##	phiA <- getParam(object, "phiA", batch)
1751
-##	phiB <- getParam(object, "phiB", batch)
1752
-##	sns <- sampleNames(object)
1753
-##	muA <- tmp.objects[["muA"]]
1754
-##	muB <- tmp.objects[["muB"]]
1755
-##	A <- A(object)
1756
-##	B <- B(object)
1757
-####	CA <- CA(object)
1758
-####	CB <- CB(object)
1759
-##	if(CHR == 23){
1760
-##		phiAX <- getParam(object, "phiAX", batch)
1761
-##		phiBX <- getParam(object, "phiBX", batch)
1762
-##	}
1763
-##	##---------------------------------------------------------------------------
1764
-##	## Train on unflagged SNPs
1765
-##	##---------------------------------------------------------------------------
1766
-##	##Might be best to train using the X chromosome, since for the
1767
-##	##X phi and nu have been adjusted for cross-hybridization
1768
-##	##plateInd <- plate == uplate[p]
1769
-##	##muA <- muA[!flagsA, p, c("A", "AA")]
1770
-##	##muB <- muB[!flagsB, p, c("B", "BB")]
1771
-##	muA <- muA[!flagsA, "AA"]
1772
-##	muB <- muB[!flagsB, "BB"]
1773
-##	X <- cbind(1, log2(c(muA, muB)))
1774
-##	Y <- log2(c(phiA[!flagsA], phiB[!flagsB]))
1775
-##	if(nrow(X) > 5000){
1776
-##		ix <- sample(1:nrow(X), 5000)
1777
-##	} else {
1778
-##		ix <- 1:nrow(X)
1779
-##	}
1780
-##	betahat <- solve(crossprod(X[ix, ]), crossprod(X[ix, ], Y[ix]))
1781
-##	normal <- tmp.objects[["normal"]][!isSnp(object), ]
1782
-##	if(CHR == 23){
1783
-##		##normalNP <- envir[["normalNP"]]
1784
-##		##normalNP <- normalNP[, plate==uplate[p]]
1785
-##		##nuT <- envir[["nuT"]]
1786
-##		##phiT <- envir[["phiT"]]
1787
-##
1788
-##		##cnvs <- envir[["cnvs"]]
1789
-##                ##loader("cnProbes.rda", pkgname=pkgname, envir=.crlmmPkgEnv)
1790
-##                ##cnProbes <- get("cnProbes", envir=.crlmmPkgEnv)
1791
-##		##cnProbes <- cnProbes[match(cnvs, rownames(cnProbes)), ]
1792
-##
1793
-##		##For build Hg18
1794
-##		##http://genome.ucsc.edu/cgi-bin/hgGateway
1795
-##		##pseudo-autosomal regions on X
1796
-##		##chrX:1-2,709,520 and chrX:154584237-154913754, respectively
1797
-##		##par:pseudo-autosomal regions
1798
-##		pseudoAR <- position(object) < 2709520 | (position(object) > 154584237 & position(object) < 154913754)
1799
-##		##pseudoAR <- cnProbes[, "position"] < 2709520 | (cnProbes[, "position"] > 154584237 & cnProbes[, "position"] < 154913754)
1800
-##		##in case some of the cnProbes are not annotated
1801
-##		pseudoAR[is.na(pseudoAR)] <- FALSE
1802
-##		pseudoAR <- pseudoAR[!isSnp(object)]
1803
-##		##gender <- envir[["gender"]]
1804
-##		gender <- object$gender
1805
-##		obj1 <- object[!isSnp(object), ]
1806
-##		A.male <- A(obj1[, gender==1])
1807
-##		mu1 <- rowMedians(A.male, na.rm=TRUE)
1808
-##		##mu1 <- rowMedians(NP[, gender=="male"], na.rm=TRUE)
1809
-##		##mu2 <- rowMedians(NP[, gender=="female"], na.rm=TRUE)
1810
-##		A.female <- A(obj1[, gender==2])
1811
-##		mu2 <- rowMedians(A.female, na.rm=TRUE)
1812
-##		mus <- log2(cbind(mu1, mu2))
1813
-##		X.men <- cbind(1, mus[, 1])
1814
-##		X.fem <- cbind(1, mus[, 2])
1815
-##
1816
-##		Yhat1 <- as.numeric(X.men %*% betahat)
1817
-##		Yhat2 <- as.numeric(X.fem %*% betahat)
1818
-##		phi1 <- 2^(Yhat1)
1819
-##		phi2 <- 2^(Yhat2)
1820
-##		nu1 <- 2^(mus[, 1]) - phi1
1821
-##		nu2 <- 2^(mus[, 2]) - 2*phi2
1822
-##
1823
-##		if(any(pseudoAR)){
1824
-##			nu1[pseudoAR] <- 2^(mus[pseudoAR, 1]) - 2*phi1[pseudoAR]
1825
-##		}
1826
-####		CT1 <- 1/phi1*(A.male-nu1)
1827
-####		CT2 <- 1/phi2*(A.female-nu2)
1828
-####		CA <- CA(obj1)
1829
-####		CA[, gender==1] <- CT1
1830
-####		CA[, gender==2] <- CT2
1831
-####		CA(object)[!isSnp(object), ] <- CA
1832
-##		##only using females to compute the variance
1833
-##		##normalNP[, gender=="male"] <- NA
1834
-##		normal[, gender==1] <- NA
1835
-##		sig2A <- getParam(object, "sig2A", batch)
1836
-##		normal.f <- normal[, object$gender==2]
1837
-##		sig2A[!isSnp(object)] <- rowMAD(log2(A.female*normal.f), na.rm=TRUE)^2
1838
-##		sig2A[!isSnp(object) & is.na(sig2A)] <- median(sig2A[!isSnp(object)], na.rm=TRUE)
1839
-##		##sig2T[, p] <- rowMAD(log2(NP*normalNP), na.rm=TRUE)^2
1840
-##		object <- pr(object, "sig2A", batch, sig2A)
1841
-##
1842
-##		nuA[!isSnp(object)] <- nu2
1843
-##		phiA[!isSnp(object)] <- phi2
1844
-##
1845
-##		THR.NU.PHI <- cnOptions$THR.NU.PHI
1846
-##		if(THR.NU.PHI){
1847
-##			verbose <- cnOptions$verbose
1848
-##			##Assign values to object
1849
-##			object <- pr(object, "nuA", batch, nuA)
1850
-##			object <- pr(object, "phiA", batch, phiA)
1851
-##			##if(verbose) message("Thresholding nu and phi")
1852
-##			object <- thresholdModelParams(object, cnOptions)
1853
-##		} else {
1854
-##			object <- pr(object, "nuA", batch, nuA)
1855
-##			object <- pr(object, "phiA", batch, phiA)
1856
-##		}
1857
-##	} else {
1858
-##		A <- A(object)[!isSnp(object), ]
1859
-##		mus <- rowMedians(A * normal, na.rm=TRUE)
1860
-##		crosshyb <- max(median(muA) - median(mus), 0)
1861
-##		X <- cbind(1, log2(mus+crosshyb))
1862
-##		logPhiT <- X %*% betahat
1863
-##		phiA[!isSnp(object)] <- 2^(logPhiT)
1864
-##		nuA[!isSnp(object)] <- mus-2*phiA[!isSnp(object)]
1865
-##
1866
-##		THR.NU.PHI <- cnOptions$THR.NU.PHI
1867
-##		if(THR.NU.PHI){
1868
-##			verbose <- cnOptions$verbose
1869
-##			##Assign values to object
1870
-##			object <- pr(object, "nuA", batch, nuA)
1871
-##			object <- pr(object, "phiA", batch, phiA)
1872
-##			##if(verbose) message("Thresholding nu and phi")
1873
-##			object <- thresholdModelParams(object, cnOptions)
1874
-##			##reassign values (now thresholded at MIN.NU and MIN.PHI
1875
-##			nuA <- getParam(object, "nuA", batch)
1876
-##			phiA <- getParam(object, "phiA", batch)
1877
-##		}
1878
-##		##CA(object)[!isSnp(object), ] <- 1/phiA[!isSnp(object)]*(A - nuA[!isSnp(object)])
1879
-##		sig2A <- getParam(object, "sig2A", batch)
1880
-##		sig2A[!isSnp(object)] <- rowMAD(log2(A*normal), na.rm=TRUE)^2
1881
-##		object <- pr(object, "sig2A", batch, sig2A)
1882
-##		##added
1883
-##		object <- pr(object, "nuA", batch, nuA)
1884
-##		object <- pr(object, "phiA", batch, phiA)
1885
-##	}
1886
-##	return(object)
1887
-##}
1888
-##
1889
-##sufficient statistics on the intensity scale
1890
-##withinGenotypeMoments <- function(object, cnOptions, tmp.objects){
1891
-##	normal <- tmp.objects[["normal"]]
1892
-##	## muA, muB: robust estimates of the within-genotype center (intensity scale)
1893
-##	muA <- tmp.objects[["muA"]]
1894
-##	muB <- tmp.objects[["muB"]]
1895
-##	## vA, vB: robust estimates of the within-genotype variance (intensity scale)
1896
-##	vA <- tmp.objects[["vA"]]
1897
-##	vB <- tmp.objects[["vB"]]
1898
-##	Ns <- tmp.objects[["Ns"]]
1899
-##	G <- snpCall(object)
1900
-##	GT.CONF.THR <- cnOptions$GT.CONF.THR
1901
-##	CHR <- unique(chromosome(object))
1902
-##	A <- A(object)
1903
-##	B <- B(object)
1904
-####	highConf <- (1-exp(-confs(object)/1000)) > GT.CONF.THR
1905
-##	xx <- snpCallProbability(object)
1906
-##	highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1907
-##	##highConf <- confs(object) > GT.CONF.THR
1908
-##	##highConf <- highConf > GT.CONF.THR
1909
-##	if(CHR == 23){
1910
-##		gender <- object$gender
1911
-####		gender <- envir[["gender"]]
1912
-##		IX <- matrix(gender, nrow(G), ncol(G), byrow=TRUE)
1913
-####		IX <- IX == "female"
1914
-##		IX <- IX == 2  ##2=female, 1=male
1915
-##	} else IX <- matrix(TRUE, nrow(G), ncol(G))
1916
-##	index <- GT.B <- GT.A <- vector("list", 3)
1917
-##	names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
1918
-##	##--------------------------------------------------
1919
-##	##within-genotype sufficient statistics
1920
-##	##--------------------------------------------------
1921
-##	##GT.B <- GT.A <- list()
1922
-##	snpIndicator <- matrix(isSnp(object), nrow(object), ncol(object)) ##RS: added
1923
-##	for(j in 1:3){
1924
-##		GT <- G==j & highConf & IX & snpIndicator
1925
-##		GT <- GT * normal
1926
-##		Ns[, j+2] <- rowSums(GT, na.rm=TRUE)
1927
-##		GT[GT == FALSE] <- NA
1928
-##		GT.A[[j]] <- GT*A
1929
-##		GT.B[[j]] <- GT*B
1930
-##		index[[j]] <- which(Ns[, j+2] > 0 & isSnp(object)) ##RS: added
1931
-##		muA[, j+2] <- rowMedians(GT.A[[j]], na.rm=TRUE)
1932
-##		muB[, j+2] <- rowMedians(GT.B[[j]], na.rm=TRUE)
1933
-##		vA[, j+2] <- rowMAD(GT.A[[j]], na.rm=TRUE)
1934
-##		vB[, j+2] <- rowMAD(GT.B[[j]], na.rm=TRUE)
1935
-##
1936
-##		##Shrink towards the typical variance
1937
-##		DF <- Ns[, j+2]-1
1938
-##		DF[DF < 1] <- 1
1939
-##		v0A <- median(vA[, j+2], na.rm=TRUE)
1940
-##		v0B <- median(vB[, j+2], na.rm=TRUE)
1941
-##		if(v0A == 0) v0A <- NA
1942
-##		if(v0B == 0) v0B <- NA
1943
-##		DF.PRIOR <- cnOptions$DF.PRIOR
1944
-##		vA[, j+2] <- (vA[, j+2]*DF + v0A*DF.PRIOR)/(DF.PRIOR+DF)
1945
-##		vA[is.na(vA[, j+2]), j+2] <- v0A
1946
-##		vB[, j+2] <- (vB[, j+2]*DF + v0B*DF.PRIOR)/(DF.PRIOR+DF)
1947
-##		vB[is.na(vB[, j+2]), j+2] <- v0B
1948
-##	}
1949
-##	if(CHR == 23){
1950
-##		k <- 1
1951
-##		for(j in c(1,3)){
1952
-##			GT <- G==j & highConf & !IX
1953
-##			Ns[, k] <- rowSums(GT)
1954
-##			GT[GT == FALSE] <- NA
1955
-##			muA[, k] <- rowMedians(GT*A, na.rm=TRUE)
1956
-##			muB[, k] <- rowMedians(GT*B, na.rm=TRUE)
1957
-##			vA[, k] <- rowMAD(GT*A, na.rm=TRUE)
1958
-##			vB[, k] <- rowMAD(GT*B, na.rm=TRUE)
1959
-##
1960
-##			DF <- Ns[, k]-1
1961
-##			DF[DF < 1] <- 1
1962
-##			v0A <- median(vA[, k], na.rm=TRUE)
1963
-##			v0B <- median(vB[, k], na.rm=TRUE)
1964
-##			vA[, k] <- (vA[, k]*DF + v0A*DF.PRIOR)/(DF.PRIOR+DF)
1965
-##			vA[is.na(vA[, k]), k] <- v0A
1966
-##			vB[, k] <- (vB[, k]*DF + v0B*DF.PRIOR)/(DF.PRIOR+DF)
1967
-##			vB[is.na(vB[, k]), k] <- v0B
1968
-##			k <- k+1
1969
-##		}
1970
-##	}
1971
-##	tmp.objects[["Ns"]] <- Ns
1972
-##	tmp.objects[["vA"]] <- vA
1973
-##	tmp.objects[["vB"]] <- vB
1974
-##	tmp.objects[["muA"]] <- muA
1975
-##	tmp.objects[["muB"]] <- muB
1976
-##	tmp.objects$index <- index
1977
-##	tmp.objects$GT.A <- GT.A
1978
-##	tmp.objects$GT.B <- GT.B
1979
-##	return(tmp.objects)
1980
-##}
1981
-
1982 1516
 imputeCenter <- function(muA, muB, index.complete, index.missing){
1983 1517
 	index <- index.missing
1984 1518
 	mnA <- muA
... ...
@@ -2011,7 +1545,7 @@ indexComplete <- function(NN, medianA, medianB, MIN.OBS){
2011 1545
 imputeCentersForMonomorphicSnps <- function(medianA, medianB, index.complete, unobserved.index){
2012 1546
 	cols <- c(3, 1, 2)
2013 1547
 	for(j in 1:3){
2014
-		if(sum(unobserved.index[[j]]) == 0) next()
1548
+		if(length(unobserved.index[[j]]) == 0) next()
2015 1549
 		kk <- cols[j]
2016 1550
 		X <- cbind(1, medianA[index.complete, kk], medianB[index.complete, kk])
2017 1551
 		Y <- cbind(medianA[index.complete,  -kk],
... ...
@@ -2872,42 +2406,35 @@ ellipseCenters <- function(object, index, allele, batch, log.it=TRUE){
2872 2406
 
2873 2407
 
2874 2408
 shrinkSummary <- function(object,
2875
-			  type=c("SNP", "NP", "X.SNP", "X.NP"), ##"X.snps", "X.nps"),
2409
+			  type=c("SNP", "X.SNP"), ##"X.snps", "X.nps"),
2876 2410
 			  MIN.OBS=1,
2877 2411
 			  MIN.SAMPLES=10,
2878 2412
 			  DF.PRIOR=50,
2879
-			  verbose=TRUE){
2880
-	if(type == "X.SNP" | type=="X.NP"){
2413
+			  verbose=TRUE,
2414
+			  marker.index,
2415
+			  is.lds){
2416
+	stopifnot(type %in% c("SNP", "X.SNP"))
2417
+	if(type == "X.SNP"){
2881 2418
 		gender <- object$gender
2882 2419
 		if(sum(gender == 2) < 3) {
2883 2420
 			return("too few females to estimate within genotype summary statistics on CHR X")
2884 2421
 		}
2885 2422
 		CHR.X <- TRUE
2886 2423
 	} else CHR.X <- FALSE
2887
-	batch <- batch(object)
2888
-	is.snp <- isSnp(object)
2889
-	is.autosome <- chromosome(object) < 23
2890
-	is.annotated <- !is.na(chromosome(object))
2891
-	is.X <- chromosome(object) == 23
2892
-	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2893
-	if(is.lds) require(ff)
2894
-	whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
2895
-		switch(type,
2896
-		       SNP=which(is.snp & is.autosome & is.annotated),
2897
-		       NP=which(!is.snp & is.autosome),
2898
-		       X.SNP=which(is.snp & is.X),
2899
-		       X.NP=which(!is.snp & is.X),
2900
-		       stop("'type' must be one of 'SNP', 'NP', 'X.SNP', or 'X.NP'")
2901
-	       )
2424
+	if(missing(marker.index)){
2425
+		batch <- batch(object)
2426
+		is.snp <- isSnp(object)
2427
+		is.autosome <- chromosome(object) < 23
2428
+		is.annotated <- !is.na(chromosome(object))
2429
+		is.X <- chromosome(object) == 23
2430
+		is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2431
+		if(is.lds) require(ff)
2432
+		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
2902 2433
 	}
2903
-	marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
2904 2434
 	summaryFxn <- function(type){
2905 2435
 		switch(type,
2906 2436
 		       SNP="shrinkGenotypeSummaries",
2907 2437
 		       X.SNP="shrinkGenotypeSummaries", ## this shrinks for the females only
2908
-##		       NP="summarizeNps",
2909
-##		       X.SNP="summarizeSnps",
2910
-##		       X.NP="summarizeNps")
2911 2438
 		       stop())
2912 2439
 	}
2913 2440
 	FUN <- summaryFxn(type[[1]])
... ...
@@ -2925,13 +2452,13 @@ shrinkSummary <- function(object,
2925 2452
 			 neededPkgs="crlmm")
2926 2453
 	} else {
2927 2454
 		FUN <- match.fun(FUN)
2928
-		object <- FUN(strata.index=1,
2455
+		object <- FUN(strata=1,
2929 2456
 			      index.list=list(marker.index),
2930 2457
 			      object=object,
2458
+			      verbose=verbose,
2931 2459
 			      MIN.OBS=MIN.OBS,
2932 2460
 			      MIN.SAMPLES=MIN.SAMPLES,
2933 2461
 			      DF.PRIOR=DF.PRIOR,
2934
-			      verbose=verbose,
2935 2462
 			      is.lds=is.lds)
2936 2463
 	}
2937 2464
 	return(object)
... ...
@@ -2940,7 +2467,9 @@ shrinkSummary <- function(object,
2940 2467
 genotypeSummary <- function(object,
2941 2468
 			    GT.CONF.THR=0.95,
2942 2469
 			    type=c("SNP", "NP", "X.SNP", "X.NP"), ##"X.snps", "X.nps"),
2943
-			    verbose=TRUE){
2470
+			    verbose=TRUE,
2471
+			    marker.index,
2472
+			    is.lds){
2944 2473
 	if(type == "X.SNP" | type=="X.NP"){
2945 2474
 		gender <- object$gender
2946 2475
 		if(sum(gender == 2) < 3) {
... ...
@@ -2948,23 +2477,16 @@ genotypeSummary <- function(object,
2948 2477
 		}
2949 2478
 		CHR.X <- TRUE
2950 2479
 	} else CHR.X <- FALSE
2951
-	batch <- batch(object)
2952
-	is.snp <- isSnp(object)
2953
-	is.autosome <- chromosome(object) < 23
2954
-	is.annotated <- !is.na(chromosome(object))
2955
-	is.X <- chromosome(object) == 23
2956
-	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2957
-	if(is.lds) require(ff)
2958
-	whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
2959
-		switch(type,
2960
-		       SNP=which(is.snp & is.autosome & is.annotated),
2961
-		       NP=which(!is.snp & is.autosome),
2962
-		       X.SNP=which(is.snp & is.X),
2963
-		       X.NP=which(!is.snp & is.X),
2964
-		       stop("'type' must be one of 'SNP', 'NP', 'X.SNP', or 'X.NP'")
2965
-	       )
2480
+	if(missing(marker.index)){
2481
+		batch <- batch(object)
2482
+		is.snp <- isSnp(object)
2483
+		is.autosome <- chromosome(object) < 23
2484
+		is.annotated <- !is.na(chromosome(object))
2485
+		is.X <- chromosome(object) == 23
2486
+		is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2487
+		if(is.lds) stopifnot(isPackagedLoaded("ff"))
2488
+		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
2966 2489
 	}
2967
-	marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
2968 2490
 	summaryFxn <- function(type){
2969 2491
 		switch(type,
2970 2492
 		       SNP="summarizeSnps",
... ...
@@ -2978,7 +2500,6 @@ genotypeSummary <- function(object,
2978 2500
 		ocLapply(seq(along=index.list),
2979 2501
 			 FUN,
2980 2502
 			 index.list=index.list,
2981
-##			 marker.index=marker.index,
2982 2503
 			 object=object,
2983 2504
 			 batchSize=ocProbesets(),
2984 2505
 			 GT.CONF.THR=GT.CONF.THR,
... ...
@@ -2988,19 +2509,28 @@ genotypeSummary <- function(object,
2988 2509
 			 neededPkgs="crlmm")
2989 2510
 	} else {
2990 2511
 		FUN <- match.fun(FUN)
2991
-		object <- FUN(strata.index=1,
2512
+		object <- FUN(strata=1,
2992 2513
 			      index.list=list(marker.index),
2993
-##			      marker.index=marker.index,
2994 2514
 			      object=object,
2995 2515
 			      batchSize=ocProbesets(),
2996 2516
 			      GT.CONF.THR=GT.CONF.THR,
2997 2517
 			      verbose=verbose,
2998
-			      CHR.X=CHR.X,
2999
-			      is.lds=is.lds)
2518
+			      is.lds=is.lds,
2519
+			      CHR.X=CHR.X)
3000 2520
 	}
3001 2521
 	return(object)
3002 2522
 }
3003 2523
 
2524
+whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
2525
+	switch(type,
2526
+	       SNP=which(is.snp & is.autosome & is.annotated),
2527
+	       NP=which(!is.snp & is.autosome),
2528
+	       X.SNP=which(is.snp & is.X),
2529
+	       X.NP=which(!is.snp & is.X),
2530
+	       stop("'type' must be one of 'SNP', 'NP', 'X.SNP', or 'X.NP'")
2531
+	       )
2532
+}
2533
+
3004 2534
 summarizeNps <- function(strata, index.list, object, batchSize,
3005 2535
 			 GT.CONF.THR, verbose, is.lds, CHR.X, ...){
3006 2536
 	if(is.lds) {physical <- get("physical"); open(object)}
... ...
@@ -3151,44 +2681,115 @@ crlmmCopynumber <- function(object,
3151 2681
 			    prior.prob=rep(1/4,4),
3152 2682
 			    seed=1,
3153 2683
 			    verbose=TRUE,
3154
-			    GT.CONF.THR=0.99,
2684
+			    GT.CONF.THR=0.95,
3155 2685
 			    PHI.THR=2^6,
3156 2686
 			    nHOM.THR=5,
3157 2687
 			    MIN.NU=2^3,
3158 2688
 			    MIN.PHI=2^3,
3159 2689
 			    THR.NU.PHI=TRUE,
3160 2690
 			    type=c("SNP", "NP", "X.SNP", "X.NP")){
3161
-	if(type == "X.SNP" | type=="X.NP"){
3162
-		gender <- object$gender
3163
-		if(sum(gender == 2) < 3) {
3164
-			warning("too few females to estimate within genotype summary statistics on CHR X")
3165
-			return(object)
3166
-		}
3167
-		CHR.X <- TRUE
3168
-	} else CHR.X <- FALSE
2691
+	stopifnot(type %in% c("SNP", "NP", "X.SNP", "X.NP"))
3169 2692
 	batch <- batch(object)
3170 2693
 	is.snp <- isSnp(object)
3171 2694
 	is.autosome <- chromosome(object) < 23
3172 2695
 	is.annotated <- !is.na(chromosome(object))
3173 2696
 	is.X <- chromosome(object) == 23
3174 2697
 	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
3175
-	if(is.lds) require(ff)
2698
+	if(is.lds) stopifnot(isPackageLoaded("ff"))
3176 2699
 	samplesPerBatch <- table(as.character(batch(object)))
3177 2700
 	if(any(samplesPerBatch < MIN.SAMPLES)){
3178 2701
 		warning("The following batches have fewer than ", MIN.SAMPLES, ":")
3179 2702
 		message(paste(samplesPerBatch[samplesPerBatch < MIN.SAMPLES], collapse=", "))
3180 2703
 		message("Not estimating copy number for the above batches")
3181 2704
 	}
3182
-	whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
3183
-		switch(type,
3184
-		       SNP=which(is.snp & is.autosome & is.annotated),
3185
-		       NP=which(!is.snp & is.autosome),
3186
-		       X.SNP=which(is.snp & is.X),
3187
-		       X.NP=which(!is.snp & is.X),
3188
-		       stop("'type' must be one of 'SNP', 'NP', 'X.SNP', or 'X.NP'")
3189
-	       )
2705
+	mylabel <- function(marker.type){
2706
+		switch(marker.type,
2707
+		       SNP="autosomal SNPs",
2708
+		       NP="autosomal nonpolymorphic markers",
2709
+		       X.SNP="chromosome X SNPs",
2710
+		       X.NP="chromosome X nonpolymorphic markers")
2711
+	}
2712
+	if(verbose) message("Computing summary statistics for genotype clusters for each batch")
2713
+	for(i in seq_along(type)){
2714
+		## do all types
2715
+		marker.type <- type[i]
2716
+		if(verbose) message(paste("...", mylabel(marker.type)))
2717
+		##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch")
2718
+		marker.index <- whichMarkers(marker.type, is.snp,
2719
+					     is.autosome, is.annotated, is.X)
2720
+		object <- genotypeSummary(object=object,
2721
+					  GT.CONF.THR=GT.CONF.THR,
2722
+					  type=marker.type,
2723
+					  verbose=verbose,
2724
+					  marker.index=marker.index,
2725
+					  is.lds=is.lds)
2726
+	}
2727
+	if(verbose) message("Imputing unobserved cluster medians and shrinking the variances at polymorphic loci")##SNPs only
2728
+	for(i in seq_along(type)){
2729
+		marker.type <- type[i]
2730
+		if(!marker.type %in% c("SNP", "X.SNP")) next()
2731
+		message(paste("...", mylabel(marker.type)))
2732
+		marker.index <- whichMarkers(marker.type, is.snp,
2733
+					     is.autosome, is.annotated, is.X)
2734
+		object <- shrinkSummary(object=object,
2735
+					MIN.OBS=MIN.OBS,
2736
+					MIN.SAMPLES=MIN.SAMPLES,
2737
+					DF.PRIOR=DF.PRIOR,
2738
+					type=marker.type,
2739
+					verbose=verbose,
2740
+					marker.index=marker.index,
2741
+					is.lds=is.lds)
2742
+	}
2743
+	if(verbose) message("Estimating parameters for copy number")##SNPs only
2744
+	for(i in seq_along(type)){
2745
+		marker.type <- type[i]
2746
+		message(paste("...", mylabel(marker.type)))
2747
+		CHR.X <- ifelse(marker.type %in% c("X.SNP", "X.NP"), TRUE, FALSE)
2748
+		marker.index <- whichMarkers(marker.type, is.snp,
2749
+					     is.autosome, is.annotated, is.X)
2750
+		object <- estimateCnParameters(object=object,
2751
+					       type=marker.type,
2752
+					       SNRMin=SNRMin,
2753
+					       DF.PRIOR=DF.PRIOR,
2754
+					       GT.CONF.THR=GT.CONF.THR,
2755
+					       MIN.SAMPLES=MIN.SAMPLES,
2756
+					       MIN.OBS=MIN.OBS,
2757
+					       MIN.NU=MIN.NU,
2758
+					       MIN.PHI=MIN.PHI,
2759
+					       THR.NU.PHI=THR.NU.PHI,
2760
+					       verbose=verbose,
2761
+					       marker.index=marker.index,
2762
+					       is.lds=is.lds,
2763
+					       CHR.X=CHR.X)
2764
+	}
2765
+	return(object)
2766
+}
2767
+
2768
+estimateCnParameters <- function(object,
2769
+				 type=c("SNP", "NP", "X.SNP", "X.NP"),
2770
+				 verbose=TRUE,
2771
+				 SNRMin=5,
2772
+				 DF.PRIOR=50,
2773
+				 GT.CONF.THR=0.95,
2774
+				 MIN.SAMPLES=10,
2775
+				 MIN.OBS=1,
2776
+				 MIN.NU=8,
2777
+				 MIN.PHI=8,
2778
+				 THR.NU.PHI=TRUE,
2779
+				 marker.index,
2780
+				 CHR.X,
2781
+				 is.lds){
2782
+	if(missing(marker.index)){
2783
+		batch <- batch(object)
2784
+		is.snp <- isSnp(object)
2785
+		is.autosome <- chromosome(object) < 23
2786
+		is.annotated <- !is.na(chromosome(object))
2787
+		is.X <- chromosome(object) == 23
2788
+		is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2789
+		if(is.lds) require(ff)
2790
+		CHR.X <- ifelse(type[[1]] %in% c("X.SNP", "X.NP"), TRUE, FALSE)
2791
+		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
3190 2792
 	}
3191
-	marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
3192 2793
 	lmFxn <- function(type){
3193 2794
 		switch(type,
3194 2795
 		       SNP="fit.lm1",
... ...
@@ -359,12 +359,14 @@ ACN <- function(object, allele, i , j){
359 359
 		ii <- which(chromosome(object) < 23)
360 360
 		if(length(ii) > 0){
361 361
 			## calculate ca only for batches indexed by j
362
-			batches <- unique(batch(object)[j])
362
+			batches <- unique(as.character(batch(object))[j])
363 363
 			for(k in seq_along(batches)){
364
-				l <- match(batches[k], bns)
364
+				this.batch <- batches[k]
365
+				jj <- j[batch(object)[j] %in% this.batch]
366
+				l <- match(this.batch, bns)
365 367
 				bg <- nu(object, allele)[ii, l]
366 368
 				slope <- phi(object, allele)[ii, l]
367
-				I <- allele(object, allele)[ii, j]
369
+				I <- allele(object, allele)[ii, jj]
368 370
 				acn[[k]] <- 1/slope*(I - bg)
369 371
 			}
370 372
 		}
... ...
@@ -375,24 +377,27 @@ ACN <- function(object, allele, i , j){
375 377
 	}
376 378
 	if(!missing(i) & missing(j)){
377 379
 		## calculate ca, cb for all batches
378
-		batches <- unique(batch(object))
380
+		batches <- batchNames(object)
379 381
 		for(k in seq_along(batches)){
382
+			this.batch <- batches[k]
383
+			l <- match(this.batch, bns)
380 384
 			##bb <- batches[k]
381
-			bg <- nu(object, allele)[i, k]
382
-			slope <- phi(object, allele)[i, k]
383
-			I <- allele(object, allele)[i, j]
385
+			bg <- nu(object, allele)[i, l]
386
+			slope <- phi(object, allele)[i, l]
387
+			I <- allele(object, allele)[i, batch(object) == this.batch]
384 388
 			acn[[k]] <- 1/slope*(I - bg)
385 389
 		}
386 390
 	}
387 391
 	if(!missing(i) & !missing(j)){
388
-		ubatch <- unique(batch(object))
389
-		batches <- unique(batch(object)[j])
392
+		batches <- unique(as.character(batch(object)[j]))
390 393
 		acn <- list()
391 394
 		for(k in seq_along(batches)){
392
-			l <- match(batches[k], ubatch)
395
+			this.batch <- batches[k]
396
+			jj <- j[batch(object)[j] %in% this.batch]
397
+			l <- match(this.batch, bns)
393 398
 			bg <- nu(object, allele)[i, l]
394 399
 			slope <- phi(object, allele)[i, l]
395
-			I <- allele(object, allele)[i, j]
400
+			I <- allele(object, allele)[i, jj]
396 401
 			acn[[k]] <- 1/slope*(I - bg)
397 402
 		}
398 403
 	}
... ...
@@ -404,15 +409,20 @@ ACN <- function(object, allele, i , j){
404 409
 setMethod("CA", signature=signature(object="CNSet"),
405 410
 	  function(object, ...){
406 411
 		  ca <- ACN(object, allele="A", ...)
412
+		  ca[ca < 0.05] <- 0.05
413
+		  ca[ca > 5] <- 5
407 414
 		  return(ca)
408 415
 	  })
409 416
 setMethod("CB", signature=signature(object="CNSet"),
410 417
 	  function(object, ...) {
411 418
 		  cb <- ACN(object, allele="B", ...)
419
+		  cb[cb < 0.05] <- 0.05
420
+		  cb[cb > 5] <- 5
412 421
 		  return(cb)
413 422
 	  })
414 423
 
415 424
 totalCopyNumber <- function(object, ...){
425
+	message("copy number at nonpolymorphic probes is not currently available")
416 426
 	ca <- CA(object, ...)
417 427
 	cb <- CB(object, ...)
418 428
 	is.snp <- isSnp(object)
... ...
@@ -432,7 +442,7 @@ setReplaceMethod("snpCall", c("CNSet", "ff_or_matrix"),
432 442
                  function(object, ..., value){
433 443
 			 assayDataElementReplace(object, "call", value)
434 444
 		 })
435
-##setReplaceMethod("snpCallProbability", c("CNSet", "ff_or_matrix"),
436
-##                 function(object, ..., value){
437
-##			 assayDataElementReplace(object, "callProbability", value)
438
-##		 })
445
+setReplaceMethod("snpCallProbability", c("CNSet", "ff_or_matrix"),
446
+                 function(object, ..., value){
447
+			 assayDataElementReplace(object, "callProbability", value)
448
+		 })
... ...
@@ -38,9 +38,9 @@ options(continue=" ", width=70)
38 38
 
39 39
 \end{abstract}
40 40
 
41
-\section{Simple usage}
42
-
41
+\section{Copy number estimation}
43 42
 
43
+\subsection{Set up}
44 44
 
45 45
 <<cdfname>>=
46 46
 library(crlmm)
... ...
@@ -91,7 +91,7 @@ calls to \verb@Sweave@ will load the saved computations from disk. For
91 91
 additional information, see the examples in the help file for the
92 92
 \Rfunction{checkExists} function.
93 93
 
94
-\paragraph{Preprocessing and genotyping.}
94
+\subsection{Preprocessing and genotyping.}
95 95
 
96 96
 In the following code chunk, we provide the complete path to the
97 97
 Affymetrix CEL files and define a 'batch' variable. The
... ...
@@ -132,169 +132,34 @@ The function \Rfunction{genotype} checks to see whether the
132 132
 genotype are stored as \Robject{ff} objects on disk.  Otherwise, the
133 133
 genotypes and normalized intensities are stored in matrices.  A word
134 134
 of caution: the \Rfunction{genotype} function without \Rpackage{ff}
135
-requires a potentially large amount of RAM.  Therefore, we illustrate
136
-the \Robject{genotype} function using only the first 20 CEL files. We
137
-wrap the function \Rfunction{genotype} in the \Rfunction{checkExists}
138
-so that a user can load and inspect the saved object.  We first check
139
-that the file \Robject{cnSet.rda} does not exist. Existence of this
140
-file implies that we have already run the copy number estimation and,
141
-therefore, we do not need to preprocess and genotype the files a
142
-second time.
135
+requires a potentially large amount of RAM.  We illustrate the
136
+\Robject{genotype} function using only the first 5 CEL files (a number
137
+to small to be used for copy number estimation). We wrap the function
138
+\Rfunction{genotype} in the \Rfunction{checkExists} to speed up
139
+Sweaving after the initial batch job.
143 140
 
144 141
 <<genotype>>=
145
-tmpdir <- "~/tmp3"
146
-##stop()
147
-##if(FALSE){
148
-##if(file.exists(tmpdir)) unlink(tmpdir, recursive=TRUE)
149
-dir.create(tmpdir)
150
-##trace(genotype, browser)
151
-system.time(gtSet2 <- genotype(filenames=celFiles[1:75],
152
-			       cdfName=cdfName,
153
-			       batch=batch[1:75]))
154
-save(gtSet2, file=file.path(tmpdir, "gtSet2.rda"))
155
-library(ff)
156
-ldPath(tmpdir)
157
-system.time(gtSet <- genotype(filenames=celFiles[1:75],
158
-			      cdfName=cdfName,
159
-			      batch=batch[1:75]))
160
-save(gtSet, file=file.path(tmpdir, "gtSet.rda"))
161
-q("no")
162
-if(FALSE){
163
-	library(ff)
164
-	tmpdir <- "~/tmp"
165
-	dir.create("~/tmp2")
166
-	load(file.path(tmpdir, "gtSet.rda"))
167
-	ldPath("~/tmp2")
168
-	gtSet <- updateObject(gtSet) ## neeed to remove the old ff objects
169
-	save(gtSet, file=file.path(ldPath(), "gtSet.rda"))
170
-}
171
-
172
-##trace(summarizeSnps, browser)
173
-ocProbesets(100e3)
174
-system.time(gtSet <- genotypeSummary(gtSet, GT.CONF.THR=0.95, type="SNP"))
175
-system.time(gtSet <- genotypeSummary(gtSet, GT.CONF.THR=0.95, type="NP"))
176
-system.time(gtSet <- genotypeSummary(gtSet, GT.CONF.THR=0.95, type="X.SNP"))
177
-system.time(gtSet <- genotypeSummary(gtSet, GT.CONF.THR=0.95, type="X.NP"))
178
-library(ff)
179
-load("~/tmp2/gtSet.rda")
180
-invisible(open(gtSet))
181
-ocProbesets(100e3)
182
-
183
-system.time(gtSet <- shrinkSummary(gtSet, type="SNP"))
184
-##system.time(gtSet <- shrinkSummary(gtSet, type="NP"))
185
-system.time(gtSet <- shrinkSummary(gtSet, type="X.SNP"))
186
-##system.time(gtSet <- shrinkSummary(gtSet, type="X.NP"))
187
-
188
-## test: try females only.  try males only
189
-
190
-## fit linear model
191
-##trace(crlmmCopynumber, browser)
192
-##trace(fit.wls, browser)
193
-trace(fit.lm1, browser)
194
-system.time(cnSet <- crlmmCopynumber(gtSet, type="SNP"))
195
-CA(cnSet)[1:5, 1:5]
196
-totalCopyNumber(cnSet, i=1:5, j=1:5)
197
-
198
-trace(fit.lm2, browser)
199
-system.time(cnSet <- crlmmCopynumber(gtSet, type="NP"))
200
-index <- which(!isSnp(cnSet))[1:10]
201
-totalCopyNumber(cnSet, i=index, j=1:5)
202
-
203
-##trace(fit.lm3, browser)
204
-##trace(summarizeMaleXGenotypes, browser)
205
-##trace(fit.wls, browser)
206
-system.time(cnSet <- crlmmCopynumber(gtSet, type="X.SNP"))
207
-
208
-trace(fit.lm4, browser)
209
-system.time(cnSet <- crlmmCopynumber(gtSet, type="X.NP"))
210
-
211
-
212
-Ns(gtSet, batchname="C")[1:5, ]
213
-medians(gtSet, "A")[1:5, ]
214
-mads(gtSet, "A")[1:5, ]
215
-##log scale
216
-tau2(gtSet, allele="A")[1:5, ]
217
-tau2(gtSet, allele="B")[1:5, ]
218
-corr(gtSet)[1:5, ]
219
-
220
-
221
-
222
-cnSet <- crlmmCopynumber(gtSet, type="autosome.snps")
223
-
224
-
225
-
226
-
227
-
228
-
229
-gtSet
230
-tmp=Ns(gtSet)
231
-
232
-
233
-
234
-NN <- N.AA(gtSet)[1:1000,]
235
-N.AA(gtSet)[1:1000,] <- NN##do.call(cbind, lapply(NN, function(x) x[, 1]))
236
-
237
-
238
-##tmp=Ns(gtSet)
239
-##tmp <- Ns(gtSet, genotype="AA")
240
-##tmp <- Ns(gtSet, genotype="AA", batchname="C")
241
-##tmp <- Ns(gtSet,  batchname="C")
242
-##Ns(gtSet, batchname="C") <- tmp
243
-
244
-load(file.path(tmpdir, "gtSet2.rda"))
245
-gtSet2.update <- updateObject(gtSet2)
246
-isCurrent(gtSet2)
247
-
248
-
249
-
250
-
251
-
252
-
253
-
254
-
255
-##using matrices
256
-if(!file.exists(file.path(outdir, "cnSet.assayData_matrix.rda"))){
257
-	gtSet <- checkExists("gtSet.assayData_matrix",
258
-			     .path=outdir,
259
-			     .FUN=genotype,
260
-			     filenames=celFiles[1:20],
261
-			     cdfName=cdfName,
262
-			     batch=batch[1:20])
263
-	class(calls(gtSet.assayData_matrix))
142
+gtSet <- checkExists("gtSet.assayData_matrix",
143
+		     .path=outdir,
144
+		     .FUN=genotype,
145
+		     filenames=celFiles[1:5],
146
+		     cdfName=cdfName,
147
+		     batch=batch[1:5])
264 148
 }
265
-##q("no")
266 149
 @
267 150
 
268
-Next, we estimate copy number for the 20 CEL files.  The copy number
269
-estimation step requirees a minimum number of CEL files in each batch
270
-(we suggest 10).  While estimation can be performed using only the 20
271
-samples as we do here, it would be far preferable to process all of
272
-the files that were on the same chemistry plate.  The code chunk below
273
-will load \Robject{cnSet.assayData_matrix} from disk if this
274
-
275
-computation had already been performed as part of the batch job.
276
-
277
-<<copynumber, eval=FALSE>>=
278
-cnSet.assayData_matrix <- checkExists("cnSet.assayData_matrix",
279
-				      .path=outdir,
280
-				      .FUN=crlmmCopynumber,
281
-				      object=gtSet.assayData_matrix,
282
-				      chromosome=22)
283
-@
284
-
285
-
286 151
 A more memory efficient approach to preprocessing and genotyping is
287
-implemented in the \R{} function \Rfunction{genotypeLD} and requires
288
-the \Rpackage{ff} package.  The functions \Rfunction{ocProbesets} and
289
-\Rfunction{ocSamples} can be used to manage how many probesets and
290
-samples are to processed at a time and can therefore be used to fine
291
-tune the required RAM.  The function \Rfunction{ldPath} indicates that
292
-\Robject{ff} objects will be stored in the directory \Robject{outdir}.
152
+requires the \Rpackage{ff} package.  In particular, the functions
153
+\Rfunction{ocProbesets} and \Rfunction{ocSamples} can be used to
154
+manage how many probesets and samples are to processed at a time and
155
+can therefore be used to fine tune the required RAM.  The function
156
+\Rfunction{ldPath} indicates that \Robject{ff} objects will be stored
157
+in the directory \Robject{outdir}.
293 158
 
294 159
 <<ff>>=
295 160
 library(ff)
296 161
 ldPath(outdir)
297
-ocProbesets(50000)
162
+ocProbesets(100000)
298 163
 ocSamples(200)
299 164
 @
300 165
 
... ...
@@ -308,68 +173,130 @@ so that subsequent calls to \verb@Sweave@ can be run interactively.
308 173
 
309 174
 
310 175
 <<LDS_genotype>>=
311
-if(!file.exists(file.path(outdir, "cnSet.assayData_ff.rda"))){
312
-	Rprof(filename="Rprof_genotypeff.out", interval=0.1)
313
-	gtSet.assayData_ff <- checkExists("gtSet.assayData_ff",
314
-					  .path=outdir,
315
-					  .FUN=genotypeLD,
316
-					  filenames=celFiles,
317
-					  cdfName=cdfName,
318
-					  batch=batch)
319
-	Rprof(NULL)
320
-	class(calls(gtSet.assayData_ff))
176
+if(!file.exists(file.path(outdir, "cnSet.rda"))){
177
+	gtSet <- checkExists("gtSet",
178
+			     .path=outdir,
179
+			     .FUN=genotypeLD,
180
+			     filenames=celFiles,
181
+			     cdfName=cdfName,
182
+			     batch=batch)
183
+	class(calls(gtSet))
321 184
 }
322 185
 @
323 186
 
324
-The analogous function for copy number estimation follows. In
325
-practice, once the object \Robject{cnSet.assayData_ff} is created the
326
-\Robject{gtSet.assayData_ff} is no longer needed and can be removed
327
-from the \Robject{outdir}.
187
+\subsection{Copy number algorithm.}
188
+
189
+The \Rfunction{crlmmCopynumber} computes summary statistics for each
190
+genotype, imputes unobserved genotype centers, shrinks the
191
+within-genotype variances, and estimates parameters for
192
+allele-specific copy number. With \texttt{verbose=TRUE}, the above
193
+steps for CN estimation are displayed.
328 194
 
329 195
 <<LDS_copynumber>>=
330
-Rprof(filename="Rprof_cnff.out", interval=0.1)
331
-##trace(fit.wls, browser)
332
-cnSet.assayData_ff <- checkExists("cnSet.assayData_ff",
333
-				  .path=outdir,
334
-				  .FUN=crlmmCopynumberLD,
335
-				  object=gtSet.assayData_ff)
336
-Rprof(NULL)
337
-##if(file.exists(file.path(outdir, "gtSet.assayData_ff.rda")))
338
-##	unlink(file.path(outdir, "gtSet.assayData_ff.rda"))
339
-gc()
340
-q("no")
341
-@
342
-
343
-The objects returned by the \Rfunction{genotypeLD} and
344
-\Rfunction{crlmmCopynumberLD} have assay data elements that are
345
-pointers to \Robject{ff} objects stored in the directory
346
-\Robject{outdir}.  The functions \Rfunction{open} and
347
-\Rfunction{close} open and close the connections to the
348
-\Robject{assayData} elements. Subsetting an \Robject{ff} object pulls
349
-the data from disk into memory and should therefore be used with
350
-caution. In particular, subsetting the \Robject{gt.assayData_ff} would
351
-subset each elements in the \Robject{assayData} slot, returning an
352
-object of the same class but with \Robject{assayData} elements that
353
-are matrices.  Such an operation can be exceedingly slow and require
196
+cnSet <- checkExists("cnSet",
197
+		     .path=outdir,
198
+		     .FUN=crlmmCopynumber,
199
+		     object=gtSet)
200
+@
201
+
202
+The \Rpackage{crlmmCopynumber} function no longer stores the
203
+allele-specific estimates of copy number.  Rather, several functions
204
+are available that will compute relatively quickly the allele-specific
205
+estimates from the stored normalized intensities and the linear model
206
+parameters.  For instance, for allele $k$, marker $i$, sample $j$, and
207
+batch $p$, the estimate of allele-specific copy number are updated by
208
+subtracting the estimated background from the observed intensity and
209
+scaling by the slope coefficient. Specifically,
210
+
211
+\newcommand{\A}{A}
212
+\newcommand{\B}{B}
213
+\begin{eqnarray}
214
+  \label{eq:cnK}
215
+{\hat c}_{k,ijp} &=& \mbox{max}\left\{\frac{1}{{\hat
216
+    \phi}_{k,ip}}\left(I_{k,ijp}-{\hat \nu}_{k,ip}\right), ~0\right\}
217
+\mbox{~for~} k \in \{\A, \B\}.
218
+%  \label{eq:cnB} {\hat c}_{\B,ijp} &=& \mbox{max}\left\{\frac{1}{{\hat
219
+%      \phi}_{\B,ip}}\left(I_{\B,ijp}-{\hat \nu}_{\B,ip}\right), ~0\right\}.
220
+\end{eqnarray}.
221
+\noindent See \cite{Scharpf2010} for details.
222
+
223
+The functions \Rfunction{CA}, \Rfunction{CB}, and
224
+\Rfunction{totalCopyNumber} can be used to extract CN estimates from
225
+the \Robject{CNSet} container.
226
+
227
+<<ca>>=
228
+##ca <- CA(x[snp.index, ])/100
229
+snp.index <- which(isSnp(cnSet))
230
+ca <- CA(cnSet, i=snp.index, j=1:5)
231
+cb <- CB(cnSet, i=snp.index, j=1:5)
232
+ct <- ca+cb
233
+##boxplot(data.frame(ct[, 1:5]), pch=".")
234
+@
235
+
236
+Alternatively, total copy number can be obtained by
237
+<<totalCopynumber.snps>>=
238
+ct2 <- totalCopyNumber(cnSet, i=snp.index, j=1:5)
239
+all.equal(ct, ct2)
240
+@
241
+
242
+At nonpolymorphic loci, either the 'CA' or totalCopynumber method can
243
+be used to obtain total copy number.
244
+
245
+<<ca>>=
246
+cn.nonpolymorphic <- CA(obj, i=which(!isSnp(obj)))
247
+@
248
+
249
+Total copy number at both polymorphic and nonpolymorphic loci:
250
+<<totalCopynumber>>=
251
+##cn <- copyNumber(x)
252
+cn <- totalCopyNumber(x, sample(1:nrow(x), 1e4), 1:5)
253
+apply(cn, 2, median, na.rm=TRUE)
254
+@
255
+
256
+The above estimates can be plotted versus physical position.
257
+
258
+
259
+\section{The CNSet container}
260
+
261
+The objects returned by the \Rfunction{genotype} and
262
+\Rfunction{crlmmCopynumber} have assay data elements that are pointers
263
+to \Robject{ff} objects stored in the directory \Robject{outdir}.  The
264
+functions \Rfunction{open} and \Rfunction{close} open and close the
265
+connections to the \Robject{assayData} elements. Subsetting an
266
+\Robject{ff} object pulls the data from disk into memory and should
267
+therefore be used with caution. In particular, subsetting the
268
+\Robject{gtSet} would subset each element in the \Robject{assayData}
269
+slot, returning an object of the same class but with
270
+\Robject{assayData} elements that are matrices.  Such an operation can
271
+be exceedingly slow when performed over a network and require
354 272
 subsantial RAM.  The preferred approach is to extract only the assay
355 273
 data element that is needed. In the example below, we extract the
356 274
 genotype calls for the the first 50 samples.
357 275
 
358 276
 <<check>>=
359
-dims(cnSet.assayData_ff)
360
-dims(cnSet.assayData_matrix)
361
-print(object.size(cnSet.assayData_matrix), units="Mb")
362
-print(object.size(cnSet.assayData_ff), units="Mb")
363
-if(FALSE){
364
-	replicate(5, system.time(gt1 <- calls(cnSet.assayData_matrix)[, 1:50])[[1]])
365
-	open(calls(cnSet2))
366
-	replicate(5, system.time(gt2 <- calls(cnSet.assayData_ff)[, 1:50])[[1]])
367
-}
368
-gt1 <- calls(cnSet.assayData_matrix)[, 1:50]
369
-gt2 <- calls(cnSet.assayData_ff)[, 1:50]
370
-all.equal(gt1, gt2)
277
+dims(cnSet)
278
+dims(cnSet)
279
+print(object.size(cnSet), units="Mb")
280
+gt <- calls(cnSet)[, 1:50]
281
+@
282
+
283
+The \Robject{CNSet} class also contains the slot
284
+\Robject{batchStatistics} that contains batch-specific summaries
285
+needed for copy number estimation. In particular, each element is a
286
+matrix (or an ff object) with R rows and C columns, correspoinding to
287
+R markers and C batches.  The summaries includes the within genotype
288
+cluster medians and median absolute deviations (mads), but also
289
+parameters estimated from the linear model.  (For unobserved
290
+genotypes, the medians are imputed and the variance is obtained the
291
+median variance (across markers) within a batch. ) The elements of the
292
+slot can be listed as follows.
293
+
294
+<<batchStatistics>>=
295
+assayDataElementNames(batchStatistics(cnSet))
371 296
 @
372 297
 
298
+
299
+
373 300
 Note that for the Affymetrix 6.0 platform the assay data elements each
374 301
 have a row dimension corresponding to the total number of polymorphic
375 302
 and nonpolymorphic markers interrogated by the Affymetrix 6.0
... ...
@@ -383,39 +310,24 @@ the CRLMM genotype calls are represented as integers. The accessor
383 310
 scores.  When stored as matrices, converting the integer
384 311
 representation back to the probability scale is straightforward as
385 312
 shown below.  However, for the \Robject{ff} objects we must first
386
-bring the data into memory. To bring all the scores into memory, one
387
-could use the function \Rfunction{[,]} but this could be slow and
388
-require a lot of RAM depending on the size of the dataset.  Instead,
389
-we suggest pulling only the needed rows and columns from memory. In
390
-the following example, we convert the integer scores to probabilities
391
-for the CEPH samples.  As genotype confidence scores are not
392
-applicable to the nonpolymorphic markers, we extract only the
393
-polymorphic markers using the \Rfunction{isSnp} function.
313
+convert the ff object to a matrix. One could use the function
314
+\Rfunction{[,]} but this could be slow and require a lot of RAM
315
+depending on the size of the dataset. We suggest pulling only the
316
+needed rows and columns from memory. In the following example, we
317
+convert the integer scores to probabilities for the CEPH samples.  As
318
+genotype confidence scores are not applicable to the nonpolymorphic
319
+markers, we extract only the polymorphic markers using the
320
+\Rfunction{isSnp} function.
394 321
 
395 322
 <<assayData>>=
396 323
 rows <- which(isSnp(cnSet))
397 324
 cols <- which(batch == "C")
398
-posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet.assayData_ff)),
399
-			    error=function(e) print("This will not work for an ff object."))
400
-posterior.prob1 <- i2p(snpCallProbability(cnSet.assayData_matrix)[rows, cols])
401
-posterior.prob2 <- i2p(snpCallProbability(cnSet.assayData_ff)[rows, cols])
402
-all.equal(posterior.prob1, posterior.prob2)
403
-@
404
-
405
-Next, we obtain locus-level estimates of copy number by fitting a linear
406
-model to each SNP. A variable named 'batch' must be indicated in the
407
-\Robject{protocolData} of the \Robject{cnSet} object. As the inverse
408
-variance of the within-genotype normalized intensities are used as
409
-weights in the linear model (and hence the design matrix in the
410
-regression changes for each SNP), the time is linear with the number of
411
-markers on the array. Copy number estimation at nonpolymorphic markers
412
-and polymorphic markers with unobserved genotypes is more difficult. We
413
-refer the interested reader to the technical report \citep{Scharpf2009}.
414
-Again, we peform the copy number estimation using the matrix version and
415
-the ff version in parallel and encourage users with large datasets to
416
-explore the latter. As with the preprocessing and genotyping, the
417
-following code should be submitted as part of the batch job as it is too
418
-slow for interactive analysis.
325
+posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet)),
326
+			   error=function(e) print("This will not work for an ff object."))
327
+posterior.prob2 <- p2i(snpCallProbability(cnSet)[rows, cols])
328
+@
329
+
330
+Next, we obtain locus-level estimates of copy number.
419 331
 
420 332
 <<cn.ff, echo=FALSE, eval=FALSE>>=
421 333
 rm(cnSet); gc()
... ...
@@ -502,28 +414,7 @@ genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10],
502 414
 
503 415
 \paragraph{\Robject{CopyNumberSet}: allele-specific copy number}
504 416
 
505
-Allele-specific copy number at polymorphic loci:
506
-<<ca>>=
507
-##ca <- CA(x[snp.index, ])/100
508
-snp.index <- which(isSnp(x))
509
-ca <- CA(x, i=snp.index)
510
-##or
511
-ca <- ACN(x, "A", i=snp.index)
512
-cb <- CB(x, i=snp.index)
513
-ct <- ca+cb
514
-@
515
-
516
-Total copy number at nonpolymorphic loci:
517
-<<ca>>=
518
-cn.nonpolymorphic <- CA(obj, i=which(!isSnp(obj)))
519
-@
520 417
 
521
-Total copy number at both polymorphic and nonpolymorphic loci:
522
-<<totalCopynumber>>=
523
-##cn <- copyNumber(x)
524
-cn <- totalCopyNumber(x, sample(1:nrow(x), 1e4), 1:5)
525
-apply(cn, 2, median, na.rm=TRUE)
526
-@
527 418
 
528 419
 \subsection{Other accessors}
529 420
 
... ...
@@ -1,11 +1,10 @@
1
-@UNPUBLISHED{Scharpf2009,
1
+@ARTICLE{Scharpf2010,
2 2
   author = {Robert B Scharpf and Ingo Ruczinski and Benilton Carvalho and Betty
3 3
 	Doan and Aravinda Chakravarti and Rafael Irizarry},
4 4
   title = {A multilevel model to address batch effects in copy number estimation
5 5
 	using SNP arrays},
6
-  month = {May},
7
-  year = {2009},
8
-  url={http://www.bepress.com/cgi/viewcontent.cgi?article=1193&context=jhubiostat}
6
+  journal={Biostatistics},
7
+  year = {2010},
9 8
 }
10 9
 
11 10
 @ARTICLE{Carvalho2007a,
... ...
@@ -1,8 +1,8 @@
1
-%added to prevent warnings
2 1
 \name{ffdf-class}
3 2
 \Rdversion{1.1}
4 3
 \docType{class}
5 4
 \alias{ffdf-class}
5
+\alias{annotatedDataFrameFrom,ff_matrix-method}
6 6
 \alias{list-class}
7 7
 \title{Class "ffdf"}
8 8
 \description{