Browse code

add functions to impute centers using cross-batch information

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52857 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 16/02/2011 15:59:44
Showing1 changed files

... ...
@@ -390,6 +390,20 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
390 390
 	shrink.corrAB <- corrAB <- as.matrix(corrAB(object)[marker.index, ])
391 391
 	shrink.corrBB <- corrBB <- as.matrix(corrBB(object)[marker.index, ])
392 392
 	flags <- as.matrix(flags(object)[marker.index, ])
393
+	if(length(batches) >= 3){
394
+		if(verbose) message("Imputing centers for unobserved genotypes from other batches")
395
+		res <- imputeAcrossBatch(N.AA, N.AB, N.BB,
396
+					 medianA.AA, medianA.AB, medianA.BB,
397
+					 medianB.AA, medianB.AB, medianB.BB)
398
+		medianA.AA <- res[["medianA.AA"]]
399
+		medianA.AB <- res[["medianA.AB"]]
400
+		medianA.BB <- res[["medianA.BB"]]
401
+		medianB.AA <- res[["medianB.AA"]]
402
+		medianB.AB <- res[["medianB.AB"]]
403
+		medianB.BB <- res[["medianB.BB"]]
404
+		updated <- res[["updated"]]
405
+	}
406
+
393 407
 	for(k in seq(along=batches)){
394 408
 		sample.index <- batches[[k]]
395 409
 		this.batch <- unique(as.character(batch(object)[sample.index]))
... ...
@@ -1816,3 +1830,206 @@ estimateCnParameters <- function(object,
1816 1830
 	message("finished")
1817 1831
 	return(object)
1818 1832
 }
1833
+
1834
+
1835
+imputeAA.AB <- function(index, N.AA, N.AB, N.BB,
1836
+			medianA.AA, medianA.AB, medianA.BB,
1837
+			medianB.AA, medianB.AB, medianB.BB){
1838
+	gt.to.impute <- 1:2
1839
+	imputed <- rep(FALSE, length(index))
1840
+	for(i in index){
1841
+		Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ])
1842
+		medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ])
1843
+		medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ])
1844
+		colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB")
1845
+
1846
+		##Find batches with sufficient data
1847
+		K <- which(rowSums(Ns < 3) == 0)
1848
+		if(length(K) < 1) next() ## nothing we can do.  use information from other snps
1849
+		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1850
+		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1851
+		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1852
+		if(is.null(tmp)) {cat("."); next()}
1853
+
1854
+		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1855
+		L <- which(rowSums(Ns < 3) == 2)
1856
+		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1857
+		imputedVals <- Z %*% betahat
1858
+		medianA.AA[i, L] <- imputedVals[, 1]
1859
+		medianA.AB[i, L] <- imputedVals[, 2]
1860
+		medianB.AA[i, L] <- imputedVals[, 3]
1861
+		medianB.AB[i, L] <- imputedVals[, 4]
1862
+		imputed[i] <- TRUE
1863
+	}
1864
+	return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB,
1865
+		    medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB,
1866
+		    imputed=imputed))
1867
+}
1868
+
1869
+imputeAB.BB <- function(index, N.AA, N.AB, N.BB,
1870
+			medianA.AA, medianA.AB, medianA.BB,
1871
+			medianB.AA, medianB.AB, medianB.BB){
1872
+	gt.to.impute <- 2:3
1873
+	imputed <- rep(FALSE, length(index))
1874
+	for(j in seq_along(index)){
1875
+		i <- index[j]
1876
+		Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ])
1877
+		medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ])
1878
+		medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ])
1879
+		colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB")
1880
+
1881
+		##Find batches with sufficient data
1882
+		K <- which(rowSums(Ns < 3) == 0)
1883
+		if(length(K) < 1) next() ## nothing we can do.  use information from other snps
1884
+		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1885
+		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1886
+		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1887
+		if(is.null(tmp)) {cat("."); next()}
1888
+
1889
+		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1890
+		L <- which(rowSums(Ns < 3) == 2)
1891
+		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1892
+		imputedVals <- Z %*% betahat
1893
+		medianA.AB[i, L] <- imputedVals[, 1]
1894
+		medianA.BB[i, L] <- imputedVals[, 2]
1895
+		medianB.AB[i, L] <- imputedVals[, 3]
1896
+		medianB.BB[i, L] <- imputedVals[, 4]
1897
+		imputed[j] <- TRUE
1898
+	}
1899
+	return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB,
1900
+		    medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB,
1901
+		    imputed=imputed))
1902
+}
1903
+
1904
+imputeAA <- function(index, N.AA, N.AB, N.BB,
1905
+		     medianA.AA, medianA.AB, medianA.BB,
1906
+		     medianB.AA, medianB.AB, medianB.BB){
1907
+	gt.to.impute <- 1
1908
+	imputed <- rep(FALSE, length(index))
1909
+	for(j in seq_along(index)){
1910
+		i <- index[j]
1911
+		Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ])
1912
+		medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ])
1913
+		medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ])
1914
+		colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB")
1915
+
1916
+		##Find batches with sufficient data
1917
+		K <- which(rowSums(Ns < 3) == 0)
1918
+		if(length(K) < 1) next() ## nothing we can do.  use information from other snps
1919
+		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1920
+		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1921
+		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1922
+		if(is.null(tmp)) {cat("."); next()}
1923
+
1924
+		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1925
+		L <- which(rowSums(Ns < 3) == 1)
1926
+		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1927
+		imputedVals <- Z %*% betahat
1928
+		medianA.AA[i, L] <- imputedVals[, 1]
1929
+		medianB.AA[i, L] <- imputedVals[, 2]
1930
+		imputed[j] <- TRUE
1931
+	}
1932
+	return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB,
1933
+		    medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB,
1934
+		    imputed=imputed))
1935
+}
1936
+
1937
+imputeBB <- function(index, N.AA, N.AB, N.BB,
1938
+		     medianA.AA, medianA.AB, medianA.BB,
1939
+		     medianB.AA, medianB.AB, medianB.BB){
1940
+	gt.to.impute <- 3
1941
+	imputed <- rep(FALSE, length(index))
1942
+	for(j in seq_along(index)){
1943
+		i <- index[j]
1944
+		Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ])
1945
+		medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ])
1946
+		medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ])
1947
+		colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB")
1948
+
1949
+		##Find batches with sufficient data
1950
+		K <- which(rowSums(Ns < 3) == 0)
1951
+		if(length(K) < 1) next() ## nothing we can do.  use information from other snps
1952
+		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1953
+		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1954
+		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1955
+		if(is.null(tmp)) {cat("."); next()}
1956
+
1957
+		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1958
+		L <- which(rowSums(Ns < 3) == 1)
1959
+		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1960
+		imputedVals <- Z %*% betahat
1961
+		medianA.BB[i, L] <- imputedVals[, 1]
1962
+		medianB.BB[i, L] <- imputedVals[, 2]
1963
+		imputed[j] <- TRUE
1964
+	}
1965
+	return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB,
1966
+		    medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB,
1967
+		    imputed=imputed))
1968
+}
1969
+
1970
+imputeAcrossBatch <- function(N.AA, N.AB, N.BB,
1971
+			      medianA.AA, medianA.AB, medianA.BB,
1972
+			      medianB.AA, medianB.AB, medianB.BB){
1973
+	N.missing <- (N.AA < 3) + (N.AB < 3) + (N.BB < 3)
1974
+	## find all indices in which one or more batches need to have 2 genotypes imputed
1975
+	missingAA.AB <- (N.AA < 3) & (N.AB < 3)
1976
+	missingAB.BB <- (N.AB < 3) & (N.BB < 3)
1977
+	missingAA <- (N.AA < 3) & (N.AB >= 3)
1978
+	missingBB <- (N.BB < 3) & (N.AB >= 3)
1979
+	index <- list(AA.AB=which(rowSums(missingAA.AB) > 0),
1980
+		      AB.BB=which(rowSums(missingAB.BB) > 0),
1981
+		      AA=which(rowSums(missingAA) > 0),
1982
+		      BB=which(rowSums(missingBB) > 0))
1983
+	imputeNone <- which(rowSums(N.missing == 0) > 0)
1984
+	## only works if there are batches with complete data
1985
+	index <- lapply(index, intersect, y=imputeNone)
1986
+	##indices.to.update <- rep(1:4, each=sapply(index, length))
1987
+	updated <- vector("list", 4)
1988
+	names(updated) <- c("AA.AB", "AB.BB", "AA", "BB")
1989
+	if(length(index[["AA.AB"]] > 0)){
1990
+		res <- imputeAA.AB(index[["AA.AB"]],
1991
+			   N.AA,
1992
+			   N.AB,
1993
+			   N.BB,
1994
+			   medianA.AA,
1995
+			   medianA.AB,
1996
+			   medianA.BB,
1997
+			   medianB.AA, medianB.AB, medianB.BB)
1998
+		updated$AA.AB <- res$imputed
1999
+	}
2000
+	if(length(index[["AB.BB"]] > 0)){
2001
+		res <- imputeAB.BB(index[["AB.BB"]],
2002
+				   N.AA,
2003
+				   N.AB,
2004
+				   N.BB,
2005
+				   res[["medianA.AA"]],
2006
+				   res[["medianA.AB"]],
2007
+				   res[["medianA.BB"]],
2008
+				   res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]])
2009
+		updated$AB.BB <- res$imputed
2010
+	}
2011
+	if(length(index[["AA"]] > 0)){
2012
+		res <- imputeAA(index[["AA"]],
2013
+				N.AA,
2014
+				N.AB,
2015
+				N.BB,
2016
+				res[["medianA.AA"]],
2017
+				res[["medianA.AB"]],
2018
+				res[["medianA.BB"]],
2019
+				res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]])
2020
+		updated$AA <- res$imputed
2021
+	}
2022
+	if(length(index[["BB"]] > 0)){
2023
+		res <- imputeBB(index[["BB"]],
2024
+				N.AA,
2025
+				N.AB,
2026
+				N.BB,
2027
+				res[["medianA.AA"]],
2028
+				res[["medianA.AB"]],
2029
+				res[["medianA.BB"]],
2030
+				res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]])
2031
+		updated$BB <- res$imputed
2032
+	}
2033
+	updated.indices <- unlist(updated)
2034
+	return(res, updated)
2035
+}