git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52857 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -390,6 +390,20 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM |
390 | 390 |
shrink.corrAB <- corrAB <- as.matrix(corrAB(object)[marker.index, ]) |
391 | 391 |
shrink.corrBB <- corrBB <- as.matrix(corrBB(object)[marker.index, ]) |
392 | 392 |
flags <- as.matrix(flags(object)[marker.index, ]) |
393 |
+ if(length(batches) >= 3){ |
|
394 |
+ if(verbose) message("Imputing centers for unobserved genotypes from other batches") |
|
395 |
+ res <- imputeAcrossBatch(N.AA, N.AB, N.BB, |
|
396 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
397 |
+ medianB.AA, medianB.AB, medianB.BB) |
|
398 |
+ medianA.AA <- res[["medianA.AA"]] |
|
399 |
+ medianA.AB <- res[["medianA.AB"]] |
|
400 |
+ medianA.BB <- res[["medianA.BB"]] |
|
401 |
+ medianB.AA <- res[["medianB.AA"]] |
|
402 |
+ medianB.AB <- res[["medianB.AB"]] |
|
403 |
+ medianB.BB <- res[["medianB.BB"]] |
|
404 |
+ updated <- res[["updated"]] |
|
405 |
+ } |
|
406 |
+ |
|
393 | 407 |
for(k in seq(along=batches)){ |
394 | 408 |
sample.index <- batches[[k]] |
395 | 409 |
this.batch <- unique(as.character(batch(object)[sample.index])) |
... | ... |
@@ -1816,3 +1830,206 @@ estimateCnParameters <- function(object, |
1816 | 1830 |
message("finished") |
1817 | 1831 |
return(object) |
1818 | 1832 |
} |
1833 |
+ |
|
1834 |
+ |
|
1835 |
+imputeAA.AB <- function(index, N.AA, N.AB, N.BB, |
|
1836 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
1837 |
+ medianB.AA, medianB.AB, medianB.BB){ |
|
1838 |
+ gt.to.impute <- 1:2 |
|
1839 |
+ imputed <- rep(FALSE, length(index)) |
|
1840 |
+ for(i in index){ |
|
1841 |
+ Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ]) |
|
1842 |
+ medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ]) |
|
1843 |
+ medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ]) |
|
1844 |
+ colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB") |
|
1845 |
+ |
|
1846 |
+ ##Find batches with sufficient data |
|
1847 |
+ K <- which(rowSums(Ns < 3) == 0) |
|
1848 |
+ if(length(K) < 1) next() ## nothing we can do. use information from other snps |
|
1849 |
+ X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE]) |
|
1850 |
+ Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE]) |
|
1851 |
+ tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL) |
|
1852 |
+ if(is.null(tmp)) {cat("."); next()} |
|
1853 |
+ |
|
1854 |
+ ## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute' |
|
1855 |
+ L <- which(rowSums(Ns < 3) == 2) |
|
1856 |
+ Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE]) |
|
1857 |
+ imputedVals <- Z %*% betahat |
|
1858 |
+ medianA.AA[i, L] <- imputedVals[, 1] |
|
1859 |
+ medianA.AB[i, L] <- imputedVals[, 2] |
|
1860 |
+ medianB.AA[i, L] <- imputedVals[, 3] |
|
1861 |
+ medianB.AB[i, L] <- imputedVals[, 4] |
|
1862 |
+ imputed[i] <- TRUE |
|
1863 |
+ } |
|
1864 |
+ return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB, |
|
1865 |
+ medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB, |
|
1866 |
+ imputed=imputed)) |
|
1867 |
+} |
|
1868 |
+ |
|
1869 |
+imputeAB.BB <- function(index, N.AA, N.AB, N.BB, |
|
1870 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
1871 |
+ medianB.AA, medianB.AB, medianB.BB){ |
|
1872 |
+ gt.to.impute <- 2:3 |
|
1873 |
+ imputed <- rep(FALSE, length(index)) |
|
1874 |
+ for(j in seq_along(index)){ |
|
1875 |
+ i <- index[j] |
|
1876 |
+ Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ]) |
|
1877 |
+ medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ]) |
|
1878 |
+ medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ]) |
|
1879 |
+ colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB") |
|
1880 |
+ |
|
1881 |
+ ##Find batches with sufficient data |
|
1882 |
+ K <- which(rowSums(Ns < 3) == 0) |
|
1883 |
+ if(length(K) < 1) next() ## nothing we can do. use information from other snps |
|
1884 |
+ X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE]) |
|
1885 |
+ Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE]) |
|
1886 |
+ tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL) |
|
1887 |
+ if(is.null(tmp)) {cat("."); next()} |
|
1888 |
+ |
|
1889 |
+ ## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute' |
|
1890 |
+ L <- which(rowSums(Ns < 3) == 2) |
|
1891 |
+ Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE]) |
|
1892 |
+ imputedVals <- Z %*% betahat |
|
1893 |
+ medianA.AB[i, L] <- imputedVals[, 1] |
|
1894 |
+ medianA.BB[i, L] <- imputedVals[, 2] |
|
1895 |
+ medianB.AB[i, L] <- imputedVals[, 3] |
|
1896 |
+ medianB.BB[i, L] <- imputedVals[, 4] |
|
1897 |
+ imputed[j] <- TRUE |
|
1898 |
+ } |
|
1899 |
+ return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB, |
|
1900 |
+ medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB, |
|
1901 |
+ imputed=imputed)) |
|
1902 |
+} |
|
1903 |
+ |
|
1904 |
+imputeAA <- function(index, N.AA, N.AB, N.BB, |
|
1905 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
1906 |
+ medianB.AA, medianB.AB, medianB.BB){ |
|
1907 |
+ gt.to.impute <- 1 |
|
1908 |
+ imputed <- rep(FALSE, length(index)) |
|
1909 |
+ for(j in seq_along(index)){ |
|
1910 |
+ i <- index[j] |
|
1911 |
+ Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ]) |
|
1912 |
+ medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ]) |
|
1913 |
+ medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ]) |
|
1914 |
+ colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB") |
|
1915 |
+ |
|
1916 |
+ ##Find batches with sufficient data |
|
1917 |
+ K <- which(rowSums(Ns < 3) == 0) |
|
1918 |
+ if(length(K) < 1) next() ## nothing we can do. use information from other snps |
|
1919 |
+ X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE]) |
|
1920 |
+ Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE]) |
|
1921 |
+ tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL) |
|
1922 |
+ if(is.null(tmp)) {cat("."); next()} |
|
1923 |
+ |
|
1924 |
+ ## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute' |
|
1925 |
+ L <- which(rowSums(Ns < 3) == 1) |
|
1926 |
+ Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE]) |
|
1927 |
+ imputedVals <- Z %*% betahat |
|
1928 |
+ medianA.AA[i, L] <- imputedVals[, 1] |
|
1929 |
+ medianB.AA[i, L] <- imputedVals[, 2] |
|
1930 |
+ imputed[j] <- TRUE |
|
1931 |
+ } |
|
1932 |
+ return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB, |
|
1933 |
+ medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB, |
|
1934 |
+ imputed=imputed)) |
|
1935 |
+} |
|
1936 |
+ |
|
1937 |
+imputeBB <- function(index, N.AA, N.AB, N.BB, |
|
1938 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
1939 |
+ medianB.AA, medianB.AB, medianB.BB){ |
|
1940 |
+ gt.to.impute <- 3 |
|
1941 |
+ imputed <- rep(FALSE, length(index)) |
|
1942 |
+ for(j in seq_along(index)){ |
|
1943 |
+ i <- index[j] |
|
1944 |
+ Ns <- cbind(N.AA[i, ], N.AB[i, ], N.BB[i, ]) |
|
1945 |
+ medianA <- cbind(medianA.AA[i, ], medianA.AB[i, ], medianA.BB[i, ]) |
|
1946 |
+ medianB <- cbind(medianB.AA[i, ], medianB.AB[i, ], medianB.BB[i, ]) |
|
1947 |
+ colnames(medianA) <- colnames(medianB) <- c("AA", "AB", "BB") |
|
1948 |
+ |
|
1949 |
+ ##Find batches with sufficient data |
|
1950 |
+ K <- which(rowSums(Ns < 3) == 0) |
|
1951 |
+ if(length(K) < 1) next() ## nothing we can do. use information from other snps |
|
1952 |
+ X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE]) |
|
1953 |
+ Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE]) |
|
1954 |
+ tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL) |
|
1955 |
+ if(is.null(tmp)) {cat("."); next()} |
|
1956 |
+ |
|
1957 |
+ ## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute' |
|
1958 |
+ L <- which(rowSums(Ns < 3) == 1) |
|
1959 |
+ Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE]) |
|
1960 |
+ imputedVals <- Z %*% betahat |
|
1961 |
+ medianA.BB[i, L] <- imputedVals[, 1] |
|
1962 |
+ medianB.BB[i, L] <- imputedVals[, 2] |
|
1963 |
+ imputed[j] <- TRUE |
|
1964 |
+ } |
|
1965 |
+ return(list(medianA.AA=medianA.AA, medianA.AB=medianA.AB, medianA.BB=medianA.BB, |
|
1966 |
+ medianB.AA=medianB.AA, medianB.AB=medianB.AB, medianB.BB=medianB.BB, |
|
1967 |
+ imputed=imputed)) |
|
1968 |
+} |
|
1969 |
+ |
|
1970 |
+imputeAcrossBatch <- function(N.AA, N.AB, N.BB, |
|
1971 |
+ medianA.AA, medianA.AB, medianA.BB, |
|
1972 |
+ medianB.AA, medianB.AB, medianB.BB){ |
|
1973 |
+ N.missing <- (N.AA < 3) + (N.AB < 3) + (N.BB < 3) |
|
1974 |
+ ## find all indices in which one or more batches need to have 2 genotypes imputed |
|
1975 |
+ missingAA.AB <- (N.AA < 3) & (N.AB < 3) |
|
1976 |
+ missingAB.BB <- (N.AB < 3) & (N.BB < 3) |
|
1977 |
+ missingAA <- (N.AA < 3) & (N.AB >= 3) |
|
1978 |
+ missingBB <- (N.BB < 3) & (N.AB >= 3) |
|
1979 |
+ index <- list(AA.AB=which(rowSums(missingAA.AB) > 0), |
|
1980 |
+ AB.BB=which(rowSums(missingAB.BB) > 0), |
|
1981 |
+ AA=which(rowSums(missingAA) > 0), |
|
1982 |
+ BB=which(rowSums(missingBB) > 0)) |
|
1983 |
+ imputeNone <- which(rowSums(N.missing == 0) > 0) |
|
1984 |
+ ## only works if there are batches with complete data |
|
1985 |
+ index <- lapply(index, intersect, y=imputeNone) |
|
1986 |
+ ##indices.to.update <- rep(1:4, each=sapply(index, length)) |
|
1987 |
+ updated <- vector("list", 4) |
|
1988 |
+ names(updated) <- c("AA.AB", "AB.BB", "AA", "BB") |
|
1989 |
+ if(length(index[["AA.AB"]] > 0)){ |
|
1990 |
+ res <- imputeAA.AB(index[["AA.AB"]], |
|
1991 |
+ N.AA, |
|
1992 |
+ N.AB, |
|
1993 |
+ N.BB, |
|
1994 |
+ medianA.AA, |
|
1995 |
+ medianA.AB, |
|
1996 |
+ medianA.BB, |
|
1997 |
+ medianB.AA, medianB.AB, medianB.BB) |
|
1998 |
+ updated$AA.AB <- res$imputed |
|
1999 |
+ } |
|
2000 |
+ if(length(index[["AB.BB"]] > 0)){ |
|
2001 |
+ res <- imputeAB.BB(index[["AB.BB"]], |
|
2002 |
+ N.AA, |
|
2003 |
+ N.AB, |
|
2004 |
+ N.BB, |
|
2005 |
+ res[["medianA.AA"]], |
|
2006 |
+ res[["medianA.AB"]], |
|
2007 |
+ res[["medianA.BB"]], |
|
2008 |
+ res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]]) |
|
2009 |
+ updated$AB.BB <- res$imputed |
|
2010 |
+ } |
|
2011 |
+ if(length(index[["AA"]] > 0)){ |
|
2012 |
+ res <- imputeAA(index[["AA"]], |
|
2013 |
+ N.AA, |
|
2014 |
+ N.AB, |
|
2015 |
+ N.BB, |
|
2016 |
+ res[["medianA.AA"]], |
|
2017 |
+ res[["medianA.AB"]], |
|
2018 |
+ res[["medianA.BB"]], |
|
2019 |
+ res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]]) |
|
2020 |
+ updated$AA <- res$imputed |
|
2021 |
+ } |
|
2022 |
+ if(length(index[["BB"]] > 0)){ |
|
2023 |
+ res <- imputeBB(index[["BB"]], |
|
2024 |
+ N.AA, |
|
2025 |
+ N.AB, |
|
2026 |
+ N.BB, |
|
2027 |
+ res[["medianA.AA"]], |
|
2028 |
+ res[["medianA.AB"]], |
|
2029 |
+ res[["medianA.BB"]], |
|
2030 |
+ res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]]) |
|
2031 |
+ updated$BB <- res$imputed |
|
2032 |
+ } |
|
2033 |
+ updated.indices <- unlist(updated) |
|
2034 |
+ return(res, updated) |
|
2035 |
+} |