... | ... |
@@ -2,7 +2,7 @@ Package: GMRP |
2 | 2 |
Type: Package |
3 | 3 |
Title: GWAS-based Mendelian Randomization and Path Analyses |
4 | 4 |
Version: 1.8.0 |
5 |
-Date: 2018-04-15 |
|
5 |
+Date: 2018-05-19 |
|
6 | 6 |
Author: Yuan-De Tan |
7 | 7 |
Maintainer: Yuan-De Tan <tanyuande@gmail.com> |
8 | 8 |
Description: Perform Mendelian randomization analysis of multiple SNPs |
... | ... |
@@ -20,13 +20,13 @@ stop("No Symbol found in the data") |
20 | 20 |
upstream5<-subset(UCSCannot,function_unit=="5upstream") |
21 | 21 |
downstream3<-subset(UCSCannot,function_unit=="3downstream") |
22 | 22 |
|
23 |
- intron<-length(intron[,1])/N |
|
24 |
- coding<-length(code[,1])/N |
|
25 |
- UTR3<-length(UTR3[,1])/N |
|
26 |
- UTR5<-length(UTR5[,1])/N |
|
27 |
- intergene<-length(ncoding[,1])/N |
|
28 |
- upstream<-length(upstream5[,1])/N |
|
29 |
- downstream<-length(downstream3[,1])/N |
|
23 |
+ intron<-nrow(intron)/N |
|
24 |
+ coding<-nrow(code)/N |
|
25 |
+ UTR3<-nrow(UTR3)/N |
|
26 |
+ UTR5<-nrow(UTR5)/N |
|
27 |
+ intergene<-nrow(ncoding)/N |
|
28 |
+ upstream<-nrow(upstream5)/N |
|
29 |
+ downstream<-nrow(downstream3)/N |
|
30 | 30 |
|
31 | 31 |
res<-matrix(NA,1,8) |
32 | 32 |
res[1,]<-c(gene.n,coding,intron,UTR3,UTR5,intergene,upstream,downstream) |
... | ... |
@@ -8,8 +8,8 @@ BiocStyle::latex() |
8 | 8 |
#library("knitr") |
9 | 9 |
#opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide", |
10 | 10 |
# fig.width=4,fig.height=4.5, |
11 |
- # dpi=300,# increase dpi to avoid pixelised pngs |
|
12 |
- # message=FALSE) |
|
11 |
+ # dpi=300,# increase dpi to avoid pixelised pngs |
|
12 |
+ # message=FALSE) |
|
13 | 13 |
|
14 | 14 |
################################################### |
15 | 15 |
### code chunk number 2: GMRP.Rnw:40-43 |
... | ... |
@@ -46,13 +46,16 @@ data12 <- fmerge(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2 |
46 | 46 |
################################################### |
47 | 47 |
### code chunk number 5: disease data load:90-93 |
48 | 48 |
################################################### |
49 |
+ |
|
49 | 50 |
data(cad.data) |
50 | 51 |
#cad <- DataFrame(cad.data) |
51 | 52 |
cad<-cad.data |
52 | 53 |
head(cad) |
54 |
+ |
|
53 | 55 |
################################################### |
54 | 56 |
### code chunk number 6:lipid data load: 95-99 |
55 | 57 |
################################################### |
58 |
+ |
|
56 | 59 |
data(lpd.data) |
57 | 60 |
#lpd <- DataFrame(lpd.data) |
58 | 61 |
lpd<-lpd.data |
... | ... |
@@ -81,7 +84,7 @@ beta.LDL <- lpd$beta.LDL |
81 | 84 |
beta.HDL <- lpd$beta.HDL |
82 | 85 |
beta.TG <- lpd$beta.TG |
83 | 86 |
beta.TC <- lpd$beta.TC |
84 |
-beta <- cbind(beta.LDL, beta.HDL, beta.TG, beta.TC) |
|
87 |
+beta <- cbind(beta.LDL,beta.HDL,beta.TG,beta.TC) |
|
85 | 88 |
|
86 | 89 |
################################################### |
87 | 90 |
# Step3: construct a matrix for allele1: 148-154 |
... | ... |
@@ -91,7 +94,7 @@ a1.LDL <- lpd$A1.LDL |
91 | 94 |
a1.HDL <- lpd$A1.HDL |
92 | 95 |
a1.TG <- lpd$A1.TG |
93 | 96 |
a1.TC <- lpd$A1.TC |
94 |
-alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC) |
|
97 |
+alle1 <- cbind(a1.LDL,a1.HDL,a1.TG,a1.TC) |
|
95 | 98 |
|
96 | 99 |
################################################### |
97 | 100 |
# Step4: calculate pcj:157-165 |
... | ... |
@@ -101,7 +104,7 @@ N.LDL <- lpd$N.LDL |
101 | 104 |
N.HDL <- lpd$N.HDL |
102 | 105 |
N.TG <- lpd$N.TG |
103 | 106 |
N.TC <- lpd$N.TC |
104 |
-ss <- cbind(N.LDL, N.HDL, N.TG, N.TC) |
|
107 |
+ss <- cbind(N.LDL,N.HDL,N.TG,N.TC) |
|
105 | 108 |
sm <- apply(ss,1,sum) |
106 | 109 |
pcj <- round(sm/max(sm), 6) |
107 | 110 |
|
... | ... |
@@ -128,13 +131,15 @@ sd <- cbind(sd.LDL, sd.HDL, sd.TG, sd.TC) |
128 | 131 |
################################################### |
129 | 132 |
# Step7: SNPID and position: 186-189 |
130 | 133 |
################################################### |
131 |
-<<Step7, keep.source=TRUE, eval=FALSE>>= |
|
134 |
+ |
|
135 |
+#<<Step7, keep.source=TRUE, eval=FALSE>>= |
|
132 | 136 |
hg19 <- lpd$SNP_hg19.HDL |
133 | 137 |
rsid <- lpd$rsid.HDL |
134 | 138 |
|
135 | 139 |
################################################### |
136 | 140 |
# Step8: separate chromosome number and SNP position: 192-194 |
137 | 141 |
################################################### |
142 |
+ |
|
138 | 143 |
chr<-chrp(hg=hg19) |
139 | 144 |
|
140 | 145 |
################################################### |
... | ... |
@@ -146,9 +151,9 @@ newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata)) |
146 | 151 |
dim(newdata) |
147 | 152 |
|
148 | 153 |
|
149 |
-################################################### |
|
154 |
+########################################################### |
|
150 | 155 |
# Step10: retrieve data from cad and calculate pdj:204-215 |
151 |
-################################################### |
|
156 |
+########################################################### |
|
152 | 157 |
|
153 | 158 |
hg18.d <- cad$chr_pos_b36 |
154 | 159 |
SNP.d <- cad$SNP #SNPID |
... | ... |
@@ -161,22 +166,23 @@ N.ctr <- cad$N_control |
161 | 166 |
N.d <- N.case+N.ctr |
162 | 167 |
freq.case <- N.case/N.d |
163 | 168 |
|
164 |
-################################################### |
|
169 |
+################################################################### |
|
165 | 170 |
# Step11: combine these cad variables into new data sheet:218-222 |
166 |
- ################################################### |
|
171 |
+ ################################################################## |
|
167 | 172 |
|
168 | 173 |
newcad <- cbind(freq.d, beta.d, N.case, N.ctr, freq.case) |
169 | 174 |
newcad <- cbind(hg18.d, SNP.d, a1.d, as.data.frame(newcad)) |
170 | 175 |
dim(newcad) |
171 | 176 |
|
172 |
-################################################### |
|
177 |
+########################################################## |
|
173 | 178 |
#Step12: give name vector of causal variables: 225-227 |
174 |
-################################################### |
|
179 |
+########################################################## |
|
175 | 180 |
varname <-c("CAD", "LDL", "HDL", "TG", "TC") |
176 | 181 |
|
177 | 182 |
################################################### |
178 | 183 |
### Step 13: choose SNPs and make beta table |
179 | 184 |
################################################### |
185 |
+ |
|
180 | 186 |
mybeta <- mktable(cdata=newdata, ddata=newcad, rt="beta", |
181 | 187 |
varname=varname, LG=1, Pv=0.00000005, Pc=0.979, Pd=0.979) |
182 | 188 |
dim(mybeta) |
... | ... |
@@ -188,6 +194,7 @@ head(beta) |
188 | 194 |
################################################### |
189 | 195 |
### load beta data: 256-264 |
190 | 196 |
################################################### |
197 |
+ |
|
191 | 198 |
data(beta.data) |
192 | 199 |
beta.data<-DataFrame(beta.data) |
193 | 200 |
CAD <- beta.data$cad |
... | ... |
@@ -199,6 +206,7 @@ TC <- beta.data$tc |
199 | 206 |
################################################### |
200 | 207 |
### two way scatter: 266-280 |
201 | 208 |
################################################### |
209 |
+ |
|
202 | 210 |
par(mfrow=c(2, 2), mar=c(5.1, 4.1, 4.1, 2.1), oma=c(0, 0, 0, 0)) |
203 | 211 |
plot(LDL,CAD, pch=19, col="blue", xlab="beta of SNPs on LDL", |
204 | 212 |
ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2) |
... | ... |
@@ -216,6 +224,7 @@ abline(lm(CAD~TC), col="red", lwd=2) |
216 | 224 |
################################################### |
217 | 225 |
### MR and Path Analysis: 296-300 |
218 | 226 |
################################################### |
227 |
+ |
|
219 | 228 |
data(beta.data) |
220 | 229 |
mybeta <- DataFrame(beta.data) |
221 | 230 |
mod <- CAD~LDL+HDL+TG+TC |
... | ... |
@@ -236,11 +245,13 @@ mypath |
236 | 245 |
################################################### |
237 | 246 |
### load package: diagram: 324-326 |
238 | 247 |
################################################### |
248 |
+ |
|
239 | 249 |
library(diagram) |
240 | 250 |
|
241 | 251 |
################################################### |
242 | 252 |
# make path diagram |
243 | 253 |
################################################### |
254 |
+ |
|
244 | 255 |
pathdiagram(pathdata=mypath, disease="CAD", R2=0.988243, range=c(1:3)) |
245 | 256 |
|
246 | 257 |
pathD<-matrix(NA,4,5) |
... | ... |
@@ -272,8 +283,8 @@ library(graphics) |
272 | 283 |
################################################### |
273 | 284 |
# chromosome SNP position histogram |
274 | 285 |
########################################## |
275 |
-snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A") |
|
276 | 286 |
|
287 |
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A") |
|
277 | 288 |
|
278 | 289 |
################################################### |
279 | 290 |
# SNP function annotation analysis |
... | ... |
@@ -294,6 +305,7 @@ SNP368[1:10, ] |
294 | 305 |
################################################### |
295 | 306 |
# make 3D Pie |
296 | 307 |
############################################# |
308 |
+ |
|
297 | 309 |
ucscannot(UCSCannot=SNP368,SNPn=368) |
298 | 310 |
|
299 | 311 |
ucscannot(UCSCannot=SNP368,SNPn=368,A=3,B=2,C=1.3,method=2) |
... | ... |
@@ -7,7 +7,7 @@ |
7 | 7 |
\documentclass[a4paper]{article} |
8 | 8 |
|
9 | 9 |
\title{GWAS-based Mendelian Randomization Path Analysis} |
10 |
-\author{Yuan-De Tan \ \ |
|
10 |
+\author{Yuan-De Tan \\ |
|
11 | 11 |
\texttt{tanyuande@gmail.com}} |
12 | 12 |
|
13 | 13 |
<<<style-Sweave, eval=TRUE, echo=FALSE, results=tex>>= |
... | ... |
@@ -19,7 +19,7 @@ BiocStyle::latex() |
19 | 19 |
\maketitle |
20 | 20 |
|
21 | 21 |
\begin{abstract} |
22 |
-\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of interest and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 $R$ functions: \Rfunction{chrp},\Rfunction{fmerge},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpPositAnnot},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerge} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto a given disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerge}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpPositAnnot} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to perform \emph{SNP} function annotation analysis. |
|
22 |
+\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of study and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 \emph{R} functions: \Rfunction{chrp},\Rfunction{fmerge},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpPositAnnot},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerge} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto a given disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerge}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpPositAnnot} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to perform \emph{SNP} function annotation analysis. |
|
23 | 23 |
\end{abstract} |
24 | 24 |
|
25 | 25 |
\tableofcontents |
... | ... |
@@ -27,9 +27,10 @@ BiocStyle::latex() |
27 | 27 |
\section{Introduction} |
28 | 28 |
As an example of human disease, coronary artery disease (\emph{CAD}) is one of the causes leading to death and infirmity worldwide~\cite{Murray1997}. Low-density lipoprotein cholesterol (\emph{LDL}) and triglycerides (\emph{TG}) are viewed as risk factors causing \emph{CAD}. In epidemiological studies, plasma concentrations of increasing \emph{TG} and \emph{LDL} and deceasing high-density lipoprotein cholesterol (\emph{HDL} ) have been observed to be associated with risk for \emph{CAD}~\cite{Angelantonio2009, Sarwar2007}. However, from observational studies, one could not directly infer that these cholesterol concentrations in plasma are risk factors causing \emph{CAD}~\cite{Do2013, Sarwar2007, Sheehan2007, Voight2012}. A big limitation of observational studies is to difficultly distinguish between causal and spurious associations due to confounding~\cite{Pichler2013}. An efficient approach to overcome this limitation is \textit{Mendelian Randomization} (\emph{MR}) analysis ~\cite{Sheehan2007, Smith2003} where genetic variants are used as instrumental variables. For this reason, many investigators tried to use genetic variants to assess causality and estimate the causal effects on the diseases. |
29 | 29 |
|
30 |
-\emph{MR} analysis can perfectly exclude confounding factors associated with disease. However, when we expand one causal variable to many, \emph{MR} analysis becomes challenged and complicated because the genetic variant would have additional effects on the other risk factors, which violate assumption of no pleiotropy. An unknown genetic variant in \emph{MR} analysis possibly provides a false instrument for causal effect assessment of risk factors on the disease. The reason is that if this genetic variant is in \emph{LD} with another gene that is not used but has effect on the disease of study~\cite{Sheehan2007, Sheehan2010}. It then violates the third assumption. These two problems can be addressed by using multiple instrumental variables. For this reason, Do \emph{et al} (2013)developed statistic approach to address this issue~\cite{Do2013}. However, method of Do \emph{et al} ~\cite{Do2013} cannot disentangle correlation effects among the multiple undefined risk factors on the disease of study. The beta values obtained from regression analyses are not direct causal effects because their effects are entangled with correlations among these undefined risk factors. |
|
30 |
+\emph{MR} analysis can perfectly exclude confounding factors associated with disease. However, when we expand one causal variable to many, \emph{MR} analysis becomes challenged and complicated because the genetic variant would have additional effects on the other risk factors, which violate assumption of no pleiotropy. An unknown genetic variant in \emph{MR} analysis possibly provides a false instrument for causal effect assessment of risk factors on the disease. The reason is that |
|
31 |
+if this genetic variant is in \emph{LD} with another gene that is not used but has effect on the disease of study~\cite{Sheehan2007, Sheehan2010}. It then violates the third assumption. These two problems can be addressed by using multiple instrumental variables. For this reason, Do \emph{et al}(2013)developed statistic approach to address this issue~\cite{Do2013}. However, the method of Do \emph{et al} ~\cite{Do2013} cannot disentangle correlation effects among the multiple undefined risk factors on the disease of study. The beta values obtained from regression analyses are not direct causal effects because their effects are entangled with correlations among these undefined risk factors. |
|
31 | 32 |
|
32 |
-The best way to address the entanglement of multiple causal effects is path analysis that was developed by Wright~\cite{Wright1921, Wright1934}. This is because path analysis can dissect beta values into direct and indirect effects of causal variables on the disease. However, path analysis has not broadly been applied to diseases because diseases are usually binary variable. The method of Do ~\cite{Do2013} makes it possible to apply path analysis to disentangle causal effects of undefined risk factors on diseases. For doing so, we here provide \textbf {R package} \Rpackage{GMRP} (\emph{GWAS}-based \emph{MR} and \emph{path analysis}) to solve the above issues. |
|
33 |
+The best way to address the entanglement of multiple causal effects is path analysis that was developed by Wright~\cite{Wright1921, Wright1934}. This is because path analysis can dissect beta values into direct and indirect effects of causal variables on the disease. However, path analysis has not broadly been applied to diseases because diseases are usually binary variable. The method of Do ~\cite{Do2013} makes it possible to apply path analysis to disentangle causal effects of undefined risk factors on diseases. For doing so, we here provide \textbf{R package} \Rpackage{GMRP} (\emph{GWAS}-based \emph{MR} and \emph{path analysis}) to solve the above issues. |
|
33 | 34 |
|
34 | 35 |
This vignette is intended to give a rapid introduction to the commands used in implementing \emph{MR} analysis, regression analysis, and path analysis, including \emph{SNP} annotation and chromosomal position analysis by means of the \Rpackage{GMRP} package. |
35 | 36 |
|
... | ... |
@@ -126,7 +127,7 @@ We use function \Rfunction{mktable} to choose \emph{SNP}s and make a standard be |
126 | 127 |
|
127 | 128 |
The standard beta table will be created via 15 steps: |
128 | 129 |
|
129 |
-Step1: calculate $pvj$ |
|
130 |
+Step1: calculate $pv_j$ |
|
130 | 131 |
|
131 | 132 |
<<Step1,keep.source=TRUE, eval=FALSE>>= |
132 | 133 |
pvalue.LDL <- lpd$P.value.LDL |
... | ... |
@@ -155,7 +156,7 @@ a1.TC <- lpd$A1.TC |
155 | 156 |
alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC) |
156 | 157 |
@ |
157 | 158 |
|
158 |
-Step4: give sample sizes of causal variables and calculate $pcj$ |
|
159 |
+Step4: give sample sizes of causal variables and calculate $pc_j$ |
|
159 | 160 |
<<Step4, keep.source=TRUE, eval=FALSE>>= |
160 | 161 |
N.LDL <- lpd$N.LDL |
161 | 162 |
N.HDL <- lpd$N.HDL |
... | ... |
@@ -175,7 +176,7 @@ freq.TC<-lpd$Freq.A1.1000G.EUR.TC |
175 | 176 |
freq<-cbind(freq.LDL,freq.HDL,freq.TG,freq.TC) |
176 | 177 |
@ |
177 | 178 |
|
178 |
-Step6: construct matrix for $sd$ of each causal variable (here $sd$ is not specific to \emph{SNP} $j$). The following $sd$ values for \emph{LDL, HDL,TG} and \emph{TC} were means of standard deviations of these lipoprotein concentrations in plasma over 63 studies from Willer \emph{et al}~\cite{Willer2013}. |
|
179 |
+Step6: construct matrix for $sd$ of each causal variable (here $sd$ is not specific to $SNP_j$). The following $sd$ values for \emph{LDL, HDL,TG} and \emph{TC} were means of standard deviations of these lipoprotein concentrations in plasma over 63 studies from Willer \emph{et al}~\cite{Willer2013}. |
|
179 | 180 |
<<Step6, keep.source=TRUE, eval=FALSE>>= |
180 | 181 |
sd.LDL <- rep(37.42, length(pvj)) |
181 | 182 |
sd.HDL <- rep(14.87, length(pvj)) |
... | ... |
@@ -202,7 +203,7 @@ newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata)) |
202 | 203 |
dim(newdata) |
203 | 204 |
@ |
204 | 205 |
|
205 |
-Step10: retrieve data from \Robject{cad} and calculate $pdj$ and frequency of coronary artery disease: \textit{freq.case} in population: |
|
206 |
+Step10: retrieve data from \Robject{cad} and calculate $pd_j$ and frequency of coronary artery disease: \textit{freq.case} in population: |
|
206 | 207 |
<<Step10,keep.source=TRUE, eval=FALSE>>= |
207 | 208 |
hg18.d <- cad$chr_pos_b36 |
208 | 209 |
SNP.d <- cad$SNP #SNPID |