Browse code

Updating MBttest.Rnw,GMRP.R,ucscannot.R,DESCRIPTION files again

y.tan authored on 19/05/2018 21:30:13
Showing 4 changed files

... ...
@@ -2,7 +2,7 @@ Package: GMRP
2 2
 Type: Package
3 3
 Title: GWAS-based Mendelian Randomization and Path Analyses
4 4
 Version: 1.8.0
5
-Date: 2018-04-15
5
+Date: 2018-05-19
6 6
 Author: Yuan-De Tan
7 7
 Maintainer: Yuan-De Tan <tanyuande@gmail.com>
8 8
 Description: Perform Mendelian randomization analysis of multiple SNPs
... ...
@@ -20,13 +20,13 @@ stop("No Symbol found in the data")
20 20
  upstream5<-subset(UCSCannot,function_unit=="5upstream")
21 21
  downstream3<-subset(UCSCannot,function_unit=="3downstream")
22 22
 
23
- intron<-length(intron[,1])/N
24
- coding<-length(code[,1])/N
25
- UTR3<-length(UTR3[,1])/N
26
- UTR5<-length(UTR5[,1])/N
27
- intergene<-length(ncoding[,1])/N
28
- upstream<-length(upstream5[,1])/N
29
- downstream<-length(downstream3[,1])/N
23
+ intron<-nrow(intron)/N
24
+ coding<-nrow(code)/N
25
+ UTR3<-nrow(UTR3)/N
26
+ UTR5<-nrow(UTR5)/N
27
+ intergene<-nrow(ncoding)/N
28
+ upstream<-nrow(upstream5)/N
29
+ downstream<-nrow(downstream3)/N
30 30
 
31 31
  res<-matrix(NA,1,8)
32 32
  res[1,]<-c(gene.n,coding,intron,UTR3,UTR5,intergene,upstream,downstream)
... ...
@@ -8,8 +8,8 @@ BiocStyle::latex()
8 8
 #library("knitr")
9 9
 #opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
10 10
  #              fig.width=4,fig.height=4.5,
11
-  #             dpi=300,# increase dpi to avoid pixelised pngs
12
-  #             message=FALSE)
11
+ #             dpi=300,# increase dpi to avoid pixelised pngs
12
+ #             message=FALSE)
13 13
                
14 14
 ###################################################
15 15
 ### code chunk number 2: GMRP.Rnw:40-43
... ...
@@ -46,13 +46,16 @@ data12 <- fmerge(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2
46 46
 ###################################################
47 47
 ### code chunk number 5: disease data load:90-93
48 48
 ###################################################
49
+
49 50
 data(cad.data)
50 51
 #cad <- DataFrame(cad.data)
51 52
 cad<-cad.data
52 53
 head(cad)
54
+
53 55
 ###################################################
54 56
 ### code chunk number 6:lipid data load: 95-99
55 57
 ###################################################
58
+
56 59
 data(lpd.data)
57 60
 #lpd <- DataFrame(lpd.data)
58 61
 lpd<-lpd.data
... ...
@@ -81,7 +84,7 @@ beta.LDL <- lpd$beta.LDL
81 84
 beta.HDL <- lpd$beta.HDL
82 85
 beta.TG <- lpd$beta.TG
83 86
 beta.TC <- lpd$beta.TC
84
-beta <- cbind(beta.LDL, beta.HDL, beta.TG, beta.TC)
87
+beta <- cbind(beta.LDL,beta.HDL,beta.TG,beta.TC)
85 88
 
86 89
 ###################################################
87 90
 #     Step3: construct a matrix for allele1: 148-154
... ...
@@ -91,7 +94,7 @@ a1.LDL <- lpd$A1.LDL
91 94
 a1.HDL <- lpd$A1.HDL
92 95
 a1.TG <- lpd$A1.TG
93 96
 a1.TC <- lpd$A1.TC
94
-alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC)
97
+alle1 <- cbind(a1.LDL,a1.HDL,a1.TG,a1.TC)
95 98
 
96 99
 ###################################################
97 100
 # Step4:  calculate pcj:157-165
... ...
@@ -101,7 +104,7 @@ N.LDL <- lpd$N.LDL
101 104
 N.HDL <- lpd$N.HDL
102 105
 N.TG <- lpd$N.TG
103 106
 N.TC <- lpd$N.TC
104
-ss <- cbind(N.LDL, N.HDL, N.TG, N.TC)
107
+ss <- cbind(N.LDL,N.HDL,N.TG,N.TC)
105 108
 sm <- apply(ss,1,sum)
106 109
 pcj <- round(sm/max(sm), 6)
107 110
 
... ...
@@ -128,13 +131,15 @@ sd <- cbind(sd.LDL, sd.HDL, sd.TG, sd.TC)
128 131
 ###################################################
129 132
 # Step7: SNPID and position: 186-189
130 133
 ###################################################
131
-<<Step7, keep.source=TRUE, eval=FALSE>>=
134
+
135
+#<<Step7, keep.source=TRUE, eval=FALSE>>=
132 136
 hg19 <- lpd$SNP_hg19.HDL
133 137
 rsid <- lpd$rsid.HDL
134 138
 
135 139
 ###################################################
136 140
 # Step8: separate chromosome number and SNP position: 192-194
137 141
 ###################################################
142
+
138 143
 chr<-chrp(hg=hg19)
139 144
 
140 145
 ###################################################
... ...
@@ -146,9 +151,9 @@ newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata))
146 151
 dim(newdata)
147 152
 
148 153
 
149
-###################################################
154
+###########################################################
150 155
 # Step10: retrieve data from cad and calculate pdj:204-215
151
-###################################################
156
+###########################################################
152 157
 
153 158
 hg18.d <- cad$chr_pos_b36
154 159
 SNP.d <- cad$SNP #SNPID
... ...
@@ -161,22 +166,23 @@ N.ctr <- cad$N_control
161 166
 N.d <- N.case+N.ctr
162 167
 freq.case <- N.case/N.d
163 168
 
164
-###################################################
169
+###################################################################
165 170
  # Step11: combine these cad variables into new data sheet:218-222
166
- ###################################################
171
+ ##################################################################
167 172
 
168 173
 newcad <- cbind(freq.d, beta.d, N.case, N.ctr, freq.case)
169 174
 newcad <- cbind(hg18.d, SNP.d, a1.d, as.data.frame(newcad))
170 175
 dim(newcad)
171 176
 
172
-################################################### 
177
+########################################################## 
173 178
 #Step12: give name vector of causal variables: 225-227
174
-###################################################
179
+##########################################################
175 180
 varname <-c("CAD", "LDL", "HDL", "TG", "TC")
176 181
 
177 182
 ###################################################
178 183
 ### Step 13: choose SNPs and make beta table
179 184
 ###################################################
185
+
180 186
 mybeta <- mktable(cdata=newdata, ddata=newcad, rt="beta", 
181 187
 varname=varname, LG=1, Pv=0.00000005, Pc=0.979, Pd=0.979)
182 188
 dim(mybeta)
... ...
@@ -188,6 +194,7 @@ head(beta)
188 194
 ###################################################
189 195
 ### load beta data: 256-264
190 196
 ###################################################
197
+
191 198
 data(beta.data)
192 199
 beta.data<-DataFrame(beta.data)
193 200
 CAD <- beta.data$cad
... ...
@@ -199,6 +206,7 @@ TC <- beta.data$tc
199 206
 ###################################################
200 207
 ### two way scatter: 266-280
201 208
 ###################################################
209
+
202 210
 par(mfrow=c(2, 2), mar=c(5.1, 4.1, 4.1, 2.1), oma=c(0, 0, 0, 0))
203 211
 plot(LDL,CAD, pch=19, col="blue", xlab="beta of SNPs on LDL", 
204 212
 ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
... ...
@@ -216,6 +224,7 @@ abline(lm(CAD~TC), col="red", lwd=2)
216 224
 ###################################################
217 225
 ### MR and Path Analysis: 296-300
218 226
 ###################################################
227
+
219 228
 data(beta.data)
220 229
 mybeta <- DataFrame(beta.data)
221 230
 mod <- CAD~LDL+HDL+TG+TC
... ...
@@ -236,11 +245,13 @@ mypath
236 245
 ###################################################
237 246
 ### load package: diagram: 324-326
238 247
 ###################################################
248
+
239 249
 library(diagram)
240 250
 
241 251
 ###################################################
242 252
 # make path diagram
243 253
 ###################################################
254
+
244 255
 pathdiagram(pathdata=mypath, disease="CAD", R2=0.988243, range=c(1:3))
245 256
 
246 257
 pathD<-matrix(NA,4,5)
... ...
@@ -272,8 +283,8 @@ library(graphics)
272 283
 ###################################################
273 284
 # chromosome SNP position histogram
274 285
 ##########################################
275
-snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
276 286
 
287
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
277 288
 
278 289
 ###################################################
279 290
 # SNP function annotation analysis
... ...
@@ -294,6 +305,7 @@ SNP368[1:10, ]
294 305
 ###################################################
295 306
 # make 3D Pie
296 307
 #############################################
308
+
297 309
 ucscannot(UCSCannot=SNP368,SNPn=368)
298 310
 
299 311
 ucscannot(UCSCannot=SNP368,SNPn=368,A=3,B=2,C=1.3,method=2)
... ...
@@ -7,7 +7,7 @@
7 7
 \documentclass[a4paper]{article}
8 8
 
9 9
 \title{GWAS-based Mendelian Randomization Path Analysis}
10
-\author{Yuan-De Tan \ \
10
+\author{Yuan-De Tan \\
11 11
 \texttt{tanyuande@gmail.com}}
12 12
 
13 13
 <<<style-Sweave, eval=TRUE, echo=FALSE, results=tex>>=
... ...
@@ -19,7 +19,7 @@ BiocStyle::latex()
19 19
 \maketitle
20 20
 
21 21
 \begin{abstract}
22
-\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of interest and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 $R$ functions: \Rfunction{chrp},\Rfunction{fmerge},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpPositAnnot},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerge} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto a given disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerge}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpPositAnnot} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to perform \emph{SNP} function annotation analysis.
22
+\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of study and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 \emph{R} functions: \Rfunction{chrp},\Rfunction{fmerge},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpPositAnnot},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerge} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto a given disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerge}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpPositAnnot} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to perform \emph{SNP} function annotation analysis.
23 23
 \end{abstract}
24 24
 
25 25
 \tableofcontents
... ...
@@ -27,9 +27,10 @@ BiocStyle::latex()
27 27
 \section{Introduction}
28 28
 As an example of human disease, coronary artery disease (\emph{CAD}) is one of the causes leading to death and infirmity worldwide~\cite{Murray1997}. Low-density lipoprotein cholesterol (\emph{LDL}) and triglycerides (\emph{TG}) are viewed as risk factors causing \emph{CAD}.  In epidemiological studies, plasma concentrations of increasing \emph{TG} and \emph{LDL} and deceasing high-density lipoprotein cholesterol (\emph{HDL} ) have been observed to be associated with risk for \emph{CAD}~\cite{Angelantonio2009, Sarwar2007}. However, from observational studies, one could not directly infer that these cholesterol concentrations  in plasma are risk factors causing \emph{CAD}~\cite{Do2013, Sarwar2007, Sheehan2007, Voight2012}. A big limitation of observational studies is to difficultly distinguish between causal and spurious associations due to confounding~\cite{Pichler2013}. An efficient approach to overcome this limitation is \textit{Mendelian Randomization} (\emph{MR}) analysis ~\cite{Sheehan2007, Smith2003} where genetic variants are used as instrumental variables. For this reason, many investigators tried to use genetic variants to assess causality and estimate the causal effects on the diseases.    
29 29
 
30
-\emph{MR} analysis can perfectly exclude confounding factors associated with disease.  However, when we expand one causal variable to many, \emph{MR} analysis becomes challenged and complicated because the genetic variant would have additional effects on the other risk factors, which violate assumption of no pleiotropy. An unknown genetic variant in \emph{MR} analysis possibly provides a false instrument for causal effect assessment of risk factors on the disease. The reason is that if this genetic variant is in \emph{LD} with another gene that is not used but has effect on the disease of study~\cite{Sheehan2007, Sheehan2010}. It then violates the third assumption.  These two problems can be addressed by using multiple instrumental variables.  For this reason, Do \emph{et al} (2013)developed statistic approach to address this issue~\cite{Do2013}. However, method of Do \emph{et al} ~\cite{Do2013} cannot disentangle correlation effects among the multiple undefined risk factors on the disease of study.  The beta values obtained from regression analyses are not direct causal effects because their effects are entangled with correlations among these undefined risk factors. 
30
+\emph{MR} analysis can perfectly exclude confounding factors associated with disease.  However, when we expand one causal variable to many, \emph{MR} analysis becomes challenged and complicated because the genetic variant would have additional effects on the other risk factors, which violate assumption of no pleiotropy. An unknown genetic variant in \emph{MR} analysis possibly provides a false instrument for causal effect assessment of risk factors on the disease. The reason is that
31
+if this genetic variant is in \emph{LD} with another gene that is not used but has effect on the disease of study~\cite{Sheehan2007, Sheehan2010}. It then violates the third assumption.  These two problems can be addressed by using multiple instrumental variables.  For this reason, Do \emph{et al}(2013)developed statistic approach to address this issue~\cite{Do2013}. However, the method of Do \emph{et al} ~\cite{Do2013} cannot disentangle correlation effects among the multiple undefined risk factors on the disease of study.  The beta values obtained from regression analyses are not direct causal effects because their effects are entangled with correlations among these undefined risk factors. 
31 32
 
32
-The best way to address the entanglement of multiple causal effects is path analysis that was developed by Wright~\cite{Wright1921, Wright1934}. This is because path analysis can dissect beta values into direct and indirect effects of causal variables on the disease. However, path analysis has not broadly been applied to diseases because diseases are usually binary variable. The method of Do ~\cite{Do2013} makes it possible to apply path analysis to disentangle causal effects of undefined risk factors on diseases. For doing so, we here provide \textbf {R package} \Rpackage{GMRP} (\emph{GWAS}-based \emph{MR} and \emph{path analysis}) to solve the above issues.
33
+The best way to address the entanglement of multiple causal effects is path analysis that was developed by Wright~\cite{Wright1921, Wright1934}. This is because path analysis can dissect beta values into direct and indirect effects of causal variables on the disease. However, path analysis has not broadly been applied to diseases because diseases are usually binary variable. The method of Do ~\cite{Do2013} makes it possible to apply path analysis to disentangle causal effects of undefined risk factors on diseases. For doing so, we here provide \textbf{R package} \Rpackage{GMRP} (\emph{GWAS}-based \emph{MR} and \emph{path analysis}) to solve the above issues.
33 34
 
34 35
 This vignette is intended to give a rapid introduction to the commands used in implementing \emph{MR} analysis, regression analysis, and path analysis, including \emph{SNP} annotation and chromosomal position analysis by means of the \Rpackage{GMRP} package. 
35 36
 
... ...
@@ -126,7 +127,7 @@ We use function \Rfunction{mktable} to choose \emph{SNP}s and make a standard be
126 127
  
127 128
 The standard beta table will be created via 15 steps:
128 129
 
129
-Step1: calculate $pvj$
130
+Step1: calculate $pv_j$
130 131
 
131 132
 <<Step1,keep.source=TRUE, eval=FALSE>>=
132 133
 pvalue.LDL <- lpd$P.value.LDL
... ...
@@ -155,7 +156,7 @@ a1.TC <- lpd$A1.TC
155 156
 alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC)
156 157
 @
157 158
 
158
-Step4: give sample sizes of causal variables and calculate $pcj$
159
+Step4: give sample sizes of causal variables and calculate $pc_j$
159 160
 <<Step4, keep.source=TRUE, eval=FALSE>>=
160 161
 N.LDL <- lpd$N.LDL
161 162
 N.HDL <- lpd$N.HDL
... ...
@@ -175,7 +176,7 @@ freq.TC<-lpd$Freq.A1.1000G.EUR.TC
175 176
 freq<-cbind(freq.LDL,freq.HDL,freq.TG,freq.TC)
176 177
 @
177 178
  
178
-Step6: construct matrix for $sd$ of each causal variable (here $sd$ is not specific to \emph{SNP} $j$). The following $sd$ values for \emph{LDL, HDL,TG} and \emph{TC} were means of standard deviations of these lipoprotein concentrations in plasma over 63 studies from Willer \emph{et al}~\cite{Willer2013}. 
179
+Step6: construct matrix for $sd$ of each causal variable (here $sd$ is not specific to $SNP_j$). The following $sd$ values for \emph{LDL, HDL,TG} and \emph{TC} were means of standard deviations of these lipoprotein concentrations in plasma over 63 studies from Willer \emph{et al}~\cite{Willer2013}. 
179 180
 <<Step6, keep.source=TRUE, eval=FALSE>>=
180 181
 sd.LDL <- rep(37.42, length(pvj))
181 182
 sd.HDL <- rep(14.87, length(pvj))
... ...
@@ -202,7 +203,7 @@ newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata))
202 203
 dim(newdata)
203 204
 @
204 205
  
205
-Step10: retrieve data from \Robject{cad} and calculate $pdj$ and frequency of coronary artery disease: \textit{freq.case} in population:
206
+Step10: retrieve data from \Robject{cad} and calculate $pd_j$ and frequency of coronary artery disease: \textit{freq.case} in population:
206 207
 <<Step10,keep.source=TRUE, eval=FALSE>>=
207 208
 hg18.d <- cad$chr_pos_b36
208 209
 SNP.d <- cad$SNP #SNPID