Browse code

first commit

y.tan authored on 21/04/2018 19:57:54
Showing 19 changed files

... ...
@@ -1,9 +1,9 @@
1 1
 Package: GMRP
2 2
 Type: Package
3 3
 Title: GWAS-based Mendelian Randomization and Path Analyses
4
-Version: 1.7.1
5
-Date: 2015-11-04
6
-Author: Yuan-De Tan and Dajiang Liu
4
+Version: 1.7.2
5
+Date: 2018-04-15
6
+Author: Yuan-De Tan
7 7
 Maintainer: Yuan-De Tan <tanyuande@gmail.com>
8 8
 Description: Perform Mendelian randomization analysis of multiple SNPs
9 9
         to determine risk factors causing disease of study and to
... ...
@@ -1,11 +1,11 @@
1 1
 exportPattern("^[[:alpha:]]+")
2
-export(fmerg)
2
+export(fmerge)
3 3
 export(chrp)
4 4
 export(mktable)
5 5
 export(path)
6 6
 export(pathdiagram)
7 7
 export(pathdiagram2)
8
-export(snpposit)
8
+export(snpPositAnnot)
9 9
 export(ucscannot)
10 10
 import(graphics)
11 11
 import(utils)
12 12
similarity index 99%
13 13
rename from R/fmerg.R
14 14
rename to R/fmerge.R
... ...
@@ -1,4 +1,4 @@
1
-fmerg <-
1
+fmerge <-
2 2
 function(fl1,fl2,ID1,ID2,A="",B="",method="No"){
3 3
 
4 4
 try (if(is.null(dim(fl1))|is.null(dim(fl2))) 
... ...
@@ -29,7 +29,7 @@ pdj<-ND/max(ND)
29 29
 newcdat<-as.data.frame(newcdat)
30 30
 
31 31
 ddata<-cbind(ddata[idx,],pdj)
32
-newcd<-fmerg(fl1=newcdat,fl2=ddata,ID1="rsid",ID2="SNP.d",A="",B="",method="no")
32
+newcd<-fmerge(fl1=newcdat,fl2=ddata,ID1="rsid",ID2="SNP.d",A="",B="",method="no")
33 33
 sbnewdat<-subset(newcd,pdj>=Pd)
34 34
 sbnewdat<-as.data.frame(sbnewdat)
35 35
 
36 36
new file mode 100644
... ...
@@ -0,0 +1,105 @@
1
+snpPositAnnot<-function(SNPdata,SNP_hg19="chr",main){
2
+#This function provide mean length of interval between SNP positions on each chromosome
3
+#and number of SNPs on each chromosome and ouput numbers of SNPs with interval length>=LG.
4
+#The data contain hg19 or chr and position. If SNP_hg19 == chr, then data must contain chr column and posit column
5
+#if SNP_hg19 != chr, then SNP_hg19 must be hg19 or hg18 that contain a column of "chrx:xxxxxxx". 
6
+
7
+if(SNP_hg19!="chr"){
8
+hg19<-as.character(SNPdata$SNP_hg19)
9
+#print(hg19[1:10])
10
+chromp<-chrp(hg19)
11
+#print(chromp[1:6,])
12
+}else{
13
+chr<-SNPdata$chr
14
+if(length(chr)==0){
15
+stop("no data")	
16
+}
17
+SNPdata<-SNPdata[order(chr),]
18
+#print(SNPdata[1:3,])
19
+chrn<-as.numeric(SNPdata$chr)
20
+posit<-as.numeric(SNPdata$posit)
21
+chromp<-cbind(chrn,posit)
22
+
23
+#print(CHR)
24
+}
25
+chr<-chromp[,1]
26
+CHR<-unique(chr)
27
+nch<-length(CHR) # number of chromosomes
28
+
29
+#print(nch)
30
+
31
+meanl<-m<-md<-rep(NA,nch)
32
+
33
+mdist<-function(x){
34
+
35
+y<-as.numeric(x[,2])
36
+#print(y)
37
+nr<-length(y)
38
+y<-sort(y)
39
+if(nr==0){
40
+lgdist<-0	
41
+}else if(nr==1){
42
+lgdist<-log(y[1])		
43
+	}else{
44
+dist<-rep(NA,nr-1)
45
+for(i in 1:nr-1){	
46
+dist[i]<-(y[(i+1)]-y[i])
47
+}
48
+lgdist<-log(min(dist))
49
+#print(dist)
50
+}
51
+return(lgdist)		
52
+	
53
+}
54
+
55
+m<-md<-snpn<-rep(0,nch)
56
+for(i in 1:nch){
57
+chrmp<-subset(chromp,chrn==CHR[i])
58
+snpn[i]<-nrow(chrmp)
59
+if(snpn[i]!=0){
60
+#	m[i]<-0
61
+#	md[i]<-0}else{
62
+
63
+#m[i]<-nsnpchr(subset(chromp,chrn==i))
64
+
65
+md[i]<-mdist(subset(chromp,chrn==CHR[i]))
66
+
67
+}
68
+
69
+}
70
+
71
+
72
+#mlength<-meanl[1:20]
73
+colors<-c("red","lightsalmon","lightseagreen","mediumspringgreen","mediumorchid4","mediumpurple4","lightskyblue4","mediumseagreen","deeppink",
74
+"mediumvioletred","firebrick","magenta","navyblue","lightsteelblue2","navajowhite","blue","maroon4","orange2","purple","darkorchid1",
75
+"black","cyan")
76
+
77
+chrcol<-rep(NA,nch)
78
+for(i in 1:nch){
79
+ chrcol[i]<-colors[i]	
80
+}
81
+
82
+par(oma=c(2,1,2,1))
83
+bp<-barplot(md,width=2,ylab="log least length (kbp) of intervals between SNPs", names.arg =paste("chr",CHR,sep=""),col=chrcol,ylim=c(0,30),
84
+main=main,cex.lab=2.0,cex.names=2,cex.main=2.5,cex.axis=1.5)
85
+
86
+
87
+# space between lab and column
88
+YY<-rep(NA,nch)
89
+for(i in 1:nch){
90
+	if(!is.na(md[i])){
91
+if(md[i]>=10){	
92
+  YY[i]<-md[i]+1.5
93
+   }else if(md[i]<10&md[i]>0){
94
+    YY[i]<-md[i]+1.5
95
+    }else if(md[i]==0){
96
+   YY[i]<-2	
97
+   }
98
+  }
99
+}
100
+#text(x=bp,y=YY,labels=n,col="black")
101
+# for data SNP247
102
+text(x=bp,y=YY,labels=snpn,col="black",cex=2.5)
103
+# for data SNP368
104
+#return(m)
105
+}
0 106
deleted file mode 100644
... ...
@@ -1,108 +0,0 @@
1
-snpposit <-
2
-function(SNPdata,SNP_hg19="chr",X="no",LG= 10,main="A",maxd=2000){
3
-SNPdata<-as.data.frame(SNPdata)	
4
-if(SNP_hg19!="chr"){
5
-hg19<-as.character(SNPdata$SNP_hg19)
6
-#print(hg19[1:10])
7
-
8
-chromp<-chrp(hg19)
9
-#print(chromp[1:6,])
10
-}else{
11
-chr<-SNPdata$chr
12
-if(length(chr)==0){
13
-stop("no data")
14
-}
15
-
16
-SNPdata<-SNPdata[with(SNPdata,order(chr,posit)),]
17
-chrn<-as.numeric(SNPdata$chr)
18
-posit<-as.numeric(SNPdata$posit)
19
-chromp<-cbind(chrn,posit)
20
-
21
-#print(CHR)
22
-}
23
-chr<-chromp[,1]
24
-CHR<-unique(chr)
25
-nch<-length(CHR) # number of chromosomes
26
-meanl<-m<-md<-rep(NA,nch)
27
-
28
-#calculate SNP number on each chromosome
29
-nsnpchr<-function(x,LG){
30
-y<-as.numeric(x[,2])
31
-#print(y)
32
-nr<-length(y)
33
-y<-sort(y)
34
-m<-0
35
-ix<-seq(nr-1)	
36
-iy<-seq(from=2,to=nr)
37
-m<-length(which(y[iy]-y[ix]>=LG))
38
-return(m)
39
-}
40
-
41
-#averaged distance between SNPs on each chromosome
42
-mdist<-function(x,LG){
43
-x<-as.data.frame(x)
44
-y<-x[,2]
45
-#print(y)
46
-nr<-length(y)
47
- y<-sort(y)
48
- mdist<-(y[nr]-y[1])/((nr-1)*LG)
49
-return(mdist)
50
-
51
-}
52
-
53
-#calculate average distance between SNPs and SNP number on each chromosome
54
-m<-md<-snpn<-rep(0,nch)
55
-for(i in 1:nch){
56
-chrmp<-subset(chromp,chrn==i)
57
-snpn[i]<-nrow(chrmp)
58
-if(snpn[i]!=0){
59
-m[i]<-nsnpchr(subset(chromp,chrn==i),LG)
60
-md[i]<-mdist(subset(chromp,chrn==i),LG)
61
-  }
62
-}
63
-
64
-md[which(md>=maxd)]<-maxd
65
-md[which(is.na(md))]<-0
66
-#print(length(md))
67
-#print(length(CHR))
68
-colors<-c("red","lightsalmon","lightseagreen","mediumspringgreen","mediumorchid4","mediumpurple4","lightskyblue4","mediumseagreen","deeppink",
69
-"mediumvioletred","firebrick","magenta","navyblue","lightsteelblue2","navajowhite","blue","maroon4","orange2","purple","darkorchid1",
70
-"black","cyan")
71
-
72
-chrcol<-rep(NA,nch)
73
-for(i in 1:nch){
74
- chrcol[i]<-colors[i]
75
-}
76
-if(X=="yes"){
77
-CHR[nch]<-"X"	
78
-}
79
-#print(CHR)
80
-par(oma=c(2,1,2,1))
81
-bp<-barplot(md,width=2,ylab="mean length (kbp) of interval between SNPs", names.arg =paste("chr",CHR,sep=""),col=chrcol,ylim=c(0,(maxd+250)),
82
-main=main,cex.lab=1.5,cex.names=1,cex.main=2.5,cex.axis=1.5)
83
-
84
-# space between lab and column
85
-YY<-rep(NA,nch)
86
-for(i in 1:nch){
87
-	if(!is.na(md[i])){
88
-if(md[i]>=1000){
89
-YY[i]<-md[i]*1.05
90
-   }else if(md[i]<1000&md[i]>=500){
91
-   YY[i]<-md[i]*1.08
92
-   }else if(md[i]<500&md[i]>=200){
93
-   YY[i]<-md[i]*1.2
94
-   }else if(md[i]<200&md[i]>=100){
95
-    YY[i]<-(md[i]*1.8)
96
-    }else if(md[i]<100&md[i]>=50){
97
-    YY[i]<-(md[i]*2.5)
98
-    }else if(md[i]<50&md[i]>0){
99
-    YY[i]<-(md[i]*5)
100
-    }else if(md[i]==0){
101
-    YY[i]<-45
102
-  }
103
-  }
104
-}
105
-#text(x=bp,y=YY,labels=n,col="black")
106
-text(x=bp,y=YY,labels=snpn,col="black",cex=1.3)
107
-return(m)
108
-}
109 0
new file mode 100644
110 1
Binary files /dev/null and b/inst/doc/GMRP-manual.pdf differ
111 2
new file mode 100644
... ...
@@ -0,0 +1,304 @@
1
+### R code from vignette source 'GMRP.Rnw'
2
+
3
+###################################################
4
+### code chunk number 1: <style-Sweave
5
+###################################################
6
+BiocStyle::latex()
7
+
8
+#library("knitr")
9
+#opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
10
+ #              fig.width=4,fig.height=4.5,
11
+  #             dpi=300,# increase dpi to avoid pixelised pngs
12
+  #             message=FALSE)
13
+               
14
+###################################################
15
+### code chunk number 2: GMRP.Rnw:40-43
16
+###################################################
17
+set.seed(102)
18
+options(width = 90)
19
+
20
+
21
+###################################################
22
+### code chunk number 3: GMRP.Rnw:45-47
23
+###################################################
24
+library(GMRP)
25
+
26
+###################################################
27
+### code chunk number 4: fmerge.Rnw:56-72
28
+###################################################
29
+data1 <- matrix(NA, 20, 4)
30
+data2 <- matrix(NA, 30, 7)
31
+SNPID1 <- paste("rs", seq(1:20), sep="")
32
+SNPID2 <- paste("rs", seq(1:30), sep="")
33
+data1[,1:4] <- c(round(runif(20), 4), round(runif(20), 4), round(runif(20), 4), round(runif(20), 4))
34
+data2[,1:4] <- c(round(runif(30), 4),round(runif(30), 4), round(runif(30), 4), round(runif(30), 4))
35
+data2[,5:7] <- c(round(seq(30)*runif(30), 4), round(seq(30)*runif(30), 4), seq(30))
36
+data1 <- cbind(SNPID1, as.data.frame(data1))
37
+data2 <- cbind(SNPID2, as.data.frame(data2))
38
+dim(data1)
39
+dim(data2)
40
+colnames(data1) <- c("SNP", "var1", "var2", "var3", "var4")
41
+colnames(data2) <- c("SNP", "var1", "var2", "var3", "var4", "V1", "V2", "V3")
42
+data1<-DataFrame(data1)
43
+data2<-DataFrame(data2)
44
+data12 <- fmerge(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2", method="No")
45
+
46
+###################################################
47
+### code chunk number 5: disease data load:90-93
48
+###################################################
49
+data(cad.data)
50
+#cad <- DataFrame(cad.data)
51
+cad<-cad.data
52
+head(cad)
53
+###################################################
54
+### code chunk number 6:lipid data load: 95-99
55
+###################################################
56
+data(lpd.data)
57
+#lpd <- DataFrame(lpd.data)
58
+lpd<-lpd.data
59
+head(lpd)
60
+
61
+###################################################
62
+### code chunk number 7: choose SNPs: 127-152
63
+###################################################
64
+
65
+###################################################
66
+# Step1: calculate pvj: 129-136
67
+###################################################
68
+
69
+pvalue.LDL <- lpd$P.value.LDL
70
+pvalue.HDL <-lpd$P.value.HDL
71
+pvalue.TG <- lpd$P.value.TG
72
+pvalue.TC <- lpd$P.value.TC
73
+pv <- cbind(pvalue.LDL, pvalue.HDL, pvalue.TG, pvalue.TC)
74
+pvj <- apply(pv, 1, min)
75
+
76
+###################################################
77
+# Step2: retrieve causal variables from data: 139-145
78
+###################################################
79
+
80
+beta.LDL <- lpd$beta.LDL
81
+beta.HDL <- lpd$beta.HDL
82
+beta.TG <- lpd$beta.TG
83
+beta.TC <- lpd$beta.TC
84
+beta <- cbind(beta.LDL, beta.HDL, beta.TG, beta.TC)
85
+
86
+###################################################
87
+#     Step3: construct a matrix for allele1: 148-154
88
+###################################################
89
+
90
+a1.LDL <- lpd$A1.LDL
91
+a1.HDL <- lpd$A1.HDL
92
+a1.TG <- lpd$A1.TG
93
+a1.TC <- lpd$A1.TC
94
+alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC)
95
+
96
+###################################################
97
+# Step4:  calculate pcj:157-165
98
+###################################################
99
+
100
+N.LDL <- lpd$N.LDL
101
+N.HDL <- lpd$N.HDL
102
+N.TG <- lpd$N.TG
103
+N.TC <- lpd$N.TC
104
+ss <- cbind(N.LDL, N.HDL, N.TG, N.TC)
105
+sm <- apply(ss,1,sum)
106
+pcj <- round(sm/max(sm), 6)
107
+
108
+################################################### 
109
+# Step5: Construct matrix for frequency of allele1:168-174
110
+###################################################
111
+
112
+freq.LDL<-lpd$Freq.A1.1000G.EUR.LDL
113
+freq.HDL<-lpd$Freq.A1.1000G.EUR.HDL
114
+freq.TG<-lpd$Freq.A1.1000G.EUR.TG
115
+freq.TC<-lpd$Freq.A1.1000G.EUR.TC
116
+freq<-cbind(freq.LDL,freq.HDL,freq.TG,freq.TC)
117
+
118
+###################################################
119
+# Step6: construct matrix for sd of each causal variable: 177-183 
120
+###################################################
121
+
122
+sd.LDL <- rep(37.42, length(pvj))
123
+sd.HDL <- rep(14.87, length(pvj))
124
+sd.TG <-rep(92.73, length(pvj))
125
+sd.TC <- rep(42.74, length(pvj))
126
+sd <- cbind(sd.LDL, sd.HDL, sd.TG, sd.TC)
127
+
128
+###################################################
129
+# Step7: SNPID and position: 186-189
130
+###################################################
131
+<<Step7, keep.source=TRUE, eval=FALSE>>=
132
+hg19 <- lpd$SNP_hg19.HDL
133
+rsid <- lpd$rsid.HDL
134
+
135
+###################################################
136
+# Step8: separate chromosome number and SNP position: 192-194
137
+###################################################
138
+chr<-chrp(hg=hg19)
139
+
140
+###################################################
141
+# Step9: get new data: 197-201
142
+###################################################
143
+
144
+newdata<-cbind(freq,beta,sd,pvj,ss,pcj)
145
+newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata))
146
+dim(newdata)
147
+
148
+
149
+###################################################
150
+# Step10: retrieve data from cad and calculate pdj:204-215
151
+###################################################
152
+
153
+hg18.d <- cad$chr_pos_b36
154
+SNP.d <- cad$SNP #SNPID
155
+a1.d<- tolower(cad$reference_allele)
156
+freq.d <- cad$ref_allele_frequency
157
+pvalue.d <- cad$pvalue
158
+beta.d <- cad$log_odds
159
+N.case <- cad$N_case
160
+N.ctr <- cad$N_control
161
+N.d <- N.case+N.ctr
162
+freq.case <- N.case/N.d
163
+
164
+###################################################
165
+ # Step11: combine these cad variables into new data sheet:218-222
166
+ ###################################################
167
+
168
+newcad <- cbind(freq.d, beta.d, N.case, N.ctr, freq.case)
169
+newcad <- cbind(hg18.d, SNP.d, a1.d, as.data.frame(newcad))
170
+dim(newcad)
171
+
172
+################################################### 
173
+#Step12: give name vector of causal variables: 225-227
174
+###################################################
175
+varname <-c("CAD", "LDL", "HDL", "TG", "TC")
176
+
177
+###################################################
178
+### Step 13: choose SNPs and make beta table
179
+###################################################
180
+mybeta <- mktable(cdata=newdata, ddata=newcad, rt="beta", 
181
+varname=varname, LG=1, Pv=0.00000005, Pc=0.979, Pd=0.979)
182
+dim(mybeta)
183
+beta <- mybeta[,4:8]   #  standard beta table for path analysis
184
+snp <- mybeta[,1:3]   #  snp data for annotation analysis
185
+beta<-DataFrame(beta)
186
+head(beta)
187
+
188
+###################################################
189
+### load beta data: 256-264
190
+###################################################
191
+data(beta.data)
192
+beta.data<-DataFrame(beta.data)
193
+CAD <- beta.data$cad
194
+LDL <- beta.data$ldl
195
+HDL <- beta.data$hdl
196
+TG <- beta.data$tg
197
+TC <- beta.data$tc
198
+
199
+###################################################
200
+### two way scatter: 266-280
201
+###################################################
202
+par(mfrow=c(2, 2), mar=c(5.1, 4.1, 4.1, 2.1), oma=c(0, 0, 0, 0))
203
+plot(LDL,CAD, pch=19, col="blue", xlab="beta of SNPs on LDL", 
204
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
205
+abline(lm(CAD~LDL), col="red", lwd=2)
206
+plot(HDL, CAD, pch=19,col="blue", xlab="beta of SNPs on HDL", 
207
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
208
+abline(lm(CAD~HDL), col="red", lwd=2)
209
+plot(TG, CAD, pch=19, col="blue", xlab="beta of SNPs on TG", 
210
+ylab="beta of SNP on CAD",cex.lab=1.5, cex.axis=1.5, cex.main=2)
211
+abline(lm(CAD~TG), col="red", lwd=2)
212
+plot(TC,CAD, pch=19, col="blue", xlab="beta of SNPs on TC", 
213
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
214
+abline(lm(CAD~TC), col="red", lwd=2)
215
+
216
+###################################################
217
+### MR and Path Analysis: 296-300
218
+###################################################
219
+data(beta.data)
220
+mybeta <- DataFrame(beta.data)
221
+mod <- CAD~LDL+HDL+TG+TC
222
+pathvalue <- path(betav=mybeta, model=mod, outcome="CAD")
223
+
224
+###################################################
225
+### Create Path Diagram: 305-312
226
+###################################################
227
+
228
+mypath <- matrix(NA,3,4)
229
+mypath[1,] <- c(1.000000, -0.066678, 0.420036, 0.764638)
230
+mypath[2,] <- c(-0.066678, 1.000000, -0.559718, 0.496831)
231
+mypath[3,] <- c(0.420036, -0.559718, 1.000000, 0.414346)
232
+colnames(mypath) <- c("LDL", "HDL", "TG", "path")
233
+mypath<-as.data.frame(mypath)
234
+mypath
235
+
236
+###################################################
237
+### load package: diagram: 324-326
238
+###################################################
239
+library(diagram)
240
+
241
+###################################################
242
+# make path diagram
243
+###################################################
244
+pathdiagram(pathdata=mypath, disease="CAD", R2=0.988243, range=c(1:3))
245
+
246
+pathD<-matrix(NA,4,5)
247
+pathD[1,] <- c(1,	-0.070161, 0.399038, 0.907127, 1.210474)
248
+pathD[2,] <- c(-0.070161,	1, -0.552106, 0.212201, 0.147933)
249
+pathD[3,] <- c(0.399038, -0.552106, 1, 0.44100, 0.64229)
250
+pathD[4,] <- c(0.907127, 0.212201, 0.441007, 1, -1.035677)
251
+colnames(pathD) <- c("LDL", "HDL", "TG", "TC", "path")
252
+pathD<-as.data.frame(pathD)
253
+pathD
254
+
255
+pathdiagram2(pathD=pathD,pathO=mypath,rangeD=c(1:4),rangeO=c(1:3),
256
+disease="CAD", R2D=0.536535,R2O=0.988243)
257
+
258
+###################################################
259
+# load SNP358 data
260
+###################################################
261
+
262
+data(SNP358.data)
263
+SNP358 <- DataFrame(SNP358.data)
264
+head(SNP358)
265
+
266
+###################################################
267
+#  load package library(graphics)
268
+###################################################
269
+
270
+library(graphics)
271
+
272
+###################################################
273
+# chromosome SNP position histogram
274
+##########################################
275
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
276
+
277
+
278
+###################################################
279
+# SNP function annotation analysis
280
+###################################################
281
+
282
+###################################################
283
+# load SNP annotation data: 431-434
284
+###################################################
285
+
286
+data(SNP368annot.data)
287
+SNP368<-DataFrame(SNP368annot.data)
288
+SNP368[1:10, ]
289
+
290
+###################################################
291
+# load package plotrix
292
+###################################################
293
+
294
+###################################################
295
+# make 3D Pie
296
+#############################################
297
+ucscannot(UCSCannot=SNP368,SNPn=368)
298
+
299
+ucscannot(UCSCannot=SNP368,SNPn=368,A=3,B=2,C=1.3,method=2)
300
+
301
+
302
+sessionInfo()
303
+
304
+
... ...
@@ -9,25 +9,26 @@
9 9
 \bold{GMRP} is used to perform Mendelian randomization analysis of causal variables on disease of study using SNP beta data from \bold{GWAS} or \bold{GWAS} meta analysis and furthermore execute path analysis of causal variables onto the disease.
10 10
 }
11 11
 \details{
12
-\bold{GMRP} can perform analyses of Mendelian randomization (MR),correlation, path of causal variables onto disease of interest and SNP annotation summarization analysis. MR includes SNP selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta values, \bold{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \bold{GMRP} consists of 8 functions: \emph{chrp, fmerg, mktable,  pathdiagram2, pathdiagram, path, snpposit, ucscannot} and 5 datasets: \code{beta.data, cad.data, lpd.data, SNP358.data and SNP368_annot.data}. Function \emph{chrp} is used to separate string vector hg19 into two numeric vectors: chromosome number and SNP position on chromosomes. Function \emph{fmerg} is used to merge two datasets into one dataset. Function \emph{mktable} performs SNP selection and creates a standard beta table for function path to do path analyses. Function \emph{pathdiagram} is used to create a path diagram of causal variables onto the disease or onto outcome. Function \emph{pathdiagram2} can merg two-level pathdiagrams into one nested pathdiagram where inner path diagram is a path diagram of causal variables contributing onto outcome and the outside path diagram is a diagram of path of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \code{lpd.data} and \code{cad.data} provide an example to create a standard beta dataset for \emph{path} function to do path analysis and SNP data for SNP annotation analysis by performing \emph{mktable} and \emph{fmerg}. beta.data are a standard beta dataset for path analysis. \code{SNP358.data} provide an example for function \emph{snpposit} to do SNP position annotation analysis and \code{SNP368_annot.data} are for function \emph{ucscannot} to do SNP function annotation analysis. It is specially emphasized that except for that making standard beta table using \emph{mktable} must be done in Unix/Linux system,GMRP can be performed in Windows or Mac OS. This is because GWAS datasets usually are very huge but standard beta table is small. If users'Unix/Linux system has X11 or the other graphics system, then user should perform GMRP in Unix/Linux system, otherwise, user should transfer a standard beta table to a local computer and run GMRP in it.      
12
+\bold{GMRP} can perform analyses of Mendelian randomization (MR),correlation, path of causal variables onto disease of interest and SNP annotation summarization analysis. MR includes SNP selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta values, \bold{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \bold{GMRP} consists
13
+of 8 functions: \emph{chrp, fmerge, mktable,  pathdiagram2, pathdiagram, path, snpPositAnnot, ucscannot} and 5 datasets: \code{beta.data, cad.data, lpd.data, SNP358.data and SNP368_annot.data}. Function \emph{chrp} is used to separate string vector hg19 into two numeric vectors: chromosome number and SNP position on chromosomes. Function \emph{fmerge} is used to merge two datasets into one dataset. Function \emph{mktable} performs SNP selection and creates a standard beta table for
14
+function path to do path analyses. Function \emph{pathdiagram} is used to create a path diagram of causal variables onto the disease or onto outcome. Function \emph{pathdiagram2} can merge two-level pathdiagrams into one nested pathdiagram where inner path diagram is a path diagram of causal variables contributing onto outcome and the outside path diagram is a diagram of path of causal variables including outcome onto the disease. The five datasets provide examples for running these
15
+functions. \code{lpd.data} and \code{cad.data} provide an example to create a standard beta dataset for \emph{path} function to do path analysis and SNP data for SNP annotation analysis by performing \emph{mktable} and \emph{fmerge}. beta.data are a standard beta dataset for path analysis. \code{SNP358.data} provide an example for function \emph{snpPositAnnot} to do SNP position annotation analysis and \code{SNP368_annot.data} are for function \emph{ucscannot} to do SNP function annotation analysis. It is specially emphasized that except for that making standard beta table using \emph{mktable} must be done in Unix/Linux system,GMRP can be performed in Windows or Mac OS. This is because GWAS datasets usually are very huge but standard beta table is small. If users'Unix/Linux system has X11 or the other graphics system, then user should perform GMRP in Unix/Linux system, otherwise, user should transfer a standard beta table to a local computer and run GMRP in it.      
13 16
 
14 17
 }
15 18
 \author{
16 19
 Yuan-De Tan
17 20
 \email{tanyuande@gmail.com}
18
-
19
-\\Dajiang Liu
20
-
21
-\\Maintainer: Yuan-De Tan
21
+\cr
22
+Maintainer: Yuan-De Tan
22 23
 }
23 24
 \references{
24 25
 Do, R. et al. 2013. Common variants associated with plasma triglycerides and risk for coronary artery disease. Nat Genet 45: 1345-1352.
25
-
26
-\\Sheehan, N.A. et al. 2008. Mendelian randomisation and causal inference in observational epidemiology. PLoS Med 5: e177.
27
- 
28
-\\Sheehan, N.A.,et al. 2010. Mendelian randomisation: a tool for assessing causality in observational epidemiology. Methods Mol Biol 713: 153-166.\\Wright, S. 1921. Correlation and causation. J. Agricultural Research 20: 557-585.
29
-
30
-\\Wright, S. 1934. The method of path coefficients Annals of Mathematical Statistics 5 (3): 161-215 5: 161-215.
26
+\cr
27
+Sheehan, N.A. et al. 2008. Mendelian randomisation and causal inference in observational epidemiology. PLoS Med 5: e177.
28
+\cr 
29
+Sheehan, N.A.,et al. 2010. Mendelian randomisation: a tool for assessing causality in observational epidemiology. Methods Mol Biol 713: 153-166.\\Wright, S. 1921. Correlation and causation. J. Agricultural Research 20: 557-585.
30
+\cr
31
+Wright, S. 1934. The method of path coefficients Annals of Mathematical Statistics 5 (3): 161-215 5: 161-215.
31 32
 
32 33
 }
33 34
 \keyword{ package }
... ...
@@ -17,7 +17,7 @@ Data of 358 SNPs
17 17
   }
18 18
 }
19 19
 \details{
20
-These 358 SNPs were chosen by using mktable with Pv = code{5x10e-08},\code{Pc = Pd =0.979} from code{lpd.data and cad.data}. They provide a data example for how to perform annotation analysis of SNP positions.
20
+These 358 SNPs were chosen by using mktable with Pv = \code{5x10e-08},\code{Pc = Pd =0.979} from \code{lpd.data and cad.data}. They provide a data example for how to perform annotation analysis of SNP positions.
21 21
 }
22 22
 \value{
23 23
 A set of data with 358 rows(SNPs) and 3 columns(SNP ID, chromosome # and SNP position on chromosomes).
24 24
similarity index 81%
25 25
rename from man/fmerg.Rd
26 26
rename to man/fmerge.Rd
... ...
@@ -1,5 +1,5 @@
1
-\name{fmerg}
2
-\alias{fmerg}
1
+\name{fmerge}
2
+\alias{fmerge}
3 3
 \title{
4 4
 Merge Two \bold{GWAS} Result Data Sheets
5 5
 }
... ...
@@ -7,7 +7,7 @@ Merge Two \bold{GWAS} Result Data Sheets
7 7
 \emph{fmerg} can be used to merge two \bold{GWAS} result data sheets with the same key \verb{ID(SNP ID)} into one data sheet.
8 8
 }
9 9
 \usage{
10
-fmerg(fl1, fl2,ID1, ID2, A, B, method)
10
+fmerge(fl1, fl2,ID1, ID2, A, B, method)
11 11
 }
12 12
 \arguments{
13 13
   \item{fl1}{
... ...
@@ -33,7 +33,8 @@ method for merging. See details.
33 33
 }
34 34
 }
35 35
 \details{
36
-fl1 and fl2 are two \bold{GWAS} result data files from different studies or with different risk variables. They contain \verb{SNPID}, \verb{hg18}, \verb{hg19}(positions), beta values, allele, frequency, and so on. The method has four options: method="No","NO" or "no" means that all data with unmatch \verb{SNP}s are not saved in the merged file; method="All","ALL" or "all" lets fmerg save all the data with unmatched \verb{SNP}s from two files but they are not paired one-by-one. This is different from R \emph{merge} function. method="file1" will save the data with unmatched \verb{SNP}s only from file 1 in the merged file and method="file2" allows function \emph{fmerg} to save the data with unmatched \verb{SNP}s from file2 in the merged file.
36
+fl1 and fl2 are two \bold{GWAS} result data files from different studies or with different risk variables. They contain \verb{SNPID}, \verb{hg18}, \verb{hg19}(positions), beta values, allele, frequency, and so on. The method has four options: method="No","NO" or "no" means that all data with unmatch \verb{SNP}s are not saved in the merged file; method="All","ALL" or "all" lets fmerge save all the data with unmatched \verb{SNP}s from two files but they are not paired one-by-one. This is
37
+different from R \emph{merge} function. method="file1" will save the data with unmatched \verb{SNP}s only from file 1 in the merged file and method="file2" allows function \emph{fmerge} to save the data with unmatched \verb{SNP}s from file2 in the merged file.
37 38
 
38 39
 }
39 40
 \value{
... ...
@@ -43,8 +44,6 @@ Return a joined data sheet.
43 44
 \author{
44 45
 Yuan-De Tan
45 46
 \email{tanyuande@gmail.com}
46
-
47
-\\Dajiang Liu
48 47
 }
49 48
 \note{
50 49
 Function fmerg can also be applied to the other types of data.
... ...
@@ -67,7 +66,7 @@ dim(data1)
67 66
 dim(data2)
68 67
 colnames(data1)<-c("SNP","var1","var2","var3","var4")
69 68
 colnames(data2)<-c("SNP","var1","var2","var3","var4","V1","V2","V3")
70
-data12<-fmerg(fl1=data1,fl2=data2,ID1="SNP",ID2="SNP",A=".dat1",B=".dat2",method="No")
69
+data12<-fmerge(fl1=data1,fl2=data2,ID1="SNP",ID2="SNP",A=".dat1",B=".dat2",method="No")
71 70
 #data12[1:3,]
72 71
 #  SNP.dat1 var1.dat1 var2.dat1 var3.dat1 var4.dat1 SNP.dat2 var1.dat2 var2.dat2
73 72
 #1      rs1    0.9152    0.9853    0.9879    0.9677      rs1    0.5041    0.5734
... ...
@@ -59,18 +59,16 @@ Return a standard \verb{SNP} beta or \verb{SNP} path table containing \code{m} \
59 59
 }
60 60
 \references{
61 61
 Do, R. et al. 2013. Common variants associated with plasma triglycerides and risk for coronary artery disease. \emph{Nat Genet} \bold{45}: 1345-1352.
62
-
63
-\\Sheehan, N.A. et al. 2008. Mendelian randomisation and causal inference in observational epidemiology. \emph{PLoS Med} \bold{5}: e177.
64
-
65
-\\Sheehan, N.A.,et al. 2010. Mendelian randomisation: a tool for assessing causality in observational epidemiology. \emph{Methods Mol Biol} \bold{713}: 153-166.
66
-
67
-\\Willer, C.J. Schmidt, E.M. Sengupta, S. Peloso, G.M. Gustafsson, S. Kanoni, S. Ganna, A. Chen, J.,Buchkovich, M.L. Mora, S. et al (2013) Discovery and refinement of loci associated with lipid levels. \emph{Nat Genet} \bold{45}: 1274-1283.
62
+\cr
63
+Sheehan, N.A. et al. 2008. Mendelian randomisation and causal inference in observational epidemiology. \emph{PLoS Med} \bold{5}: e177.
64
+\cr
65
+Sheehan, N.A.,et al. 2010. Mendelian randomisation: a tool for assessing causality in observational epidemiology. \emph{Methods Mol Biol} \bold{713}: 153-166.
66
+\cr
67
+Willer, C.J. Schmidt, E.M. Sengupta, S. Peloso, G.M. Gustafsson, S. Kanoni, S. Ganna, A. Chen, J.,Buchkovich, M.L. Mora, S. et al (2013) Discovery and refinement of loci associated with lipid levels. \emph{Nat Genet} \bold{45}: 1274-1283.
68 68
 }
69 69
 \author{
70 70
 Yuan-De Tan
71 71
 \email{tanyuande@gmail.com}
72
-
73
-\\Dajiang Liu
74 72
 }
75 73
 \note{
76 74
 The order of column variables must be \verb{chrn} \verb{posit} \verb{rsid} \code{a1.x1} \dots \code{a1.xn} \code{freq.x1} \dots \code{freq.xn} \code{beta.x1} \dots \code{beta.x1} \dots \code{beta.xn} \code{sd.x1} \dots \code{sd.xn} \dots otherwise, mktable would have error. see example.
... ...
@@ -81,10 +79,11 @@ The order of column variables must be \verb{chrn} \verb{posit} \verb{rsid} \code
81 79
 }
82 80
 \examples{
83 81
 data(lpd.data)
84
-lpd<-DataFrame(lpd.data)
82
+#lpd<-DataFrame(lpd.data)
83
+lpd<-lpd.data
85 84
 data(cad.data)
86
-cad<-DataFrame(cad.data)
87
-
85
+#cad<-DataFrame(cad.data)
86
+cad<-cad.data
88 87
 # step 1: calculate pvj
89 88
 pvalue.LDL<-lpd$P.value.LDL
90 89
 pvalue.HDL<-lpd$P.value.HDL
... ...
@@ -29,20 +29,18 @@ Return three matrices: beta coefficients of regressions of risk variables on out
29 29
 }
30 30
 \references{
31 31
 Do, R. et al. 2013 Common variants associated with plasma triglycerides and risk for coronary artery disease. \emph{Nat Genet} \bold{45}: 1345-1352.
32
-
33
-\\Sheehan, N.A. et al. 2008 Mendelian randomisation and causal inference in observational epidemiology. PLoS Med 5: e177.
34
-
35
-\\Sheehan, N.A.,et al. 2010 Mendelian randomisation: a tool for assessing causality in observational epidemiology. \emph{Methods Mol Biol} \bold{713}: 153-166.
36
-
37
-\\Wright, S. 1921 Correlation and causation. \emph{J.Agricultural Research} \bold{20}: 557-585.
38
-
39
-\\Wright, S. 1934 The method of path coefficients. \emph{Annals of Mathematical Statistics} \bold{5} (3): 161-215.
32
+\cr
33
+Sheehan, N.A. et al. 2008 Mendelian randomisation and causal inference in observational epidemiology. PLoS Med 5: e177.
34
+\cr
35
+Sheehan, N.A.,et al. 2010 Mendelian randomisation: a tool for assessing causality in observational epidemiology. \emph{Methods Mol Biol} \bold{713}: 153-166.
36
+\cr
37
+Wright, S. 1921 Correlation and causation. \emph{J.Agricultural Research} \bold{20}: 557-585.
38
+\cr
39
+Wright, S. 1934 The method of path coefficients. \emph{Annals of Mathematical Statistics} \bold{5} (3): 161-215.
40 40
 }
41 41
 \author{
42 42
 Yuan-De Tan
43 43
 \email{tanyuande@gmail.com}
44
-
45
-\\Dajiang Liu
46 44
 }
47 45
 \note{
48 46
 betav may also be a matrix of SNP path coefficients onto risk variables and disease.
... ...
@@ -33,8 +33,6 @@ NULL. pathdiagram will create one-level path diagram labeled with colors.
33 33
 \author{
34 34
 Yuan-De Tan
35 35
 \email{tanyuande@gmail.com}
36
-
37
-\\Dajiang Liu
38 36
 }
39 37
 
40 38
 \seealso{
... ...
@@ -42,7 +42,6 @@ Null. Function \emph{pathdiagram2} creates a nested two-level path diagram label
42 42
 Yuan-De Tan
43 43
 \email{tanyuande@gmail.com}
44 44
 
45
-\\Dajiang Liu
46 45
 }
47 46
 
48 47
 \seealso{
49 48
similarity index 66%
50 49
rename from man/snpposit.Rd
51 50
rename to man/snpPositAnnot.Rd
... ...
@@ -1,5 +1,5 @@
1
-\name{snpposit}
2
-\alias{snpposit}
1
+\name{snpPositAnnot}
2
+\alias{snpPositAnnot}
3 3
 \title{
4 4
 \emph{SNP} Position Annotation
5 5
 }
... ...
@@ -7,7 +7,7 @@
7 7
 This function is used to perform position annotation analysis of \verb{SNP}s chosen from \verb{GWAS}.
8 8
 }
9 9
 \usage{
10
-snpposit(SNPdata, SNP_hg19="chr", X="no",LG=10, main="A", maxd=2000)
10
+snpPositAnnot(SNPdata, SNP_hg19="chr", main)
11 11
 }
12 12
 \arguments{
13 13
   \item{SNPdata}{
... ...
@@ -16,18 +16,9 @@ snpposit(SNPdata, SNP_hg19="chr", X="no",LG=10, main="A", maxd=2000)
16 16
   \item{SNP_hg19}{
17 17
 a string parameter. It may be "hg19" or "chr". If SNP_hg19="hg19",then \verb{SNPdata} contain a string vector of hg19 or if \verb{SNP_hg19}="chr", then \verb{SNPdata} consist of at lest two columns: \verb{chr} and \verb{posit}. \verb{chr} is chromosome number and posit is \verb{SNP} physical position on chromosomes. If data sheet has chromosome X, then character "X" should be changed to 23 in chr vector or chr23.######## in hg19 vector.
18 18
 }
19
-  \item{X}{
20
-character parameter. It has Two options: X="yes" or X="no". Default is X="no".	
21
-}
22
-  \item{LG}{
23
-a numeric parameter that gives maximum permissible distance between positions. 
24
-}
25
-  \item{main}{
26
-a string which is title of graph. If no title is given, then man="".
27
-}
28
-  \item{maxd}{
29
-maxd is a numeric parameter that is maximum distance for truncating chromosome columns. If there are not too big differences among 23 chromosomes, then \verb{maxd} can be set to be larger than 2000. 
30
-}
19
+   \item{main}{
20
+a string which is title of graph. If no title is given, then main="".
21
+  }
31 22
 }
32 23
 \value{
33 24
 Return a set of numbers of SNPs between which interval length > \bold{LG} on 23 chromosomes. This function also creates a histogram for averaged distances between SNPs and SNP numbers on chromosomes.
... ...
@@ -35,8 +26,6 @@ Return a set of numbers of SNPs between which interval length > \bold{LG} on 23
35 26
 \author{
36 27
 Yuan-De Tan
37 28
 \email{tanyuande@gmail.com}
38
-
39
-\\Dajiang Liu
40 29
 }
41 30
 \note{
42 31
 This function can also be applied to hg18 data with \code{SNP_hg19}="hg18". 
... ...
@@ -48,7 +37,7 @@ This function can also be applied to hg18 data with \code{SNP_hg19}="hg18".
48 37
 \examples{
49 38
 data(SNP358.data)
50 39
 SNP358<-DataFrame(SNP358.data)
51
-snpposit(SNPdata=SNP358,SNP_hg19="chr",X="Yes",LG=10,main="A",maxd=2000)
40
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
52 41
 }
53 42
 \keyword{graphics}
54 43
 \keyword{SNP position}
... ...
@@ -33,7 +33,6 @@ Create a color \emph{pie3D} diagram and return a set of numeric values: proporti
33 33
 Yuan-De Tan
34 34
 \email{tanyuande@gmail.com}
35 35
 
36
-\\Dajiang Liu
37 36
 }
38 37
 \note{
39 38
 This function just need  data of "Predicted function" and "symbol", so the other column data in UCSCannot do not impact the results of analysis.
40 39
new file mode 100644
... ...
@@ -0,0 +1,304 @@
1
+### R code from vignette source 'GMRP.Rnw'
2
+
3
+###################################################
4
+### code chunk number 1: <style-Sweave
5
+###################################################
6
+BiocStyle::latex()
7
+
8
+#library("knitr")
9
+#opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
10
+ #              fig.width=4,fig.height=4.5,
11
+  #             dpi=300,# increase dpi to avoid pixelised pngs
12
+  #             message=FALSE)
13
+               
14
+###################################################
15
+### code chunk number 2: GMRP.Rnw:40-43
16
+###################################################
17
+set.seed(102)
18
+options(width = 90)
19
+
20
+
21
+###################################################
22
+### code chunk number 3: GMRP.Rnw:45-47
23
+###################################################
24
+library(GMRP)
25
+
26
+###################################################
27
+### code chunk number 4: fmerge.Rnw:56-72
28
+###################################################
29
+data1 <- matrix(NA, 20, 4)
30
+data2 <- matrix(NA, 30, 7)
31
+SNPID1 <- paste("rs", seq(1:20), sep="")
32
+SNPID2 <- paste("rs", seq(1:30), sep="")
33
+data1[,1:4] <- c(round(runif(20), 4), round(runif(20), 4), round(runif(20), 4), round(runif(20), 4))
34
+data2[,1:4] <- c(round(runif(30), 4),round(runif(30), 4), round(runif(30), 4), round(runif(30), 4))
35
+data2[,5:7] <- c(round(seq(30)*runif(30), 4), round(seq(30)*runif(30), 4), seq(30))
36
+data1 <- cbind(SNPID1, as.data.frame(data1))
37
+data2 <- cbind(SNPID2, as.data.frame(data2))
38
+dim(data1)
39
+dim(data2)
40
+colnames(data1) <- c("SNP", "var1", "var2", "var3", "var4")
41
+colnames(data2) <- c("SNP", "var1", "var2", "var3", "var4", "V1", "V2", "V3")
42
+data1<-DataFrame(data1)
43
+data2<-DataFrame(data2)
44
+data12 <- fmerge(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2", method="No")
45
+
46
+###################################################
47
+### code chunk number 5: disease data load:90-93
48
+###################################################
49
+data(cad.data)
50
+#cad <- DataFrame(cad.data)
51
+cad<-cad.data
52
+head(cad)
53
+###################################################
54
+### code chunk number 6:lipid data load: 95-99
55
+###################################################
56
+data(lpd.data)
57
+#lpd <- DataFrame(lpd.data)
58
+lpd<-lpd.data
59
+head(lpd)
60
+
61
+###################################################
62
+### code chunk number 7: choose SNPs: 127-152
63
+###################################################
64
+
65
+###################################################
66
+# Step1: calculate pvj: 129-136
67
+###################################################
68
+
69
+pvalue.LDL <- lpd$P.value.LDL
70
+pvalue.HDL <-lpd$P.value.HDL
71
+pvalue.TG <- lpd$P.value.TG
72
+pvalue.TC <- lpd$P.value.TC
73
+pv <- cbind(pvalue.LDL, pvalue.HDL, pvalue.TG, pvalue.TC)
74
+pvj <- apply(pv, 1, min)
75
+
76
+###################################################
77
+# Step2: retrieve causal variables from data: 139-145
78
+###################################################
79
+
80
+beta.LDL <- lpd$beta.LDL
81
+beta.HDL <- lpd$beta.HDL
82
+beta.TG <- lpd$beta.TG
83
+beta.TC <- lpd$beta.TC
84
+beta <- cbind(beta.LDL, beta.HDL, beta.TG, beta.TC)
85
+
86
+###################################################
87
+#     Step3: construct a matrix for allele1: 148-154
88
+###################################################
89
+
90
+a1.LDL <- lpd$A1.LDL
91
+a1.HDL <- lpd$A1.HDL
92
+a1.TG <- lpd$A1.TG
93
+a1.TC <- lpd$A1.TC
94
+alle1 <- cbind(a1.LDL, a1.HDL, a1.TG, a1.TC)
95
+
96
+###################################################
97
+# Step4:  calculate pcj:157-165
98
+###################################################
99
+
100
+N.LDL <- lpd$N.LDL
101
+N.HDL <- lpd$N.HDL
102
+N.TG <- lpd$N.TG
103
+N.TC <- lpd$N.TC
104
+ss <- cbind(N.LDL, N.HDL, N.TG, N.TC)
105
+sm <- apply(ss,1,sum)
106
+pcj <- round(sm/max(sm), 6)
107
+
108
+################################################### 
109
+# Step5: Construct matrix for frequency of allele1:168-174
110
+###################################################
111
+
112
+freq.LDL<-lpd$Freq.A1.1000G.EUR.LDL
113
+freq.HDL<-lpd$Freq.A1.1000G.EUR.HDL
114
+freq.TG<-lpd$Freq.A1.1000G.EUR.TG
115
+freq.TC<-lpd$Freq.A1.1000G.EUR.TC
116
+freq<-cbind(freq.LDL,freq.HDL,freq.TG,freq.TC)
117
+
118
+###################################################
119
+# Step6: construct matrix for sd of each causal variable: 177-183 
120
+###################################################
121
+
122
+sd.LDL <- rep(37.42, length(pvj))
123
+sd.HDL <- rep(14.87, length(pvj))
124
+sd.TG <-rep(92.73, length(pvj))
125
+sd.TC <- rep(42.74, length(pvj))
126
+sd <- cbind(sd.LDL, sd.HDL, sd.TG, sd.TC)
127
+
128
+###################################################
129
+# Step7: SNPID and position: 186-189
130
+###################################################
131
+<<Step7, keep.source=TRUE, eval=FALSE>>=
132
+hg19 <- lpd$SNP_hg19.HDL
133
+rsid <- lpd$rsid.HDL
134
+
135
+###################################################
136
+# Step8: separate chromosome number and SNP position: 192-194
137
+###################################################
138
+chr<-chrp(hg=hg19)
139
+
140
+###################################################
141
+# Step9: get new data: 197-201
142
+###################################################
143
+
144
+newdata<-cbind(freq,beta,sd,pvj,ss,pcj)
145
+newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata))
146
+dim(newdata)
147
+
148
+
149
+###################################################
150
+# Step10: retrieve data from cad and calculate pdj:204-215
151
+###################################################
152
+
153
+hg18.d <- cad$chr_pos_b36
154
+SNP.d <- cad$SNP #SNPID
155
+a1.d<- tolower(cad$reference_allele)
156
+freq.d <- cad$ref_allele_frequency
157
+pvalue.d <- cad$pvalue
158
+beta.d <- cad$log_odds
159
+N.case <- cad$N_case
160
+N.ctr <- cad$N_control
161
+N.d <- N.case+N.ctr
162
+freq.case <- N.case/N.d
163
+
164
+###################################################
165
+ # Step11: combine these cad variables into new data sheet:218-222
166
+ ###################################################
167
+
168
+newcad <- cbind(freq.d, beta.d, N.case, N.ctr, freq.case)
169
+newcad <- cbind(hg18.d, SNP.d, a1.d, as.data.frame(newcad))
170
+dim(newcad)
171
+
172
+################################################### 
173
+#Step12: give name vector of causal variables: 225-227
174
+###################################################
175
+varname <-c("CAD", "LDL", "HDL", "TG", "TC")
176
+
177
+###################################################
178
+### Step 13: choose SNPs and make beta table
179
+###################################################
180
+mybeta <- mktable(cdata=newdata, ddata=newcad, rt="beta", 
181
+varname=varname, LG=1, Pv=0.00000005, Pc=0.979, Pd=0.979)
182
+dim(mybeta)
183
+beta <- mybeta[,4:8]   #  standard beta table for path analysis
184
+snp <- mybeta[,1:3]   #  snp data for annotation analysis
185
+beta<-DataFrame(beta)
186
+head(beta)
187
+
188
+###################################################
189
+### load beta data: 256-264
190
+###################################################
191
+data(beta.data)
192
+beta.data<-DataFrame(beta.data)
193
+CAD <- beta.data$cad
194
+LDL <- beta.data$ldl
195
+HDL <- beta.data$hdl
196
+TG <- beta.data$tg
197
+TC <- beta.data$tc
198
+
199
+###################################################
200
+### two way scatter: 266-280
201
+###################################################
202
+par(mfrow=c(2, 2), mar=c(5.1, 4.1, 4.1, 2.1), oma=c(0, 0, 0, 0))
203
+plot(LDL,CAD, pch=19, col="blue", xlab="beta of SNPs on LDL", 
204
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
205
+abline(lm(CAD~LDL), col="red", lwd=2)
206
+plot(HDL, CAD, pch=19,col="blue", xlab="beta of SNPs on HDL", 
207
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
208
+abline(lm(CAD~HDL), col="red", lwd=2)
209
+plot(TG, CAD, pch=19, col="blue", xlab="beta of SNPs on TG", 
210
+ylab="beta of SNP on CAD",cex.lab=1.5, cex.axis=1.5, cex.main=2)
211
+abline(lm(CAD~TG), col="red", lwd=2)
212
+plot(TC,CAD, pch=19, col="blue", xlab="beta of SNPs on TC", 
213
+ylab="beta of SNP on CAD", cex.lab=1.5, cex.axis=1.5, cex.main=2)
214
+abline(lm(CAD~TC), col="red", lwd=2)
215
+
216
+###################################################
217
+### MR and Path Analysis: 296-300
218
+###################################################
219
+data(beta.data)
220
+mybeta <- DataFrame(beta.data)
221
+mod <- CAD~LDL+HDL+TG+TC
222
+pathvalue <- path(betav=mybeta, model=mod, outcome="CAD")
223
+
224
+###################################################
225
+### Create Path Diagram: 305-312
226
+###################################################
227
+
228
+mypath <- matrix(NA,3,4)
229
+mypath[1,] <- c(1.000000, -0.066678, 0.420036, 0.764638)
230
+mypath[2,] <- c(-0.066678, 1.000000, -0.559718, 0.496831)
231
+mypath[3,] <- c(0.420036, -0.559718, 1.000000, 0.414346)
232
+colnames(mypath) <- c("LDL", "HDL", "TG", "path")
233
+mypath<-as.data.frame(mypath)
234
+mypath
235
+
236
+###################################################
237
+### load package: diagram: 324-326
238
+###################################################
239
+library(diagram)
240
+
241
+###################################################
242
+# make path diagram
243
+###################################################
244
+pathdiagram(pathdata=mypath, disease="CAD", R2=0.988243, range=c(1:3))
245
+
246
+pathD<-matrix(NA,4,5)
247
+pathD[1,] <- c(1,	-0.070161, 0.399038, 0.907127, 1.210474)
248
+pathD[2,] <- c(-0.070161,	1, -0.552106, 0.212201, 0.147933)
249
+pathD[3,] <- c(0.399038, -0.552106, 1, 0.44100, 0.64229)
250
+pathD[4,] <- c(0.907127, 0.212201, 0.441007, 1, -1.035677)
251
+colnames(pathD) <- c("LDL", "HDL", "TG", "TC", "path")
252
+pathD<-as.data.frame(pathD)
253
+pathD
254
+
255
+pathdiagram2(pathD=pathD,pathO=mypath,rangeD=c(1:4),rangeO=c(1:3),
256
+disease="CAD", R2D=0.536535,R2O=0.988243)
257
+
258
+###################################################
259
+# load SNP358 data
260
+###################################################
261
+
262
+data(SNP358.data)
263
+SNP358 <- DataFrame(SNP358.data)
264
+head(SNP358)
265
+
266
+###################################################
267
+#  load package library(graphics)
268
+###################################################
269
+
270
+library(graphics)
271
+
272
+###################################################
273
+# chromosome SNP position histogram
274
+##########################################
275
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
276
+
277
+
278
+###################################################
279
+# SNP function annotation analysis
280
+###################################################
281
+
282
+###################################################
283
+# load SNP annotation data: 431-434
284
+###################################################
285
+
286
+data(SNP368annot.data)
287
+SNP368<-DataFrame(SNP368annot.data)
288
+SNP368[1:10, ]
289
+
290
+###################################################
291
+# load package plotrix
292
+###################################################
293
+
294
+###################################################
295
+# make 3D Pie
296
+#############################################
297
+ucscannot(UCSCannot=SNP368,SNPn=368)
298
+
299
+ucscannot(UCSCannot=SNP368,SNPn=368,A=3,B=2,C=1.3,method=2)
300
+
301
+
302
+sessionInfo()
303
+
304
+
... ...
@@ -7,7 +7,7 @@
7 7
 \documentclass[a4paper]{article}
8 8
 
9 9
 \title{GWAS-based Mendelian Randomization Path Analysis}
10
-\author{Yuan-De Tan and Dajiang Liu\ \
10
+\author{Yuan-De Tan \ \
11 11
 \texttt{tanyuande@gmail.com}}
12 12
 
13 13
 <<<style-Sweave, eval=TRUE, echo=FALSE, results=tex>>=
... ...
@@ -19,13 +19,13 @@ BiocStyle::latex()
19 19
 \maketitle
20 20
 
21 21
 \begin{abstract}
22
-\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of interest and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 $R$ functions: \Rfunction{chrp},\Rfunction{fmerg},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpposit},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerg} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto the disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerg}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpposit} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to do \emph{SNP} function annotation analysis.
22
+\Rpackage{GMRP} can perform analyses of Mendelian randomization (\emph{MR}),correlation, path of causal variables onto disease of interest and \emph{SNP} annotation analysis. \emph{MR} includes \emph{SNP} selection with given criteria and regression analysis of causal variables on the disease to generate beta values of causal variables on the disease. Using the beta vectors, \Rpackage{GMRP} performs correlation and path analyses to construct  path diagrams of causal variables to the disease. \Rpackage{GMRP} consists of 8 $R$ functions: \Rfunction{chrp},\Rfunction{fmerge},\Rfunction{mktable},\Rfunction{pathdiagram},\Rfunction{pathdiagram2},\Rfunction{path},\Rfunction{snpPositAnnot},\Rfunction{ucscannot} and 5 datasets: \Robject{beta.data,cad.data,lpd.data,SNP358.data,SNP368annot.data}. \Rfunction{chrp} is used to separate string vector \texttt{hg19} into two numeric vectors: chromosome number and \emph{SNP} chromosome position. Function \Rfunction{fmerge} is used to merge two \emph{GWAS} result datasets into one dataset. Function \Rfunction{mktable} performs \emph{SNP} selection and creates a standard beta table for function \Rfunction{path} to do \emph{MR} and path analyses. Function \Rfunction{pathdiagram} is used to create a path diagram of causal variables onto a given disease or onto outcome. Function \Rfunction{pathdiagram2} can merge two-level \emph{pathdiagrams} into one nested \emph{pathdiagram} where inner \emph{pathdiagram} is a \emph{pathdiagram} of causal variables contributing to outcome and the outside \emph{pathdiagram} is a path diagram of causal variables including outcome onto the disease. The five datasets provide examples for running these functions. \Robject{lpd.data} and \Robject{cad.data} provide an example to create a standard beta dataset for path function to do path analysis and \emph{SNP} data for \emph{SNP} annotation analysis by performing \Rfunction{mktable} and \Rfunction{fmerge}. \Robject{beta.data} are a standard beta dataset for path analysis. \Robject{SNP358.data} provide an example for \Rfunction{snpPositAnnot} to do \emph{SNP} position annotation analysis and \Robject{SNP368annot.data} are for \Rfunction{ucscannot} to perform \emph{SNP} function annotation analysis.
23 23
 \end{abstract}
24 24
 
25 25
 \tableofcontents
26 26
 
27 27
 \section{Introduction}
28
-As an example of human disease, coronary artery disease (\emph{CAD}) is one of the causes leading to death and infirmity worldwide~\cite{Murray1997}. Low-density lipoprotein cholesterol (\emph{LDL}) and triglycerides (\emph{TG}) are viewed as risk factors causing \emph{CAD}.  In epidemiological studies, plasma concentrations of increasing \emph{TG} and \emph{LDL} and deceasing \emph{HDL} have been observed to be associated with risk for \emph{CAD}~\cite{Angelantonio2009, Sarwar2007}. However, from observational studies, one could not directly infer that these cholesterol concentrations  in plasma are risk factors causing \emph{CAD}~\cite{Do2013, Sarwar2007, Sheehan2007, Voight2012}. A big limitation of observational studies is to difficultly distinguish between causal and spurious associations due to confounding~\cite{Pichler2013}. An efficient approach to overcome this limitation is \textit{Mendelian Randomization} (\emph{MR}) analysis ~\cite{Sheehan2007, Smith2003} where genetic variants are used as instrumental variables. For this reason, many investigators tried to use genetic variants to assess causality and estimate the causal effects on the diseases.    
28
+As an example of human disease, coronary artery disease (\emph{CAD}) is one of the causes leading to death and infirmity worldwide~\cite{Murray1997}. Low-density lipoprotein cholesterol (\emph{LDL}) and triglycerides (\emph{TG}) are viewed as risk factors causing \emph{CAD}.  In epidemiological studies, plasma concentrations of increasing \emph{TG} and \emph{LDL} and deceasing high-density lipoprotein cholesterol (\emph{HDL} ) have been observed to be associated with risk for \emph{CAD}~\cite{Angelantonio2009, Sarwar2007}. However, from observational studies, one could not directly infer that these cholesterol concentrations  in plasma are risk factors causing \emph{CAD}~\cite{Do2013, Sarwar2007, Sheehan2007, Voight2012}. A big limitation of observational studies is to difficultly distinguish between causal and spurious associations due to confounding~\cite{Pichler2013}. An efficient approach to overcome this limitation is \textit{Mendelian Randomization} (\emph{MR}) analysis ~\cite{Sheehan2007, Smith2003} where genetic variants are used as instrumental variables. For this reason, many investigators tried to use genetic variants to assess causality and estimate the causal effects on the diseases.    
29 29
 
30 30
 \emph{MR} analysis can perfectly exclude confounding factors associated with disease.  However, when we expand one causal variable to many, \emph{MR} analysis becomes challenged and complicated because the genetic variant would have additional effects on the other risk factors, which violate assumption of no pleiotropy. An unknown genetic variant in \emph{MR} analysis possibly provides a false instrument for causal effect assessment of risk factors on the disease. The reason is that if this genetic variant is in \emph{LD} with another gene that is not used but has effect on the disease of study~\cite{Sheehan2007, Sheehan2010}. It then violates the third assumption.  These two problems can be addressed by using multiple instrumental variables.  For this reason, Do \emph{et al} (2013)developed statistic approach to address this issue~\cite{Do2013}. However, method of Do \emph{et al} ~\cite{Do2013} cannot disentangle correlation effects among the multiple undefined risk factors on the disease of study.  The beta values obtained from regression analyses are not direct causal effects because their effects are entangled with correlations among these undefined risk factors. 
31 31
 
... ...
@@ -52,7 +52,7 @@ library(GMRP)
52 52
 
53 53
 \Robject{lpd.data} was a subset (1069 SNPs) of four GWAS result datasets for \emph{LDL}, \emph{HDL}, \emph{TG} and \emph{TC}. These \emph{GWAS} result data sheets were downloaded from the website\footnote{\url{http://csg.sph.umich.edu//abecasis/public/lipids2013/} } where there are 120165 SNPs scattered on 23 chromosomes and 40 variables. Four GWAS result datasets for \emph{LDL}, \emph{HDL}, \emph{TG} and \emph{TC} were merged into one data sheet by using \Rfunction{fmerg}\textit{(fl1,fl2,ID1,ID2,A,B,method)} where \textit{fl1} and \textit{fl2} are two \emph{GWAS} result data sheets. $ID1$ and $ID2$ are key $id$ in files \textit{fl1} and \textit{fl2}, respectively, and required. $A$ and $B$ are postfix for \textit{fl1} and \textit{fl2}. Default values are $A$="" and $B$="". \textit{method} is method for merging . In the current version, there are four methods: \textit{method}="No" or "no" or "NO" or "N" or "n" means that  the data with unmatched \emph{SNP}s in \textit{file1} and \textit{file 2} are not saved in the merged file; \textit{method}="ALL" or "All" or "all" or "A" or "a" indicates that the data with all unmatched \emph{SNP}s in \textit{file 1} and \textit{file 2} are saved in the unpaired way in the merged data file; if \textit{method}="\textit{file1}", then those with unmatched \emph{SNP}s only from file1 are saved or if \textit{method}="\textit{file2}", \Rfunction{fmerg} will save the data with unmatched \emph{SNP}s only from \textit{file2}". Here is a simple example:
54 54
 
55
-<<fmerg,keep.source=TRUE, eval=FALSE>>=
55
+<<fmerge,keep.source=TRUE, eval=FALSE>>=
56 56
 data1 <- matrix(NA, 20, 4)
57 57
 data2 <- matrix(NA, 30, 7)
58 58
 SNPID1 <- paste("rs", seq(1:20), sep="")
... ...
@@ -68,15 +68,15 @@ colnames(data1) <- c("SNP", "var1", "var2", "var3", "var4")
68 68
 colnames(data2) <- c("SNP", "var1", "var2", "var3", "var4", "V1", "V2", "V3")
69 69
 data1<-DataFrame(data1)
70 70
 data2<-DataFrame(data2)
71
-data12 <- fmerg(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2", method="No")
71
+data12 <- fmerge(fl1=data1, fl2=data2, ID1="SNP", ID2="SNP", A=".dat1", B=".dat2", method="No")
72 72
 @
73 73
 
74 74
 User can take the following approach to merge all four lipid files into a data sheet:
75
-\textit{LDL\underline{}HDL<-fmerg(fl1=LDL\underline{}file,fl2=HDL\underline{}file, ID1="SNP", ID2="SNP", A=".LDL", B=".HDL", method="No")}
75
+\textit{LDL\underline{}HDL<-fmerge(fl1=LDL\underline{}file,fl2=HDL\underline{}file, ID1="SNP", ID2="SNP", A=".LDL", B=".HDL", method="No")}
76 76
 
77
-\textit{TG\underline{}TC<-fmerg(fl1=TG\underline{}file,fl2=TC\underline{}file, ID1="SNP", ID2="SNP", A=".TG", B=".TC", method="No")}
77
+\textit{TG\underline{}TC<-fmerge(fl1=TG\underline{}file,fl2=TC\underline{}file, ID1="SNP", ID2="SNP", A=".TG", B=".TC", method="No")}
78 78
 
79
-\textit{lpd<-fmerg(fl1=LDL\underline{}HDL,fl2=TG\underline{}TC,ID1="SNP",ID2="SNP",A="",B="",method="No")}
79
+\textit{lpd<-fmerge(fl1=LDL\underline{}HDL,fl2=TG\underline{}TC,ID1="SNP",ID2="SNP",A="",B="",method="No")}
80 80
 
81 81
 \Robject{cad.data} was also a subset (1069 SNPs) of original GWAS meta-analyzed dataset that was downloaded from the website\footnote{\url{http://www.cardiogramplusc4d.org/downloads/} } and contains 2420360 \emph{SNP}s and 12 variables. 
82 82
 
... ...
@@ -86,15 +86,17 @@ User can take the following approach to merge all four lipid files into a data s
86 86
 
87 87
 \Robject{SNP368annot.data} is the data obtained from function analysis with \href{http://snp-nexus.org/index.html}{\emph{SNP} Annotation Tool} and provides example of performing function \Rfunction{ucscanno} to draw a \texttt{3D} pie and output the results of proportions of \emph{SNP}s coming from gene function various elements. 
88 88
   
89
- <<>>=
89
+<<>>=
90 90
 data(cad.data)
91
-cad <- DataFrame(cad.data)
91
+#cad <- DataFrame(cad.data)
92
+cad<-cad.data
92 93
 head(cad)
93 94
 @
94 95
 
95 96
 <<>>=
96 97
 data(lpd.data)
97
-lpd <- DataFrame(lpd.data)
98
+#lpd <- DataFrame(lpd.data)
99
+lpd<-lpd.data
98 100
 head(lpd)
99 101
 @
100 102
 
... ...
@@ -406,7 +408,7 @@ With SNP data \Robject{SNP358}, we can perform \emph{SNP} position annotation us
406 408
 \textit{maxd} is a numeric parameter that is maximum distance for truncating chromosome columns. If there are not big differences among 23 chromosomes, then \textit{maxd} can be set to be larger than 2000$kbp$. Its default is  2000$kbp$.
407 409
 
408 410
 <<fig=FALSE,keep.source=TRUE, label=ChromHistogram>>=
409
-snpposit(SNPdata=SNP358,SNP_hg19="chr",LG=10,main="A",maxd=2000)
411
+snpPositAnnot(SNPdata=SNP358,SNP_hg19="chr",main="A")
410 412
 @
411 413
 \begin{figure}[ht]
412 414
 \begin{center}
... ...
@@ -457,7 +459,7 @@ ucscannot(UCSCannot=SNP368,SNPn=368)
457 459
 
458 460
 <<label=figPIE3D1, fig=TRUE,echo=FALSE>>=
459 461
 <<PIE3D1>>
460
-@  v
462
+@  
461 463
 \caption{ Distribution of the selected \emph{SNP}s  in gene function elements.}
462 464
 \label{figure5}
463 465
 \end{center}