\name{mktable}
\alias{mktable}
\title{
Selection of SNPs and Creation of A Standard Table for Mendelian Randomization and Path Analyses
}
\description{
mktable is used to choose SNPs with \code{LG, Pv, Pc} and \code{Pd} and create a standard \verb{SNP} beta table for \code{Mendelian randomization} and \code{path analysis}, see details.
}
\usage{
mktable(cdata, ddata,rt, varname, LG, Pv, Pc, Pd)
}
\arguments{
  \item{cdata}{
causal variable \bold{GWAS} data or \bold{GWAS} meta-analysed data containing \verb{SNP} \code{ID}, \verb{SNP} position, chromosome, allele, allelic frequency, beta value, \code{sd}, sample size, etc.
}
  \item{ddata}{
disease \bold{GWAS} data or \bold{GWAS} meta-analysed data containing \verb{SNP} \code{ID}, \verb{SNP} position, chromosome, allele, allelic frequency, beta value, \code{sd}, sample size, etc.
}
  \item{rt}{
a string that specifies type of returning table. It has two options: \code{rt="beta"} returns beta table or \code{rt="path"} returns \verb{SNP} direct path coefficient table. Default is "beta".
}
  \item{varname}{
a required string set that lists names of undefined causal variables for Mendelian randomization and path analyses. The first name is disease name. Here an example given is \code{varname <-c("CAD","LDL","HD","TG","TC")}.
}
  \item{LG}{
a numeric parameter. \code{LG} is a given minimum interval distnce between \verb{SNP}s and used to choose \verb{SNP}s with. Default \code{LG=1}
}
  \item{Pv}{
a numeric parameter. \code{Pv} is a given maximum p-value that is used to choose \verb{SNP}s. Default Pv=5e-8
}
  \item{Pc}{
a numeric parameter. \code{Pc} is a given proportion of sample size to maximum sample size in causal variable data and used to choose \verb{SNP}s. Default \code{Pc=0.979}
}
  \item{Pd}{
a numeric parameter. Pd is a given proportion of sample size to the maximum sample size in disease data and used to choose \verb{SNP}s. Default \code{Pd =0.979}.
}
}
\details{
The standard \bold{GWAS} cdata set should have the format with following columns: chrn, posit, rsid, \code{a1.x1, a1.x2}, \dots, \code{a1.xn}, \code{freq.x1, freq.x2}, \dots, \code{freq.xn}, \code{beta.x1, beta.x2}, \dots, \code{beta.xn}, \code{sd.x1, sd.x2}, \dots, \code{sd.xn}, \code{pvj}, \code{N.x1, N.x2}, \dots, \code{N.xn}, \code{pcj}. The standard \bold{GWAS} \emph{ddata} set should have\code{hg.d}, \code{SNP.d},\code{a1.d}, \code{freq.d}, \code{beta.d}, \code{N.case},\code{N.ctr},\code{freq.case} where \code{x1, x2}, \dots, \code{xn} are causal variables. See example.
\describe{
\item{beta}{ is a numeric vector that is a column of beta values for regression of SNPs on variable vector \code{X={x1, x2, \dots, xn}}.
}	
\item{freq}{ is a numeric vector that is a column of frequencies of allele 1 with respect to variable vector \code{X={x1, x2, \dots, xn}}.
}
\item{sd}{is a numeric vector that is a column of standard deviations of variable \code{x1,x2}, \dots, \code{xn} specific to \verb{SNP}. Note that here sd is not beta standard deviation. If sd is not specifical to \verb{SNP}s, then sd.xi has the same value for all SNPs in variable \code{i}.
	}
\item{d}{denotes disease.}
\item{N}{is sample size.}
\item{freq.case}{is frequency of disease.}
\item{chrn}{is a numeric vector for chromosome #.}
\item{posit}{is a numeric vector for \verb{SNP} positions on chromosome #. Some time, \verb{chrn} and posit are combined into string vector: \code{hg19/hg18}.
	}
\item{pvj}{is defined as p-value, \code{pcj} and \code{pdj} as proportions of sample size for \verb{SNP} \code{j} to the maximum sample size in the causal variable data and in disease data, respectively.
  }
 }
}
\value{
Return a standard \verb{SNP} beta or \verb{SNP} path table containing \code{m} \verb{SNP}s chosen with \code{LG, Pv, Pc and Pd} and \code{n} variables and disease for \code{Mendelian randomization} and \code{path analysis}.
}
\references{
Do, R. et al. 2013. Common variants associated with plasma triglycerides and risk for coronary artery disease. \emph{Nat Genet} \bold{45}: 1345-1352.
\cr
Sheehan, N.A. et al. 2008. Mendelian randomisation and causal inference in observational epidemiology. \emph{PLoS Med} \bold{5}: e177.
\cr
Sheehan, N.A.,et al. 2010. Mendelian randomisation: a tool for assessing causality in observational epidemiology. \emph{Methods Mol Biol} \bold{713}: 153-166.
\cr
Willer, C.J. Schmidt, E.M. Sengupta, S. Peloso, G.M. Gustafsson, S. Kanoni, S. Ganna, A. Chen, J.,Buchkovich, M.L. Mora, S. et al (2013) Discovery and refinement of loci associated with lipid levels. \emph{Nat Genet} \bold{45}: 1274-1283.
}
\author{
Yuan-De Tan
\email{tanyuande@gmail.com}
}
\note{
The order of column variables must be \verb{chrn} \verb{posit} \verb{rsid} \code{a1.x1} \dots \code{a1.xn} \code{freq.x1} \dots \code{freq.xn} \code{beta.x1} \dots \code{beta.x1} \dots \code{beta.xn} \code{sd.x1} \dots \code{sd.xn} \dots otherwise, mktable would have error. see example.
}

\seealso{
     \code{\link{path}}
}
\examples{
data(lpd.data)
#lpd<-DataFrame(lpd.data)
lpd<-lpd.data
data(cad.data)
#cad<-DataFrame(cad.data)
cad<-cad.data
# step 1: calculate pvj
pvalue.LDL<-lpd$P.value.LDL
pvalue.HDL<-lpd$P.value.HDL
pvalue.TG<-lpd$P.value.TG
pvalue.TC<-lpd$P.value.TC
pv<-cbind(pvalue.LDL,pvalue.HDL,pvalue.TG,pvalue.TC)
pvj<-apply(pv,1,min)

#step 2: construct beta table of undefined causal variables:
beta.LDL<-lpd$beta.LDL
beta.HDL<-lpd$beta.HDL
beta.TG<-lpd$beta.TG
beta.TC<-lpd$beta.TC
beta<-cbind(beta.LDL,beta.HDL,beta.TG,beta.TC)

#step 3: construct a matrix for allele 1 in each undefined causal variable:
a1.LDL<-lpd$A1.LDL
a1.HDL<-lpd$A1.HDL
a1.TG<-lpd$A1.TG
a1.TC<-lpd$A1.TC
alle1<-cbind(a1.LDL,a1.HDL,a1.TG,a1.TC)

#step 4: calculate sample sizes of causal variables and calculate pcj
N.LDL<-lpd$N.LDL
N.HDL<-lpd$N.HDL
N.TG<-lpd$N.TG
N.TC<-lpd$N.TC
ss<-cbind(N.LDL,N.HDL,N.TG,N.TC)
sm<-apply(ss,1,sum)
pcj<-sm/max(sm)

#step 5: construct a matrix for frequency of allele1 in each undefined causal variable in 1000G.EUR
freq.LDL<-lpd$Freq.A1.1000G.EUR.LDL
freq.HDL<-lpd$Freq.A1.1000G.EUR.HDL
freq.TG<-lpd$Freq.A1.1000G.EUR.TG
freq.TC<-lpd$Freq.A1.1000G.EUR.TC
freq<-cbind(freq.LDL,freq.HDL,freq.TG,freq.TC)

#step 6: construct matrix for sd of each causal variable (here sd is not specific to SNPj)
# the sd values were averaged over 63 studies see reference Willer et al(2013) 
sd.LDL<-rep(37.42,length(pvj))
sd.HDL<-rep(14.87,length(pvj))
sd.TG<-rep(92.73,length(pvj))
sd.TC<-rep(42.74,length(pvj))
sd<-cbind(sd.LDL,sd.HDL,sd.TG,sd.TC)

#step 7: retriev SNP ID and position:
hg19<-lpd$SNP_hg19.HDL
rsid<-lpd$rsid.HDL

#step 8: invoke chrp to separate chromosome number and SNP position:
chr<-chrp(hg=hg19)

#step 9: get new data of causal variables:
newdata<-cbind(freq,beta,sd,pvj,ss,pcj)
newdata<-cbind(chr,rsid,alle1,as.data.frame(newdata))
dim(newdata)
#[1] 120165     25

#step 10: retrieve cad data from cad and calculate pdj and frequency of cad in population
hg18.d<-cad$chr_pos_b36
SNP.d<-cad$SNP #SNPID
a1.d<-tolower(cad$reference_allele)
freq.d<-cad$ref_allele_frequency
pvalue.d<-cad$pvalue
beta.d<-cad$log_odds
N.case<-cad$N_case
N.ctr<-cad$N_control
N.d<-N.case+N.ctr
freq.case<-N.case/N.d


#step 11: get new cad data:
newcad<-cbind(freq.d,beta.d,N.case,N.ctr,freq.case)
newcad<-cbind(hg18.d,SNP.d,a1.d,as.data.frame(newcad))
dim(newcad)

#step 12: give variable list
varname<-c("CAD","LDL","HDL","TG","TC")
#step 3: create beta table with function mktable 
mybeta<-mktable(cdata=newdata,ddata=newcad,rt="beta",varname=varname,LG=1, Pv=0.00000005,
Pc=0.979,Pd=0.979)

beta<-mybeta[,4:8] # save beta for path analysis
snp<-mybeta[,1:3] # save snp for annotation analysis
beta<-DataFrame(beta)
}
\keyword{Selection of SNPs}
\keyword{Mendelian Randomiztion}