}
nrow(vst)
## remove outlier code
if (cutoff == 'yes'){
# hard cut off
outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',')
}
outliers$ID <- as.character(outliers$ID)
test <- lapply(strsplit(outliers$ID, "-"), `[`, 2:5)
for (i in seq(1,length(test))){
x <- paste(test[[i]],collapse='.')
test[[i]] <- x
}
outliers$ID <- unlist(test)
outliers$ID <- as.character(outliers$ID)
outliers$ID <- gsub('\\.','-',outliers$ID)
vst <- vst[ , -which(names(vst) %in% outliers$ID)]
if (selectgenes == 'yes'){
df <- read.csv('~/Desktop/PEAC/8_modularframework/singlecelldata/SCreadyforannotation.csv')
#df <- subset(df, df$avg_logFC > 0)
vst <- subset(vst, row.names(vst) %in% df$gene)
nrow(vst)
}
## get meta data
meta <- subset(meta_backup, meta_backup$Visit == 3)
## selection of patient groups
if (drug == 'rituximab'){
meta <- subset(meta, meta$Randomised.medication == 'Rituximab')
}else if (drug == 'tocilizumab'){
meta <- subset(meta, meta$Randomised.medication == 'Tocilizumab')
}
meta$Current.Medication <- droplevels(meta$Current.Medication)
## further reformatting, select matching patients between RNA-seq and meta data
meta$Seq_ID <- meta$Seq_ID.V2
meta$Seq_ID <- as.character(meta$Seq_ID)
meta$Seq_ID <- gsub('\\.','-',meta$Seq_ID)
meta$Seq_ID <- paste(meta$Seq_ID,'-Baseline',sep='')
## reformatting
meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7)
meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Good.Responder'] <- 'Good'
meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder'] <- 'Moderate_or_none'
meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Non.Responder'] <- 'Moderate_or_none'
meta$CDAI.response.status.V7 <- as.character(meta$CDAI.response.status.V7)
meta$CDAI.50.improvement <- meta$CDAI.response.status.V7
### remove extra patients for tocil
if (drug == 'tocilizumab'){
meta <- meta[-which(meta$super_ID %in% c("CAGL.R4RA.1136.3","GUYS.R4RA.1092.3","NEWC.R4RA.1118.3",
"OXFO.R4RA.1200.3","QMUL.R4RA.0444.3","SEND.R4RA.0892.3",
"WHIP.R4RA.1183.3")),]
}
### tx only
## select matching patients and transpose dataframe
meta <- subset(meta, meta$Seq_ID %in% colnames(vst))
vst <- vst[,meta$Seq_ID]
tvst <- data.frame(t(vst))
## add response variable
if (variable == 'DAS28.CRP.EULARresp.V7'){
tvst$DAS28.CRP.EULARresp.V7 <- meta$DAS28.CRP.EULARresp.V7
}else if (variable == 'CDAI.50.improvement'){
tvst$CDAI.50.improvement <- meta$CDAI.50.improvement
}
library(MLmetrics)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5,
summaryFunction = f1,
classProbs=T,
savePredictions = T,
verboseIter = T)
fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE],
y = factor(tvst[[variable]]),
method="glmnet",
trControl=ctrl, metric = "F1",
tuneGrid = expand.grid(alpha = 0.6, # was 0.6
lambda = seq(0.001,0.1,by = 0.001)))
fit1
plot(fit1)
## 2. custom metric
library(MLmetrics)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[2])
c(F1 = f1_val)
}
ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5,
summaryFunction = f1,
classProbs=T,
savePredictions = T,
verboseIter = T)
fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE],
y = factor(tvst[[variable]]),
method="glmnet",
trControl=ctrl, metric = "F1",
tuneGrid = expand.grid(alpha = 0.6, # was 0.6
lambda = seq(0.001,0.1,by = 0.001)))
plot(fit1)
install.packages('kernlab')
source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R')
source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R')
source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R')
ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5,
summaryFunction = f1,
classProbs=T,
savePredictions = T,
verboseIter = T)
fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE],
y = factor(tvst[[variable]]),
method = lpSVM,
trControl=ctrl, metric = "F1",
tuneGrid = expand.grid(alpha = 0.6, # was 0.6
lambda = seq(0.001,0.1,by = 0.001)))
source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R')
ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5,
summaryFunction = f1,
classProbs=T,
savePredictions = T,
verboseIter = T)
fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE],
y = factor(tvst[[variable]]),
method = lpSVM,
trControl=ctrl, metric = "F1")
plot(fit1)
c<-c('a','b')
c[c=='a']
sum(c=='a')
c<-c('a','a','b')
sum(c=='a')
library(caret)
source('~/Dropbox/evalV4.R')
im <- twoClassSim(200, intercept = -20, linearVars = 20)
table(im$Class)
fitControl <- trainControl(
method = "cv",
summaryFunction=twoClassSummary,
classProbs=T,
savePredictions = T,
verboseIter = T)
orig_fit <- train(Class ~ ., data = im,
method = "glmnet",
metric = "ROC",
trControl = fitControl)
x <- eval(orig_fit)
xr <- x$probs$`Group 1`
View(im)
dev.off()
library(caret)
source('~/Dropbox/evalV4.R')
im <- twoClassSim(200, intercept = -20, linearVars = 20)
table(im$Class)
View(im)
library(caret)
df<-data.frame(c='a')
View(df)
im <- twoClassSim(200, intercept = -20, linearVars = 20)
table(im$Class)
View(im)
source('~/Desktop/evalV4.R')
View(df)
View(df)
source('~/Desktop/evalV4.R')
View(df)
theme
library(ggplot2)
source('~/Desktop/evalV4.R')
x <- eval(orig_fit)
View(im)
df<-c('a')
source('~/Dropbox/evalV3.R')
df<-c('a')
df<-data.frame(c='aaa')
View(df)
source('~/Dropbox/evalV2.R')
df<-data.frame(c='aaa')
View(df)
df<-data.frame(c='aaa')
source('~/.active-rstudio-document')
c
c()
View(df)
source('~/Dropbox/evalV4_super_temp.R')
View(df)
source('~/Dropbox/evalV4_super_temp.R')
source('~/Dropbox/evalV4_super_temp.R')
df<-data.frame(c='aaa')
View(df)
source('~/.active-rstudio-document')
df<-data.frame(c='aaa')
View(df)
c(df)
View(df)
View(df)
library(M3C)
library(edgeR)
library(limma)
library(ComplexHeatmap)
library(plot3D)
load("~/Desktop/R4RA/R4RA_V2_data_trimmed.RData")
source('~/Desktop/R4RA/SCRIPTS/reformat_IDs_functions.R')
### parameters
# features
modules <- FALSE
custom_list <- FALSE # set to FALSE for unsupervised clustering
protein <- FALSE # set to TRUE for unsupervised clustering
# outliers
cutoff <- 'hard' # hard, soft, none
# variability and expression strength filtering
filtering <- TRUE # set to TRUE for unsupervised clustering
cvx <- 0.075
# 0.075,0.125 with PAM, method 2, 1000 reps == good p values
### main script
## get the data out
#data <- data.frame(log2(cpm(txiready$rawcounts+2))) ################################## Important
data <- txiready$vst
colnames(data) <- gsub('\\.','-',colnames(data))
colnames(data) <- substring(colnames(data), 2)
## remove outliers
if (cutoff == 'hard'){
outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',')
outliers$ID <- as.character(outliers$ID)
data <- data[ , -which(names(data) %in% outliers$ID)] # 12 initial outliers
}
#
data <- fix_colnames(data)
# get the metadata
meta <- read.csv('/home/christopher/Desktop/R4RA/meta_data_V13_imp.csv')
meta <- subset(meta, meta$Visit == 3)
completeFun <- function(data, desiredCols) {
completeVec <- complete.cases(data[, desiredCols])
return(data[completeVec, ])
}
#meta <- completeFun(meta, "CDAI.response.status")
if (drugf){
# select based on medication, levels: Rituximab Tocilizumab
meta <- subset(meta, meta$Randomised.medication == drug)
}
meta$Seq_ID.V2 <- as.character(meta$Seq_ID.V2)
meta$Seq_ID.V2 <- gsub('\\.','-',meta$Seq_ID.V2)
meta$Seq_ID.V2 <- paste(meta$Seq_ID.V2,'-Baseline',sep='')
setdiff(colnames(data),meta$Seq_ID.V2)
meta <- subset(meta, meta$Seq_ID.V2 %in% colnames(data))
data <- data[,meta$Seq_ID.V2]
View(df)
# optionally just select the protein coding genes
if (protein){
mappings <- read.csv('~/Desktop/R4RA/gencode.v29.mappings.tsv',sep='\t',header=FALSE)
protein_coding <- subset(mappings, mappings$V3 == 'protein_coding')
protein_coding$V3 <- as.character(protein_coding$V3)
protein_coding$V2 <- as.character(protein_coding$V2)
row.names(protein_coding) <- NULL
data <- subset(data, row.names(data) %in% protein_coding$V2)
}
data_backup <- data # save for boxplots
## coefficient of variation
## second order coefficient of variation
mfilter <- 'cv'
if (filtering){
# filter for just the strongest expressed genes
tokeep <- row.names(txiraw$txistd$counts[ rowSums((cpm(txiraw$txistd$counts))>1)>=10, ])
data2 <- subset(data, row.names(data) %in% tokeep)
# variable gene filter
if (mfilter == 'cv'){
# co efficient of variation
CV <- apply(data2,1,sd)/(rowMeans(data2))
names <- names(CV)[CV>cvx]
}else if (mfilter == 'a2'){
CV <- apply(data2,1,sd)/(rowMeans(data2))
CV2 <- CV^2
A2 <- sqrt(CV2/(CV2+1))
names <- names(A2)[A2>0.8]
}else if (mfilter == 'var'){
xxx <- featurefilter(data2,method='var',percentile=40)
names <- row.names(xxx$filtered_data)
}
data3 <- subset(data2, row.names(data2) %in% names)
nrow(data3)
}else{
data3 <- data
}
#colnames(meta)[43] <- 'ID'
colnames(meta)[colnames(meta)=='Seq_ID.V2'] <- 'ID'
###
data3 <- t(scale(t(data3)))
### clustering
#source('~/Desktop/scripts/AEC_early.R')
#source('~/Desktop/scripts/AECv11_early.R')
set.seed(123)
library(DESeq2)
library(dplyr)
library(qusage)
library(edgeR)
source('~/Desktop/R4RA/SCRIPTS/reformat_IDs_functions.R')
load("/home/christopher/Desktop/R4RA/R4RA_V2_data_trimmed.RData")
# DAS28.CRP.EULARresp.V7.mod, DAS28.CRP.EULARresp.V7, Target.CDAI.response.V7, CDAI.response.status.V7
variable <- 'DAS28.CRP.EULARresp.V7.mod'
drug <- 'Tocilizumab' # Rituximab, Tocilizumab
protein_only <- FALSE
## get data out and remove outliers
counts <- txiready$lengthscaledtpmcounts # length scaled TPMs should be used for limma
counts <- data.frame(counts)
### to remove outliers
# put before fix_colnames
# 234-40=194
colnames(counts) <- gsub('\\.','-',colnames(counts))
colnames(counts) <- substring(colnames(counts), 2)
outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',')
outliers$ID <- as.character(outliers$ID)
counts <- counts[ , -which(names(counts) %in% outliers$ID)] # 12 initial outliers
# end
#
counts <- fix_colnames(counts)
## always filter to remove low count features when running limma
tokeep <- row.names(txiraw$txistd$counts[ rowSums((cpm(txiraw$txistd$counts))>1)>=10, ])
counts <- subset(counts, row.names(counts) %in% tokeep)
## optionally select protein coding transcripts only
if (protein_only){
mappings <- read.csv('~/Desktop/R4RA/gencode.v29.mappings.tsv',sep='\t',header=FALSE)
protein_coding <- subset(mappings, mappings$V3 == 'protein_coding')
protein_coding$V3 <- as.character(protein_coding$V3)
protein_coding$V2 <- as.character(protein_coding$V2)
row.names(protein_coding) <- NULL
counts <- subset(counts, row.names(counts) %in% protein_coding$V2)
}
## get meta data and select visit 3 (baseline)
meta <- read.csv('/home/christopher/Desktop/R4RA/meta_data_V14_imp.csv')
# select visit 3, reformat, select rituximab patients
meta <- subset(meta, meta$Visit == 3)
completeFun <- function(data, desiredCols) {
completeVec <- complete.cases(data[, desiredCols])
return(data[completeVec, ])
}
# select variable to test for
if (variable == 'CDAI.response.status.V7'){
meta$CDAI.response.status.V7 <- as.character(meta$CDAI.response.status.V7)
meta <- completeFun(meta, "CDAI.response.status.V7")
}else if (variable == 'Target.CDAI.response.V7'){
meta$Target.CDAI.response.V7 <- as.character(meta$Target.CDAI.response.V7)
meta <- completeFun(meta, "Target.CDAI.response.V7")
}else if (variable == 'DAS28.CRP.EULARresp.V7'){
meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7)
#meta <- meta[!meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder', ] # need to run this for binary status
meta <- meta[complete.cases(meta$DAS28.CRP.EULARresp.V7),]
}else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){
meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7)
meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Non.Responder'] <- 'Moderate_or_none'
meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder'] <- 'Moderate_or_none'
}
# select drug
if (drug == 'Rituximab'){
meta <- subset(meta, meta$Randomised.medication == 'Rituximab')
}else{
meta <- subset(meta, meta$Randomised.medication == 'Tocilizumab')
}
# further reformatting, select matching patients between RNA-seq and meta data
meta$Seq_ID.V2 <- as.character(meta$Seq_ID.V2)
meta$Seq_ID.V2 <- gsub('\\.','-',meta$Seq_ID.V2)
meta$Seq_ID.V2 <- paste(meta$Seq_ID.V2,'-Baseline',sep='')
meta <- subset(meta, meta$Seq_ID.V2 %in% colnames(counts))
countsn <- counts[,meta$Seq_ID.V2]
## get N
# select variable to test for
if (variable == 'CDAI.response.status.V7'){
table(meta$CDAI.response.status.V7)
}else if (variable == 'Target.CDAI.response.V7'){
table(meta$Target.CDAI.response.V7)
}else if (variable == 'DAS28.CRP.EULARresp.V7' | variable == 'DAS28.CRP.EULARresp.V7.mod'){
table(meta$DAS28.CRP.EULARresp.V7)
}
## run LIMMA
# countn = length scale tpm counts
# do contrast matrix for clusters
if (variable == 'CDAI.response.status.V7'){
design <- model.matrix(~0+meta$CDAI.response.status.V7+meta$Age+meta$Gender+meta$Ethnicity)
}else if (variable == 'DAS28.CRP.EULARresp.V7' | variable == 'DAS28.CRP.EULARresp.V7.mod'){
design <- model.matrix(~0+meta$DAS28.CRP.EULARresp.V7+meta$Age+meta$Gender+meta$Ethnicity)
}else if (variable == 'Target.CDAI.response.V7'){
design <- model.matrix(~0+meta$Target.CDAI.response.V7+meta$Age+meta$Gender+meta$Ethnicity)
}
#
if (variable == 'CDAI.response.status.V7' | variable == 'Target.CDAI.response.V7'){
colnames(design) <- c('Non.Responder','Responder','Age','Gender','Ethnicity_asian','Ethnicity_caucasian',
'Ethnicity_other')
}else if (variable == 'DAS28.CRP.EULARresp.V7'){
colnames(design) <- c('Good', 'Moderate', 'None','Age','Gender','Ethnicity_asian','Ethnicity_caucasian',
'Ethnicity_other')
}else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){
colnames(design) <- c('Good', 'Moderate_or_none','Age','Gender','Ethnicity_asian','Ethnicity_caucasian',
'Ethnicity_other')
}
#
if (variable == 'CDAI.response.status.V7' | variable == 'Target.CDAI.response.V7'){
contrast.matrix <- makeContrasts('Responder-Non.Responder', levels = design)
}else if (variable == 'DAS28.CRP.EULARresp.V7'){
contrast.matrix <- makeContrasts('Good-None', levels = design)
}else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){
contrast.matrix <- makeContrasts('Good-Moderate_or_none', levels = design)
}
## filter data and normalise
dge <- DGEList(counts=countsn)
keep <- filterByExpr(dge, design)
dge <- dge[keep,,keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)
# dge$samples$norm.factors ## normalisation factors
# barplot(colSums(txiraw$txistd$counts)) ## examine sequencing depth
## voom procedure
v <- voom(dge, design, plot=TRUE)
fit <- lmFit(v,design)
fit <- contrasts.fit(fit, contrast.matrix)
fit <- eBayes(fit)
top2 <- topTable(fit,coef=1,number=Inf,sort.by="P")
qs <- qvalue::qvalue(p = top2$P.Value)
top2$q.Val <- qs$qvalues
nrow(subset(top2, top2$q.Val < 0.05))
sig <- subset(top2, top2$q.Val < 0.05)
## optionally write at this point
#write.csv(top2, file='EULAR3.mod.tocil.limma.degs.csv')
## limma-trend
# logCPM <- cpm(dge, log=TRUE, prior.count=3)
# fit <- lmFit(logCPM, design)
# fit <- contrasts.fit(fit, contrast.matrix)
# fit <- eBayes(fit, trend=TRUE)
# top3 <- topTable(fit, coef=1,number=Inf,sort.by="P")
# qs <- qvalue::qvalue(p = top3$P.Value)
# top3$q.Val <- qs$qvalues
## quantile
# v <- voom(countsn, design, plot=TRUE, normalize="quantile")
# fit <- lmFit(v,design)
# fit <- contrasts.fit(fit, contrast.matrix)
# fit <- eBayes(fit)
# top4 <- topTable(fit,coef=1,number=Inf,sort.by="P")
# qs <- qvalue::qvalue(p = top2$P.Value)
# top2$q.Val <- qs$qvalues
### limma gene set analysis
# wgcna
# do fry analysis
source('~/Desktop/scripts/parsemodulesforcameraV3.R')
c2.indices <- loadmodules('wgcna')
fryresults <- fry(v,c2.indices,design,contrast=contrast.matrix[,1])
# do gene set direction analysis
c2.indices2 <- loadmodules('wgcna',keepsymbols = TRUE)
source('~/Desktop/scripts/calculategenesetmeanfoldchangeV2.R')
fryresults <- calculategenesetmeanfoldchange(top2,fryresults,c2.indices2)
# calculate q values
qs <- qvalue::qvalue(p = fryresults$PValue)
fryresults$q.Val <- qs$qvalues
### select modules for plotting
masterresults1 <- fryresults
masterresults1 <- masterresults1[order(masterresults1[,3]),]
# change name of long module
#masterresults1$genesetID[masterresults1$genesetID=='S220 MAPK activation, toll like receptor (TLR) signalling'] <- 'S220 MAPK activation'
View(masterresults1)
View(meta)
source('~/.active-rstudio-document')
View(meta)
source('~/.active-rstudio-document')
d<-data.frame(a='aaaa')
View(d)
source('~/.active-rstudio-document')
View(d)
source('~/Dropbox/evalV4_super_temp.R')
View(d)
library(MLeval)
fit1
im <- twoClassSim(2000, intercept = -25, linearVars = 20)
table(im$Class)
fitControl <- trainControl(
method = "cv",
summaryFunction=twoClassSummary,
classProbs=T,
savePredictions = T,
verboseIter = F)
im_fit <- train(Class ~ ., data = im,
method = "ranger",
metric = "ROC",
trControl = fitControl)
library(caret)
im <- twoClassSim(2000, intercept = -25, linearVars = 20)
table(im$Class)
fitControl <- trainControl(
method = "cv",
summaryFunction=twoClassSummary,
classProbs=T,
savePredictions = T,
verboseIter = F)
im_fit <- train(Class ~ ., data = im,
method = "ranger",
metric = "ROC",
trControl = fitControl)
?trainControl
?train
fitControl <- trainControl(
method = "cv",
summaryFunction=prSummary,
classProbs=T,
savePredictions = T,
verboseIter = F)
im_fit <- train(Class ~ ., data = im,
method = "ranger",
metric = "AUC",
trControl = fitControl)
im_fit
x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c())
dev.off()
x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c())
library(MLeval)
x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c())
x$optres
setwd("~/M3C")
devtools::check()
setwd("~/M3C")
devtools::install()
devtools::check(document = FALSE)