} nrow(vst) ## remove outlier code if (cutoff == 'yes'){ # hard cut off outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',') } outliers$ID <- as.character(outliers$ID) test <- lapply(strsplit(outliers$ID, "-"), `[`, 2:5) for (i in seq(1,length(test))){ x <- paste(test[[i]],collapse='.') test[[i]] <- x } outliers$ID <- unlist(test) outliers$ID <- as.character(outliers$ID) outliers$ID <- gsub('\\.','-',outliers$ID) vst <- vst[ , -which(names(vst) %in% outliers$ID)] if (selectgenes == 'yes'){ df <- read.csv('~/Desktop/PEAC/8_modularframework/singlecelldata/SCreadyforannotation.csv') #df <- subset(df, df$avg_logFC > 0) vst <- subset(vst, row.names(vst) %in% df$gene) nrow(vst) } ## get meta data meta <- subset(meta_backup, meta_backup$Visit == 3) ## selection of patient groups if (drug == 'rituximab'){ meta <- subset(meta, meta$Randomised.medication == 'Rituximab') }else if (drug == 'tocilizumab'){ meta <- subset(meta, meta$Randomised.medication == 'Tocilizumab') } meta$Current.Medication <- droplevels(meta$Current.Medication) ## further reformatting, select matching patients between RNA-seq and meta data meta$Seq_ID <- meta$Seq_ID.V2 meta$Seq_ID <- as.character(meta$Seq_ID) meta$Seq_ID <- gsub('\\.','-',meta$Seq_ID) meta$Seq_ID <- paste(meta$Seq_ID,'-Baseline',sep='') ## reformatting meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7) meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Good.Responder'] <- 'Good' meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder'] <- 'Moderate_or_none' meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Non.Responder'] <- 'Moderate_or_none' meta$CDAI.response.status.V7 <- as.character(meta$CDAI.response.status.V7) meta$CDAI.50.improvement <- meta$CDAI.response.status.V7 ### remove extra patients for tocil if (drug == 'tocilizumab'){ meta <- meta[-which(meta$super_ID %in% c("CAGL.R4RA.1136.3","GUYS.R4RA.1092.3","NEWC.R4RA.1118.3", "OXFO.R4RA.1200.3","QMUL.R4RA.0444.3","SEND.R4RA.0892.3", "WHIP.R4RA.1183.3")),] } ### tx only ## select matching patients and transpose dataframe meta <- subset(meta, meta$Seq_ID %in% colnames(vst)) vst <- vst[,meta$Seq_ID] tvst <- data.frame(t(vst)) ## add response variable if (variable == 'DAS28.CRP.EULARresp.V7'){ tvst$DAS28.CRP.EULARresp.V7 <- meta$DAS28.CRP.EULARresp.V7 }else if (variable == 'CDAI.50.improvement'){ tvst$CDAI.50.improvement <- meta$CDAI.50.improvement } library(MLmetrics) f1 <- function(data, lev = NULL, model = NULL) { f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1]) c(F1 = f1_val) } ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5, summaryFunction = f1, classProbs=T, savePredictions = T, verboseIter = T) fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE], y = factor(tvst[[variable]]), method="glmnet", trControl=ctrl, metric = "F1", tuneGrid = expand.grid(alpha = 0.6, # was 0.6 lambda = seq(0.001,0.1,by = 0.001))) fit1 plot(fit1) ## 2. custom metric library(MLmetrics) f1 <- function(data, lev = NULL, model = NULL) { f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[2]) c(F1 = f1_val) } ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5, summaryFunction = f1, classProbs=T, savePredictions = T, verboseIter = T) fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE], y = factor(tvst[[variable]]), method="glmnet", trControl=ctrl, metric = "F1", tuneGrid = expand.grid(alpha = 0.6, # was 0.6 lambda = seq(0.001,0.1,by = 0.001))) plot(fit1) install.packages('kernlab') source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R') source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R') source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R') ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5, summaryFunction = f1, classProbs=T, savePredictions = T, verboseIter = T) fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE], y = factor(tvst[[variable]]), method = lpSVM, trControl=ctrl, metric = "F1", tuneGrid = expand.grid(alpha = 0.6, # was 0.6 lambda = seq(0.001,0.1,by = 0.001))) source('~/Desktop/R4RA/SCRIPTS/4_Machine_learning/experiment_scripts/svmcustom.R') ctrl <- trainControl(method = 'repeatedcv', repeats=50, number=5, summaryFunction = f1, classProbs=T, savePredictions = T, verboseIter = T) fit1 <- train(x = tvst[,-ncol(tvst),drop=FALSE], y = factor(tvst[[variable]]), method = lpSVM, trControl=ctrl, metric = "F1") plot(fit1) c<-c('a','b') c[c=='a'] sum(c=='a') c<-c('a','a','b') sum(c=='a') library(caret) source('~/Dropbox/evalV4.R') im <- twoClassSim(200, intercept = -20, linearVars = 20) table(im$Class) fitControl <- trainControl( method = "cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, verboseIter = T) orig_fit <- train(Class ~ ., data = im, method = "glmnet", metric = "ROC", trControl = fitControl) x <- eval(orig_fit) xr <- x$probs$`Group 1` View(im) dev.off() library(caret) source('~/Dropbox/evalV4.R') im <- twoClassSim(200, intercept = -20, linearVars = 20) table(im$Class) View(im) library(caret) df<-data.frame(c='a') View(df) im <- twoClassSim(200, intercept = -20, linearVars = 20) table(im$Class) View(im) source('~/Desktop/evalV4.R') View(df) View(df) source('~/Desktop/evalV4.R') View(df) theme library(ggplot2) source('~/Desktop/evalV4.R') x <- eval(orig_fit) View(im) df<-c('a') source('~/Dropbox/evalV3.R') df<-c('a') df<-data.frame(c='aaa') View(df) source('~/Dropbox/evalV2.R') df<-data.frame(c='aaa') View(df) df<-data.frame(c='aaa') source('~/.active-rstudio-document') c c() View(df) source('~/Dropbox/evalV4_super_temp.R') View(df) source('~/Dropbox/evalV4_super_temp.R') source('~/Dropbox/evalV4_super_temp.R') df<-data.frame(c='aaa') View(df) source('~/.active-rstudio-document') df<-data.frame(c='aaa') View(df) c(df) View(df) View(df) library(M3C) library(edgeR) library(limma) library(ComplexHeatmap) library(plot3D) load("~/Desktop/R4RA/R4RA_V2_data_trimmed.RData") source('~/Desktop/R4RA/SCRIPTS/reformat_IDs_functions.R') ### parameters # features modules <- FALSE custom_list <- FALSE # set to FALSE for unsupervised clustering protein <- FALSE # set to TRUE for unsupervised clustering # outliers cutoff <- 'hard' # hard, soft, none # variability and expression strength filtering filtering <- TRUE # set to TRUE for unsupervised clustering cvx <- 0.075 # 0.075,0.125 with PAM, method 2, 1000 reps == good p values ### main script ## get the data out #data <- data.frame(log2(cpm(txiready$rawcounts+2))) ################################## Important data <- txiready$vst colnames(data) <- gsub('\\.','-',colnames(data)) colnames(data) <- substring(colnames(data), 2) ## remove outliers if (cutoff == 'hard'){ outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',') outliers$ID <- as.character(outliers$ID) data <- data[ , -which(names(data) %in% outliers$ID)] # 12 initial outliers } # data <- fix_colnames(data) # get the metadata meta <- read.csv('/home/christopher/Desktop/R4RA/meta_data_V13_imp.csv') meta <- subset(meta, meta$Visit == 3) completeFun <- function(data, desiredCols) { completeVec <- complete.cases(data[, desiredCols]) return(data[completeVec, ]) } #meta <- completeFun(meta, "CDAI.response.status") if (drugf){ # select based on medication, levels: Rituximab Tocilizumab meta <- subset(meta, meta$Randomised.medication == drug) } meta$Seq_ID.V2 <- as.character(meta$Seq_ID.V2) meta$Seq_ID.V2 <- gsub('\\.','-',meta$Seq_ID.V2) meta$Seq_ID.V2 <- paste(meta$Seq_ID.V2,'-Baseline',sep='') setdiff(colnames(data),meta$Seq_ID.V2) meta <- subset(meta, meta$Seq_ID.V2 %in% colnames(data)) data <- data[,meta$Seq_ID.V2] View(df) # optionally just select the protein coding genes if (protein){ mappings <- read.csv('~/Desktop/R4RA/gencode.v29.mappings.tsv',sep='\t',header=FALSE) protein_coding <- subset(mappings, mappings$V3 == 'protein_coding') protein_coding$V3 <- as.character(protein_coding$V3) protein_coding$V2 <- as.character(protein_coding$V2) row.names(protein_coding) <- NULL data <- subset(data, row.names(data) %in% protein_coding$V2) } data_backup <- data # save for boxplots ## coefficient of variation ## second order coefficient of variation mfilter <- 'cv' if (filtering){ # filter for just the strongest expressed genes tokeep <- row.names(txiraw$txistd$counts[ rowSums((cpm(txiraw$txistd$counts))>1)>=10, ]) data2 <- subset(data, row.names(data) %in% tokeep) # variable gene filter if (mfilter == 'cv'){ # co efficient of variation CV <- apply(data2,1,sd)/(rowMeans(data2)) names <- names(CV)[CV>cvx] }else if (mfilter == 'a2'){ CV <- apply(data2,1,sd)/(rowMeans(data2)) CV2 <- CV^2 A2 <- sqrt(CV2/(CV2+1)) names <- names(A2)[A2>0.8] }else if (mfilter == 'var'){ xxx <- featurefilter(data2,method='var',percentile=40) names <- row.names(xxx$filtered_data) } data3 <- subset(data2, row.names(data2) %in% names) nrow(data3) }else{ data3 <- data } #colnames(meta)[43] <- 'ID' colnames(meta)[colnames(meta)=='Seq_ID.V2'] <- 'ID' ### data3 <- t(scale(t(data3))) ### clustering #source('~/Desktop/scripts/AEC_early.R') #source('~/Desktop/scripts/AECv11_early.R') set.seed(123) library(DESeq2) library(dplyr) library(qusage) library(edgeR) source('~/Desktop/R4RA/SCRIPTS/reformat_IDs_functions.R') load("/home/christopher/Desktop/R4RA/R4RA_V2_data_trimmed.RData") # DAS28.CRP.EULARresp.V7.mod, DAS28.CRP.EULARresp.V7, Target.CDAI.response.V7, CDAI.response.status.V7 variable <- 'DAS28.CRP.EULARresp.V7.mod' drug <- 'Tocilizumab' # Rituximab, Tocilizumab protein_only <- FALSE ## get data out and remove outliers counts <- txiready$lengthscaledtpmcounts # length scaled TPMs should be used for limma counts <- data.frame(counts) ### to remove outliers # put before fix_colnames # 234-40=194 colnames(counts) <- gsub('\\.','-',colnames(counts)) colnames(counts) <- substring(colnames(counts), 2) outliers <- read.csv('~/Desktop/R4RA/outliers_trimmed_1.csv',sep=',') outliers$ID <- as.character(outliers$ID) counts <- counts[ , -which(names(counts) %in% outliers$ID)] # 12 initial outliers # end # counts <- fix_colnames(counts) ## always filter to remove low count features when running limma tokeep <- row.names(txiraw$txistd$counts[ rowSums((cpm(txiraw$txistd$counts))>1)>=10, ]) counts <- subset(counts, row.names(counts) %in% tokeep) ## optionally select protein coding transcripts only if (protein_only){ mappings <- read.csv('~/Desktop/R4RA/gencode.v29.mappings.tsv',sep='\t',header=FALSE) protein_coding <- subset(mappings, mappings$V3 == 'protein_coding') protein_coding$V3 <- as.character(protein_coding$V3) protein_coding$V2 <- as.character(protein_coding$V2) row.names(protein_coding) <- NULL counts <- subset(counts, row.names(counts) %in% protein_coding$V2) } ## get meta data and select visit 3 (baseline) meta <- read.csv('/home/christopher/Desktop/R4RA/meta_data_V14_imp.csv') # select visit 3, reformat, select rituximab patients meta <- subset(meta, meta$Visit == 3) completeFun <- function(data, desiredCols) { completeVec <- complete.cases(data[, desiredCols]) return(data[completeVec, ]) } # select variable to test for if (variable == 'CDAI.response.status.V7'){ meta$CDAI.response.status.V7 <- as.character(meta$CDAI.response.status.V7) meta <- completeFun(meta, "CDAI.response.status.V7") }else if (variable == 'Target.CDAI.response.V7'){ meta$Target.CDAI.response.V7 <- as.character(meta$Target.CDAI.response.V7) meta <- completeFun(meta, "Target.CDAI.response.V7") }else if (variable == 'DAS28.CRP.EULARresp.V7'){ meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7) #meta <- meta[!meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder', ] # need to run this for binary status meta <- meta[complete.cases(meta$DAS28.CRP.EULARresp.V7),] }else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){ meta$DAS28.CRP.EULARresp.V7 <- as.character(meta$DAS28.CRP.EULARresp.V7) meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Non.Responder'] <- 'Moderate_or_none' meta$DAS28.CRP.EULARresp.V7[meta$DAS28.CRP.EULARresp.V7 == 'Moderate.Responder'] <- 'Moderate_or_none' } # select drug if (drug == 'Rituximab'){ meta <- subset(meta, meta$Randomised.medication == 'Rituximab') }else{ meta <- subset(meta, meta$Randomised.medication == 'Tocilizumab') } # further reformatting, select matching patients between RNA-seq and meta data meta$Seq_ID.V2 <- as.character(meta$Seq_ID.V2) meta$Seq_ID.V2 <- gsub('\\.','-',meta$Seq_ID.V2) meta$Seq_ID.V2 <- paste(meta$Seq_ID.V2,'-Baseline',sep='') meta <- subset(meta, meta$Seq_ID.V2 %in% colnames(counts)) countsn <- counts[,meta$Seq_ID.V2] ## get N # select variable to test for if (variable == 'CDAI.response.status.V7'){ table(meta$CDAI.response.status.V7) }else if (variable == 'Target.CDAI.response.V7'){ table(meta$Target.CDAI.response.V7) }else if (variable == 'DAS28.CRP.EULARresp.V7' | variable == 'DAS28.CRP.EULARresp.V7.mod'){ table(meta$DAS28.CRP.EULARresp.V7) } ## run LIMMA # countn = length scale tpm counts # do contrast matrix for clusters if (variable == 'CDAI.response.status.V7'){ design <- model.matrix(~0+meta$CDAI.response.status.V7+meta$Age+meta$Gender+meta$Ethnicity) }else if (variable == 'DAS28.CRP.EULARresp.V7' | variable == 'DAS28.CRP.EULARresp.V7.mod'){ design <- model.matrix(~0+meta$DAS28.CRP.EULARresp.V7+meta$Age+meta$Gender+meta$Ethnicity) }else if (variable == 'Target.CDAI.response.V7'){ design <- model.matrix(~0+meta$Target.CDAI.response.V7+meta$Age+meta$Gender+meta$Ethnicity) } # if (variable == 'CDAI.response.status.V7' | variable == 'Target.CDAI.response.V7'){ colnames(design) <- c('Non.Responder','Responder','Age','Gender','Ethnicity_asian','Ethnicity_caucasian', 'Ethnicity_other') }else if (variable == 'DAS28.CRP.EULARresp.V7'){ colnames(design) <- c('Good', 'Moderate', 'None','Age','Gender','Ethnicity_asian','Ethnicity_caucasian', 'Ethnicity_other') }else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){ colnames(design) <- c('Good', 'Moderate_or_none','Age','Gender','Ethnicity_asian','Ethnicity_caucasian', 'Ethnicity_other') } # if (variable == 'CDAI.response.status.V7' | variable == 'Target.CDAI.response.V7'){ contrast.matrix <- makeContrasts('Responder-Non.Responder', levels = design) }else if (variable == 'DAS28.CRP.EULARresp.V7'){ contrast.matrix <- makeContrasts('Good-None', levels = design) }else if (variable == 'DAS28.CRP.EULARresp.V7.mod'){ contrast.matrix <- makeContrasts('Good-Moderate_or_none', levels = design) } ## filter data and normalise dge <- DGEList(counts=countsn) keep <- filterByExpr(dge, design) dge <- dge[keep,,keep.lib.sizes=FALSE] dge <- calcNormFactors(dge) # dge$samples$norm.factors ## normalisation factors # barplot(colSums(txiraw$txistd$counts)) ## examine sequencing depth ## voom procedure v <- voom(dge, design, plot=TRUE) fit <- lmFit(v,design) fit <- contrasts.fit(fit, contrast.matrix) fit <- eBayes(fit) top2 <- topTable(fit,coef=1,number=Inf,sort.by="P") qs <- qvalue::qvalue(p = top2$P.Value) top2$q.Val <- qs$qvalues nrow(subset(top2, top2$q.Val < 0.05)) sig <- subset(top2, top2$q.Val < 0.05) ## optionally write at this point #write.csv(top2, file='EULAR3.mod.tocil.limma.degs.csv') ## limma-trend # logCPM <- cpm(dge, log=TRUE, prior.count=3) # fit <- lmFit(logCPM, design) # fit <- contrasts.fit(fit, contrast.matrix) # fit <- eBayes(fit, trend=TRUE) # top3 <- topTable(fit, coef=1,number=Inf,sort.by="P") # qs <- qvalue::qvalue(p = top3$P.Value) # top3$q.Val <- qs$qvalues ## quantile # v <- voom(countsn, design, plot=TRUE, normalize="quantile") # fit <- lmFit(v,design) # fit <- contrasts.fit(fit, contrast.matrix) # fit <- eBayes(fit) # top4 <- topTable(fit,coef=1,number=Inf,sort.by="P") # qs <- qvalue::qvalue(p = top2$P.Value) # top2$q.Val <- qs$qvalues ### limma gene set analysis # wgcna # do fry analysis source('~/Desktop/scripts/parsemodulesforcameraV3.R') c2.indices <- loadmodules('wgcna') fryresults <- fry(v,c2.indices,design,contrast=contrast.matrix[,1]) # do gene set direction analysis c2.indices2 <- loadmodules('wgcna',keepsymbols = TRUE) source('~/Desktop/scripts/calculategenesetmeanfoldchangeV2.R') fryresults <- calculategenesetmeanfoldchange(top2,fryresults,c2.indices2) # calculate q values qs <- qvalue::qvalue(p = fryresults$PValue) fryresults$q.Val <- qs$qvalues ### select modules for plotting masterresults1 <- fryresults masterresults1 <- masterresults1[order(masterresults1[,3]),] # change name of long module #masterresults1$genesetID[masterresults1$genesetID=='S220 MAPK activation, toll like receptor (TLR) signalling'] <- 'S220 MAPK activation' View(masterresults1) View(meta) source('~/.active-rstudio-document') View(meta) source('~/.active-rstudio-document') d<-data.frame(a='aaaa') View(d) source('~/.active-rstudio-document') View(d) source('~/Dropbox/evalV4_super_temp.R') View(d) library(MLeval) fit1 im <- twoClassSim(2000, intercept = -25, linearVars = 20) table(im$Class) fitControl <- trainControl( method = "cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, verboseIter = F) im_fit <- train(Class ~ ., data = im, method = "ranger", metric = "ROC", trControl = fitControl) library(caret) im <- twoClassSim(2000, intercept = -25, linearVars = 20) table(im$Class) fitControl <- trainControl( method = "cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, verboseIter = F) im_fit <- train(Class ~ ., data = im, method = "ranger", metric = "ROC", trControl = fitControl) ?trainControl ?train fitControl <- trainControl( method = "cv", summaryFunction=prSummary, classProbs=T, savePredictions = T, verboseIter = F) im_fit <- train(Class ~ ., data = im, method = "ranger", metric = "AUC", trControl = fitControl) im_fit x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c()) dev.off() x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c()) library(MLeval) x <- evalm(im_fit,rlinethick=0.8,fsize=8,plots=c()) x$optres setwd("~/M3C") devtools::check() setwd("~/M3C") devtools::install() devtools::check(document = FALSE)