Browse code

All Files in new Folder

Bhattacharya authored on 20/08/2021 20:18:52
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,770 @@
1
+#' Combining the RNAseq reads of family members in a 
2
+#' single file.
3
+#'
4
+#' @param RNASeqDir  character. Directory containing RNAseq reads.
5
+#' @param returnMethod  character. Method of returning Data.
6
+#' @param outpath  character. Contains file path if Method of return is chosen as 
7
+#' Text.
8
+#' @param outFileName  character. Output file name. 
9
+#' @return Text or Dataframe containing TPM read counts of genes in the family.
10
+#' @examples
11
+#' RNASeqDir = system.file("extdata", package="nanotatoR")
12
+#' returnMethod="dataFrame"
13
+#' datRNASeq <- RNAseqcombine_solo(RNASeqDir = RNASeqDir,
14
+#' returnMethod = returnMethod)
15
+#' @importFrom AnnotationDbi mapIds
16
+#' @importFrom stats na.omit 
17
+#' @import org.Hs.eg.db
18
+#' @export
19
+RNAseqcombine_solo<-function(RNASeqDir,returnMethod=c("Text","dataFrame"),
20
+                        outpath="",outFileName=""){ 
21
+    #library()
22
+    #setwd(RNASeqDir)
23
+    l <- list.files(path = RNASeqDir, 
24
+       pattern="*.genes.results", full.names = TRUE)
25
+    len<-length(l)
26
+    #dat<-listDatasets(ensembl)
27
+    #g1<-grep("sscrofa",listDatasets(ensembl)$dataset)
28
+    'grch37 = useMart(="ENSEMBL_MART_ENSEMBL", host="www.ensembl.org", 
29
+                    dataset="hsapiens_gene_ensembl")'
30
+    gen<-c(); 
31
+    ##Need to make this function dynamic
32
+    dat<-data.frame(matrix(ncol=len,nrow=nrow(r<-read.table(l[1],sep="\t",header=TRUE))))
33
+    cnam<-c()
34
+    for (ii in 1:length(l)){
35
+        r<-read.table(l[ii],sep="\t",header=TRUE)
36
+        gen<-c(gen,as.character(r$gene_id))
37
+        dat[,ii]<-as.numeric(r$TPM)
38
+        str<-strsplit(l[ii],split=".genes.results")
39
+        #print(str[[1]][1])
40
+        cnam<-c(cnam,str[[1]][1])
41
+    }
42
+    #datf<-data.frame(dat)
43
+    gen<-unique(as.character(gen))
44
+    st<-strsplit(gen,split="[.]")
45
+    genes<-c()
46
+    for(k in 1:length(gen)){
47
+        genes<-c(genes,as.character(st[[k]][1]))
48
+    }
49
+    data1 <- data.frame(dat[,1])
50
+    names(data1) <- cnam
51
+    genesym <- c()
52
+    ensemblid <- c()
53
+    'gene1 = getBM(attributes = c("external_gene_name", "ensembl_gene_id"), 
54
+                filters = "ensembl_gene_id", values = genes, mart = grch37)'
55
+    gn1 <- mapIds(org.Hs.eg.db, genes, "SYMBOL", "ENSEMBL")
56
+    'rn <- row.names(data.frame(gn1))
57
+    rn1 <- row.names(data.frame(gn2))'
58
+    gene1<-data.frame(
59
+        ensembl_gene_id = as.character(names((gn1))),
60
+        external_gene_name = as.character(data.frame(gn1)[,1])
61
+        )
62
+    genesym<-as.character(gene1$external_gene_name)
63
+    ensemblid<-as.character(gene1$ensembl_gene_id)
64
+    gene3<-c()
65
+    ens<-c()
66
+    ensemblid<-paste("^",ensemblid,"$",sep="")
67
+    for(kk in 1:length(genes)){
68
+        pag<-paste("^",genes[kk],"$",sep="")
69
+        val<-grep(pag,ensemblid,fixed=TRUE)
70
+        if(length(val)>0){
71
+            gene3<-c(gene3,as.character(unique(genesym[val])))
72
+            ens<-c(ens,as.character(genes[kk]))
73
+        }
74
+        else{
75
+            gene3<-c(gene3,as.character("-"))
76
+            ens<-c(ens,as.character(genes[kk]))
77
+        }
78
+    }
79
+    RNASeqDat<-data.frame(GeneName = as.character(gene3),
80
+            GeneID = as.character(ens),data1)
81
+    if(returnMethod=="Text"){
82
+        fname = file.path(outFileName,".csv",sep = "")
83
+        write.csv(RNASeqDat,file.path(outpath,fname), row.names = FALSE )
84
+    }
85
+    else if (returnMethod=="dataFrame"){
86
+        return (RNASeqDat)
87
+    }
88
+    else{
89
+        stop("Invalid ReturnMethod")
90
+    }
91
+}
92
+#' Annotating the Overlapping genes with RNAseq expression
93
+#'
94
+#' @param gnsOverlap  character. Vector containing overlapping genes.
95
+#' @param SVID  character. SV Index ID.
96
+#' @param RNASeqData  dataFrame. RNAseq data with gene names. 
97
+#' @param pattern_Proband  character. Pattern for proband. 
98
+#' @return Dataframe containing TPM read counts of overlapping genes.
99
+#' @examples
100
+#' RNASeqDir = system.file("extdata", package="nanotatoR")
101
+#' returnMethod="dataFrame"
102
+#' datRNASeq <- RNAseqcombine_solo(RNASeqDir = RNASeqDir,
103
+#' returnMethod = returnMethod)
104
+#' gnsOverlap <- c("AGL")
105
+#' SVID = 397
106
+#' datgnovrlap <- OverlapRNAseq_solo(gnsOverlap = gnsOverlap, 
107
+#' SVID = SVID, RNASeqData = datRNASeq,
108
+#' pattern_Proband = "*_P_*")
109
+#' @importFrom stats na.omit  
110
+#' @export
111
+OverlapRNAseq_solo<-function(gnsOverlap, SVID, RNASeqData,
112
+                pattern_Proband = NA){
113
+  
114
+  ###Finding the column names
115
+   #print(gnsOverlap);print(length(gnsOverlap))
116
+  'if(is.na(pattern_Father)==FALSE){
117
+    fatherInd<-grep(pattern_Father,names(RNASeqData))
118
+    } else{fatherInd<- NA}
119
+    if(is.na(pattern_Mother)==FALSE){
120
+    motherInd<-grep(pattern_Mother,names(RNASeqData))
121
+    } else{motherInd<- NA}'
122
+    if(is.na(pattern_Proband)==FALSE){
123
+        probandInd <- grep(pattern_Proband,names(RNASeqData))
124
+    } else{probandInd<- NA}
125
+    'if(is.na(pattern_Sibling)==FALSE){
126
+      siblingInd<-grep(pattern_Sibling,names(RNASeqData))
127
+    } 
128
+    else{
129
+      siblingInd<-NA
130
+    }'
131
+    sv<-c()
132
+    gene<-c()
133
+    gnsname<-as.character(RNASeqData$GeneName)
134
+    pasgnsname<-pasgnovlap<-paste("^",gnsname,"$",sep="")
135
+    'overlap_ensemblgenes = select(EnsDb.Hsapiens.v79, gnsOverlap, 
136
+                c("GENEID","GENENAME"), "SYMBOL")
137
+    gnsOverlapID<-as.character(overlap_ensemblgenes$GENEID)'
138
+    #print(paste("gnsOverlap:",gnsOverlap))
139
+    #print(paste("overlap_ensemblgenes:",overlap_ensemblgenes))
140
+    #print(paste("gnsOverlapID:",gnsOverlapID))
141
+    #genes<-as.character(overlap_ensemblgenes$SYMBOL)
142
+    ###Extracting Reads
143
+    ###
144
+    ###Genes Names Extraction
145
+    #print(paste("fatherInd :",fatherInd))
146
+    #print(paste("motherInd :",motherInd))
147
+    #print(paste("probandInd :",probandInd))
148
+    #print(paste("siblingInd :",siblingInd))
149
+    gnsOverlapID <- as.character(gnsOverlap)
150
+    #print(gnsOverlapID)
151
+    if(length(gnsOverlapID)>1){
152
+    
153
+        datGeneInfoTemp<-data.frame()
154
+        fatherReads<-c()
155
+        motherReads<-c()
156
+        probandReads<-c()
157
+        siblingReads<-c()
158
+        for (ki in 1:length(gnsOverlapID)){
159
+            pasgnovlap <- paste("^", gnsOverlapID[ki],"$", sep = "")
160
+            #print(ki)
161
+            gg<-grep(pasgnovlap, pasgnsname, fixed = TRUE)
162
+            dat_temp<-RNASeqData[gg, ]
163
+        
164
+            if(nrow(dat_temp)>1){
165
+                #print("FALSE")
166
+                #print(ki)
167
+                dat_temp_1 <- mean(dat_temp[,probandInd])
168
+                #dat_temp_1<-data.frame(dat_temp_1)
169
+                dat_temp1 <- dat_temp[1,]
170
+                dat_temp1[,probandInd] <- dat_temp_1
171
+                #print(dim(dat_temp1))
172
+            
173
+                if(is.na(probandInd[1])==FALSE){
174
+                    if(length(probandInd)>1){
175
+                        for(j in probandInd){
176
+                            probandcount<-c(probandcount,dat_temp1[,j])
177
+                        }
178
+                        probandReads<-c(
179
+                            probandReads,paste(probandcount,collapse = ":"))
180
+                    }
181
+                    else if(length(probandInd)==1){
182
+                        probandReads<-c(probandReads,dat_temp1[,probandInd])
183
+                    }
184
+                    else{
185
+                        probandReads<-c(probandReads,0)
186
+                    }
187
+                }
188
+                else{
189
+                    probandReads<-c(probandReads,"-")
190
+                }
191
+            }
192
+            else if (nrow(dat_temp)==1) {
193
+                #print("TRUE")
194
+                #print(dim(dat_temp1))
195
+                probandcount<-c()
196
+                if(is.na(probandInd [1])==FALSE){ 
197
+                    if(length(probandInd)>1){
198
+                        for(j in probandInd){
199
+                            probandcount<-c(probandcount,dat_temp[,j])
200
+                        }
201
+                        probandReads<-c(
202
+                        probandReads, paste(probandcount,collapse = ":"))
203
+                    }
204
+                    else if(length(probandInd)==1){
205
+                        probandReads<-c(
206
+                            probandReads, mean(dat_temp[,probandInd]))
207
+                    }
208
+                    else{
209
+                        probandReads<-c(probandReads,0)
210
+                    }
211
+                }
212
+                else{
213
+                    probandReads<-c(probandReads, "-")
214
+                }
215
+            }
216
+            else{
217
+                probandReads<-c(probandReads,"-")
218
+            }   
219
+            gene<-c(gene,as.character(gnsOverlapID[ki]))
220
+        }
221
+        if(is.na(probandInd[1])==FALSE){
222
+            ProbandGenes<-c()
223
+            for(ii in 1:length(gene)){
224
+                pasgene<-paste(gene[ii],"(", probandReads[ii],")", sep = "")
225
+                ProbandGenes<-c(ProbandGenes,pasgene)
226
+            }
227
+            ProbandTPM<-paste(ProbandGenes,collapse=";")
228
+            } else{
229
+                ProbandTPM <- "-"
230
+            }
231
+        datGeneInfo<-data.frame(SVID=SVID, ProbandTPM=ProbandTPM)     
232
+                
233
+    }
234
+    else if(length(gnsOverlapID)==1){
235
+        pasgnovlap<-paste("^",as.character(gnsOverlapID),"$",sep="")
236
+        #print(ki)
237
+        gg<-grep(pasgnovlap,pasgnsname,fixed=TRUE)
238
+      
239
+        dat_temp<-RNASeqData[gg,]
240
+        if(nrow(dat_temp)>1){
241
+            dat_temp_1 <- mean(dat_temp[,probandInd])
242
+            #dat_temp_1<-data.frame(dat_temp_1)
243
+            dat_temp1<-dat_temp[1,]
244
+            dat_temp1[,probandInd]<-dat_temp_1
245
+            #dat_temp1<-cbind(dat_temp[1,1:3],dat_temp1)
246
+            #print(dim(dat_temp1))
247
+            probandcount<-c()
248
+            if(is.na(probandInd[1])==FALSE){ 
249
+                if(length(probandInd)>1){
250
+                    for(j in probandInd){
251
+                        probandcount <- c(probandcount,mean(dat_temp1[,j]))
252
+                    }
253
+                probandReads <- paste(probandcount,collapse = ":")
254
+                }
255
+                else if(length(probandInd)==1){
256
+                    probandReads<-mean(dat_temp1[,probandInd])
257
+                }
258
+                else{
259
+                    probandReads <- 0
260
+                }
261
+            } 
262
+            else{
263
+                probandReads <- "-"
264
+            }
265
+          
266
+          
267
+        #motherReads<-dat_temp1[,motherInd]
268
+        #probandReads<-dat_temp1[,probandInd]
269
+        #if(is.na(siblingInd[1])==TRUE){
270
+        #siblingReads<-"-"
271
+        #}
272
+        #else{
273
+        #siblingReads<-dat_temp1[,siblingInd]
274
+        #}
275
+        }
276
+        else if (nrow(dat_temp)==1){
277
+       
278
+            #print(dim(dat_temp1))
279
+            if(is.na(probandInd[1])==FALSE){ 
280
+                if(length(probandInd)>1){
281
+                    for(j in probandInd){
282
+                        probandcount <- c(probandcount,mean(dat_temp[,j]))
283
+                    }
284
+                    probandReads<-paste(probandcount,collapse=":")
285
+                }
286
+                else if(length(probandInd)==1){
287
+                    probandReads<-mean(dat_temp[,probandInd])
288
+                }
289
+                else{
290
+                    probandReads<-0
291
+                }
292
+            } else{
293
+                probandReads<-"-"
294
+            }
295
+        }
296
+        else{
297
+            probandReads<-"-"
298
+        }
299
+        #gene<-overlap_ensemblgenes$SYMBOL
300
+        if(is.na(probandInd[1])==FALSE){
301
+            gene<- as.character(gnsOverlapID)
302
+        
303
+            ProbandGenes<-paste(gene,"(",probandReads,")",sep="")
304
+            #ProbandGenes<-c(ProbandGenes,pasgene)
305
+            ProbandTPM<-as.character(ProbandGenes)
306
+        }else{
307
+            ProbandTPM <- "-"
308
+        }
309
+        datGeneInfo<-data.frame(SVID = SVID,ProbandTPM = ProbandTPM)
310
+            
311
+    }
312
+    else{
313
+        datGeneInfo <- data.frame(SVID = SVID,ProbandTPM = "-")
314
+    }
315
+  
316
+  #print(warnings())
317
+  
318
+    return(datGeneInfo)
319
+}
320
+#' Annotating the Non-Overlapping genes with RNAseq expression
321
+#'
322
+#' @param gnsNonOverlap  character. Vector containing non-overlapping genes.
323
+#' @param SVID  character. SV Index ID.
324
+#' @param RNASeqData  dataFrame. RNAseq data with gene names. 
325
+#' @param pattern_Proband  character. Pattern for proband. 
326
+#' @return Dataframe containing TPM read counts of overlapping genes.
327
+#' @examples
328
+#' RNASeqDir = system.file("extdata", package="nanotatoR")
329
+#' returnMethod="dataFrame"
330
+#' datRNASeq <- RNAseqcombine_solo(RNASeqDir = RNASeqDir,
331
+#' returnMethod = returnMethod)
332
+#' gnsNonOverlap <- c("DDX11L1", "MIR1302-2HG", "OR4G4P")
333
+#' SVID = 397
334
+#' datgnnonovrlap <- nonOverlapRNAseq_solo(gnsNonOverlap = gnsNonOverlap, 
335
+#' SVID = SVID, RNASeqData = datRNASeq,
336
+#' pattern_Proband = "*_P_*")
337
+#' @importFrom stats na.omit  
338
+#' @export
339
+
340
+nonOverlapRNAseq_solo<-function(gnsNonOverlap,SVID,RNASeqData,
341
+                pattern_Proband = NA){
342
+  ## annotation
343
+  ###Checking if the input is empty; else if not empty add 
344
+  ###expression values for each genes
345
+    datGeneInfo<-data.frame()
346
+    SVID=SVID
347
+    ###Extracting the index for the the parents
348
+    'if(is.na(pattern_Father)==FALSE){
349
+    fatherInd<-grep(pattern_Father,names(RNASeqData))
350
+    } else{fatherInd <- NA}
351
+    if(is.na(pattern_Mother)==FALSE){
352
+    motherInd<-grep(pattern_Mother,names(RNASeqData))
353
+    } else{motherInd <- NA}'
354
+    if(is.na(pattern_Proband)==FALSE){
355
+         probandInd<-grep(pattern_Proband,names(RNASeqData))
356
+    } else{probandInd <- NA}
357
+    ##Checking for sibling
358
+    'if(is.na(pattern_Sibling)==FALSE){
359
+      siblingInd<-grep(pattern_Sibling,names(RNASeqData))
360
+    } 
361
+    else{
362
+      siblingInd<-NA
363
+    }'
364
+    
365
+    gene<-c()
366
+    gnsname<-as.character(RNASeqData$GeneName)
367
+    pasgnsname<-pasgnovlap<-paste("^",as.character(gnsname),"$",sep="")
368
+    'nonoverlap_ensemblgenes = select(EnsDb.Hsapiens.v79, gnsNonOverlap, 
369
+                c("GENEID","GENENAME"), "SYMBOL")
370
+    gnsnonOverlapID<-as.character(nonoverlap_ensemblgenes$GENEID)'    
371
+    ###Extracting Reads
372
+    ###
373
+    ###Genes Names Extraction
374
+    gnsnonOverlapID<- as.character(gnsNonOverlap)
375
+    if(length(gnsnonOverlapID)>1){
376
+        #datGeneInfoTemp<-data.frame()
377
+        probandReads<-c()
378
+      
379
+      
380
+        for (ki in 1:length(gnsnonOverlapID)){
381
+            
382
+            pasgnnonovlap <- paste("^",as.character(
383
+                gnsnonOverlapID[ki]),"$",sep = "")
384
+            gg<-grep(pasgnnonovlap,pasgnsname,fixed=TRUE)
385
+            dat_temp<-RNASeqData[gg,]
386
+            if(nrow(dat_temp)>1){
387
+                dat_temp_1 <- mean(dat_temp[,probandInd])
388
+                #dat_temp_1<-data.frame(dat_temp_1)
389
+                dat_temp1<-dat_temp[1,]
390
+                dat_temp1[,probandInd]<-dat_temp_1
391
+                #print(dim(dat_temp1))
392
+                probandcount<-c()
393
+                if(is.na(probandInd[1]) == FALSE){
394
+                    if(length(probandInd)>1){
395
+                        for(j in probandInd){
396
+                            probandcount <- c(probandcount,dat_temp1[,j])
397
+                        }
398
+                    probandReads<-c(
399
+                        probandReads,paste(probandcount,collapse=":")
400
+                        )
401
+                    }
402
+                    else if(length(probandInd)==1){
403
+                        probandReads<-c(probandReads,dat_temp1[,probandInd])
404
+                    }
405
+                    else{
406
+                        probandReads<-c(probandReads,0)
407
+                    }
408
+                } else{
409
+                    probandReads<-c(probandReads, "-")
410
+                }
411
+            }
412
+            else if (nrow(dat_temp)==1) {
413
+                #print(dim(dat_temp1))
414
+                probandcount<-c()
415
+                if(is.na(probandInd[1]) == FALSE){ 
416
+                    if(length(probandInd)>1){
417
+                        for(j in probandInd){
418
+                            probandcount <- c(probandcount, dat_temp[,j])
419
+                        }
420
+                    probandReads<-c(probandReads,paste(probandcount,collapse=":"))
421
+                }
422
+                else if(length(probandInd)==1){
423
+                    probandReads<-c(probandReads,dat_temp[,probandInd])
424
+                }
425
+                else{
426
+                    probandReads<-c(probandReads,0)
427
+                }
428
+            }
429
+            else{
430
+                probandReads <- c(probandReads,"-")
431
+            }
432
+        }
433
+        else{
434
+            probandReads<-c(probandReads,"-")
435
+        }
436
+        gene<-c(gene,as.character(gnsnonOverlapID[ki]))
437
+    }
438
+        if(is.na(probandInd[1])==FALSE){
439
+            ProbandGenes<-c()
440
+        
441
+            for(ii in 1:length(gene)){
442
+                pasgene<-paste(gene[ii],"(",probandReads[ii],")",sep="")
443
+                ProbandGenes<-c(ProbandGenes,pasgene)
444
+            }
445
+            ProbandTPM<-paste(ProbandGenes,collapse=";")
446
+            } else{
447
+                ProbandTPM<-"-"
448
+            }
449
+        datGeneInfo<-data.frame(SVID=SVID,ProbandTPM=ProbandTPM)
450
+    } 
451
+    else if(length(gnsnonOverlapID)==1){
452
+        pasgnnonovlap<-paste("^",as.character(gnsnonOverlapID),"$",sep="")
453
+        gg<-grep(pasgnnonovlap,pasgnsname,fixed=TRUE)
454
+        #gg<-grep(pasgnnonovlap,gnsname,fixed=TRUE)
455
+        dat_temp<-RNASeqData[gg,]
456
+      
457
+        if(nrow(dat_temp)>1){
458
+            dat_temp_1 <- mean(dat_temp[,probandInd])
459
+            #dat_temp_1<-data.frame(dat_temp_1)
460
+            dat_temp1<-dat_temp[1,]
461
+            dat_temp1[, probandInd]<-dat_temp_1
462
+           #print(dim(dat_temp1))
463
+            probandcount<-c()
464
+            if(is.na(probandInd[1])==FALSE){
465
+                if(length(probandInd)>1){
466
+                    for(j in probandInd){
467
+                        probandcount<-c(probandcount,dat_temp1[,j])
468
+                    }
469
+                probandReads<-paste(probandcount,collapse=":")
470
+            }
471
+            else if(length(probandInd)==1){
472
+                probandReads<-dat_temp1[,probandInd]
473
+            }
474
+            else{
475
+                probandReads<-0
476
+            }
477
+        }
478
+        else{
479
+            probandReads<- "-"
480
+        }
481
+          
482
+          
483
+        }
484
+        else if (nrow(dat_temp)==1){
485
+       
486
+         #print(dim(dat_temp1))
487
+            probandcount<-c()
488
+            if(is.na(probandInd)==FALSE){
489
+                if(length(probandInd)>1){
490
+                    for(j in probandInd){
491
+                        probandcount<-c(probandcount,dat_temp[,j])
492
+                    }
493
+                probandReads <- paste(probandcount,collapse=":")
494
+            }
495
+            else if(length(probandInd)==1){
496
+                probandReads <- dat_temp[,probandInd]
497
+            }
498
+            else{
499
+                probandReads<-0
500
+            }
501
+        }
502
+        else{
503
+            probandReads<- "-"
504
+        }
505
+        }
506
+        else{
507
+            probandReads<-"-"
508
+        }
509
+        gene<-c(gene,as.character(gnsnonOverlapID))
510
+        if(is.na(probandInd[1])==FALSE){
511
+            ProbandGenes <- c()
512
+            ProbandGenes <- paste(gene,"(",probandReads,")",sep="")
513
+            #ProbandGenes<-c(ProbandGenes,pasgene)
514
+        #}
515
+            ProbandTPM<-as.character(ProbandGenes)
516
+        }else{
517
+            ProbandTPM<-"-"
518
+        }
519
+        datGeneInfo <- data.frame(SVID = SVID,ProbandTPM = ProbandTPM)
520
+    }
521
+    else{
522
+        datGeneInfo < -data.frame(SVID = SVID,ProbandTPM = "-")
523
+    
524
+    }
525
+    return(datGeneInfo)
526
+}
527
+#' Annotating the Overlapping and Non-Overlapping genes with RNAseq expression
528
+#'
529
+#' @param input_fmt_SV  character. Input format of the SV data.Options 
530
+#' "Text" or "DataFrame".
531
+#' @param smapdata  dataframe. SV data dataframe.
532
+#' @param smappath  character. smap path.
533
+#' @param input_fmt_RNASeq  character. Input format of  
534
+#' the RNASeq data. Options "Text" or "DataFrame"..
535
+#' @param RNASeqData  dataFrame. RNAseq data with gene names.
536
+#' @param RNASeqPATH  character. RNAseq dataset path . 
537
+#' @param outputfmt  character. Output format of  
538
+#' the result. Options "Text" or "DataFrame"..
539
+#' @param pattern_Proband  character. Pattern for proband. 
540
+#' @param EnzymeType  character. Enzyme used. option "SVMerge" or "SE".
541
+#' @return Dataframe Annotated datafreme with RNASeq data.
542
+#' @examples
543
+#' RNASeqDir = system.file("extdata", package="nanotatoR")
544
+#' returnMethod="dataFrame"
545
+#' datRNASeq <- RNAseqcombine_solo(RNASeqDir = RNASeqDir,
546
+#' returnMethod = returnMethod)
547
+#' smapName="NA12878_DLE1_VAP_solo5.smap"
548
+#' smap = system.file("extdata", smapName, package="nanotatoR")
549
+#' smap = system.file("extdata", smapName, package="nanotatoR")
550
+#' bedFile <- system.file("extdata", "HomoSapienGRCH19_lift37.bed", package="nanotatoR")
551
+#' outpath <- system.file("extdata", package="nanotatoR")
552
+#' datcomp<-overlapnearestgeneSearch(smap = smap, 
553
+#'     bed=bedFile, inputfmtBed = "bed", outpath, 
554
+#'     n = 3, returnMethod_bedcomp = c("dataFrame"), 
555
+#'     input_fmt_SV = "Text",
556
+#'     EnzymeType = "SE", 
557
+#'     bperrorindel = 3000, 
558
+#'     bperrorinvtrans = 50000)
559
+#' datRNASeq1 <- SVexpression_solo (input_fmt_SV=c("dataFrame"),
560
+#'     smapdata = datcomp,
561
+#'     input_fmt_RNASeq=c("dataFrame"),
562
+#'     RNASeqData = datRNASeq,
563
+#'     outputfmt=c("datFrame"),
564
+#'     pattern_Proband = "*_P_*", EnzymeType = c("SE"))
565
+#' datRNASeq1[1,]
566
+#' @importFrom stats na.omit  
567
+#' @export
568
+SVexpression_solo <- function(input_fmt_SV=c("Text","dataFrame"),
569
+        smapdata,smappath, input_fmt_RNASeq=c("Text","dataFrame"),
570
+        RNASeqData,RNASeqPATH,outputfmt=c("Text","datFrame"),
571
+        pattern_Proband = NA, EnzymeType = c("SVMerge", "SE")){
572
+  ###RNASEQ Analysis data
573
+    if(input_fmt_RNASeq=="dataFrame"){
574
+        RNASeqData = RNASeqData
575
+    }
576
+    else if(input_fmt_RNASeq=="Text"){
577
+        RNASeqData=read.csv(RNASeqPATH)
578
+    }
579
+    else{
580
+        stop("Input format for RNASeq Data Incorrect")
581
+    }
582
+  
583
+    if(input_fmt_SV=="dataFrame"){
584
+        smapdata = smapdata
585
+        if(EnzymeType == "SVMerge"){
586
+            #smapdata <- readSMap(smap, input_fmt_smap = "Text")
587
+            SVID<-smapdata$SVIndex
588
+        }
589
+        else{
590
+            #smapdata <- readSMap_DLE(smap, input_fmt_smap)
591
+            SVID<-smapdata$SmapEntryID
592
+        }
593
+    }
594
+    else if(input_fmt_SV=="Text"){
595
+        if(EnzymeType == "SVMerge"){
596
+            smapdata <- readSMap(smappath, input_fmt_smap = "Text")
597
+            SVID<-smapdata$SVIndex
598
+        }
599
+        else{
600
+            smapdata <- readSMap_DLE(smappath, input_fmt_smap = "Text")
601
+            SVID<-smapdata$SmapEntryID
602
+        }
603
+    }
604
+    else{
605
+        stop("Input format for SMAP Incorrect")
606
+    }
607
+    ##Extracting Data
608
+    overlapgenes <- stringr::str_trim(smapdata$OverlapGenes_strand_perc)
609
+  
610
+    
611
+    dataOverLap <- data.frame(matrix(nrow = nrow(smapdata), ncol = 2))
612
+    ##Extracting Overlapped Genes
613
+    #dataOverLap<-data.frame(matrix(nrow=10,ncol=5))
614
+    names(dataOverLap)<-c("SVID", "OverlapProbandTPM")
615
+    print("###OverlapGenes###")
616
+    for(kk in 1:length(overlapgenes)){
617
+        #print(paste("kk:",kk))
618
+        #for(kk in 1:10){
619
+        #print(kk)
620
+        datOverLap<-data.frame()
621
+        #print(paste("kk:",kk,sep=""))
622
+        svID<-as.character(SVID[kk])
623
+        if(length(grep(";",overlapgenes[kk]))>=1){
624
+            st1<-strsplit(as.character(overlapgenes[kk]), split = ";")
625
+            sttemp<-as.character(st1[[1]])
626
+            #print("1")
627
+            gns_overlap<-c()
628
+            for (tt in 1:length(sttemp)){
629
+                gn_temp<-strsplit(sttemp[tt], split = "\\(")
630
+                gns_overlap<-c(gns_overlap,as.character(gn_temp[[1]][1]))
631
+            }
632
+            
633
+                datOverLap<-OverlapRNAseq_solo(
634
+                        gnsOverlap = as.character(gns_overlap),
635
+                        SVID = svID, 
636
+                        RNASeqData = RNASeqData,
637
+                        pattern_Proband = pattern_Proband)
638
+                
639
+        }
640
+        else if (length(grep("\\(", as.character(overlapgenes[kk]))) >= 1){
641
+            #print("2")
642
+            gnsOverlap<-strsplit(as.character(overlapgenes[kk]),split="\\(")[[1]][1]
643
+                datOverLap<-OverlapRNAseq_solo(gnsOverlap = as.character(gnsOverlap),
644
+                    SVID = svID,
645
+                    RNASeqData = RNASeqData,
646
+                    pattern_Proband = pattern_Proband)
647
+            }
648
+            else{
649
+                #print(paste("OverLapDNSVID:",svID))
650
+                datOverLap<-data.frame(
651
+                    SVID = svID, ProbandTPM = "-")
652
+                }
653
+                dataOverLap[kk,]<-c(
654
+                    as.character(datOverLap$SVID),
655
+                    Proband_OverlapGeneExpression_TPM = as.character(datOverLap$ProbandTPM))
656
+        }
657
+    
658
+            ##Extracting NonOverlapped Genes
659
+            nearestUPGenes<-smapdata$Upstream_nonOverlapGenes_dist_kb
660
+            #datanonOverLapUP<-data.frame(matrix(nrow=nrow(smapdata),ncol=5))
661
+            datanonOverLapUP<-data.frame(matrix(nrow = nrow(smapdata),ncol = 2))
662
+            names(datanonOverLapUP) <- c("SVID", "NonOverlapUPProbandTPM")
663
+            print("###NonOverlapUPStreamGenes###") 
664
+            for(ll in 1:length(nearestUPGenes))
665
+            {
666
+                #for(ll in 1:10){
667
+                #print(ll)
668
+                datNonOverLapUP <- data.frame()
669
+                #print(paste("llUP:",ll,sep=""))
670
+                svID<-as.character(SVID[ll])
671
+                if(length(grep(";",nearestUPGenes[ll]))>=1){
672
+                    st1<-strsplit(
673
+                        as.character(nearestUPGenes[ll]),
674
+                        split = ";")
675
+                        sttemp<-as.character(st1[[1]])
676
+                        #print("1")
677
+                        gns_nonoverlap_up<-c()
678
+                    for (mm in 1:length(sttemp)){
679
+                        gn_temp<-strsplit(sttemp[mm],split="\\(")
680
+                        gns_nonoverlap_up<-c(gns_nonoverlap_up, 
681
+                            as.character(gn_temp[[1]][1]))
682
+                    }
683
+                
684
+                    datNonOverLapUP<-nonOverlapRNAseq_solo(
685
+                        gnsNonOverlap = as.character(gns_nonoverlap_up),
686
+                        SVID = svID,
687
+                        RNASeqData = RNASeqData,
688
+                        pattern_Proband = pattern_Proband)
689
+                    
690
+                }
691
+                else if (length(grep(
692
+                "\\(",as.character(nearestUPGenes[ll]))) >=1
693
+                ){
694
+            #print("2")
695
+                        gnsNonOverlapUP<-strsplit(as.character(nearestUPGenes[ll]),split="\\(")[[1]][1]
696
+                        
697
+                        datNonOverLapUP<-nonOverlapRNAseq_solo(
698
+                            gnsNonOverlap = as.character(gnsNonOverlapUP),
699
+                            SVID = svID,RNASeqData = RNASeqData,
700
+                            pattern_Proband=pattern_Proband)
701
+                        
702
+                    }
703
+                else{
704
+                    #print(paste("NonOverLapUPSVID:",svID))
705
+                        datNonOverLapUP<-data.frame(
706
+                            SVID = svID, ProbandTPM="-"
707
+                        )
708
+        }
709
+        datanonOverLapUP[ll,]<-c(
710
+            as.character(datNonOverLapUP$SVID), 
711
+            Proband_Upstream_nonOverlapGeneExpression_TPM = as.character(datNonOverLapUP$ProbandTPM))
712
+                
713
+    }
714
+    
715
+  ##Extracting NonOverlapped Down Stream Genes
716
+    nearestDNGenes<-smapdata$Downstream_nonOverlapGenes_dist_kb
717
+    datanonOverLapDN<-data.frame(matrix(nrow = nrow(smapdata), ncol = 2))
718
+    names(datanonOverLapDN)<-c("SVID","NonOverlapDNProbandTPM")
719
+    print("###NonOverlapDNStreamGenes###") 
720
+    for(nn in 1:length(nearestDNGenes))
721
+    {
722
+        #for(nn in 1:10){
723
+        datNonOverLapDN<-data.frame()
724
+        # print(paste("llDN:",ll,sep=""))
725
+        svID<-as.character(SVID[nn])
726
+        if(length(grep(";",nearestDNGenes[nn]))>=1){
727
+            st1 <- strsplit(as.character(nearestDNGenes[nn]), split = ";")
728
+            sttemp<-as.character(st1[[1]])
729
+            #print("1")
730
+            gns_nonoverlap_dn<-c()
731
+            for (mm in 1:length(sttemp)){
732
+                gn_temp <- strsplit(sttemp[mm],split="\\(")
733
+                gns_nonoverlap_dn <- c(
734
+                        gns_nonoverlap_dn,as.character(gn_temp[[1]][1])
735
+                        )
736
+            }
737
+            datNonOverLapDN<-nonOverlapRNAseq_solo(
738
+                    gnsNonOverlap = as.character(gns_nonoverlap_dn),
739
+                    SVID = svID,RNASeqData = RNASeqData,
740
+                    pattern_Proband = pattern_Proband)
741
+            
742
+        }
743
+        else if (length(grep("\\(",as.character(nearestDNGenes[nn]))) >= 1){
744
+            # print("2")
745
+            gnsNonOverlapDN<-strsplit(as.character(
746
+                nearestDNGenes[nn]),split="\\(")[[1]][1]
747
+                datNonOverLapDN<-nonOverlapRNAseq_solo(
748
+                        gnsNonOverlap = as.character(gnsNonOverlapDN),
749
+                        SVID = svID,
750
+                        RNASeqData = RNASeqData,
751
+                        pattern_Proband = pattern_Proband)
752
+                
753
+        }
754
+        else{
755
+            #print(paste("NonOverLapDNSVID:",svID))
756
+            #print ("SVID")
757
+            datNonOverLapDN<-data.frame(
758
+            SVID = svID,ProbandTPM="-")
759
+        }
760
+            datanonOverLapDN[nn,]<-c(as.character(datNonOverLapDN$SVID),
761
+                Proband_Downstream_nonOverlapGeneExpression_TPM = as.character(datNonOverLapDN$ProbandTPM))
762
+            }
763
+  
764
+            dataFinal<-data.frame(smapdata, 
765
+                OverlapProbandEXP = dataOverLap[,2],
766
+                NonOverlapUPprobandEXP = datanonOverLapUP[,2],
767
+                NonOverlapDNprobandEXP = datanonOverLapDN[,2])
768
+    return(dataFinal)
769
+
770
+}
0 771
\ No newline at end of file