inst/scripts/import/jaspar2014/import.R
b599b5fb
 # jaspar2014/import.R
612c2460
 #-----------------------------------------------------------------------------------
a1478bee
 library (org.Hs.eg.db)
 library (org.Mm.eg.db)
 #------------------------------------------------------------------------------------------------------------------------
612c2460
 options (stringsAsFactors=FALSE)
a1478bee
 printf <- function(...) print(noquote(sprintf(...)))
 #------------------------------------------------------------------------------------------------------------------------
 run = function (dataDir)
 {
612c2460
   dataDir <- file.path(dataDir,"jaspar2014")
   rawMatrixList <- readRawMatrices (dataDir)
   matrices <- extractMatrices (rawMatrixList)
   tbl.anno <- createAnnotationTable(dataDir)
   tbl.md <- createMetadataTable (tbl.anno, matrices)
   matrices <- normalizeMatrices (matrices)
   matrices <- renameMatrices (matrices, tbl.md)
   serializedFile <- file.path(dataDir, "jaspar2014.RData")
a1478bee
   save (matrices, tbl.md, file=serializedFile)
   printf("saved %d matrices to %s", length(matrices), serializedFile)
612c2460
   printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
a1478bee
 
 } # run
 #------------------------------------------------------------------------------------------------------------------------
 readRawMatrices = function (dataDir)
 {
612c2460
     # our convention is that there is a shared "dataDir" visible to
     # the importer, and that within that directory there is one
     # subdirectory for each data source.
     # for this example importer, that directory will be <dataDir>/test
     # within which we will look for one small file "sample.pcm"
     
 
   filename <- file.path(dataDir, "matrix_data.txt")
a51f6d0f
     stopifnot(file.exists(filename))
a1478bee
   
612c2460
   all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
   title.lines = grep ('^>', all.lines)
   title.line.count <<- length (title.lines)
   max = title.line.count - 1
a1478bee
 
612c2460
   pwms = list ()
   
   for (i in 1:max) {
     start.line = title.lines [i]
     end.line = title.lines [i+1] - 1
     new.pwm = parsePwm (all.lines [start.line:end.line])
     pwms = c (pwms, list (new.pwm))
     } # for i
   invisible (pwms)
a1478bee
 } # readRawMatrices
 #------------------------------------------------------------------------------------------------------------------------
612c2460
 extractMatrices = function (pwm.list)
a1478bee
 {
612c2460
   matrices = sapply (pwm.list, function (element) element$matrix)
   matrix.names <- sapply (pwm.list, function (element) element$title)
   matrix.names <- sub("^>", "", matrix.names)
   names (matrices) <- matrix.names
   
   return (matrices)
 } # extractMatrices
a1478bee
 #------------------------------------------------------------------------------------------------------------------------
612c2460
 createAnnotationTable <- function(dataDir)
a1478bee
 {
565b437c
     file <- file.path(dataDir, "MATRIX.txt")
612c2460
     tbl.matrix <-  read.table(file, sep='\t', header=FALSE, as.is=TRUE)
     colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder')
       
     file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
     stopifnot(file.exists(file))
     tbl.protein <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
     colnames(tbl.protein) <- c('id', 'proteinID')
 
     file <- file.path(dataDir, "MATRIX_SPECIES.txt")
     stopifnot(file.exists(file))
     tbl.species <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
     colnames(tbl.species) <- c('id', 'speciesID')
     
     file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
     stopifnot(file.exists(file))
     tbl.anno <- read.table(file, sep='\t', header=FALSE, as.is=TRUE, quote="")
     colnames(tbl.anno) <- c('id', 'attribute', 'value')
 
     file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
     tbl.family  <- subset(tbl.anno, attribute=='family') [, -2];   
     colnames(tbl.family) <- c('id', 'family')
        # create 5 2-column data.frames out of tbl.anno
        # which can all then be merged on the "id" column
     tbl.tax <- subset(tbl.anno, attribute=='tax_group') [,-2]; 
     colnames(tbl.tax) <- c('id', 'tax')
 
     tbl.class <- subset(tbl.anno, attribute=='class') [,-2];     
     colnames(tbl.class) <- c('id', 'class')
 
     tbl.comment <- subset(tbl.anno, attribute=='comment')[,-2];    
     colnames(tbl.comment) <- c('id', 'comment')
 
     tbl.pubmed  <- subset(tbl.anno, attribute=='medline')[,-2];    
     colnames(tbl.pubmed) <- c('id', 'pubmed')
 
     tbl.type    <- subset(tbl.anno, attribute=='type')[,-2];       
     colnames(tbl.type) <- c('id', 'type')
 
     tbl.md <- merge(tbl.matrix, tbl.species, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.protein, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.family, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.tax, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.class, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.pubmed, all.x=TRUE)
     tbl.md <- merge(tbl.md, tbl.type, all.x=TRUE)
 
     fullID <- paste(tbl.md$mID, tbl.md$version, sep='.')
     tbl.md <- cbind(fullID, tbl.md, stringsAsFactors=FALSE)
 
     invisible(tbl.md)
     
a1478bee
 } # createAnnotationTable
612c2460
 #-------------------------------------------------------------------------------
a1478bee
 createMetadataTable = function (tbl.anno, matrices)
 {
   options (stringsAsFactors=FALSE)
   tbl.md = data.frame ()
   matrix.ids = names (matrices) 
612c2460
   dataSource = 'JASPAR_2014'
a1478bee
   
   for (matrix.id in matrix.ids) {
     matrix = matrices [[matrix.id]]
612c2460
     tbl.sub = subset (tbl.anno, fullID==substr(matrix.id,0,8))
a1478bee
     if (nrow (tbl.sub) > 1) {  
         # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
         # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
       dimer = paste (unique (tbl.sub$proteinID), collapse=';')
       tbl.sub [1, 'proteinID'] = dimer
       }
     anno = as.list (tbl.sub [1,])
     taxon.code = anno$speciesID
     geneId.info = assignGeneId (anno$proteinID)
     new.row = list (providerName=anno$binder,
                     providerId=anno$fullID,
                     dataSource=dataSource,
                     geneSymbol=anno$binder,
                     geneId=geneId.info$geneId,
                     geneIdType=geneId.info$type,
                     proteinId=anno$proteinID,
                     proteinIdType=guessProteinIdentifierType (anno$proteinID),
                     organism=convertTaxonCode(anno$speciesID),
                     sequenceCount=as.integer (mean (colSums (matrix))),
                     bindingSequence=NA_character_,
                     bindingDomain=anno$class,
                     tfFamily=anno$family,
                     experimentType=anno$type,
                     pubmedID=anno$pubmed)
     tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
     full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
     rownames (tbl.md) [nrow (tbl.md)] = full.name
     } # for i
 
       # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
     NA.string.indices = grep ('NA', tbl.md$proteinId)
     if (length (NA.string.indices) > 0)
       tbl.md$proteinId [NA.string.indices] = NA
    invisible (tbl.md)
612c2460
 }#createMetadataTable
a1478bee
 #------------------------------------------------------------------------------------------------------------------------
 renameMatrices = function (matrices, tbl.md)
 {
   stopifnot (length (matrices) == nrow (tbl.md))
   names (matrices) = rownames (tbl.md)
   invisible (matrices)
612c2460
   
a1478bee
 } # renameMatrices
 #------------------------------------------------------------------------------------------------------------------------
612c2460
 # an empirical and not altogether trustworthy solution to identifying identifier types.
 guessProteinIdentifierType = function (moleculeName)
 {
   if (nchar (moleculeName) == 0)
     return (NA_character_)
   if (is.na (moleculeName))
     return (NA_character_) 
 
   first.char = substr (moleculeName, 1, 1)
 
   if (first.char == 'Y')
     return ('SGD')
 
   if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
     return ('UNIPROT')
 
   if (length (grep ('^NP_', moleculeName)) == 1)
     return ('REFSEQ')
 
    return (NA_character_)
 
 } # guessProteinIdentifierType
 #------------------------------------------------------------------------------------------------------------------------
 normalizeMatrices = function (matrices)
 {
   mtx.normalized = sapply (matrices,
       function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
 
   invisible (mtx.normalized)
 
 } # normalizeMatrices
 #------------------------------------------------------------------------------------------------------------------------
 parsePwm = function (text)
 {
    lines = strsplit (text, '\t')
    #browser()
    stopifnot(length(lines)==5) # title line, one line for each base
    title = lines [[1]][1]
    line.count = length(lines)
 
      # expect 4 rows, and a number of columns we can discern from
      # the incoming text.
   cols <- length(lines[[2]])
   result <- matrix (nrow=4, ncol=cols,
                    dimnames=list(c('A','C','G','T'),
                                  as.character(1:cols)))
   row = 1
   for(i in 2:line.count){
     result [row,] = as.numeric (lines[[i]])
     row = row + 1
     } # for i
 
   #result = t (result)
     
   #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
   return (list (title=title, matrix=result))
 
 } # parsePwm
 #----------------------------------------------------------------------------------------------------
 #------------------------------------------------------------------------------------------------------------------------
a1478bee
 # full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
 #                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
 #                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
 #                'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', 
 #                'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
 convertTaxonCode = function (ncbi.code)
 {
   #if (is.na (ncbi.code))  
   #  return (NA_character_)
   ncbi.code = as.character (ncbi.code)
   if (ncbi.code %in% c ('-', NA_character_, 'NA'))
     return ('Vertebrata')
 
   tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
                             '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
                             '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
                     name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
                             'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
                             'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
                             'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
                             'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
                     stringsAsFactors=FALSE)
 
   ncbi.code = as.character (ncbi.code)
   index = which (tbl$code == ncbi.code)
   if (length (index) == 1)
     return (tbl$name [index])
612c2460
   if (nchar(ncbi.code)>6)
     return ('Vertebrata')
a1478bee
   else {
612c2460
     browser()
     write (sprintf ("unable to map organism code |%s|", ncbi.code), stderr ())
a1478bee
     return (NA_character_)
     }
 
 } # convertTaxonCode
612c2460
 #----------------------------------------------------------------------------------------------------
a1478bee
 assignGeneId = function (proteinId)
 {
   if (!exists ('id.uniprot.human')) {
 
     tbl = toTable (org.Hs.egUNIPROT)
     id.uniprot.human <<- as.character (tbl$gene_id)
     names (id.uniprot.human) <<- tbl$uniprot_id
 
     tbl = toTable (org.Hs.egREFSEQ)
     tbl = tbl [grep ('^NP_', tbl$accession),]
     id.refseq.human <<- as.character (tbl$gene_id)
     names (id.refseq.human) <<- tbl$accession
 
     tbl = toTable (org.Mm.egUNIPROT)
     id.uniprot.mouse <<- as.character (tbl$gene_id)
     names (id.uniprot.mouse) <<- tbl$uniprot_id
 
     tbl = toTable (org.Mm.egREFSEQ)
     tbl = tbl [grep ('^NP_', tbl$accession),]
     id.refseq.mouse <<- as.character (tbl$gene_id)
     names (id.refseq.mouse) <<- tbl$accession
     }
 
   proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
 
   if (proteinId %in% names (id.uniprot.human))
     return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.uniprot.mouse))
     return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.refseq.human))
     return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.refseq.mouse))
     return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
 
   found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
 
   if (found.leading.Y == 1)
     return (list (geneId=proteinId, type='SGD'))
 
   return (list (geneId=NA_character_, type=NA_character_))
 
 } # assignGeneId
612c2460
 #-----------------------------------------------------------------------------