inst/scripts/import/jaspar2014/import.R
a1478bee
 # jaspar/import.R
 # 
 #------------------------------------------------------------------------------------------------------------------------
 library (org.Hs.eg.db)
 library (org.Mm.eg.db)
 #------------------------------------------------------------------------------------------------------------------------
 printf <- function(...) print(noquote(sprintf(...)))
 #------------------------------------------------------------------------------------------------------------------------
 run = function (dataDir)
 {
   dataDir <- file.path(dataDir, "jaspar")
   tbl.rmat = readRawMatrices (dataDir)
   matrices = convertRawMatricesToStandard (tbl.rmat)
   tbl.anno = createAnnotationTable (dataDir)
   tbl.md = createMetadataTable (tbl.anno, matrices)
   matrices = renameMatrices (matrices, tbl.md)
   matrices = normalizeMatrices (matrices)
   serializedFile <- "jaspar.RData"
   save (matrices, tbl.md, file=serializedFile)
   printf("saved %d matrices to %s", length(matrices), serializedFile)
   printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
 
 } # run
 #------------------------------------------------------------------------------------------------------------------------
 
 readRawMatrices = function (dataDir)
 {
   file <- file.path(dataDir, 'MATRIX_DATA.txt')
   #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
   #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
   tbl.matrices = read.table (file, sep='\n', header=FALSE )
   
   invisible (tbl.matrices)
 
 } # readRawMatrices
 #------------------------------------------------------------------------------------------------------------------------
 convertRawMatricesToStandard = function (tbl.rmat)
 {
   #matrix.ids = unique (tbl.rmat$id)
   #result =  vector ('list', length (matrix.ids))
 
   #i = 1
   #for (matrix.id in matrix.ids) {
   #  tbl.sub = subset (tbl.rmat, id == matrix.id)
   #    # sanity check this rather unusual representation of a position count matrix
   #  base.count = as.data.frame (table (tbl.sub$base))
   #  stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
       # conservative length check.  actually expect sequence lengths of 6 - 20 bases
   #  if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
   #    printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
   #    }
   #  stopifnot (all (base.count$Freq == base.count$Freq [1]))
   # nucleotide.counts = tbl.sub$count
   #  row.names = c('A', 'C', 'G', 'T'); 
   #  col.names = 1:(nrow (tbl.sub) / 4)
   #  m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
   #  result [[i]] = m
   #  i = i + 1
   #  } # for matrix.id
 
   #names (result) = matrix.ids
 
   result<- vector("list",length=593)
   matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE)
   names (result) = matrices.id
   #Get a list of the ID's and then assign them as names 
   m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE)
   data.list=list()
   x = 1
   y = 1
   while (x < length(m.d))
     {   
       matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3])
       matrices.formatted <- gsub("\t"," ",matrices.position)
       #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t"))
       row.names = c('A', 'C', 'G', 'T')
       col.names = 1:(ncol (matrices.formatted))
       #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,))
       result[[y]] <- matrices.formatted
       y=y+1
       x=x+4
     }
   browser()
   return (result)
 
 } # convertRawMatricesToStandard 
 #------------------------------------------------------------------------------------------------------------------------
 # read 'mysql' tables provide by jaspar: 
 #          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
 #  MATRIX_PROTEIN.txt: 9229	P29383
 #  MATRIX_SPECIES.txt: 9229	3702
 #  MATRIX_ANNOTATION.txt: 
 #     9229	class	Other Alpha-Helix
 #     9229	comment	-
 #     9229	family	MADS
 #     9229	medline	7632923
 #     9229	tax_group	plants
 #     9229	type	SELEX
 createAnnotationTable = function (dataDir)
 {
   file <- file.path(dataDir, "MATRIX.txt")
   tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
   colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
 
   file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
   tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
   colnames (tbl.protein) =  c ('id', 'proteinID')
 
   file <- file.path(dataDir, "MATRIX_SPECIES.txt")
   tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
   colnames (tbl.species) = c ('id', 'speciesID')
 
   file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
   tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
   colnames (tbl.anno) = c ('id', 'attribute', 'value')
 
   tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
   colnames (tbl.family) = c ('id', 'family')
 
   tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
   colnames (tbl.tax) = c ('id', 'tax')
 
   tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
   colnames (tbl.class) = c ('id', 'class')
 
   tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
   colnames (tbl.comment) = c ('id', 'comment')
 
   tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
   colnames (tbl.pubmed) = c ('id', 'pubmed')
 
   tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
   colnames (tbl.type) = c ('id', 'type')
 
 
   tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
   tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
 
   fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
   tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
 
   invisible (tbl.md)
 
 } # createAnnotationTable
 #------------------------------------------------------------------------------------------------------------------------
 # assemble these columns:
 #                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
 #                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
 #                      geneSymbols=character(),              # ABF2, Rhox11
 #                      sequenceCounts=integer(),             # often NA
 #                      organisms=character(),                # Scerevisiae, Mmusculus
 #                      bindingMolecules=character(),         # YMR072W, 194738
 #                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
 #                      bindingDomainTypes=character(),       # NA, Homeo
 #                      dataSources=character(),              # ScerTF, UniPROBE
 #                      experimentTypes=character(),          # NA, protein-binding microarray
 #                      pubmedIDs=character(),                # 19111667, 1858359
 #                      tfFamilies=character())               # NA, NA
 #
 # from these
 #
 createMetadataTable = function (tbl.anno, matrices)
 {
   options (stringsAsFactors=FALSE)
   tbl.md = data.frame ()
   matrix.ids = names (matrices) 
   dataSource = 'JASPAR_CORE'
   
   for (matrix.id in matrix.ids) {
     matrix = matrices [[matrix.id]]
     stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
     tbl.sub = subset (tbl.anno, id==matrix.id)
     if (nrow (tbl.sub) > 1) {  
         # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
         # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
       dimer = paste (unique (tbl.sub$proteinID), collapse=';')
       tbl.sub [1, 'proteinID'] = dimer
       }
     anno = as.list (tbl.sub [1,])
     taxon.code = anno$speciesID
     geneId.info = assignGeneId (anno$proteinID)
     new.row = list (providerName=anno$binder,
                     providerId=anno$fullID,
                     dataSource=dataSource,
                     geneSymbol=anno$binder,
                     geneId=geneId.info$geneId,
                     geneIdType=geneId.info$type,
                     proteinId=anno$proteinID,
                     proteinIdType=guessProteinIdentifierType (anno$proteinID),
                     organism=convertTaxonCode(anno$speciesID),
                     sequenceCount=as.integer (mean (colSums (matrix))),
                     bindingSequence=NA_character_,
                     bindingDomain=anno$class,
                     tfFamily=anno$family,
                     experimentType=anno$type,
                     pubmedID=anno$pubmed)
     tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
     full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
     rownames (tbl.md) [nrow (tbl.md)] = full.name
     } # for i
 
       # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
     NA.string.indices = grep ('NA', tbl.md$proteinId)
     if (length (NA.string.indices) > 0)
       tbl.md$proteinId [NA.string.indices] = NA
      
    invisible (tbl.md)
 
 } # createMetadataTable
 #------------------------------------------------------------------------------------------------------------------------
 renameMatrices = function (matrices, tbl.md)
 {
   stopifnot (length (matrices) == nrow (tbl.md))
   names (matrices) = rownames (tbl.md)
   invisible (matrices)
 
 } # renameMatrices
 #------------------------------------------------------------------------------------------------------------------------
 # full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
 #                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
 #                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
 #                'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', 
 #                'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
 convertTaxonCode = function (ncbi.code)
 {
   #if (is.na (ncbi.code))  
   #  return (NA_character_)
   ncbi.code = as.character (ncbi.code)
   if (ncbi.code %in% c ('-', NA_character_, 'NA'))
     return ('Vertebrata')
 
   tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
                             '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
                             '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
                     name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
                             'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
                             'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
                             'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
                             'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
                     stringsAsFactors=FALSE)
 
   ncbi.code = as.character (ncbi.code)
   index = which (tbl$code == ncbi.code)
   if (length (index) == 1)
     return (tbl$name [index])
   else {
     write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
     return (NA_character_)
     }
 
 } # convertTaxonCode
 #------------------------------------------------------------------------------------------------------------------------
 # an empirical and not altogether trustworthy solution to identifying identifier types.
 guessProteinIdentifierType = function (moleculeName)
 {
   if (nchar (moleculeName) == 0)
     return (NA_character_)
   if (is.na (moleculeName))
     return (NA_character_) 
 
   first.char = substr (moleculeName, 1, 1)
 
   if (first.char == 'Y')
     return ('SGD')
 
   if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
     return ('UNIPROT')
 
   if (length (grep ('^EAW', moleculeName)) == 1)
     return ('NCBI')
 
   if (length (grep ('^EAX', moleculeName)) == 1)
     return ('NCBI')
 
   if (length (grep ('^NP_', moleculeName)) == 1)
     return ('REFSEQ')
 
   if (length (grep ('^BAD', moleculeName)) == 1)
     return ('EMBL')
 
    return (NA_character_)
 
 } # guessProteinIdentifierType
 #------------------------------------------------------------------------------------------------------------------------
 normalizeMatrices = function (matrices)
 {
   mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
   invisible (mtx.normalized)
 
 } # normalizeMatrices
 #------------------------------------------------------------------------------------------------------------------------
 assignGeneId = function (proteinId)
 {
   if (!exists ('id.uniprot.human')) {
 
     tbl = toTable (org.Hs.egUNIPROT)
     id.uniprot.human <<- as.character (tbl$gene_id)
     names (id.uniprot.human) <<- tbl$uniprot_id
 
     tbl = toTable (org.Hs.egREFSEQ)
     tbl = tbl [grep ('^NP_', tbl$accession),]
     id.refseq.human <<- as.character (tbl$gene_id)
     names (id.refseq.human) <<- tbl$accession
 
     tbl = toTable (org.Mm.egUNIPROT)
     id.uniprot.mouse <<- as.character (tbl$gene_id)
     names (id.uniprot.mouse) <<- tbl$uniprot_id
 
     tbl = toTable (org.Mm.egREFSEQ)
     tbl = tbl [grep ('^NP_', tbl$accession),]
     id.refseq.mouse <<- as.character (tbl$gene_id)
     names (id.refseq.mouse) <<- tbl$accession
     }
 
   proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
 
   if (proteinId %in% names (id.uniprot.human))
     return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.uniprot.mouse))
     return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.refseq.human))
     return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
 
   if (proteinId %in% names (id.refseq.mouse))
     return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
 
   found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
 
   if (found.leading.Y == 1)
     return (list (geneId=proteinId, type='SGD'))
 
   return (list (geneId=NA_character_, type=NA_character_))
 
 } # assignGeneId
 #------------------------------------------------------------------------------------------------------------------------