import os, glob, gzip, tarfile, shutil, time, sys, gc, pickle, tempfile
import urllib.request, urllib.error, urllib.parse
from contextlib import closing
from configparser import SafeConfigParser
from pyGeno.tools.ProgressBar import ProgressBar
import pyGeno.configuration as conf
from pyGeno.Genome import *
from pyGeno.Chromosome import *
from pyGeno.Gene import *
from pyGeno.Transcript import *
from pyGeno.Exon import *
from pyGeno.Protein import *
from pyGeno.tools.parsers.GTFTools import GTFFile
from pyGeno.tools.ProgressBar import ProgressBar
from pyGeno.tools.io import printf
import gc
#~ import objgraph
[docs]def backUpDB() :
"""backup the current database version. automatically called by importGenome(). Returns the filename of the backup"""
st = time.ctime().replace(' ', '_')
fn = conf.pyGeno_RABA_DBFILE.replace('.db', '_%s-bck.db' % st)
shutil.copy2(conf.pyGeno_RABA_DBFILE, fn)
return fn
def _decompressPackage(packageFile) :
pFile = tarfile.open(packageFile)
packageDir = tempfile.mkdtemp(prefix = "pyGeno_import_")
if os.path.isdir(packageDir) :
shutil.rmtree(packageDir)
os.makedirs(packageDir)
for mem in pFile :
pFile.extract(mem, packageDir)
return packageDir
def _getFile(fil, directory) :
if fil.find("http://") == 0 or fil.find("ftp://") == 0 :
printf("Downloading file: %s..." % fil)
finalFile = os.path.normpath('%s/%s' %(directory, fil.split('/')[-1]))
urllib.request.urlretrieve (fil, finalFile)
#with closing(urllib.request.urlopen(fil)) as r:
# with open(finalFile, 'wb') as f:
# shutil.copyfileobj(r, f)
printf('done.')
else :
finalFile = os.path.normpath('%s/%s' %(directory, fil))
return finalFile
[docs]def deleteGenome(species, name) :
"""Removes a genome from the database"""
printf('deleting genome (%s, %s)...' % (species, name))
conf.db.beginTransaction()
objs = []
allGood = True
try :
genome = Genome_Raba(name = name, species = species.lower())
objs.append(genome)
pBar = ProgressBar(label = 'preparing')
for typ in (Chromosome_Raba, Gene_Raba, Transcript_Raba, Exon_Raba, Protein_Raba) :
pBar.update()
f = RabaQuery(typ, namespace = genome._raba_namespace)
f.addFilter({'genome' : genome})
for e in f.run(gen=True) :
objs.append(e)
pBar.close()
pBar = ProgressBar(nbEpochs = len(objs), label = 'deleting objects')
for e in objs :
pBar.update()
e.delete()
pBar.close()
except KeyError as e :
#~ printf("\tWARNING, couldn't remove genome form db, maybe it's not there: ", e)
raise KeyError("\tWARNING, couldn't remove genome form db, maybe it's not there: ", e)
allGood = False
printf('\tdeleting folder')
try :
shutil.rmtree(conf.getGenomeSequencePath(species, name))
except OSError as e:
#~ printf('\tWARNING, Unable to delete folder: ', e)
OSError('\tWARNING, Unable to delete folder: ', e)
allGood = False
conf.db.endTransaction()
return allGood
[docs]def importGenome(packageFile, batchSize = 50, verbose = 0) :
"""Import a pyGeno genome package. A genome packages is folder or a tar.gz ball that contains at it's root:
* gziped fasta files for all chromosomes, or URLs from where them must be downloaded
* gziped GTF gene_set file from Ensembl, or an URL from where it must be downloaded
* a manifest.ini file such as::
[package_infos]
description = Test package. This package installs only chromosome Y of mus musculus
maintainer = Tariq Daouda
maintainer_contact = tariq.daouda [at] umontreal
version = GRCm38.73
[genome]
species = Mus_musculus
name = GRCm38_test
source = http://useast.ensembl.org/info/data/ftp/index.html
[chromosome_files]
Y = Mus_musculus.GRCm38.73.dna.chromosome.Y.fa.gz / or an url such as ftp://... or http://
[gene_set]
gtf = Mus_musculus.GRCm38.73_Y-only.gtf.gz / or an url such as ftp://... or http://
All files except the manifest can be downloaded from: http://useast.ensembl.org/info/data/ftp/index.html
A rollback is performed if an exception is caught during importation
batchSize sets the number of genes to parse before performing a database save. PCs with little ram like
small values, while those endowed with more memory may perform faster with higher ones.
Verbose must be an int [0, 4] for various levels of verbosity
"""
def reformatItems(items) :
s = str(items)
s = s.replace('[', '').replace(']', '').replace("',", ': ').replace('), ', '\n').replace("'", '').replace('(', '').replace(')', '')
return s
printf('Importing genome package: %s... (This may take a while)' % packageFile)
isDir = False
if not os.path.isdir(packageFile) :
packageDir = _decompressPackage(packageFile)
else :
isDir = True
packageDir = packageFile
parser = SafeConfigParser()
parser.read(os.path.normpath(packageDir+'/manifest.ini'))
packageInfos = parser.items('package_infos')
genomeName = parser.get('genome', 'name')
species = parser.get('genome', 'species')
genomeSource = parser.get('genome', 'source')
seqTargetDir = conf.getGenomeSequencePath(species.lower(), genomeName)
if os.path.isdir(seqTargetDir) :
raise KeyError("The directory %s already exists, Please call deleteGenome() first if you want to reinstall" % seqTargetDir)
gtfFile = _getFile(parser.get('gene_set', 'gtf'), packageDir)
chromosomesFiles = {}
chromosomeSet = set()
for key, fil in parser.items('chromosome_files') :
chromosomesFiles[key] = _getFile(fil, packageDir)
chromosomeSet.add(key)
try :
genome = Genome(name = genomeName, species = species)
except KeyError:
pass
else :
raise KeyError("There seems to be already a genome (%s, %s), please call deleteGenome() first if you want to reinstall it" % (genomeName, species))
genome = Genome_Raba()
genome.set(name = genomeName, species = species, source = genomeSource, packageInfos = packageInfos)
printf("Importing:\n\t%s\nGenome:\n\t%s\n..." % (reformatItems(packageInfos).replace('\n', '\n\t'), reformatItems(parser.items('genome')).replace('\n', '\n\t')))
chros = _importGenomeObjects(gtfFile, chromosomeSet, genome, batchSize, verbose)
os.makedirs(seqTargetDir)
startChro = 0
pBar = ProgressBar(nbEpochs = len(chros))
for chro in chros :
pBar.update(label = "Importing DNA, chro %s" % chro.number)
length = _importSequence(chro, chromosomesFiles[chro.number.lower()], seqTargetDir)
chro.start = startChro
chro.end = startChro+length
startChro = chro.end
chro.save()
pBar.close()
if not isDir :
shutil.rmtree(packageDir)
#~ objgraph.show_most_common_types(limit=20)
return True
#~ @profile
def _importGenomeObjects(gtfFilePath, chroSet, genome, batchSize, verbose = 0) :
"""verbose must be an int [0, 4] for various levels of verbosity"""
class Store(object) :
def __init__(self, conf) :
self.conf = conf
self.chromosomes = {}
self.genes = {}
self.transcripts = {}
self.proteins = {}
self.exons = {}
def batch_save(self) :
self.conf.db.beginTransaction()
for c in self.genes.values() :
c.save()
conf.removeFromDBRegistery(c)
for c in self.transcripts.values() :
c.save()
conf.removeFromDBRegistery(c.exons)
conf.removeFromDBRegistery(c)
for c in self.proteins.values() :
c.save()
conf.removeFromDBRegistery(c)
self.conf.db.endTransaction()
del(self.genes)
del(self.transcripts)
del(self.proteins)
del(self.exons)
self.genes = {}
self.transcripts = {}
self.proteins = {}
self.exons = {}
gc.collect()
def save_chros(self) :
pBar = ProgressBar(nbEpochs = len(self.chromosomes))
for c in self.chromosomes.values() :
pBar.update(label = 'Chr %s' % c.number)
c.save()
pBar.close()
printf('Importing gene set infos from %s...' % gtfFilePath)
printf('Backuping indexes...')
indexes = conf.db.getIndexes()
printf("Dropping all your indexes (don't worry I'll restore them later)...")
Genome_Raba.flushIndexes()
Chromosome_Raba.flushIndexes()
Gene_Raba.flushIndexes()
Transcript_Raba.flushIndexes()
Protein_Raba.flushIndexes()
Exon_Raba.flushIndexes()
printf("Parsing gene set...")
gtf = GTFFile(gtfFilePath, gziped = True)
printf('Done. Importation begins!')
store = Store(conf)
chroNumber = None
pBar = ProgressBar(nbEpochs = len(gtf))
for line in gtf :
chroN = line['seqname']
pBar.update(label = "Chr %s" % chroN)
if (chroN.upper() in chroSet or chroN.lower() in chroSet):
strand = line['strand']
gene_biotype = line['gene_biotype']
regionType = line['feature']
frame = line['frame']
start = int(line['start']) - 1
end = int(line['end'])
if start > end :
start, end = end, start
chroNumber = chroN.upper()
if chroNumber not in store.chromosomes :
store.chromosomes[chroNumber] = Chromosome_Raba()
store.chromosomes[chroNumber].set(genome = genome, number = chroNumber)
try :
geneId = line['gene_id']
geneName = line['gene_name']
except KeyError :
geneId = None
geneName = None
if verbose :
printf('Warning: no gene_id/name found in line %s' % gtf[i])
if geneId is not None :
if geneId not in store.genes :
if len(store.genes) > batchSize :
store.batch_save()
if verbose > 0 :
printf('\tGene %s, %s...' % (geneId, geneName))
store.genes[geneId] = Gene_Raba()
store.genes[geneId].set(genome = genome, id = geneId, chromosome = store.chromosomes[chroNumber], name = geneName, strand = strand, biotype = gene_biotype)
if store.genes[geneId].start is None or start < store.genes[geneId].start:
store.genes[geneId].start = start
if store.genes[geneId].end is None or end > store.genes[geneId].end:
store.genes[geneId].end = end
try :
transId = line['transcript_id']
transName = line['transcript_name']
try :
transcript_biotype = line['transcript_biotype']
except KeyError :
transcript_biotype = None
except KeyError :
transId = None
transName = None
if verbose > 2 :
printf('\t\tWarning: no transcript_id, name found in line %s' % gtf[i])
if transId is not None :
if transId not in store.transcripts :
if verbose > 1 :
printf('\t\tTranscript %s, %s...' % (transId, transName))
store.transcripts[transId] = Transcript_Raba()
store.transcripts[transId].set(genome = genome, id = transId, chromosome = store.chromosomes[chroNumber], gene = store.genes.get(geneId, None), name = transName, biotype=transcript_biotype)
if store.transcripts[transId].start is None or start < store.transcripts[transId].start:
store.transcripts[transId].start = start
if store.transcripts[transId].end is None or end > store.transcripts[transId].end:
store.transcripts[transId].end = end
try :
protId = line['protein_id']
except KeyError :
protId = None
if verbose > 2 :
printf('Warning: no protein_id found in line %s' % gtf[i])
# Store selenocysteine positions in transcript
if regionType == 'Selenocysteine':
store.transcripts[transId].selenocysteine.append(start)
if protId is not None and protId not in store.proteins :
if verbose > 1 :
printf('\t\tProtein %s...' % (protId))
store.proteins[protId] = Protein_Raba()
store.proteins[protId].set(genome = genome, id = protId, chromosome = store.chromosomes[chroNumber], gene = store.genes.get(geneId, None), transcript = store.transcripts.get(transId, None), name = transName)
store.transcripts[transId].protein = store.proteins[protId]
try :
exonNumber = int(line['exon_number']) - 1
exonKey = (transId, exonNumber)
except KeyError :
exonNumber = None
exonKey = None
if verbose > 2 :
printf('Warning: no exon number or id found in line %s' % gtf[i])
if exonKey is not None :
if verbose > 3 :
printf('\t\t\texon %s...' % (exonId))
if exonKey not in store.exons and regionType == 'exon' :
store.exons[exonKey] = Exon_Raba()
store.exons[exonKey].set(genome = genome, chromosome = store.chromosomes[chroNumber], gene = store.genes.get(geneId, None), transcript = store.transcripts.get(transId, None), protein = store.proteins.get(protId, None), strand = strand, number = exonNumber, start = start, end = end)
store.transcripts[transId].exons.append(store.exons[exonKey])
try :
store.exons[exonKey].id = line['exon_id']
except KeyError :
pass
if regionType == 'exon' :
if store.exons[exonKey].start is None or start < store.exons[exonKey].start:
store.exons[exonKey].start = start
if store.exons[exonKey].end is None or end > store.transcripts[transId].end:
store.exons[exonKey].end = end
elif regionType == 'CDS' :
store.exons[exonKey].CDS_start = start
store.exons[exonKey].CDS_end = end
store.exons[exonKey].frame = frame
elif regionType == 'stop_codon' :
if strand == '+' :
if store.exons[exonKey].CDS_end != None :
store.exons[exonKey].CDS_end += 3
if store.exons[exonKey].end < store.exons[exonKey].CDS_end :
store.exons[exonKey].end = store.exons[exonKey].CDS_end
if store.transcripts[transId].end < store.exons[exonKey].CDS_end :
store.transcripts[transId].end = store.exons[exonKey].CDS_end
if store.genes[geneId].end < store.exons[exonKey].CDS_end :
store.genes[geneId].end = store.exons[exonKey].CDS_end
if strand == '-' :
if store.exons[exonKey].CDS_start != None :
store.exons[exonKey].CDS_start -= 3
if store.exons[exonKey].start > store.exons[exonKey].CDS_start :
store.exons[exonKey].start = store.exons[exonKey].CDS_start
if store.transcripts[transId].start > store.exons[exonKey].CDS_start :
store.transcripts[transId].start = store.exons[exonKey].CDS_start
if store.genes[geneId].start > store.exons[exonKey].CDS_start :
store.genes[geneId].start = store.exons[exonKey].CDS_start
pBar.close()
store.batch_save()
conf.db.beginTransaction()
printf('almost done saving chromosomes...')
store.save_chros()
printf('saving genome object...')
genome.save()
conf.db.endTransaction()
conf.db.beginTransaction()
printf('restoring core indexes...')
# Genome.ensureGlobalIndex(('name', 'species'))
# Chromosome.ensureGlobalIndex('genome')
# Gene.ensureGlobalIndex('genome')
# Transcript.ensureGlobalIndex('genome')
# Protein.ensureGlobalIndex('genome')
# Exon.ensureGlobalIndex('genome')
Transcript.ensureGlobalIndex('exons')
printf('commiting changes...')
conf.db.endTransaction()
conf.db.beginTransaction()
printf('restoring user indexes')
pBar = ProgressBar(label = "restoring", nbEpochs = len(indexes))
for idx in indexes :
pBar.update()
conf.db.execute(idx[-1].replace('CREATE INDEX', 'CREATE INDEX IF NOT EXISTS'))
pBar.close()
printf('commiting changes...')
conf.db.endTransaction()
return list(store.chromosomes.values())
#~ @profile
def _importSequence(chromosome, fastaFile, targetDir) :
"Serializes fastas into .dat files"
f = gzip.open(fastaFile, 'rt')
header = f.readline()
strRes = f.read().upper().replace('\n', '').replace('\r', '')
f.close()
fn = '%s/chromosome%s.dat' % (targetDir, chromosome.number)
f = open(fn, 'w')
f.write(strRes)
f.close()
chromosome.dataFile = fn
chromosome.header = header
return len(strRes)