Source code for tools.UsefulFunctions

import os, copy

[docs]class UnknownNucleotide(Exception) : def __init__(self, nuc) : self.msg = 'Unknown nucleotides %s' % str(nuc) def __str__(self) : return self.msg
#This will probably be moved somewhere else in the futur def saveResults(directoryName, fileName, strResults, log = '', args = ''): if not os.path.exists(directoryName): os.makedirs(directoryName) resPath = "%s/%s"%(directoryName, fileName) resFile = open(resPath, 'w') print("Saving results :\n\t%s..."%resPath) resFile.write(strResults) resFile.close() if log != '' : errPath = "%s.err.txt"%(resPath) errFile = open(errPath, 'w') print("Saving log :\n\t%s..." %errPath) errFile.write(log) errFile.close() if args != '' : paramPath = "%s.args.txt"%(resPath) paramFile = open(paramPath, 'w') print("Saving arguments :\n\t%s..." %paramPath) paramFile.write(args) paramFile.close() return "%s/"%(directoryName) nucleotides = ['A', 'T', 'C', 'G'] polymorphicNucleotides = { 'R' : ['A','G'], 'Y' : ['C','T'], 'M': ['A','C'], 'K' : ['T','G'], 'W' : ['A','T'], 'S' : ['C','G'], 'B': ['C','G','T'], 'D' : ['A','G','T'], 'H' : ['A','C','T'], 'V' : ['A','C','G'], 'N': ['A','C','G','T'] } #<7iyed> #from Molecular Systems Biology 8; Article number 572; doi:10.1038/msb.2012.3 codonAffinity = {'CTT': 'low', 'ACC': 'high', 'ACA': 'low', 'ACG': 'high', 'ATC': 'high', 'AAC': 'high', 'ATA': 'low', 'AGG': 'high', 'CCT': 'low', 'ACT': 'low', 'AGC': 'high', 'AAG': 'high', 'AGA': 'low', 'CAT': 'low', 'AAT': 'low', 'ATT': 'low', 'CTG': 'high', 'CTA': 'low', 'CTC': 'high', 'CAC': 'high', 'AAA': 'low', 'CCG': 'high', 'AGT': 'low', 'CCA': 'low', 'CAA': 'low', 'CCC': 'high', 'TAT': 'low', 'GGT': 'low', 'TGT': 'low', 'CGA': 'low', 'CAG': 'high', 'TCT': 'low', 'GAT': 'low', 'CGG': 'high', 'TTT': 'low', 'TGC': 'high', 'GGG': 'high', 'TAG': 'high', 'GGA': 'low', 'TGG': 'high', 'GGC': 'high', 'TAC': 'high', 'TTC': 'high', 'TCG': 'high', 'TTA': 'low', 'TTG': 'high', 'TCC': 'high', 'GAA': 'low', 'TAA': 'high', 'GCA': 'low', 'GTA': 'low', 'GCC': 'high', 'GTC': 'high', 'GCG': 'high', 'GTG': 'high', 'GAG': 'high', 'GTT': 'low', 'GCT': 'low', 'TGA': 'high', 'GAC': 'high', 'CGT': 'low', 'TCA': 'low', 'ATG': 'high', 'CGC': 'high'} lowAffinityCodons = set(['CTT', 'ACA', 'AAA', 'ATA', 'CCT', 'AGA', 'CAT', 'AAT', 'ATT', 'CTA', 'ACT', 'CAA', 'AGT', 'CCA', 'TAT', 'GGT', 'TGT', 'CGA', 'TCT', 'GAT', 'TTT', 'GGA', 'TTA', 'CGT', 'GAA', 'TCA', 'GCA', 'GTA', 'GTT', 'GCT']) highAffinityCodons = set(['ACC', 'ATG', 'AAG', 'ACG', 'ATC', 'AAC', 'AGG', 'AGC', 'CTG', 'CTC', 'CAC', 'CCG', 'CAG', 'CCC', 'CGC', 'CGG', 'TGC', 'GGG', 'TAG', 'TGG', 'GGC', 'TAC', 'TTC', 'TCG', 'TTG', 'TCC', 'TAA', 'GCC', 'GTC', 'GCG', 'GTG', 'GAG', 'TGA', 'GAC']) #</7iyed> #Empiraclly calculated using genome GRCh37.74 and Ensembl annotations humanCodonCounts = {'CTT': 588990, 'ACC': 760250, 'ACA': 671093, 'ACG': 248588, 'ATC': 819539, 'AAC': 777291, 'ATA': 326568, 'AGG': 520514, 'CCT': 784233, 'ACT': 581281, 'AGC': 826157, 'AAG': 1373474, 'AGA': 560614, 'CAT': 487348, 'AAT': 745200, 'ATT': 685951, 'CTG': 1579105, 'CTA': 311963, 'CTC': 772503, 'CAC': 618558, 'AAA': 1111269, 'CCG': 285345, 'AGT': 558788, 'CCA': 771391, 'CAA': 572531, 'CCC': 809928, 'TAT': 507376, 'GGT': 459267, 'TGT': 443487, 'CGA': 276584, 'CAG': 1483627, 'TCT': 675336, 'GAT': 982540, 'CGG': 477748, 'TTT': 721642, 'TGC': 495033, 'GGG': 661842, 'TAG': 28685, 'GGA': 731598, 'TGG': 535340, 'GGC': 877641, 'TAC': 588108, 'TTC': 774303, 'TCG': 185384, 'TTA': 348372, 'TTG': 563764, 'TCC': 729893, 'GAA': 1355256, 'TAA': 37503, 'GCA': 718158, 'GTA': 316640, 'GCC': 1120424, 'GTC': 576027, 'GCG': 289438, 'GTG': 1119171, 'GAG': 1685297, 'GTT': 486471, 'GCT': 806491, 'TGA': 82954, 'GAC': 1033108, 'CGT': 200762, 'TCA': 569093, 'ATG': 935789, 'CGC': 404889} humanCodonCount = 42433513 humanCodonRatios = {'CTT': 0.013880302580651288, 'ACC': 0.017916263496731935, 'ACA': 0.01581516477318293, 'ACG': 0.005858294127097137, 'ATC': 0.019313484603549088, 'AAC': 0.018317856454637634, 'ATA': 0.007695992551924702, 'AGG': 0.012266578070026868, 'CCT': 0.018481453562423644, 'ACT': 0.0136986301369863, 'AGC': 0.01946944623698726, 'AAG': 0.03236767127906662, 'AGA': 0.013211585851965638, 'CAT': 0.011484978865643295, 'AAT': 0.017561591000019253, 'ATT': 0.016165312544356155, 'CTG': 0.0372136287655467, 'CTA': 0.007351807049300867, 'CTC': 0.018205021111497414, 'CAC': 0.014577110313727737, 'AAA': 0.02618847513285077, 'CCG': 0.006724519838835875, 'AGT': 0.01316855382678309, 'CCA': 0.018178815409414725, 'CAA': 0.013492425197037068, 'CCC': 0.01908698909750885, 'TAT': 0.011956964298477951, 'GGT': 0.010823214189218791, 'TGT': 0.010451338308944631, 'CGA': 0.006518055669819277, 'CAG': 0.034963567593378375, 'TCT': 0.015915156494349172, 'GAT': 0.023154811622596506, 'CGG': 0.011258742588670422, 'TTT': 0.017006416602839365, 'TGC': 0.01166608571861585, 'GGG': 0.015597153127529177, 'TAG': 0.0006759987088507143, 'GGA': 0.017241042475083315, 'TGG': 0.012615971720276849, 'GGC': 0.020682732537369696, 'TAC': 0.013859517122704406, 'TTC': 0.01824744041342983, 'TCG': 0.004368811038576985, 'TTA': 0.008209831695999339, 'TTG': 0.013285819630347362, 'TCC': 0.017200861969641778, 'GAA': 0.03193834081095289, 'TAA': 0.0008838061557618385, 'GCA': 0.01692431168732129, 'GTA': 0.007462026535488589, 'GCC': 0.026404224415734798, 'GTC': 0.013574812907901357, 'GCG': 0.006820976618174413, 'GTG': 0.026374695868334068, 'GAG': 0.039716179049328296, 'GTT': 0.011464311239090669, 'GCT': 0.01900599179709679, 'TGA': 0.0019549170958341345, 'GAC': 0.024346511211551115, 'CGT': 0.004731213274752906, 'TCA': 0.013411404330346158, 'ATG': 0.022053064520017467, 'CGC': 0.009541727077840574} synonymousCodonsFrequencies = {'A': {'GCA': 0.24472833804337418, 'GCC': 0.38180943946027124, 'GCT': 0.2748297757275403, 'GCG': 0.09863244676881429}, 'C': {'TGC': 0.5274613220815753, 'TGT': 0.47253867791842474}, 'E': {'GAG': 0.5542731864894314, 'GAA': 0.44572681351056864}, 'D': {'GAT': 0.48745614313610314, 'GAC': 0.5125438568638969}, 'G': {'GGT': 0.1682082284016543, 'GGG': 0.24240206742876733, 'GGA': 0.26795045906236126, 'GGC': 0.3214392451072171}, 'F': {'TTC': 0.51760124870901, 'TTT': 0.48239875129098997}, 'I': {'ATT': 0.3744155479793762, 'ATC': 0.4473324534485262, 'ATA': 0.17825199857209761}, 'H': {'CAC': 0.5593224017231121, 'CAT': 0.4406775982768879}, 'K': {'AAG': 0.552763002048904, 'AAA': 0.44723699795109595}, '*': {'TAG': 0.19233348084375965, 'TGA': 0.5562081774416328, 'TAA': 0.25145834171460757}, 'M': {'ATG': 1.0}, 'L': {'CTT': 0.14142445416797428, 'CTG': 0.37916443861342136, 'CTA': 0.07490652981477404, 'CTC': 0.18548840407837594, 'TTA': 0.08364882247135866, 'TTG': 0.13536735085409574}, 'N': {'AAT': 0.48946102144446174, 'AAC': 0.5105389785555383}, 'Q': {'CAA': 0.27844698705060605, 'CAG': 0.721553012949394}, 'P': {'CCT': 0.29583684315158226, 'CCG': 0.1076409230535928, 'CCA': 0.2909924451987384, 'CCC': 0.30552978859608654}, 'S': {'TCT': 0.19052256484488883, 'AGC': 0.23307146458142142, 'TCG': 0.05229964811768493, 'AGT': 0.15764260007543762, 'TCC': 0.2059139249534016, 'TCA': 0.1605497974271656}, 'R': {'AGG': 0.21322832103906786, 'CGC': 0.16586259289315397, 'CGG': 0.19570924878057572, 'CGA': 0.11330250857089251, 'AGA': 0.2296552676219967, 'CGT': 0.0822420610943132}, 'T': {'ACC': 0.33621349966301256, 'ACA': 0.2967846446949689, 'ACG': 0.10993573358004469, 'ACT': 0.2570661220619738}, 'W': {'TGG': 1.0}, 'V': {'GTA': 0.12674172810489015, 'GTC': 0.230566755353321, 'GTT': 0.19472010868151218, 'GTG': 0.44797140786027667}, 'Y': {'TAT': 0.46315236005272553, 'TAC': 0.5368476399472745}} # Translation tables # Ref: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi translTable = dict() # Standard Code (NCBI transl_table=1) translTable['default'] = { 'TTT' : 'F', 'TCT' : 'S', 'TAT' : 'Y', 'TGT' : 'C', 'TTC' : 'F', 'TCC' : 'S', 'TAC' : 'Y', 'TGC' : 'C', 'TTA' : 'L', 'TCA' : 'S', 'TAA' : '*', 'TGA' : '*', 'TTG' : 'L', 'TCG' : 'S', 'TAG' : '*', 'TGG' : 'W', 'CTT' : 'L', 'CTC' : 'L', 'CTA' : 'L', 'CTG' : 'L', 'CCT' : 'P', 'CCC' : 'P', 'CCA' : 'P', 'CCG' : 'P', 'CAT' : 'H', 'CAC' : 'H', 'CAA' : 'Q', 'CAG' : 'Q', 'CGT' : 'R', 'CGC' : 'R', 'CGA' : 'R', 'CGG' : 'R', 'ATT' : 'I', 'ATC' : 'I', 'ATA' : 'I', 'ATG' : 'M', 'ACT' : 'T', 'ACC' : 'T', 'ACA' : 'T', 'ACG' : 'T', 'AAT' : 'N', 'AAC' : 'N', 'AAA' : 'K', 'AAG' : 'K', 'AGT' : 'S', 'AGC' : 'S', 'AGA' : 'R', 'AGG' : 'R', 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V', 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A', 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E', 'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G', '!GA' : 'U' } codonTable = translTable['default'] # The Vertebrate Mitochondrial Code (NCBI transl_table=2) translTable['mt'] = { 'TTT' : 'F', 'TCT' : 'S', 'TAT' : 'Y', 'TGT' : 'C', 'TTC' : 'F', 'TCC' : 'S', 'TAC' : 'Y', 'TGC' : 'C', 'TTA' : 'L', 'TCA' : 'S', 'TAA' : '*', 'TGA' : 'W', 'TTG' : 'L', 'TCG' : 'S', 'TAG' : '*', 'TGG' : 'W', 'CTT' : 'L', 'CTC' : 'L', 'CTA' : 'L', 'CTG' : 'L', 'CCT' : 'P', 'CCC' : 'P', 'CCA' : 'P', 'CCG' : 'P', 'CAT' : 'H', 'CAC' : 'H', 'CAA' : 'Q', 'CAG' : 'Q', 'CGT' : 'R', 'CGC' : 'R', 'CGA' : 'R', 'CGG' : 'R', 'ATT' : 'I', 'ATC' : 'I', 'ATA' : 'M', 'ATG' : 'M', 'ACT' : 'T', 'ACC' : 'T', 'ACA' : 'T', 'ACG' : 'T', 'AAT' : 'N', 'AAC' : 'N', 'AAA' : 'K', 'AAG' : 'K', 'AGT' : 'S', 'AGC' : 'S', 'AGA' : '*', 'AGG' : '*', 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V', 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A', 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E', 'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G' } AATable = {'A': ['GCA', 'GCC', 'GCG', 'GCT'], 'C': ['TGT', 'TGC'], 'E': ['GAG', 'GAA'], 'D': ['GAT', 'GAC'], 'G': ['GGT', 'GGG', 'GGA', 'GGC'], 'F': ['TTT', 'TTC'], 'I': ['ATC', 'ATA', 'ATT'], 'H': ['CAT', 'CAC'], 'K': ['AAG', 'AAA'], '*': ['TAG', 'TGA', 'TAA'], 'M': ['ATG'], 'L': ['CTT', 'CTG', 'CTA', 'CTC', 'TTA', 'TTG'], 'N': ['AAC', 'AAT'], 'Q': ['CAA', 'CAG'], 'P': ['CCT', 'CCG', 'CCA', 'CCC'], 'S': ['AGC', 'AGT', 'TCT', 'TCG', 'TCC', 'TCA'], 'R': ['AGG', 'AGA', 'CGA', 'CGG', 'CGT', 'CGC'], 'T': ['ACA', 'ACG', 'ACT', 'ACC'], 'W': ['TGG'], 'V': ['GTA', 'GTC', 'GTG', 'GTT'], 'Y': ['TAT', 'TAC']} AAs = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', '*', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y'] toFloat = lambda x: float(x) toInt = lambda x: int(x) floatToStr = lambda x:"%f"%(x) intToStr = lambda x:"%d"%(x) splitStr = lambda x: x.split(';') stripSplitStr = lambda x: x.strip().split(';')
[docs]def findAll(haystack, needle) : """returns a list of all occurances of needle in haystack""" h = haystack res = [] f = haystack.find(needle) offset = 0 while (f >= 0) : #print h, needle, f, offset res.append(f+offset) offset += f+len(needle) h = h[f+len(needle):] f = h.find(needle) return res
[docs]def complementTab(seq=[]): """returns a list of complementary sequence without inversing it""" complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'R': 'Y', 'Y': 'R', 'M': 'K', 'K': 'M', 'W': 'W', 'S': 'S', 'B': 'V', 'D': 'H', 'H': 'D', 'V': 'B', 'N': 'N', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'r': 'y', 'y': 'r', 'm': 'k', 'k': 'm', 'w': 'w', 's': 's', 'b': 'v', 'd': 'h', 'h': 'd', 'v': 'b', 'n': 'n'} seq_tmp = [] for bps in seq: if len(bps) == 0: #Need manage '' for deletion seq_tmp.append('') elif len(bps) == 1: seq_tmp.append(complement[bps]) else: #Need manage 'ACT' for insertion #The insertion need to be reverse complement (like seq) seq_tmp.append(reverseComplement(bps)) #Doesn't work in the second for when bps=='' #seq = [complement[bp] if bp != '' else '' for bps in seq for bp in bps] return seq_tmp
[docs]def reverseComplementTab(seq): ''' Complements a DNA sequence, returning the reverse complement in a list to manage INDEL. ''' return complementTab(seq[::-1])
[docs]def reverseComplement(seq): ''' Complements a DNA sequence, returning the reverse complement. ''' return complement(seq)[::-1]
[docs]def complement(seq) : """returns the complementary sequence without inversing it""" tb = str.maketrans("ACGTRYMKWSBDHVNacgtrymkwsbdhvn", "TGCAYRKMWSVHDBNtgcayrkmwsvhdbn") #just to be sure that seq isn't unicode return str(seq).translate(tb)
[docs]def translateDNA_6Frames(sequence) : """returns 6 translation of sequence. One for each reading frame""" trans = ( translateDNA(sequence, 'f1'), translateDNA(sequence, 'f2'), translateDNA(sequence, 'f3'), translateDNA(sequence, 'r1'), translateDNA(sequence, 'r2'), translateDNA(sequence, 'r3'), ) return trans
[docs]def translateDNA(sequence, frame = 'f1', translTable_id='default') : """Translates DNA code, frame : fwd1, fwd2, fwd3, rev1, rev2, rev3""" protein = "" if frame == 'f1' : dna = sequence elif frame == 'f2': dna = sequence[1:] elif frame == 'f3' : dna = sequence[2:] elif frame == 'r1' : dna = reverseComplement(sequence) elif frame == 'r2' : dna = reverseComplement(sequence) dna = dna[1:] elif frame == 'r3' : dna = reverseComplement(sequence) dna = dna[2:] else : raise ValueError('unknown reading frame: %s, should be one of the following: fwd1, fwd2, fwd3, rev1, rev2, rev3' % frame) for i in range(0, len(dna), 3) : codon = dna[i:i+3] # Check if variant messed with selenocysteine codon if '!' in codon and codon != '!GA': codon = codon.replace('!', 'T') if (len(codon) == 3) : try : # MC protein += translTable[translTable_id][codon] except KeyError : combinaisons = polymorphicCodonCombinaisons(list(codon)) translations = set() for ci in range(len(combinaisons)): translations.add(translTable[translTable_id][combinaisons[ci]]) protein += '/'.join(translations) return protein
[docs]def getSequenceCombinaisons(polymorphipolymorphicDnaSeqSeq, pos = 0) : """Takes a dna sequence with polymorphismes and returns all the possible sequences that it can yield""" if type(polymorphipolymorphicDnaSeqSeq) is not list : seq = list(polymorphipolymorphicDnaSeqSeq) else : seq = polymorphipolymorphicDnaSeqSeq if pos >= len(seq) : return [''.join(seq)] variants = [] if seq[pos] in polymorphicNucleotides : chars = decodePolymorphicNucleotide(seq[pos]) else : chars = seq[pos]#.split('/') for c in chars : rseq = copy.copy(seq) rseq[pos] = c variants.extend(getSequenceCombinaisons(rseq, pos + 1)) return variants
[docs]def polymorphicCodonCombinaisons(codon) : """Returns all the possible amino acids encoded by codon""" return getSequenceCombinaisons(codon, 0)
[docs]def encodePolymorphicNucleotide(polySeq) : """returns a single character encoding all nucletides of polySeq in a single character. PolySeq must have one of the following forms: ['A', 'T', 'G'], 'ATG', 'A/T/G'""" if type(polySeq) is str : if polySeq.find("/") < 0 : sseq = list(polySeq) else : sseq = polySeq.split('/') else : sseq = polySeq seq = [] for n in sseq : try : for n2 in polymorphicNucleotides[n] : seq.append(n2) except KeyError : seq.append(n) seq = set(seq) if len(seq) == 4: return 'N' elif len(seq) == 3 : if 'T' not in seq : return 'V' elif 'G' not in seq : return 'H' elif 'C' not in seq : return 'D' elif 'A' not in seq : return 'B' elif len(seq) == 2 : if 'A' in seq and 'G' in seq : return 'R' elif 'C' in seq and 'T' in seq : return 'Y' elif 'A' in seq and 'C' in seq : return 'M' elif 'T' in seq and 'G' in seq : return 'K' elif 'A' in seq and 'T' in seq : return 'W' elif 'C' in seq and 'G' in seq : return 'S' elif polySeq[0] in nucleotides : return polySeq[0] else : raise UnknownNucleotide(polySeq)
[docs]def decodePolymorphicNucleotide(nuc) : """the opposite of encodePolymorphicNucleotide, from 'R' to ['A', 'G']""" if nuc in polymorphicNucleotides : return polymorphicNucleotides[nuc] if nuc in nucleotides : return nuc raise ValueError('nuc: %s, is not a valid nucleotide' % nuc)
[docs]def decodePolymorphicNucleotide_str(nuc) : """same as decodePolymorphicNucleotide but returns a string instead of a list, from 'R' to 'A/G""" return '/'.join(decodePolymorphicNucleotide(nuc))
[docs]def getNucleotideCodon(sequence, x1) : """Returns the entire codon of the nucleotide at pos x1 in sequence, and the position of that nocleotide in the codon in a tuple""" if 0 <= x1 < len(sequence): p = x1 % 3 return (sequence[x1 - p : x1 + 3 - p], p) else: return None
[docs]def showDifferences(seq1, seq2) : """Returns a string highligthing differences between seq1 and seq2: * Matches by '-' * Differences : 'A|T' * Exceeded length : '#' """ ret = [] for i in range(max(len(seq1), len(seq2))) : if i >= len(seq1) : c1 = '#' else : c1 = seq1[i] if i >= len(seq2) : c2 = '#' else : c2 = seq2[i] if c1 != c2 : ret.append('%s|%s' % (c1, c2)) else : ret.append('-') return ''.join(ret)
[docs]def highlightSubsequence(sequence, x1, x2, start=' [', stop = '] ') : """returns a sequence where the subsequence in [x1, x2[ is placed in bewteen 'start' and 'stop'""" seq = list(sequence) #print(x1, x2-1, len(seq)) seq[x1] = start + seq[x1] seq[x2-1] = seq[x2-1] + stop return ''.join(seq)
# def highlightSubsequence(sequence, x1, x2, start=' [', stop = '] ') : # """returns a sequence where the subsequence in [x1, x2[ is placed # in bewteen 'start' and 'stop'""" # seq = list(sequence) # ii = 0 # acc = True # for i in range(len(seq)) : # if ii == x1 : # seq[i] = start+seq[i] # if ii == x2-1 : # seq[i] = seq[i] + stop # if i < len(seq) - 1 : # if seq[i+1] == '/': # acc = False # else : # acc = True # if acc : # ii += 1 # return ''.join(seq)