Source code for tools.BinarySequence

import array, copy

[docs]class BinarySequence : """A class for representing sequences in a binary format""" ALPHABETA_SIZE = 32 ALPHABETA_KMP = range(ALPHABETA_SIZE) def __init__(self, sequence, arrayForma, charToBinDict) : self.forma = arrayForma self.charToBin = charToBinDict self.sequence = sequence self.binSequence, self.defaultSequence, self.polymorphisms = self.encode(sequence) self.itemsize = self.binSequence.itemsize self.typecode = self.binSequence.typecode #print 'bin', len(self.sequence), len(self.binSequence)
[docs] def encode(self, sequence): """Returns a tuple (binary reprensentation, default sequence, polymorphisms list)""" polymorphisms = [] defaultSequence = '' binSequence = array.array(self.forma.typecode) b = 0 i = 0 trueI = 0 #not inc in case if poly poly = set() # Handle stop variant at the end if sequence.endswith('/'): sequence += '*' while i < len(sequence)-1: b = b | self.forma[self.charToBin[sequence[i]]] if sequence[i+1] == '/' : poly.add(sequence[i]) i += 2 else : binSequence.append(b) if len(poly) > 0 : poly.add(sequence[i]) polymorphisms.append((trueI, poly)) poly = set() while b % 2 != 0 : b = b/2 defaultSequence += sequence[i] b = 0 i += 1 trueI += 1 if i < len(sequence) : b = b | self.forma[self.charToBin[sequence[i]]] binSequence.append(b) if len(poly) > 0 : if sequence[i] not in poly : poly.add(sequence[i]) polymorphisms.append((trueI, poly)) defaultSequence += sequence[i] return (binSequence, defaultSequence, polymorphisms)
def __testFind(self, arr) : if len(arr) == 0: raise TypeError ('binary find, needle is empty') if arr.itemsize != self.itemsize : raise TypeError ('binary find, both arrays must have same item size, arr: %d, self: %d' % (arr.itemsize, self.itemsize)) def __testBinary(self, arr) : if len(arr) != len(self) : raise TypeError ('bin operator, both arrays must be of same length') if arr.itemsize != self.itemsize : raise TypeError ('bin operator, both arrays must have same item size, arr: %d, self: %d' % (arr.itemsize, self.itemsize))
[docs] def findPolymorphisms(self, strSeq, strict = False): """ Compares strSeq with self.sequence. If not 'strict', this function ignores the cases of matching heterozygocity (ex: for a given position i, strSeq[i] = A and self.sequence[i] = 'A/G'). If 'strict' it returns all positions where strSeq differs self,sequence """ arr = self.encode(strSeq)[0] res = [] if not strict : for i in range(len(arr)+len(self)) : if i >= len(arr) or i > len(self) : break if arr[i] & self[i] == 0: res.append(i) else : for i in range(len(arr)+len(self)) : if i >= len(arr) or i > len(self) : break if arr[i] != self[i] : res.append(i) return res
[docs] def getPolymorphisms(self): """returns all polymorphsims in the form of a dict pos => alleles""" return self.polymorphisms
[docs] def getDefaultSequence(self) : """returns a default version of sequence where only the last allele of each polymorphisms is shown""" return self.defaultSequence
def __getSequenceVariants(self, x1, polyStart, polyStop, listSequence) : """polyStop, is the polymorphisme at wixh number where the calcul of combinaisons stops""" if polyStart < len(self.polymorphisms) and polyStart < polyStop: sequence = copy.copy(listSequence) ret = [] pk = self.polymorphisms[polyStart] posInSequence = pk[0]-x1 if posInSequence < len(listSequence) : for allele in sorted(pk[1]): sequence[posInSequence] = allele ret.extend(self.__getSequenceVariants(x1, polyStart+1, polyStop, sequence)) return ret else : return [''.join(listSequence)]
[docs] def getSequenceVariants(self, x1 = 0, x2 = -1, maxVariantNumber = 128) : """returns the sequences resulting from all combinaisons of all polymorphismes between x1 and x2. The results is a couple (bool, variants of sequence between x1 and x2), bool == true if there's more combinaisons than maxVariantNumber""" if x2 == -1 : xx2 = len(self.defaultSequence) else : xx2 = x2 polyStart = None nbP = 1 stopped = False j = 0 for p in self.polymorphisms : if p[0] >= xx2 : break if x1 <= p[0] : if polyStart == None : polyStart = j nbP *= len(p[1]) if nbP > maxVariantNumber : stopped = True break j += 1 if polyStart == None : return (stopped, [self.defaultSequence[x1:xx2]]) return (stopped, self.__getSequenceVariants(x1, polyStart, j, list(self.defaultSequence[x1:xx2])))
[docs] def getNbVariants(self, x1, x2 = -1) : """returns the nb of variants of sequences between x1 and x2""" if x2 == -1 : xx2 = len(self.defaultSequence) else : xx2 = x2 nbP = 1 for p in self.polymorphisms: if x1 <= p[0] and p[0] <= xx2 : nbP *= len(p[1]) return nbP
def _dichFind(self, needle, currHaystack, offset, lst = None) : """dichotomic search, if lst is None, will return the first position found. If it's a list, will return a list of all positions in lst. returns -1 or [] if no match found""" if len(currHaystack) == 1 : if (offset <= (len(self) - len(needle))) and (currHaystack[0] & needle[0]) > 0 and (self[offset+len(needle)-1] & needle[-1]) > 0 : found = True for i in range(1, len(needle)-1) : if self[offset + i] & needle[i] == 0 : found = False break if found : if lst is not None : lst.append(offset) else : return offset else : if lst is None : return -1 else : if (offset <= (len(self) - len(needle))) : if lst is not None : self._dichFind(needle, currHaystack[:len(currHaystack)/2], offset, lst) self._dichFind(needle, currHaystack[len(currHaystack)/2:], offset + len(currHaystack)/2, lst) else : v1 = self._dichFind(needle, currHaystack[:len(currHaystack)/2], offset, lst) if v1 > -1 : return v1 return self._dichFind(needle, currHaystack[len(currHaystack)/2:], offset + len(currHaystack)/2, lst) return -1 def _kmp_construct_next(self, pattern): """the helper function for KMP-string-searching is to construct the DFA. pattern should be an integer array. return a 2D array representing the DFA for moving the pattern.""" next = [[0 for state in pattern] for input_token in self.ALPHABETA_KMP] next[pattern[0]][0] = 1 restart_state = 0 for state in range(1, len(pattern)): for input_token in self.ALPHABETA_KMP: next[input_token][state] = next[input_token][restart_state] next[pattern[state]][state] = state + 1 restart_state = next[pattern[state]][restart_state] return next def _kmp_search_first(self, pInput_sequence, pPattern): """use KMP algorithm to search the first occurrence in the input_sequence of the pattern. both arguments are integer arrays. return the position of the occurence if found; otherwise, -1.""" input_sequence, pattern = pInput_sequence, [len(bin(e)) for e in pPattern] n, m = len(input_sequence), len(pattern) d = p = 0 next = self._kmp_construct_next(pattern) while d < n and p < m: p = next[len(bin(input_sequence[d]))][p] d += 1 if p == m: return d - p else: return -1 def _kmp_search_all(self, pInput_sequence, pPattern): """use KMP algorithm to search all occurrence in the input_sequence of the pattern. both arguments are integer arrays. return a list of the positions of the occurences if found; otherwise, [].""" r = [] input_sequence, pattern = [len(bin(e)) for e in pInput_sequence], [len(bin(e)) for e in pPattern] n, m = len(input_sequence), len(pattern) d = p = 0 next = self._kmp_construct_next(pattern) while d < n: p = next[input_sequence[d]][p] d += 1 if p == m: r.append(d - m) p = 0 return r def _kmp_find(self, needle, haystack, lst = None): """find with KMP-searching. needle is an integer array, reprensenting a pattern. haystack is an integer array, reprensenting the input sequence. if lst is None, return the first position found or -1 if no match found. If it's a list, will return a list of all positions in lst. returns -1 or [] if no match found.""" if lst != None: return self._kmp_search_all(haystack, needle) else: return self._kmp_search_first(haystack, needle)
[docs] def findByBiSearch(self, strSeq) : """returns the first occurence of strSeq in self. Takes polymorphisms into account""" arr = self.encode(strSeq) return self._dichFind(arr[0], self, 0, lst = None)
[docs] def findAllByBiSearch(self, strSeq) : """Same as find but returns a list of all occurences""" arr = self.encode(strSeq) lst = [] self._dichFind(arr[0], self, 0, lst) return lst
[docs] def find(self, strSeq) : """returns the first occurence of strSeq in self. Takes polymorphisms into account""" arr = self.encode(strSeq) return self._kmp_find(arr[0], self)
[docs] def findAll(self, strSeq) : """Same as find but returns a list of all occurences""" arr = self.encode(strSeq) lst = [] lst = self._kmp_find(arr[0], self, lst) return lst
def __and__(self, arr) : self.__testBinary(arr) ret = BinarySequence(self.typecode, self.forma, self.charToBin) for i in range(len(arr)) : ret.append(self[i] & arr[i]) return ret def __xor__(self, arr) : self.__testBinary(arr) ret = BinarySequence(self.typecode, self.forma, self.charToBin) for i in range(len(arr)) : ret.append(self[i] ^ arr[i]) return ret def __eq__(self, seq) : self.__testBinary(seq) if len(seq) != len(self) : return False return all( self[i] == seq[i] for i in range(len(self)) ) def append(self, arr) : self.binSequence.append(arr) def extend(self, arr) : self.binSequence.extend(arr)
[docs] def decode(self, binSequence): """decodes a binary sequence to return a string""" try: binSeq = iter(binSequence[0]) except TypeError: binSeq = binSequence ret = '' for b in binSeq : ch = '' for c in self.charToBin : if b & self.forma[self.charToBin[c]] > 0 : ch += c +'/' if ch == '' : raise KeyError('Key %d unkowom, bad format' % b) ret += ch[:-1] return ret
def getChar(self, i): return self.decode([self.binSequence[i]]) def __len__(self): return len(self.binSequence) def __getitem__(self,i): return self.binSequence[i] def __setitem__(self, i, v): self.binSequence[i] = v
[docs]class AABinarySequence(BinarySequence) : """A binary sequence of amino acids""" def __init__(self, sequence): f = array.array('I', [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152]) c = {'A': 17, 'C': 14, 'E': 19, 'D': 15, 'G': 13, 'F': 16, 'I': 3, 'H': 9, 'K': 8, '*': 1, 'M': 20, 'L': 0, 'N': 4, 'Q': 11, 'P': 6, 'S': 7, 'R': 5, 'T': 2, 'W': 10, 'V': 18, 'Y': 12, 'U': 21} BinarySequence.__init__(self, sequence, f, c)
[docs]class NucBinarySequence(BinarySequence) : """A binary sequence of nucleotides""" def __init__(self, sequence): f = array.array('B', [1, 2, 4, 8]) c = {'A': 0, 'T': 1, 'C': 2, 'G': 3} ce = { 'R' : 'A/G', 'Y' : 'C/T', 'M': 'A/C', 'K' : 'T/G', 'W' : 'A/T', 'S' : 'C/G', 'B': 'C/G/T', 'D' : 'A/G/T', 'H' : 'A/C/T', 'V' : 'A/C/G', 'N': 'A/C/G/T' } lstSeq = list(sequence) for i in range(len(lstSeq)) : if lstSeq[i] in ce : lstSeq[i] = ce[lstSeq[i]] lstSeq = ''.join(lstSeq) BinarySequence.__init__(self, lstSeq, f, c)
if __name__=="__main__": def test0() : #seq = 'ACED/E/GFIHK/MLMQPS/RTWVY' seq = 'ACED/E/GFIHK/MLMQPS/RTWVY/A/R' bSeq = AABinarySequence(seq) start = 0 stop = 4 rB = bSeq.getSequenceVariants_bck(start, stop) r = bSeq.getSequenceVariants(start, stop) #print start, stop, 'nb_comb_r', len(r[1]), set(rB[1])==set(r[1]) print(start, stop)#, 'nb_comb_r', len(r[1]), set(rB[1])==set(r[1]) #if set(rB[1])!=set(r[1]) : print('-AV-') print(start, stop, 'nb_comb_r', len(rB[1])) print('\n'.join(rB[1])) print('=AP========') print(start, stop, 'nb_comb_r', len(r[1])) print('\n'.join(r[1])) def testVariants() : seq = 'ATGAGTTTGCCGCGCN' bSeq = NucBinarySequence(seq) print(bSeq.getSequenceVariants()) testVariants() from random import randint alphabeta = ['A', 'C', 'G', 'T'] seq = '' for _ in range(8192): seq += alphabeta[randint(0, 3)] seq += 'ATGAGTTTGCCGCGCN' bSeq = NucBinarySequence(seq) ROUND = 512 PATTERN = 'GCGC' def testFind(): for i in range(ROUND): bSeq.find(PATTERN) def testFindByBiSearch(): for i in range(ROUND): bSeq.findByBiSearch(PATTERN) def testFindAll(): for i in range(ROUND): bSeq.findAll(PATTERN) def testFindAllByBiSearch(): for i in range(ROUND): bSeq.findAllByBiSearch(PATTERN) import cProfile print('find:\n') cProfile.run('testFind()') print('findAll:\n') cProfile.run('testFindAll()') print('findByBiSearch:\n') cProfile.run('testFindByBiSearch()') print('findAllByBiSearch:\n') cProfile.run('testFindAllByBiSearch()')