#!/usr/bin/env python # coding: utf-8 # In[1]: import bisect import sys # In[2]: class Index(object): def __init__(self, t, k): ''' Create index from all substrings of size 'length' ''' self.k = k # k-mer length (k) self.index = {} for i in range(len(t) - k + 1): # for each k-mer kmer = t[i:i+k] if kmer not in self.index: self.index[kmer] = [i] else: self.index[kmer].append(i) # could also have used collections.defaultdict def query(self, p): ''' Return index hits for first k-mer of P ''' kmer = p[:self.k] # query with first k-mer return self.index.get(kmer, [])[:] # In[3]: def queryIndex(p, t, index): k = index.k offsets = [] for i in index.query(p): if p[k:] == t[i+k:i+len(p)]: # verify that rest of P matches offsets.append(i) return offsets # In[4]: t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA' p = 'GGTATTCGGGA' # In[5]: index = Index(t, 4) print(queryIndex(p, t, index)) # In[6]: index = Index(t, 4) print(queryIndex('TTTT', t, index)) # In[7]: index = Index(t, 2) print(queryIndex('AT', t, index)) # In[8]: t = 'There would have been a time for such a word' p = 'word' # In[9]: index = Index(t, 4) print(queryIndex('word', t, index))