import bisect
import sys
class Index(object):
def __init__(self, t, k):
''' Create index from all substrings of size 'length' '''
self.k = k # k-mer length (k)
self.index = []
for i in range(len(t) - k + 1): # for each k-mer
self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair
self.index.sort() # alphabetize by k-mer
def query(self, p):
''' Return index hits for first k-mer of P '''
kmer = p[:self.k] # query with first k-mer
i = bisect.bisect_left(self.index, (kmer, -1)) # binary search
hits = []
while i < len(self.index): # collect matching index entries
if self.index[i][0] != kmer:
break
hits.append(self.index[i][1])
i += 1
return hits[:]
def queryIndex(p, t, index):
k = index.k
offsets = []
for i in index.query(p):
if p[k:] == t[i+k:i+len(p)]: # verify that rest of P matches
offsets.append(i)
return offsets
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'
index = Index(t, 4)
print(queryIndex(p, t, index))
[21, 68]
index = Index(t, 4)
print(queryIndex('TTTT', t, index))
[]
index = Index(t, 2)
print(queryIndex('AT', t, index))
[8, 24, 31, 41, 50, 54, 60, 62, 71]
t = 'There would have been a time for such a word'
p = 'word'
index = Index(t, 4)
print(queryIndex('word', t, index))
[40]