In [1]:
import bisect
import sys
In [2]:
class Index(object):
    def __init__(self, t, k):
        ''' Create index from all substrings of size 'length' '''
        self.k = k  # k-mer length (k)
        self.index = []
        for i in range(len(t) - k + 1):  # for each k-mer
            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair
        self.index.sort()  # alphabetize by k-mer
    
    def query(self, p):
        ''' Return index hits for first k-mer of P '''
        kmer = p[:self.k]  # query with first k-mer
        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits[:]
In [3]:
def queryIndex(p, t, index):
    k = index.k
    offsets = []
    for i in index.query(p):
        if p[k:] == t[i+k:i+len(p)]:  # verify that rest of P matches
            offsets.append(i)
    return offsets
In [4]:
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'
In [5]:
index = Index(t, 4)
print(queryIndex(p, t, index))
[21, 68]
In [6]:
index = Index(t, 4)
print(queryIndex('TTTT', t, index))
[]
In [7]:
index = Index(t, 2)
print(queryIndex('AT', t, index))
[8, 24, 31, 41, 50, 54, 60, 62, 71]
In [8]:
t = 'There would have been a time for such a word'
p = 'word'
In [9]:
index = Index(t, 4)
print(queryIndex('word', t, index))
[40]