#!/usr/bin/env python # coding: utf-8 # In[1]: # Not a great way to build a suffix array, but we'll use it # for the small examples here def naiveBuildSA(t): satups = sorted([(t[i:], i) for i in range(len(t))]) return list(map(lambda x: x[1], satups)) # In[2]: naiveBuildSA('abaaba$') # works on a simple example # In[3]: def binarySearchSA(t, sa, p): assert t[-1] == '$' # t already has terminator assert len(t) == len(sa) # sa is the suffix array for t if len(t) == 1: return 1 l, r = 0, len(sa) # invariant: sa[l] < p < sa[r] while True: c = (l + r) // 2 # determine whether p < T[sa[c]:] by doing comparisons # starting from left-hand sides of p and T[sa[c]:] plt = True # assume p < T[sa[c]:] until proven otherwise i = 0 while i < len(p) and sa[c]+i < len(t): if p[i] < t[sa[c]+i]: break # p < T[sa[c]:] elif p[i] > t[sa[c]+i]: plt = False break # p > T[sa[c]:] i += 1 # tied so far if plt: if c == l + 1: return c r = c else: if c == r - 1: return r l = c # In[4]: t = 'abaaba$' sa = naiveBuildSA(t) binarySearchSA(t, sa, 'aba') # In[5]: binarySearchSA(t, sa, 'bb') # p is greater than all suffixes # In[6]: binarySearchSA(t, sa, 'aa') # In[7]: def suffixLcp(t, toff, p): i = 0 while i < len(p) and i + toff < len(t): if p[i] != t[i + toff]: return i i += 1 return i # In[8]: suffixLcp('abaaba$', 0, 'aba') # In[9]: suffixLcp('abaaba$', 0, 'abab') # In[10]: suffixLcp('abaaba$', 0, 'abaabaaba') # In[11]: def binarySearchSA_lcp1(t, sa, p): assert t[-1] == '$' # t already has terminator assert len(t) == len(sa) # sa is the suffix array for t if len(t) == 1: return 1 l, r = 0, len(sa) # invariant: sa[l] < p < sa[r] lcp_lp, lcp_rp = 0, 0 while True: c = (l + r) // 2 # determine whether p < T[sa[c]:] by doing comparisons # starting from left-hand sides of p and T[sa[c]:] plt = True # assume p < T[sa[c]:] until proven otherwise i = min(lcp_lp, lcp_rp) while i < len(p) and sa[c]+i < len(t): if p[i] < t[sa[c]+i]: break # p < T[sa[c]:] elif p[i] > t[sa[c]+i]: plt = False break # p > T[sa[c]:] i += 1 # tied so far if plt: if c == l + 1: return c r = c lcp_rp = i else: if c == r - 1: return r l = c lcp_lp = i # In[12]: binarySearchSA_lcp1(t, sa, 'aba') # In[13]: binarySearchSA_lcp1(t, sa, 'bb') # In[14]: binarySearchSA_lcp1(t, sa, 'aa')