import ( "bytes" "sort" ) func zArray(s string) []int { // Use Z-algorithm to preprocess given string. See // Gusfield for complete description of algorithm. Z := make([]int, len(s)+1) Z[0] = len(s) // Initial comparison of s[1:] with prefix for i := 1; i < len(s); i++ { if s[i] == s[i-1] { Z[1] += 1 } else { break } } r, l := 0, 0 if Z[1] > 0 { r, l = Z[1], 1 } for k := 2; k < len(s); k++ { if k > r { // Case 1 for i := k; i < len(s); i++ { if s[i] == s[i-k] { Z[k] += 1 } else { break } } r, l = k + Z[k] - 1, k } else { // Case 2 // Calculate length of beta nbeta := r - k + 1 Zkp := Z[k - l] if nbeta > Zkp { // Case 2a: Zkp wins Z[k] = Zkp } else { // Case 2b: Compare characters just past r nmatch := 0 for i := r+1; i < len(s); i++ { if s[i] == s[i - k] { nmatch += 1 } else { break } } l, r = k, r + nmatch Z[k] = r - k + 1 } } } return Z } // Helper function that returns a new string // that is the reverse of the argument func reverseString(s string) string { var buf bytes.Buffer for i := 0; i < len(s); i++ { buf.WriteByte(s[len(s)-i-1]) } return buf.String() } // Helper function that returns a new int slice // that is the reverse of the argument func reverseInts(is []int) []int { buf := make([]int, len(is)) for i := 0; i < len(is); i++ { buf[len(is)-i-1] = is[i] } return buf } // Make N array (Gusfield theorem 2.2.2) from Z array func nArray(s string) []int { return reverseInts(zArray(reverseString(s))) } // Make L' array (Gusfield theorem 2.2.2) using p and N array. // L'[i] = largest index j less than n such that N[j] = |P[i:]| func bigLPrimeArray(length int, n []int) []int { lp := make([]int, length) for j := 0; j < length-1; j++ { i := length - n[j] if i < length { lp[i] = j + 1 } } return lp } // Compile L array (Gusfield theorem 2.2.2) using p and L' array. // L[i] = largest index j less than n such that N[j] >= |P[i:]| func bigLArray(length int, lp []int) []int { l := make([]int, length) l[1] = lp[1] for i := 2; i < length; i++ { l[i] = l[i-1] if lp[i] > l[i] { l[i] = lp[i] } } return l } // Compile lp' array (Gusfield theorem 2.2.4) using N array func smallLPrimeArray(n []int) []int { smallLps := make([]int, len(n)) for i := 0; i < len(n); i++ { if n[i] == i+1 { // prefix matching a suffix smallLps[len(n)-i-1] = i+1 } } for i := len(n)-2; i > -1; i-- { // "smear" them out to the left if smallLps[i] == 0 { smallLps[i] = smallLps[i+1] } } return smallLps } // Return tables needed to apply good suffix rule func goodSuffixTable(p string) ([]int, []int) { n := nArray(p) lp := bigLPrimeArray(len(p), n) return bigLArray(len(p), lp), smallLPrimeArray(n) } // Given pattern string and list with ordered alphabet characters, create // and return a dense bad character table. Table is indexed by offset // then by character. func denseBadCharTab(p string, amap map[byte]int) [][]int { tab := make([][]int, len(p)) nxt := make([]int, len(amap)) for i, c := range []byte(p) { tab[i] = nxt[:] nxt[amap[c]] = i+1 } return tab } // Boyer-Moore preprocessing object type BoyerMoore struct { pattern string amap map[byte]int badChars [][]int bigLS, smallLPrimes []int } // Constructor for Boyer-Moore preprocessing object func NewBoyerMoore(p string, alphabet string) *BoyerMoore { m := new(BoyerMoore) m.pattern = p m.amap = make(map[byte]int) for i, c := range []byte(alphabet) { m.amap[c] = i } m.badChars = denseBadCharTab(p, m.amap) m.bigLS, m.smallLPrimes = goodSuffixTable(p) return m } // Return # skips given by bad character rule at offset i func (bm *BoyerMoore) badCharacterRule(i int, c byte) int { ci := bm.amap[c] return i - (bm.badChars[i][ci]-1) } // Given a mismatch at offset i, return amount to shift // as determined by (weak) good suffix rule. func (bm *BoyerMoore) goodSuffixRule(i int) int { length := len(bm.bigLS) if i == length - 1 { return 0 } i += 1 // i points to leftmost matching position of P if bm.bigLS[i] > 0 { return length - bm.bigLS[i] } return length - bm.smallLPrimes[i] } // Return amount to shift in case where P matches T func (bm *BoyerMoore) matchSkip() int { return len(bm.smallLPrimes) - bm.smallLPrimes[1] } // Utility function for taking max of integers func Max(x, y int) int { if x > y { return x } return y } // Do Boyer-Moore matching func (bm *BoyerMoore) match(t string) []int { i := 0 occurrences := make([]int, 0) p := bm.pattern for i < len(t) - len(p) + 1 { shift, skipGs := 1, 1 mismatched := false for j := len(p) - 1; j >= 0; j-- { if p[j] != t[i+j] { skipBc := bm.badCharacterRule(j, t[i+j]) skipGs = bm.goodSuffixRule(j) shift = Max(shift, Max(skipBc, skipGs)) mismatched = true break } } if ! mismatched { occurrences = append(occurrences, i) skipGs := Max(shift, skipGs) shift = Max(shift, skipGs) } i += shift } return occurrences } // Return a string splice containing non-overlapping, // non-empty substrings that cover p. They should be // as close to equal-length as possible. func partition(p string, pieces int) []string { base, mod := len(p) / pieces, len(p) % pieces idx := 0 ps := make([]string, 0) modAdjust := 1 for i := 0; i < pieces; i++ { if i >= mod { modAdjust = 0 } newIdx := idx + base + modAdjust ps = append(ps, p[idx:newIdx]) idx = newIdx } return ps } // Use the pigeonhole principle together with Boyer-Moore to find // approximate matches with up to a specified number of mismatches. func bmApproximate(p string, t string, k int, alph string) []int { ps := partition(p, k+1) // split p into k+1 non-empty, non-overlapping substrings off := 0 // offset into p of current partition occurrences := make(map[int]int) // we might see the same occurrence >1 time for _, pi := range ps { // for each partition bm := NewBoyerMoore(pi, alph) // BM preprocess the partition for _, hit := range bm.match(t) { if hit - off < 0 { continue // pattern falls off left end of T? } if hit + len(p) - off > len(t) { continue // falls off right end? } // Count mismatches to left and right of the matching partition nmm := 0 for i := 0; i < off; i++ { if t[hit-off+i] != p[i] { nmm++ if nmm > k { break // exceeded maximum # mismatches } } } if nmm <= k { for i := off+len(pi); i < len(p); i++ { if t[hit-off+i] != p[i] { nmm++ if nmm > k { break // exceeded maximum # mismatches } } } } if nmm <= k { occurrences[hit-off]++ // approximate match } } off += len(pi) // Update offset of current partition } var occurrence_list []int for k := range occurrences { occurrence_list = append(occurrence_list, k) } sort.Ints(occurrence_list) return occurrence_list } bmApproximate("needle", "needle noodle nargle", 2, "abcdefghijklmnopqrstuvwxyz ")