Notebook

In [1]:

import (
    "bytes"
    "fmt"
)

In [2]:

func zArray(s string) []int {
    // Use Z-algorithm to preprocess given string.  See
    // Gusfield for complete description of algorithm.
    Z := make([]int, len(s)+1)
    Z[0] = len(s)

    // Initial comparison of s[1:] with prefix
    for i := 1; i < len(s); i++ {
        if s[i] == s[i-1] {
            Z[1] += 1
        } else {
            break
        }
    }

    r, l := 0, 0
    if Z[1] > 0 {
        r, l = Z[1], 1
    }

    for k := 2; k < len(s); k++ {
        if k > r {
            // Case 1
            for i := k; i < len(s); i++ {
                if s[i] == s[i-k] {
                    Z[k] += 1
                } else {
                    break
                }
            }
            r, l = k + Z[k] - 1, k
        } else {
            // Case 2
            // Calculate length of beta
            nbeta := r - k + 1
            Zkp := Z[k - l]
            if nbeta > Zkp {
                // Case 2a: Zkp wins
                Z[k] = Zkp
            } else {
                // Case 2b: Compare characters just past r
                nmatch := 0
                for i := r+1; i < len(s); i++ {
                    if s[i] == s[i - k] {
                        nmatch += 1
                    } else {
                        break
                    }
                }
                l, r = k, r + nmatch
                Z[k] = r - k + 1
            }
        }
    }
    return Z
}

In [3]:

// Helper function that returns a new string
// that is the reverse of the argument
func reverseString(s string) string {
    var buf bytes.Buffer
    for i := 0; i < len(s); i++ {
        buf.WriteByte(s[len(s)-i-1])
    }
    return buf.String()
}

// Helper function that returns a new int slice
// that is the reverse of the argument
func reverseInts(is []int) []int {
    buf := make([]int, len(is))
    for i := 0; i < len(is); i++ {
        buf[len(is)-i-1] = is[i]
    }
    return buf
}

In [4]:

// Make N array (Gusfield theorem 2.2.2) from Z array
func nArray(s string) []int {
    return reverseInts(zArray(reverseString(s)))
}

In [5]:

// Make L' array (Gusfield theorem 2.2.2) using p and N array.
// L'[i] = largest index j less than n such that N[j] = |P[i:]|
func bigLPrimeArray(length int, n []int) []int {
    lp := make([]int, length)
    for j := 0; j < length-1; j++ {
        i := length - n[j]
        if i < length {
            lp[i] = j + 1
        }
    }
    return lp
}

In [6]:

// Compile L array (Gusfield theorem 2.2.2) using p and L' array.
// L[i] = largest index j less than n such that N[j] >= |P[i:]|
func bigLArray(length int, lp []int) []int {
    l := make([]int, length)
    l[1] = lp[1]
    for i := 2; i < length; i++ {
        l[i] = l[i-1]
        if lp[i] > l[i] {
            l[i] = lp[i]
        }
    }
    return l
}

In [7]:

// Compile lp' array (Gusfield theorem 2.2.4) using N array
func smallLPrimeArray(n []int) []int {
    smallLps := make([]int, len(n))
    for i := 0; i < len(n); i++ {
        if n[i] == i+1 {  // prefix matching a suffix
            smallLps[len(n)-i-1] = i+1
        }
    }
    for i := len(n)-2; i > -1; i-- {  // "smear" them out to the left
        if smallLps[i] == 0 {
            smallLps[i] = smallLps[i+1]
        }
    }
    return smallLps
}

In [8]:

// Return tables needed to apply good suffix rule
func goodSuffixTable(p string) ([]int, []int) {
    n := nArray(p)
    lp := bigLPrimeArray(len(p), n)
    return bigLArray(len(p), lp), smallLPrimeArray(n)
}

In [9]:

// Given pattern string and list with ordered alphabet characters, create
// and return a dense bad character table.  Table is indexed by offset
// then by character.
func denseBadCharTab(p string, amap map[byte]int) [][]int {
    tab := make([][]int, len(p))
    nxt := make([]int, len(amap))
    for i, c := range []byte(p) {
        tab[i] = nxt[:]
        nxt[amap[c]] = i+1
    }
    return tab
}

In [10]:

// Boyer-Moore preprocessing object
type BoyerMoore struct {
    amap                map[byte]int
    badChars            [][]int
    bigLS, smallLPrimes []int
}

// Constructor for Boyer-Moore preprocessing object
func NewBoyerMoore(p string, alphabet string) *BoyerMoore {
    m := new(BoyerMoore)
    m.amap = make(map[byte]int)
    for i, c := range []byte(alphabet) {
        m.amap[c] = i
    }
    m.badChars = denseBadCharTab(p, m.amap)
    m.bigLS, m.smallLPrimes = goodSuffixTable(p)
    return m
}

// Return # skips given by bad character rule at offset i
func (bm *BoyerMoore) badCharacterRule(i int, c byte) int {
    ci := bm.amap[c]
    return i - (bm.badChars[i][ci]-1)
}

// Given a mismatch at offset i, return amount to shift
// as determined by (weak) good suffix rule.
func (bm *BoyerMoore) goodSuffixRule(i int) int {
    length := len(bm.bigLS)
    if i == length - 1 {
        return 0
    }
    i += 1   // i points to leftmost matching position of P
    if bm.bigLS[i] > 0 {
        return length - bm.bigLS[i]
    }
    return length - bm.smallLPrimes[i]
}

// Return amount to shift in case where P matches T
func (bm *BoyerMoore) matchSkip() int {
    return len(bm.smallLPrimes) - bm.smallLPrimes[1]
}

In [11]:

// Utility function for taking max of integers
func Max(x, y int) int {
    if x > y {
        return x
    }
    return y
}

In [12]:

// Do Boyer-Moore matching
func boyerMoore(p string, bm *BoyerMoore, t string) []int {
    i := 0
    occurrences := make([]int, 0)
    for i < len(t)-len(p)+1 {
        shift, skipGs := 1, 1
        mismatched := false
        for j := len(p) - 1; j >= 0; j-- {
            if p[j] != t[i+j] {
                skipBc := bm.badCharacterRule(j, t[i+j])
                skipGs = bm.goodSuffixRule(j)
                shift = Max(shift, Max(skipBc, skipGs))
                mismatched = true
                break
            }
        }
        if ! mismatched {
            occurrences = append(occurrences, i)
            skipGs := Max(shift, skipGs)
            shift = Max(shift, skipGs)
        }
        i += shift
    }
    return occurrences
}

In [13]:

t := "haystack needle haystack"  // "text" - thing we search in
p := "needle"                    // "pattern" thing we search for

In [14]:

bm := NewBoyerMoore(p, "abcdefghijklmnopqrstuvwxyz")
boyerMoore(p, bm, t)

Out[14]:

[9]

In [15]:

t[9:9+len(p)]

Out[15]:

needle

In [16]:

boyerMoore(p, bm, "needleneedleneedle")

Out[16]:

[0 6 12]