%pylab inline

# We'll be using some pretty-printing later on
from pprint import pprint

from IPython.core.display import Image 
Image(url='http://labrosa.ee.columbia.edu/crucialpython/logo.png', width=600) 

# Let's make a function to generate some example data
def make_point_data(n, m):
    a = np.arange(-n/2.0, 1 + n / 2.0)
    b = np.random.randn(m)

    # For convenience, let's sort b
    b = np.sort(b)

    return a, b

# Build a list and iterate over all pairs
def find_similar_points(a, b, r):
    '''Given two lists of numbers a and b, and a threshold r,
       returns a matrix `hits` such that:
       
       |a[ hits[i, 0] ] - b[ hits[j, 1] ]| <= r
       
    '''
    
    hits = []
    for i, ai in enumerate(a):
        for j, bj in enumerate(b):
            if np.abs(ai - bj) <= r:
                hits.append( (i, j) )
                
    # Return indices in ndarray format
    return np.asarray(hits)

# Generate and display the example data
a, b = make_point_data(5, 10)
# And a threshold
r = 0.25

print 'a: ', a
print 'b: ', b
print 'r: ', r

# What'd we get?
print 'Hits: '
print find_similar_points(a, b, r)

def make_interval_data(n):
    # Generate random start times
    S = np.random.randn(n)
    
    # Sort it, for convenience
    S = np.sort(S)

    # Add a random positive length to each one
    T = S + np.abs(np.random.randn(n))
    
    return S, T

def find_interval_overlap(S, T):
    '''S and T are vectors that encode intervals (S[i], T[i])
    
    Returns an n-by-n matrix C such that C[i, j] is the length of the overlap between intervals i and j.
    '''
    
    # Get the shape
    n = S.shape[0]
    
    # Build an output matrix.
    C = np.zeros( (n, n) )
    
    for i in range(n):
        for j in range(n):
            C[i, j] = max(0, min(T[i], T[j]) - max(S[i], S[j]))
            
    return C

# Let's make up some test data
S, T = make_interval_data(5)

# What does it look like?
pprint( zip(S, T) )

print find_interval_overlap(S, T)

# Here's how to write our similar_points function using the subtraction ufunc
def fast_similar_points(a, b, r):
    
    # subtract.outer builds a matrix of a[i] - b[j]
    # abs computes the absolute value
    difference = np.abs(np.subtract.outer(a, b))
    
    # <= r does a pointwise comparison against the threshold
    # argwhere finds all indices (i, j) such that hits[i, j] == True
    return np.argwhere(difference <= r)

# Make some more test data, 5 points in a and 10 in b
a, b = make_point_data(5, 10)

print a
print b
print r

# For comparison purposes, our previous implementation
print find_similar_points(a, b, r)

# And the new one
print fast_similar_points(a, b, r)

# Timing benchmark
%timeit find_similar_points(a, b, r)

%timeit fast_similar_points(a, b, r)

# How does it scale? Let's increase the size of our data
a, b = make_point_data(500, 1000)

%timeit find_similar_points(a, b, r)

%timeit fast_similar_points(a, b, r)

# Similarly, our interval-overlap calculator can be simplified by using outers to compute the intermediate steps
def fast_interval_overlap(S, T):
    
    # For each pair of intervals, find the later start: max(S[i], S[j])
    max_start = np.maximum.outer(S, S)
    
    # And the earlier end: min(T[i], T[j])
    min_end   = np.minimum.outer(T, T)
    
    # Subtract min_end[i, j] - max_start[i, j]
    # If the result is negative, then the intersection is 0
    return np.maximum(0, min_end - max_start)

S, T = make_interval_data(5)
pprint( zip(S, T) )

print fast_interval_overlap(S, T)

%timeit find_interval_overlap(S, T)

%timeit fast_interval_overlap(S, T)

# Scaling up?
S, T = make_interval_data(1000)

%timeit find_interval_overlap(S, T)

%timeit fast_interval_overlap(S, T)