%pylab inline # We'll be using some pretty-printing later on from pprint import pprint from IPython.core.display import Image Image(url='http://labrosa.ee.columbia.edu/crucialpython/logo.png', width=600) # Let's make a function to generate some example data def make_point_data(n, m): a = np.arange(-n/2.0, 1 + n / 2.0) b = np.random.randn(m) # For convenience, let's sort b b = np.sort(b) return a, b # Build a list and iterate over all pairs def find_similar_points(a, b, r): '''Given two lists of numbers a and b, and a threshold r, returns a matrix `hits` such that: |a[ hits[i, 0] ] - b[ hits[j, 1] ]| <= r ''' hits = [] for i, ai in enumerate(a): for j, bj in enumerate(b): if np.abs(ai - bj) <= r: hits.append( (i, j) ) # Return indices in ndarray format return np.asarray(hits) # Generate and display the example data a, b = make_point_data(5, 10) # And a threshold r = 0.25 print 'a: ', a print 'b: ', b print 'r: ', r # What'd we get? print 'Hits: ' print find_similar_points(a, b, r) def make_interval_data(n): # Generate random start times S = np.random.randn(n) # Sort it, for convenience S = np.sort(S) # Add a random positive length to each one T = S + np.abs(np.random.randn(n)) return S, T def find_interval_overlap(S, T): '''S and T are vectors that encode intervals (S[i], T[i]) Returns an n-by-n matrix C such that C[i, j] is the length of the overlap between intervals i and j. ''' # Get the shape n = S.shape[0] # Build an output matrix. C = np.zeros( (n, n) ) for i in range(n): for j in range(n): C[i, j] = max(0, min(T[i], T[j]) - max(S[i], S[j])) return C # Let's make up some test data S, T = make_interval_data(5) # What does it look like? pprint( zip(S, T) ) print find_interval_overlap(S, T) # Here's how to write our similar_points function using the subtraction ufunc def fast_similar_points(a, b, r): # subtract.outer builds a matrix of a[i] - b[j] # abs computes the absolute value difference = np.abs(np.subtract.outer(a, b)) # <= r does a pointwise comparison against the threshold # argwhere finds all indices (i, j) such that hits[i, j] == True return np.argwhere(difference <= r) # Make some more test data, 5 points in a and 10 in b a, b = make_point_data(5, 10) print a print b print r # For comparison purposes, our previous implementation print find_similar_points(a, b, r) # And the new one print fast_similar_points(a, b, r) # Timing benchmark %timeit find_similar_points(a, b, r) %timeit fast_similar_points(a, b, r) # How does it scale? Let's increase the size of our data a, b = make_point_data(500, 1000) %timeit find_similar_points(a, b, r) %timeit fast_similar_points(a, b, r) # Similarly, our interval-overlap calculator can be simplified by using outers to compute the intermediate steps def fast_interval_overlap(S, T): # For each pair of intervals, find the later start: max(S[i], S[j]) max_start = np.maximum.outer(S, S) # And the earlier end: min(T[i], T[j]) min_end = np.minimum.outer(T, T) # Subtract min_end[i, j] - max_start[i, j] # If the result is negative, then the intersection is 0 return np.maximum(0, min_end - max_start) S, T = make_interval_data(5) pprint( zip(S, T) ) print fast_interval_overlap(S, T) %timeit find_interval_overlap(S, T) %timeit fast_interval_overlap(S, T) # Scaling up? S, T = make_interval_data(1000) %timeit find_interval_overlap(S, T) %timeit fast_interval_overlap(S, T)