import matplotlib.pyplot as plt from collections import deque import heapq import time ## Limit the number of reads to load, -1 for unlimited MAXREADS = -1 ## Limit the number of reads to plot MAX_READS_LAYOUT = 500 ## Path to reads file READFILE = "/Users/mschatz/build/planesweep/readid.start.stop.txt" print "Loading reads from " + READFILE f = open(READFILE) ## This will have a list of tuples containing (readid, start, end, rc) ## rc is 0 if the read was originally oriented forward, 1 for reverse reads = [] totallen = 0 readidx = 0 for line in f: line = line.rstrip() fields = line.split() readid = fields[0] start = int(fields[1]) end = int(fields[2]) rc = 0 if (start > end): start = int(fields[2]) end = int(fields[1]) rc = 1 readinfo = (readid, start, end, rc) reads.append(readinfo) if (end > totallen): totallen = end readidx += 1 if (readidx == MAXREADS): break print "Loaded layout information for %d reads" % (len(reads)) for i in xrange(3): print " %d: %s [%d - %d] %d" % (i, reads[i][0], reads[i][1], reads[i][2], reads[i][3]) print " ..." for i in xrange(len(reads)-3, len(reads)): print " %d: %s [%d - %d] %d" % (i, reads[i][0], reads[i][1], reads[i][2], reads[i][3]) print "\n\n" ## Note to keep it tractable, we only plot the first MAX_READ_LAYOUT reads plt.figure(figsize=(15,4), dpi=100) print "Plotting layout" ## draw the layout of reads for i in xrange(min(MAX_READS_LAYOUT, len(reads))): r = reads[i] readid = r[0] start = r[1] end = r[2] rc = r[3] color = "blue" if (rc == 1): color = "red" plt.plot ([start,end], [-2*i, -2*i], lw=4, color=color) plt.draw() plt.show() print "Brute force computing coverage over %d bp" % (totallen) starttime = time.time() brutecov = [0] * totallen for r in reads: # print " -- [%d, %d]" % (r[1], r[2]) for i in xrange(r[1], r[2]): brutecov[i] += 1 brutetime = (time.time() - starttime) * 1000.0 print " Brute force complete in %0.02f ms" % (brutetime) print brutecov[0:10] maxcov = 0 lowcov = 0 LOW_COV_THRESHOLD = 10 for c in brutecov: if c > maxcov: maxcov = c if c <= LOW_COV_THRESHOLD: lowcov += 1 print "max cov: %d, there are %d low coverage bases (<= %d depth)" % (maxcov, lowcov, LOW_COV_THRESHOLD) print "\n\n" plt.figure() plt.hist(brutecov) plt.show() print "Delta encoding coverage plot" starttime = time.time() deltacov = [] curcov = -1 for i in xrange(0, len(brutecov)): if brutecov[i] != curcov: curcov = brutecov[i] delta = (i, curcov) deltacov.append(delta) ## Finish up with the last position deltacov.append((totallen, 0)) print "Delta encoding required only %d steps, saving %0.02f%% of the space in %0.02f ms" % (len(deltacov), (100.0*float(totallen-len(deltacov))/totallen), (time.time()-starttime) * 1000.0) for i in xrange(3): print " %d: [%d,%d]" % (i, deltacov[i][0], deltacov[i][1]) print " ..." for i in xrange(len(deltacov)-3, len(deltacov)): print " %d: [%d,%d]" % (i, deltacov[i][0], deltacov[i][1]) print "\n\n" plt.figure(figsize=(15,4), dpi=100) print "Plotting coverage profile" ## expand the coverage profile by this amount so that it is easier to see YSCALE = 5 ## draw the layout of reads for i in xrange(min(MAX_READS_LAYOUT, len(reads))): r = reads[i] readid = r[0] start = r[1] end = r[2] rc = r[3] color = "blue" if (rc == 1): color = "red" plt.plot ([start,end], [-2*i, -2*i], lw=4, color=color) ## draw the base of the coverage plot plt.plot([0, totallen], [0,0], color="black") ## draw the coverage plot for i in xrange(len(deltacov)-1): x1 = deltacov[i][0] x2 = deltacov[i+1][0] y1 = YSCALE*deltacov[i][1] y2 = YSCALE*deltacov[i+1][1] ## draw the horizonal line plt.plot([x1, x2], [y1, y1], color="black") ## and now the right vertical to the new coverage level plt.plot([x2, x2], [y1, y2], color="black") plt.draw() plt.show() print "Beginning list-based plane sweep over %d reads" % (len(reads)) starttime = time.time() ## record the delta encoded depth using a plane sweep deltacovplane = [] ## use a list to record the end positions of the elements currently in plane planelist = [] ## BEGIN SWEEP for r in reads: startpos = r[1] endpos = r[2] ## clear out any positions from the plane that we have already moved past while (len(planelist) > 0): if (planelist[0] <= startpos): ## the coverage steps down, extract it from the front of the list oldend = planelist.pop(0) deltacovplane.append((oldend, len(planelist))) else: break ## Now insert the current endpos into the correct position into the list insertpos = -1 for i in xrange(len(planelist)): if (endpos < planelist[i]): insertpos = i break if (insertpos > 0): planelist.insert(insertpos, endpos) else: planelist.append(endpos) ## Finally record that the coverage has increased deltacovplane.append((startpos, len(planelist))) ## Flush any remaining end positions while (len(planelist) > 0): oldend = planelist.pop(0) deltacovplane.append((oldend, len(planelist))) ## Report statistics planelisttime = (time.time() - starttime) * 1000.0 print "Plane sweep found %d steps, saving %0.02f%% of the space in %0.2f ms (%0.02f speedup)!" % (len(deltacovplane), (100.0*float(totallen-len(deltacovplane))/totallen), planelisttime, brutetime/planelisttime) for i in xrange(3): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print " ..." for i in xrange(len(deltacovplane)-3, len(deltacovplane)): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print "\n\n" print "Checking for duplicates:" lastpos = -1 for i in xrange(len(deltacovplane)): if deltacovplane[i][0] == lastpos: print " Found duplicate: %d: [%d,%d] == %d: [%d,%d]" % (i-1, deltacovplane[i-1][0], deltacovplane[i-1][1], i, deltacovplane[i][0], deltacovplane[i][1]) lastpos = deltacovplane[i][0] print "\n\n" print "Beginning correct list-based plane sweep over %d reads" % (len(reads)) starttime = time.time() ## record the delta encoded depth using a plane sweep deltacovplane = [] ## use a list to record the end positions of the elements currently in plane planelist = [] ## BEGIN SWEEP (note change to index based so can peek ahead) for rr in xrange(len(reads)): r = reads[rr] startpos = r[1] endpos = r[2] ## clear out any positions from the plane that we have already moved past while (len(planelist) > 0): if (planelist[0] <= startpos): ## the coverage steps down, extract it from the front of the list oldend = planelist.pop(0) nextend = -1 if (len(planelist) > 0): nextend = planelist[0] ## only record this transition if it is not the same as a start pos ## and only if not the same as the next end point if ((oldend != startpos) and (oldend != nextend)): deltacovplane.append((oldend, len(planelist))) else: break ## Now insert the current endpos into the correct position into the list insertpos = -1 for i in xrange(len(planelist)): if (endpos < planelist[i]): insertpos = i break if (insertpos > 0): planelist.insert(insertpos, endpos) else: planelist.append(endpos) ## Finally record that the coverage has increased ## But make sure the current read does not start at the same position as the next if ((rr == len(reads)-1) or (startpos != reads[rr+1][1])): deltacovplane.append((startpos, len(planelist))) ## if it is at the same place, it will get reported in the next cycle ## Flush any remaining end positions while (len(planelist) > 0): oldend = planelist.pop(0) deltacovplane.append((oldend, len(planelist))) ## Report statistics planelisttime = (time.time() - starttime) * 1000.0 print "Plane sweep found %d steps, saving %0.02f%% of the space in %0.2f ms (%0.02f speedup)!" % (len(deltacovplane), (100.0*float(totallen-len(deltacovplane))/totallen), planelisttime, brutetime/planelisttime) for i in xrange(3): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print " ..." for i in xrange(len(deltacovplane)-3, len(deltacovplane)): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print "\n\n" print "Checking for duplicates in corrected version:" lastpos = -1 for i in xrange(len(deltacovplane)): if deltacovplane[i][0] == lastpos: print " Found duplicate: %d: [%d,%d] == %d: [%d,%d]" % (i-1, deltacovplane[i-1][0], deltacovplane[i-1][1], i, deltacovplane[i][0], deltacovplane[i][1]) lastpos = deltacovplane[i][0] print "\n\n" print "Beginning heap-based plane sweep over %d reads" % (len(reads)) starttime = time.time() ## record the delta encoded depth using a plane sweep deltacovplane = [] ## use a list to record the end positions of the elements currently in plane planeheap = [] ## BEGIN SWEEP (note change to index based so can peek ahead) for rr in xrange(len(reads)): r = reads[rr] startpos = r[1] endpos = r[2] ## clear out any positions from the plane that we have already moved past while (len(planeheap) > 0): if (planeheap[0] <= startpos): ## the coverage steps down, extract it from the front of the list ## oldend = planelist.pop(0) oldend = heapq.heappop(planeheap) nextend = -1 if (len(planeheap) > 0): nextend = planeheap[0] ## only record this transition if it is not the same as a start pos ## and only if not the same as the next end point if ((oldend != startpos) and (oldend != nextend)): deltacovplane.append((oldend, len(planeheap))) else: break ## Now insert the current endpos into the correct position into the list ##insertpos = -1 ##for i in xrange(len(planelist)): ## if (endpos < planelist[i]): ## insertpos = i ## break ##if (insertpos > 0): ## planelist.insert(insertpos, endpos) ##else: ## planelist.append(endpos) heapq.heappush(planeheap, endpos) ## Finally record that the coverage has increased ## But make sure the current read does not start at the same position as the next if ((rr == len(reads)-1) or (startpos != reads[rr+1][1])): deltacovplane.append((startpos, len(planeheap))) ## if it is at the same place, it will get reported in the next cycle ## Flush any remaining end positions while (len(planeheap) > 0): ##oldend = planelist.pop(0) oldend = heapq.heappop(planeheap) deltacovplane.append((oldend, len(planeheap))) ## Report statistics planeheaptime = (time.time()-starttime) * 1000.0 print "Heap-Plane sweep found %d steps, saving %0.02f%% of the space in %0.02f ms (%0.02f speedup)!" % (len(deltacovplane), (100.0*float(totallen-len(deltacovplane))/totallen), planeheaptime, brutetime/planeheaptime) for i in xrange(3): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print " ..." for i in xrange(len(deltacovplane)-3, len(deltacovplane)): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print "\n\n" print "Checking for duplicates in heap-ified version:" lastpos = -1 for i in xrange(len(deltacovplane)): if deltacovplane[i][0] == lastpos: print " Found duplicate: %d: [%d,%d] == %d: [%d,%d]" % (i-1, deltacovplane[i-1][0], deltacovplane[i-1][1], i, deltacovplane[i][0], deltacovplane[i][1]) lastpos = deltacovplane[i][0] print "\n\n" print "Beginning heap-based plane sweep over %d reads to search for max reads" % (len(reads)) starttime = time.time() ## record the delta encoded depth using a plane sweep deltacovplane = [] ## use a list to record the end positions of the elements currently in plane planeheap = [] ## use a list to record the ids at max coverage maxcov = -1 maxcovpos = -1 maxcovreads = [] ## BEGIN SWEEP (note change to index based so can peek ahead) for rr in xrange(len(reads)): r = reads[rr] readid = r[0] startpos = r[1] endpos = r[2] ## clear out any positions from the plane that we have already moved past while (len(planeheap) > 0): if (planeheap[0][0] <= startpos): ## the coverage steps down, extract it from the front of the list oldend = heapq.heappop(planeheap)[0] nextend = -1 if (len(planeheap) > 0): nextend = planeheap[0][0] ## only record this transition if it is not the same as a start pos ## and only if not the same as the next end point if ((oldend != startpos) and (oldend != nextend)): deltacovplane.append((oldend, len(planeheap))) else: break ## Now insert the current endpos into the correct position into the list heapq.heappush(planeheap, (endpos, readid)) ## Finally record that the coverage has increased ## But make sure the current read does not start at the same position as the next if ((rr == len(reads)-1) or (startpos != reads[rr+1][1])): cov = len(planeheap) if (cov > maxcov): maxcov = cov maxcovpos = startpos maxcovreads = [] for rr in planeheap: maxcovreads.append(rr[1]) deltacovplane.append((startpos, len(planeheap))) ## Flush any remaining end positions while (len(planeheap) > 0): oldend = heapq.heappop(planeheap)[0] deltacovplane.append((oldend, len(planeheap))) ## Report statistics planeheaptime = (time.time()-starttime) * 1000.0 print "Heap-Plane sweep found %d steps, saving %0.02f%% of the space in %0.02f ms (%0.02f speedup)!" % (len(deltacovplane), (100.0*float(totallen-len(deltacovplane))/totallen), planeheaptime, brutetime/planeheaptime) print "The %d reads at the position of maximum depth (%d) are:" % (maxcov, maxcovpos) print maxcovreads print "\n\n" for i in xrange(3): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print " ..." for i in xrange(len(deltacovplane)-3, len(deltacovplane)): print " %d: [%d,%d]" % (i, deltacovplane[i][0], deltacovplane[i][1]) print "\n\n" print "Checking for duplicates in heap-ified version:" lastpos = -1 for i in xrange(len(deltacovplane)): if deltacovplane[i][0] == lastpos: print " Found duplicate: %d: [%d,%d] == %d: [%d,%d]" % (i-1, deltacovplane[i-1][0], deltacovplane[i-1][1], i, deltacovplane[i][0], deltacovplane[i][1]) lastpos = deltacovplane[i][0] print "\n\n" f = plt.figure(figsize=(15,4), dpi=100) print "Plotting max depth\n\n" lastend = -1 lastidx = -1 ## draw the layout of reads for i in xrange(min(MAX_READS_LAYOUT, len(reads))): r = reads[i] readid = r[0] start = r[1] end = r[2] rc = r[3] color = "blue" if (rc == 1): color = "red" plt.plot ([start,end], [-2*i, -2*i], lw=4, color=color) lastend = end lastidx = i ## draw the base of the coverage plot plt.plot([0, lastend], [0,0], color="black") ## draw the coverage profile for i in xrange(len(deltacov)-1): x1 = deltacov[i][0] x2 = deltacov[i+1][0] y1 = YSCALE * deltacov[i][1] y2 = YSCALE * deltacov[i+1][1] ## draw the horizonal line plt.plot([x1, x2], [y1, y1], color="black") ## and now the right vertical to the new coverage level plt.plot([x2, x2], [y1, y2], color="black") if (x2 >= lastend): break ## draw a vertical bar with the max coverage plt.plot([maxcovpos, maxcovpos], [YSCALE*maxcov, -(2*lastidx+20)], color="green", lw=2) plt.draw() plt.show()