Figure generation for A probabilistic approach to k-mer counting =============
import numpy
datadir = '../data/'
figsize(12,6)
pwd
u'/Users/qingpeng/Dropbox/Work/Manuscript/2013-khmer/2013-khmer-counting/notebook'
def get_time_mem(filename):
"Extract the user time and max memory as generated by 'time' command"
for line in open(filename):
line = line.rstrip()
if 'system' in line:
fields1 = line.split('user')
time1 = float(fields1[0])
fields1b = line.split('system')[0].split()[-1]
time2 = float(fields1b)
walltime = line.split('elapsed')[0].split()[-1].rsplit(':')
assert len(walltime) <= 3
hours = 0.
minutes = 0.
seconds = walltime[-1]
if len(walltime) == 3:
hours = float(walltime[0])
minutes = float(walltime[1])
elif len(walltime) == 2:
minutes = float(walltime[0])
wall_seconds = hours*60*60 + minutes*60 + float(walltime[1])
time = wall_seconds
fields2 = line.split('avgdata ')
fields3 = fields2[1].split('max')
mem = fields3[0]
return float(time), float(mem)
raise Exception(filename)
# read time files
mkindex_time={}
mkindex_mem={}
suffix_time={}
suffix_mem={}
# tallymer runtime info
# part=1 k=22
for i1 in range(1,6):
for i2 in [1]:
for i3 in [22]:
name = "%d_%d_%d" % (i1, i2, i3)
filename = 'mkindex_%d_part%d_%d.time' % (i1, i2, i3)
filename = datadir + filename
mkindex_time[name],mkindex_mem[name] = get_time_mem(filename)
name = "%d_%d" % (i1, i2)
filename = datadir + 'suffix_%d_part%d.time' % (i1, i2)
suffix_time[name],suffix_mem[name] = get_time_mem(filename)
# read jellyfish
# k=22 and k=31
jelly_count_mem = {}
jelly_count_time = {}
for i1 in range(1,6):
for i2 in [22,31]:
name = "%d_%d" % (i1, i2)
filename = datadir + 'jelly_%d_%d.time1' % (i1, i2)
jelly_count_time[name],jelly_count_mem[name] = get_time_mem(filename)
jelly_hist_mem = {}
jelly_hist_time = {}
for i1 in range(1,6):
for i2 in [22,31]:
name = "%d_%d" % (i1, i2)
filename = datadir + 'jelly_%d_%d.time2' % (i1, i2)
jelly_hist_time[name],jelly_hist_mem[name] = get_time_mem(filename)
# DSK use k=22
dsk_mem = {}
dsk_time = {}
for i1 in range(1,6):
for i2 in [22]:
name = "%d_%d" % (i1, i2)
filename = datadir+'dsk_%d_%d.time' % (i1, i2)
dsk_time[name], dsk_mem[name] = get_time_mem(filename)
# khmer use k=22 only, error rate=1% and 5%
khmer_mem1 = {}
khmer_time1 = {}
for i1 in range(1,6):
for i2 in [1,5,20]:
for i3 in [22]:
name = "%d_%d_%d" % (i1, i2, i3)
filename = datadir +'bloom_%d_%d_%d.time1' % (i1, i2, i3)
khmer_time1[name],khmer_mem1[name] = get_time_mem(filename)
khmer_mem2 = {}
khmer_time2 = {}
for i1 in range(1,6):
for i2 in [1,5,20]:
for i3 in [22]:
name = "%d_%d_%d" % (i1, i2, i3)
filename = datadir +'bloom_%d_%d_%d.time2' % (i1, i2, i3)
# khmer_time2[name],khmer_mem2[name] = get_time_mem(filename)
# number of unique 22-mers in different data sets
def get_total_kmers(filename):
total = 0
for line in open(filename):
line = line.rstrip()
fields = line.split()
total = total + int(fields[1])
return total
total_list = []
for i in range(1,6):
filename = datadir+'jelly_%d_22.hist' % (i)
total = get_total_kmers(filename)
total_list.append(total)
Time usage of different khmer counting tools
# 1. user time
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
read_counts = [9744399, 19488798, 29233197, 38977596, 48721995]
L_khmer = [] # 1% error rate , k=22
L_tallymer = [] # k=22, time usage is the same for part=1 or part=4, only use part=1 here
L_jellyfish_k22 = [] # use k=22, but memory change with different k size. k=31.
L_dsk = [] # use k=22.
for i in range(1,6):
L_khmer.append(khmer_time1[str(i)+'_1_22'])
L_tallymer.append(suffix_time[str(i)+'_1']+mkindex_time[str(i)+'_1_22'])
L_jellyfish_k22.append(jelly_count_time[str(i)+'_22'] + jelly_hist_time[str(i)+'_22'])
L_dsk.append(dsk_time[str(i)+'_22'])
ax.set_xlabel('Total number of reads')
ax.set_ylabel('Time (s)')
ax.plot(read_counts,L_khmer,'bo-', label='khmer (1% error rate)')
ax.plot(read_counts,L_tallymer,'go-', label='Tallymer')
ax.plot(read_counts,L_jellyfish_k22,'ro-', label='Jellyfish')
ax.plot(read_counts,L_dsk,'yo-', label='DSK')
#ax.set_xlim(0,3e9)
#ax.set_ylim(0,6000)
ax.legend(loc='best',prop={'size':8})
fig_file = '../figure/time_benchmark.eps'
plt.savefig(fig_file,dpi=300)
#fig_file = '../figure/time_benchmark.pdf'
#plt.savefig(fig_file,dpi=300)
print L_khmer
[303.45, 629.87, 917.78, 1166.83, 1462.18]
Memory usage of different k-mer counting tools
# 2. memory usage of different k-mer counting tools
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
L_khmer_1p = [] # 1% error rate , k=22
L_khmer_5p = [] # 5% error rate, k=22
L_khmer_20p = [] # 20% error rate, k=22
L_tallymer = [] # k=22, pick biggest memory, suffix 1 part and 4 parts different memory ,but all smaller than mkindex step(same with part1 or part4).
L_jellyfish_k22 = [] # use k=22, but memory change with different k size. k=31.
L_jellyfish_k31 = []
L_dsk = [] # use k=22.
for i in range(1,6):
L_khmer_1p.append(khmer_mem1[str(i)+'_1_22']/1000000)
L_khmer_5p.append(khmer_mem1[str(i)+'_5_22']/1000000)
L_khmer_20p.append(khmer_mem1[str(i)+'_20_22']/1000000)
L_tallymer.append(mkindex_mem[str(i)+'_1_22']/1000000) # memory usage of mkindex is always bigger than suffix(part1/4)
if jelly_count_mem[str(i)+'_22'] > jelly_hist_mem[str(i)+'_22']:
L_jellyfish_k22.append(jelly_count_mem[str(i)+'_22']/1000000)
else:
L_jellyfish_k22.append(jelly_hist_mem[str(i)+'_22']/1000000)
if jelly_count_mem[str(i)+'_31'] > jelly_hist_mem[str(i)+'_31']:
L_jellyfish_k31.append(jelly_count_mem[str(i)+'_31']/1000000)
else:
L_jellyfish_k31.append(jelly_hist_mem[str(i)+'_31']/1000000)
L_dsk.append(dsk_mem[str(i)+'_22']/1000000)
#print L_khmer,L_tallymer,L_jellyfish_k22,L_jellyfish_k31,L_dsk
ax.set_xlabel('Total number of distinct k-mers')
ax.set_ylabel('Memory usage(G)')
ax.plot(total_list,L_khmer_1p,'bo-', label='khmer (1% error rate)')
ax.plot(total_list,L_khmer_5p,'bo--', label='khmer (5% error rate)')
ax.plot(total_list,L_khmer_20p,'bo:', label='khmer (20% error rate)')
ax.plot(total_list,L_tallymer,'go-', label='Tallymer')
ax.plot(total_list,L_jellyfish_k22,'ro--', label='Jellyfish k=22')
ax.plot(total_list,L_jellyfish_k31,'ro-', label='Jellyfish k=31')
ax.plot(total_list,L_dsk,'yo-', label='DSK')
ax.set_xlim(0,2.5e9)
#ax.set_ylim(0,40)
ax.legend(loc='best',prop={'size':8})
fig_file = '../figure/memory_benchmark.eps'
plt.savefig(fig_file,dpi=300)
Disk storage usage of different k-mer counting tools
# 3. disk usage of different k-mer counting tools
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
# from size of intermediant files on hard disk, will modify to automatically generate
L_khmer = [5.5,11,15,18,21] # 1% error rate , k=22
L_tallymer = [7.8,17,23,28,36] # k=22, pick biggest memory, suffix 1 part and 4 parts different memory ,but all smaller than mkindex step.
L_jellyfish_k22 = [5.3,9.9,14,17,20] # use k=22, but memory change with different k size. k=31.
L_dsk = [1.1,2.1,3.0,3.8,4.7] # use k=22. by default, use the size of reads file as disk usage parameter
L_khmer_gz = [1284585409. / 1e9,2491325096. / 1e9, 3439743439./1e9,4254327173./1e9,5151176537./1e9]
ax.set_xlabel('Total number of distinct k-mers')
ax.set_ylabel('Disk usage(GB)')
ax.plot(total_list,L_khmer,'bo-', label='khmer (1%)*')
ax.plot(total_list,L_khmer_gz,'bo--', label='khmer (1%), compressed')
ax.plot(total_list,L_tallymer,'go-', label='Tallymer')
ax.plot(total_list,L_jellyfish_k22,'ro-', label='Jellyfish')
ax.plot(total_list,L_dsk,'yo-', label='DSK')
ax.set_ylim(0,50)
ax.legend(loc='best',prop={'size':8})
fig_file = '../figure/disk_benchmark.eps'
plt.savefig(fig_file,dpi=300)
Comparison of time it takes to count k-mers.
tally_count = [ get_time_mem('../data/1_part1_22.count.time')[0],
get_time_mem('../data/2_part1_22.count.time')[0],
get_time_mem('../data/3_part1_22.count.time')[0],
get_time_mem('../data/4_part1_22.count.time')[0],
get_time_mem('../data/5_part1_22.count.time')[0] ]
khmer_count = [ get_time_mem('../data/bloom_1_1_22.count.time')[0],
get_time_mem('../data/bloom_2_1_22.count.time')[0],
get_time_mem('../data/bloom_3_1_22.count.time')[0],
get_time_mem('../data/bloom_4_1_22.count.time')[0],
get_time_mem('../data/bloom_5_1_22.count.time')[0] ]
jelly_count = [ get_time_mem('../data/jelly_1_22.count.time')[0],
get_time_mem('../data/jelly_2_22.count.time')[0],
get_time_mem('../data/jelly_3_22.count.time')[0],
get_time_mem('../data/jelly_4_22.count.time')[0],
get_time_mem('../data/jelly_5_22.count.time')[0] ]
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
ax.plot(total_list, khmer_count, 'bo-', label='khmer (1%)')
ax.plot(total_list, tally_count, 'go-', label='Tallymer')
ax.plot(total_list, jelly_count, 'ro-', label='Jellyfish')
ax.legend(loc='best',prop={'size':8})
ax.set_ylim(0,1600)
ax.set_xlabel('Number of k-mers in database')
ax.set_ylabel('Time (s)')
fig_file = '../figure/count_benchmark.eps'
plt.savefig(fig_file,dpi=300)
relation between average miscount and counting error rate
ht_size = "100000,200000,400000,600000,800000,1000000,1200000,1400000,1800000,2200000,2600000,3000000,4000000,6000000"
HT_SIZE_array = map(int,ht_size.split(','))
file_obj = open(datadir+'fp_assessment.out','r')
lines = file_obj.readlines()
result1 = [map(float,lines[1].split()),map(float,lines[2].split())]
result2 = [map(float,lines[5].split()),map(float,lines[6].split())]
result3 = [map(float,lines[9].split()),map(float,lines[10].split())]
result4 = [map(float,lines[13].split()),map(float,lines[14].split())]
result5 = [map(float,lines[17].split()),map(float,lines[18].split())]
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
ax.set_xlabel('counting error rate (offset>0)')
ax.set_ylabel('average offset')
ax.plot(result1[1],result1[0],'ro-',result2[1],result2[0],'go-',result3[1],result3[0],'bo-',result4[1],result4[0],'yo-',result5[1],result5[0],'ko-')
#ax.set_xlim(0,0.2)
#ax.set_ylim(0,20)
ax.axis(ymax=10, ymin=-2)
ax.legend(('metagenome data','random k-mers','reads with error','reads without error','E.coli reads'),loc='upper left',prop={'size':8})
fig_file = '../figure/average_offset_vs_fpr.eps'
plt.savefig(fig_file,dpi=300)
#print result1
#print result2
#print result3
#print result4
y-axis is the average of (offset/correct_count) for each k-mer
Since the high diversity dataset, most of the accurate counts are 1, so for smaller hash table(high error rate), the offset(miscount) may be 2 or 3. That percentage will be 100%-200%, which is the case in the figure while counting error rate >0.7
ht_size = "100000,200000,400000,600000,800000,1000000,1200000"
HT_SIZE_array = map(int,ht_size.split(','))
file_obj = open(datadir+'fp_assessment.out','r')
lines = file_obj.readlines()
result1 = numpy.array([map(float,lines[3].split()),map(float,lines[2].split())])
result1[0] *= 100.
result2 = numpy.array([map(float,lines[7].split()),map(float,lines[6].split())])
result2[0] *= 100.
result3 = numpy.array([map(float,lines[11].split()),map(float,lines[10].split())])
result3[0] *= 100.
result4 = numpy.array([map(float,lines[15].split()),map(float,lines[14].split())])
result4[0] *= 100.
result5 = numpy.array([map(float,lines[19].split()),map(float,lines[18].split())])
result5[0] *= 100.
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
ax.set_xlabel('counting error rate (offset > 0)')
ax.set_ylabel('Average offset (percent)')
ax.plot(result1[1],result1[0],'ro-',result2[1],result2[0],'go-',result3[1],result3[0],'bo-',result4[1],result4[0],'yo-',result5[1],result5[0],'ko-')
#ax.set_xlim(0,1)
ax.set_ylim(0,100)
ax.legend(('metagenome data','random k-mers','reads with 1% error','reads without error','E.coli reads'),loc='upper left',prop={'size':8})
fig_file = '../figure/percent_offset_vs_fpr.eps'
plt.savefig(fig_file,dpi=300)
#ax.axis(ymax=1)
Percentage of the unique k-mers starting in different position in reads
file17 = open(datadir+"ecoli_ref.fastq.pos17.abund1",'r')
file32 = open(datadir+"ecoli_ref.fastq.pos32.abund1",'r')
list1 = []
list2 = []
x1 = []
x2 = []
for line in file17:
line = line.rstrip()
fields = line.split()
if fields[1] != '0':
x1.append(float(fields[0]))
list1.append(float(fields[1]))
for line in file32:
line = line.rstrip()
fields = line.split()
if fields[1] != '0':
x2.append(float(fields[0]))
list2.append(float(fields[1]))
fig = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(6.83,4)
ax.set_xlabel("Starting position of k-mer in read")
ax.set_ylabel("Number of abund=1 k-mers at that position")
ax.plot(x1,list1,'r-')
ax.plot(x2,list2,'b-')
ax.legend(('k=17','k=32'), loc='upper left',prop={'size':12})
#plt.ylim(0,10000000)
ax.set_xlim(0,100)
plt.savefig("../figure/perc_unique_pos.eps",dpi=300)
print list1
[46039.0, 45984.0, 46446.0, 47470.0, 48579.0, 50113.0, 51632.0, 53137.0, 54740.0, 56699.0, 58970.0, 61184.0, 63901.0, 66477.0, 69228.0, 71822.0, 74680.0, 77593.0, 80603.0, 83879.0, 87269.0, 90834.0, 94489.0, 98343.0, 102286.0, 106404.0, 110672.0, 115064.0, 119797.0, 124374.0, 129221.0, 134128.0, 140078.0, 146139.0, 152210.0, 158722.0, 165447.0, 172649.0, 180461.0, 188009.0, 196267.0, 204593.0, 213246.0, 222281.0, 231718.0, 241559.0, 252654.0, 263953.0, 276486.0, 289243.0, 302325.0, 317075.0, 332998.0, 349578.0, 366452.0, 383595.0, 401981.0, 420849.0, 442622.0, 464175.0, 486949.0, 511942.0, 538922.0, 565053.0, 593924.0, 621925.0, 651158.0, 684745.0, 719185.0, 753759.0, 787073.0, 822855.0, 861472.0, 905192.0, 950364.0, 994514.0, 1038587.0, 1083820.0, 1131337.0, 1181805.0, 1232100.0, 1282543.0, 1341001.0, 1402687.0]
Number of remaining reads after iterating filtering out low-abundance reads
# 6. abundance filtering 1 . number of remaining reads
fig = plt.figure()
ax = fig.add_subplot(111)
list2_x = np.arange(4)*4
file_remaining = open(datadir+"remaining_reads.wc",'r')
lines = file_remaining.readlines()
list2_y2 = []# hashtable size: 1e9
for i in [5,6,7,8]:
fields = lines[i].split()
list2_y2.append(int(fields[0])/2)
list2_y3 = []# hashtable size: 1e10
for i in [1,2,3,4]:
fields = lines[i].split()
list2_y3.append(int(fields[0])/2)
ax.set_ylabel('number of remaining reads')
width = 1
#rects1 = ax.bar(list2_x+1,list2_y1,width,color='g')
rects2 = ax.bar(list2_x+width*1,list2_y2,width,color='b')
rects3 = ax.bar(list2_x+width*2,list2_y3,width,color='c')
l = ax.axhline(linewidth=4, color='k',y=42458402)
ax.set_xticks(list2_x+2.5)
ax.set_xticklabels(('1st run','2nd run','3rd run','4th run'))
ax.set_ylim((1,6e7))
ax.set_xlim(0,17)
ax.annotate('number of original total reads',
xy=(3, 4.3e7), #
xytext=(1,5e7), #
xycoords='data',
textcoords='data',
arrowprops=dict(facecolor='black', shrink=0.01,width=2),
horizontalalignment='left',
verticalalignment='top',
clip_on=True, # clip to the axes bounding box
)
plt.legend((rects2[0],rects3[0]),('hashtable size: 1e9','hashtable size: 1e10'),loc='best')
fig_file = '../figure/num_remaining_reads.pdf'
plt.savefig(fig_file)
number of reads with low-abundance k-mers after iterating filtering out low-abundance reads
# 7. abundance filtering 2 . percentage of remaining reads
fig = plt.figure()
ax = fig.add_subplot(111)
list2_x = np.arange(4)*3
file_bad = open(datadir+"bad_reads.wc",'r')
lines = file_bad.readlines()
list2_y2 = []# hashtable size: 1e9
for i in [7,6,5,4]:
fields = lines[i].split()
list2_y2.append(int(fields[0]))
list2_y3 = []# hashtable size: 1e10
for i in [3,2,1,0]:
fields = lines[i].split()
list2_y3.append(int(fields[0]))
ax.set_ylabel('number of reads with low-abundance k-mers')
width = 1
rects1 = ax.bar(list2_x+1,list2_y2,width,color='b')
rects2 = ax.bar(list2_x+1+width*1,list2_y3,width,color='c')
ax.set_xticks(list2_x+2)
ax.set_xticklabels(('1st run','2nd run','3rd run','4th run'))
ax.set_ylim((0,4e6))
ax.set_xlim(0,13)
plt.legend((rects1[0],rects2[0]),('hashtable size: 1e9','hashtable size: 1e10'),loc='best')
fig_file = '../figure/num_bad_reads.pdf'
plt.savefig(fig_file)