Figure generation for

## "A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data"¶

See http://arxiv.org/abs/1203.4802 for more information.

In [1]:
import numpy
datadir = '../data/'
num_points_to_plot = 50000  # affects size, rendering time for PDF plots.


# Figure 1¶

k-mer rank abundance plots

In [2]:
rr1 = [ x.split() for x in open(datadir + 'read2.ranks') ]
rr1 = numpy.array([ (int(a), int(b)) for (a,b) in rr1 ])
rr1_x = [ a for (a, b) in rr1 ]
rr1_y = [ b for (a, b) in rr1 ]

rr2 = [ x.split() for x in open(datadir + 'read3.ranks') ]
rr2 = numpy.array([ (int(a), int(b)) for (a,b) in rr2 ])
rr2_x = [ a for (a, b) in rr2 ]
rr2_y = [ b for (a, b) in rr2 ]

rr3 = [ x.split() for x in open(datadir + 'read15.ranks') ]
rr3 = numpy.array([ (int(a), int(b)) for (a,b) in rr3 ])
rr3_x = [ a for (a, b) in rr3 ]
rr3_y = [ b for (a, b) in rr3 ]

In [3]:
clf()
plot(rr1[:,0], rr1[:,1])
plot(rr2[:,0], rr2[:,1])
plot(rr3[:,0], rr3[:,1])
xlabel('k-mer rank')
ylabel('k-mer abundance')
legend(['no errors', 'single substitution error', 'multiple substitution errors'])
axis([0, 80, 0, 250])
savefig('diginorm-ranks.pdf')
show()


# Figures 2 and 3¶

Correlation between median k-mer count and read coverage calculated from mapping, from both simulated data and genomic & transcriptomic data.

In [4]:
import itertools, gzip
def load_cmp_file(filename, limit=None):
if filename.endswith('.gz'):
fp = gzip.open(filename)
else:
fp = open(filename)
if limit:
lines = [ line.split() for i, line in itertools.izip(range(limit), fp) ]
else:
lines = [ line.split() for line in fp ]
lines = [ (float(a[0]), float(a[1])) for a in lines ]

print 'loaded %d lines' % len(lines)
return numpy.array(lines)

In [5]:
genome_counts = load_cmp_file(datadir + 'genome-reads.fa.counts.cmp', limit=num_points_to_plot)

loaded 50000 lines

In [6]:
clf()
axis([0, 300, 0, 300])
#axis(ymax=1000)

xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
plot(genome_counts[:,0], genome_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-sim-genome.pdf')
show()

c = numpy.corrcoef(genome_counts[:,0], genome_counts[:,1])[0,1]
print c**2

0.786287931714

In [7]:
ecoli_counts = load_cmp_file(datadir + 'ecoli_ref-5m.fastq.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
ecoli_counts = numpy.array([ (a,b) for (a,b) in ecoli_counts if b < 8000 ])

loaded 50000 lines

In [8]:
clf()
#axis([0, 300, 0, 300])
#axis(xmax=200, ymax=200)
#axis(ymax=10500,ymin=10000)
#title('ecoli reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
plot(ecoli_counts[:,0], ecoli_counts[:,1], 'b.', rasterized=True)
savefig('diginorm-ecoli-genome.pdf')
show()

c = numpy.corrcoef(ecoli_counts[:,0], ecoli_counts[:,1])[0,1]
print c**2

0.798609411012

In [9]:
transcript_counts = load_cmp_file(datadir + 'transcript-reads.fa.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
transcript_counts = numpy.array([ (a,b) for (a,b) in transcript_counts if b < 20000 ])

loaded 50000 lines

In [10]:
clf()
#axis([0, 250, 0, 250])
#title('transcript reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
loglog(transcript_counts[:,0], transcript_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-sim-transcr.pdf')
show()

c = numpy.corrcoef(transcript_counts[:,0], transcript_counts[:,1])[0,1]
print 'linear r^2', c**2

c = numpy.corrcoef([ math.log(z) for z in transcript_counts[:,0] ], [ math.log(z) for z in transcript_counts[:,1] ])[0,1]
print 'log r^2', c**2

linear r^2 0.931638777367
log r^2 0.956816045607

In [11]:
mouse_counts = load_cmp_file(datadir + 'mouse-5m.fq.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
mouse_counts = numpy.array([ (a,b) for (a,b) in mouse_counts if b < 20000 ])

loaded 50000 lines

In [12]:
clf()
#axis([0, 250, 0, 250])
#title('mouse reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
loglog(mouse_counts[:,0], mouse_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-mouse-transcr.pdf')
show()

c = numpy.corrcoef(mouse_counts[:,0], mouse_counts[:,1])[0,1]
print 'linear r^2', c**2

za = []
zb = []
for a, b in mouse_counts:
if a and b:
za.append(math.log(a))
zb.append(math.log(b))

c = numpy.corrcoef(za, zb)[0,1]
print 'log r^2', c**2

linear r^2 0.904776248799
log r^2 0.872367665664


# Figure 4¶

In [13]:
ecoli_cov = numpy.loadtxt(datadir + 'ecoli.rawreads.map.gz.cov')
ecoli_cov[:,1] /= sum(ecoli_cov[:,1])

staph_cov = numpy.loadtxt(datadir + 'staph.rawreads.map.gz.cov')
staph_cov[:,1] /= sum(staph_cov[:,1])

sar_cov = numpy.loadtxt(datadir + 'sar324.rawreads.map.gz.cov')
sar_cov[:,1] /= sum(sar_cov[:,1])

In [14]:
mapcov = numpy.loadtxt(datadir + 'genome-reads.fa.map.cov')
keepcov = numpy.loadtxt(datadir + 'genome-reads.fa.keep.map.cov')
ecolicov = numpy.loadtxt(datadir + 'ecoli_ref-5m.fastq.map.cov')
ecoli20cov = numpy.loadtxt(datadir + 'ecoli_ref.fastq.bz2.keep.map.cov')

transcript_cov = numpy.loadtxt(datadir + 'transcript-reads.fa.map.cov')
mousecov = numpy.loadtxt(datadir + 'mouse-5m.fq.map.cov')

transcript_keepcov = numpy.loadtxt(datadir + 'transcript-reads.fa.keep.map.cov')
mousekeepcov = numpy.loadtxt(datadir + 'mouse-5m.fq.keep.map.cov')

In [15]:
mapcov[:,1] /= sum(mapcov[:,1])
keepcov[:,1] /= sum(keepcov[:,1])
ecolicov[:,1] /= sum(ecolicov[:,1])
ecoli20cov[:,1] /= sum(ecoli20cov[:,1])

transcript_cov[:,1] /= sum(transcript_cov[:,1])
mousecov[:,1] /= sum(mousecov[:,1])

transcript_keepcov[:,1] /= sum(transcript_keepcov[:,1])
mousekeepcov[:,1] /= sum(mousekeepcov[:,1])

In [16]:
plot(ecoli_cov[:,0], ecoli_cov[:,1])
plot(staph_cov[:,0], staph_cov[:,1])
plot(sar_cov[:,0], sar_cov[:,1])
xlabel("Per-base coverage of reference genome")
ylabel("Fraction of total bases with that coverage")
legend(["E. coli", "S. aureus single cell MDA", "SAR324 single cell MDA"])
axis(xmax=2000)
savefig('/tmp/diginorm-coverage2-raw.pdf')

In [17]:
ecoli_kcov = numpy.loadtxt(datadir + 'ecoli.keep.rawreads.map.gz.cov')
ecoli_kcov[:,1] /= sum(ecoli_kcov[:,1])

staph_kcov = numpy.loadtxt(datadir + 'staph.keep.rawreads.map.gz.cov')
staph_kcov[:,1] /= sum(staph_kcov[:,1])

sar_kcov = numpy.loadtxt(datadir + 'sar324.keep.rawreads.map.gz.cov')
sar_kcov[:,1] /= sum(sar_kcov[:,1])

In [18]:
plot(ecoli_kcov[:,0], ecoli_kcov[:,1])
plot(staph_kcov[:,0], staph_kcov[:,1])
plot(sar_kcov[:,0], sar_kcov[:,1])
xlabel("Per-base coverage of reference genome")
ylabel("Fraction of total bases with that coverage")
legend(["E. coli", "S. aureus single cell MDA", "SAR324 single cell MDA"])
axis(xmax=100)
savefig('/tmp/diginorm-coverage2-dn.pdf')


# Figure 5¶

In [19]:
fp = open(datadir + 'ecoli_ref.report')
report = [ x.split()[:2] for x in fp ]
report = [ (float(a) / 1e6, int(b)) for (a,b) in report[1:] ]
report = [ (a, b, float(b)/float(a)/1e6) for (a,b) in report if a ]
print report[0]

(0.2, 199444, 0.99722)

In [20]:
x = [ a for (a,b,c) in report ]
y = [ c for (a,b,c) in report ]

In [21]:
clf()
xlabel("Number of reads (m)")
ylabel("Total fraction of reads kept")
axis([0, 30, 0, 1.0])
plot(x,y)
savefig('diginorm-accumulation.pdf')


# Figure 6 - end bias stuff¶

In [22]:
def load_endbias(filename):
data = [ i.split() for i in open(filename) ]
x = [ int(a) for (a,_,b,_) in data ]
y = [ float(b) for (a,_,b,_) in data ]

return (x, y)

end_tr_x, end_tr_y = load_endbias(datadir + 'endbias-transcripts.txt')
end_g_x, end_g_y = load_endbias(datadir + 'endbias-genome.txt')

In [23]:
clf()
xlabel("Distance from k-mer to end of source sequence")
ylabel("Fraction of positions with missing k-mers")
axis([-5,50,-.05, 1.1])
plot(end_tr_x, end_tr_y)
plot(end_g_x, end_g_y)
legend(['simulated transcripts', 'simulated genome'])
savefig('diginorm-endbias.pdf')
show()