Figure generation for

"A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data"¶

See http://arxiv.org/abs/1203.4802 for more information.

In [1]:

import numpy
datadir = '../data/'
num_points_to_plot = 50000  # affects size, rendering time for PDF plots.

Figure 1¶

k-mer rank abundance plots

In [2]:

rr1 = [ x.split() for x in open(datadir + 'read2.ranks') ]
rr1 = numpy.array([ (int(a), int(b)) for (a,b) in rr1 ])
rr1_x = [ a for (a, b) in rr1 ]
rr1_y = [ b for (a, b) in rr1 ]

rr2 = [ x.split() for x in open(datadir + 'read3.ranks') ]
rr2 = numpy.array([ (int(a), int(b)) for (a,b) in rr2 ])
rr2_x = [ a for (a, b) in rr2 ]
rr2_y = [ b for (a, b) in rr2 ]

rr3 = [ x.split() for x in open(datadir + 'read15.ranks') ]
rr3 = numpy.array([ (int(a), int(b)) for (a,b) in rr3 ])
rr3_x = [ a for (a, b) in rr3 ]
rr3_y = [ b for (a, b) in rr3 ]

In [3]:

clf()
plot(rr1[:,0], rr1[:,1])
plot(rr2[:,0], rr2[:,1])
plot(rr3[:,0], rr3[:,1])
xlabel('k-mer rank')
ylabel('k-mer abundance')
legend(['no errors', 'single substitution error', 'multiple substitution errors'])
axis([0, 80, 0, 250])
savefig('diginorm-ranks.pdf')
show()

Figures 2 and 3¶

Correlation between median k-mer count and read coverage calculated from mapping, from both simulated data and genomic & transcriptomic data.

In [4]:

import itertools, gzip
def load_cmp_file(filename, limit=None):
    if filename.endswith('.gz'):
        fp = gzip.open(filename)
    else:
        fp = open(filename)
    if limit:
        lines = [ line.split() for i, line in itertools.izip(range(limit), fp) ]
    else:
        lines = [ line.split() for line in fp ]
    lines = [ (float(a[0]), float(a[1])) for a in lines ]

    print 'loaded %d lines' % len(lines)
    return numpy.array(lines)

In [5]:

genome_counts = load_cmp_file(datadir + 'genome-reads.fa.counts.cmp', limit=num_points_to_plot)

loaded 50000 lines

In [6]:

clf()
axis([0, 300, 0, 300])
#axis(ymax=1000)

xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
plot(genome_counts[:,0], genome_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-sim-genome.pdf')
show()

c = numpy.corrcoef(genome_counts[:,0], genome_counts[:,1])[0,1]
print c**2

0.786287931714

In [7]:

ecoli_counts = load_cmp_file(datadir + 'ecoli_ref-5m.fastq.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
ecoli_counts = numpy.array([ (a,b) for (a,b) in ecoli_counts if b < 8000 ])

loaded 50000 lines

In [8]:

clf()
#axis([0, 300, 0, 300])
#axis(xmax=200, ymax=200)
#axis(ymax=10500,ymin=10000)
#title('ecoli reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
plot(ecoli_counts[:,0], ecoli_counts[:,1], 'b.', rasterized=True)
savefig('diginorm-ecoli-genome.pdf')
show()

c = numpy.corrcoef(ecoli_counts[:,0], ecoli_counts[:,1])[0,1]
print c**2

0.798609411012

In [9]:

transcript_counts = load_cmp_file(datadir + 'transcript-reads.fa.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
transcript_counts = numpy.array([ (a,b) for (a,b) in transcript_counts if b < 20000 ])

loaded 50000 lines

In [10]:

clf()
#axis([0, 250, 0, 250])
#title('transcript reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
loglog(transcript_counts[:,0], transcript_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-sim-transcr.pdf')
show()

c = numpy.corrcoef(transcript_counts[:,0], transcript_counts[:,1])[0,1]
print 'linear r^2', c**2

c = numpy.corrcoef([ math.log(z) for z in transcript_counts[:,0] ], [ math.log(z) for z in transcript_counts[:,1] ])[0,1]
print 'log r^2', c**2

linear r^2 0.931638777367
log r^2 0.956816045607

In [11]:

mouse_counts = load_cmp_file(datadir + 'mouse-5m.fq.counts.cmp', limit=num_points_to_plot)

# remove points from Illumina crap; see text for details.
mouse_counts = numpy.array([ (a,b) for (a,b) in mouse_counts if b < 20000 ])

loaded 50000 lines

In [12]:

clf()
#axis([0, 250, 0, 250])
#title('mouse reads')
xlabel('read coverage by mapping')
ylabel('read coverage by median k-mer count')
loglog(mouse_counts[:,0], mouse_counts[:,1], 'b.', rasterized=True)

savefig('diginorm-mouse-transcr.pdf')
show()

c = numpy.corrcoef(mouse_counts[:,0], mouse_counts[:,1])[0,1]
print 'linear r^2', c**2

za = []
zb = []
for a, b in mouse_counts:
    if a and b:
        za.append(math.log(a))
        zb.append(math.log(b))

c = numpy.corrcoef(za, zb)[0,1]
print 'log r^2', c**2

linear r^2 0.904776248799
log r^2 0.872367665664

Figure 4¶

In [13]:

ecoli_cov = numpy.loadtxt(datadir + 'ecoli.rawreads.map.gz.cov')
ecoli_cov[:,1] /= sum(ecoli_cov[:,1])

staph_cov = numpy.loadtxt(datadir + 'staph.rawreads.map.gz.cov')
staph_cov[:,1] /= sum(staph_cov[:,1])

sar_cov = numpy.loadtxt(datadir + 'sar324.rawreads.map.gz.cov')
sar_cov[:,1] /= sum(sar_cov[:,1])

In [14]:

mapcov = numpy.loadtxt(datadir + 'genome-reads.fa.map.cov')
keepcov = numpy.loadtxt(datadir + 'genome-reads.fa.keep.map.cov')
ecolicov = numpy.loadtxt(datadir + 'ecoli_ref-5m.fastq.map.cov')
ecoli20cov = numpy.loadtxt(datadir + 'ecoli_ref.fastq.bz2.keep.map.cov')

transcript_cov = numpy.loadtxt(datadir + 'transcript-reads.fa.map.cov')
mousecov = numpy.loadtxt(datadir + 'mouse-5m.fq.map.cov')

transcript_keepcov = numpy.loadtxt(datadir + 'transcript-reads.fa.keep.map.cov')
mousekeepcov = numpy.loadtxt(datadir + 'mouse-5m.fq.keep.map.cov')

In [15]:

mapcov[:,1] /= sum(mapcov[:,1])
keepcov[:,1] /= sum(keepcov[:,1])
ecolicov[:,1] /= sum(ecolicov[:,1])
ecoli20cov[:,1] /= sum(ecoli20cov[:,1])

transcript_cov[:,1] /= sum(transcript_cov[:,1])
mousecov[:,1] /= sum(mousecov[:,1])

transcript_keepcov[:,1] /= sum(transcript_keepcov[:,1])
mousekeepcov[:,1] /= sum(mousekeepcov[:,1])

In [16]:

plot(ecoli_cov[:,0], ecoli_cov[:,1])
plot(staph_cov[:,0], staph_cov[:,1])
plot(sar_cov[:,0], sar_cov[:,1])
xlabel("Per-base coverage of reference genome")
ylabel("Fraction of total bases with that coverage")
legend(["E. coli", "S. aureus single cell MDA", "SAR324 single cell MDA"])
axis(xmax=2000)
savefig('/tmp/diginorm-coverage2-raw.pdf')

In [17]:

ecoli_kcov = numpy.loadtxt(datadir + 'ecoli.keep.rawreads.map.gz.cov')
ecoli_kcov[:,1] /= sum(ecoli_kcov[:,1])

staph_kcov = numpy.loadtxt(datadir + 'staph.keep.rawreads.map.gz.cov')
staph_kcov[:,1] /= sum(staph_kcov[:,1])

sar_kcov = numpy.loadtxt(datadir + 'sar324.keep.rawreads.map.gz.cov')
sar_kcov[:,1] /= sum(sar_kcov[:,1])

In [18]:

plot(ecoli_kcov[:,0], ecoli_kcov[:,1])
plot(staph_kcov[:,0], staph_kcov[:,1])
plot(sar_kcov[:,0], sar_kcov[:,1])
xlabel("Per-base coverage of reference genome")
ylabel("Fraction of total bases with that coverage")
legend(["E. coli", "S. aureus single cell MDA", "SAR324 single cell MDA"])
axis(xmax=100)
savefig('/tmp/diginorm-coverage2-dn.pdf')

Figure 5¶

In [19]:

fp = open(datadir + 'ecoli_ref.report')
report = [ x.split()[:2] for x in fp ]
report = [ (float(a) / 1e6, int(b)) for (a,b) in report[1:] ]
report = [ (a, b, float(b)/float(a)/1e6) for (a,b) in report if a ]
print report[0]

(0.2, 199444, 0.99722)

In [20]:

x = [ a for (a,b,c) in report ]
y = [ c for (a,b,c) in report ]

In [21]:

clf()
xlabel("Number of reads (m)")
ylabel("Total fraction of reads kept")
axis([0, 30, 0, 1.0])
plot(x,y)
savefig('diginorm-accumulation.pdf')

Figure 6 - end bias stuff¶

In [22]:

def load_endbias(filename):
    data = [ i.split() for i in open(filename) ]
    x = [ int(a) for (a,_,b,_) in data ]
    y = [ float(b) for (a,_,b,_) in data ]
    
    return (x, y)

end_tr_x, end_tr_y = load_endbias(datadir + 'endbias-transcripts.txt')
end_g_x, end_g_y = load_endbias(datadir + 'endbias-genome.txt')

In [23]:

clf()
xlabel("Distance from k-mer to end of source sequence")
ylabel("Fraction of positions with missing k-mers")
axis([-5,50,-.05, 1.1])
plot(end_tr_x, end_tr_y)
plot(end_g_x, end_g_y)
legend(['simulated transcripts', 'simulated genome'])
savefig('diginorm-endbias.pdf')
show()