%matplotlib inline
import numpy
import scipy
import pylab
import scipy.cluster.hierarchy as sch
def load_matrix_and_labels(basefile):
D = numpy.load(open(basefile, 'rb'))
labeltext = [ x.strip() for x in open(basefile + '.labels.txt') ]
return (D, labeltext)
def plot_composite_matrix(D, labeltext, show_labels=True, show_indices=True):
if show_labels:
show_indices=True
D = numpy.array(D, copy=True)
fig = pylab.figure(figsize=(11,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
# plot denderogram
Y = sch.linkage(D, method='single') # centroid
dendrolabels = labeltext
if not show_labels:
dendrolabels = [ str(i) for i in range(len(labeltext)) ]
Z1 = sch.dendrogram(Y, orientation='right', labels=dendrolabels,
no_labels=not show_indices)
ax1.set_xticks([])
xstart = 0.45
width = 0.45
if not show_labels:
xstart = 0.315
scale_xstart = xstart + width + 0.01
# plot matrix
axmatrix = fig.add_axes([xstart,0.1, width,0.6])
# (this reorders D by the clustering in Z1)
idx1 = Z1['leaves']
D = D[idx1, :]
D = D[:, idx1]
# show matrix
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu, vmin=0, vmax=1)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([scale_xstart,0.1,0.02,0.6])
pylab.colorbar(im, cax=axcolor)
# print labels, if not shown
if not show_labels:
for i, label in enumerate(labeltext):
print('%d\t%s' % (i, label))
return fig
Conclusion: removing low-abundance k-mers from raw reads yields better clusters.
!sourmash compare SRR2*.sig -o trim.npy > /dev/null && echo success || echo fail
# running sourmash subcommand: compare loading SRR2060939_1.fastq.gz.sig loading SRR2060939_1.fastq.gz.trimnoV.fq.gz.sig loading SRR2060939_2.fastq.gz.sig loading SRR2060939_2.fastq.gz.trimnoV.fq.gz.sig loading SRR2241509_1.fastq.gz.sig loading SRR2241509_1.fastq.gz.trimnoV.fq.gz.sig loading SRR2255622_1.fastq.gz.sig loading SRR2255622_1.fastq.gz.trimnoV.fq.gz.sig min similarity in matrix: 0.0120000001043 saving labels to: trim.npy.labels.txt saving distance matrix to: trim.npy success
D_filename = 'trim.npy'
D, labeltext = load_matrix_and_labels(D_filename)
# relabel the trimmed ones as 'trim' - they're the ones with trim in the filename ;)
for i in range(len(labeltext)):
if i % 2:
labeltext[i] = '(trim) '+ labeltext[i]
_ = plot_composite_matrix(D, labeltext)
pylab.savefig('sourmash-trimming.png')
print(labeltext)
['SRR2060939_1.fastq.gz', '(trim) SRR2060939_1.fastq.gz.trimnoV.fq.gz', 'SRR2060939_2.fastq.gz', '(trim) SRR2060939_2.fastq.gz.trimnoV.fq.gz', 'SRR2241509_1.fastq.gz', '(trim) SRR2241509_1.fastq.gz.trimnoV.fq.gz', 'SRR2255622_1.fastq.gz', '(trim) SRR2255622_1.fastq.gz.trimnoV.fq.gz']
Conclusion: echinoderm data sets cluster almost entirely by species, with little to no cross-species similarity.
!sourmash compare urchin/*.sig -o urchin.npy > /dev/null && echo success || echo fail
# running sourmash subcommand: compare loading urchin/abyssicola-SRR3217899.sig loading urchin/agassizii-SRR1695485.sig loading urchin/amurensis-SRR1139201.sig loading urchin/amurensis-SRR1141046.sig loading urchin/amurensis-SRR1642063.sig loading urchin/angulata-SRR3217897.sig loading urchin/annulatus-SRR1695480.sig loading urchin/annulatus-SRR1695481.sig loading urchin/bispinosus-SRR3217921.sig loading urchin/brevispina-SRR2845428.sig loading urchin/briareus-SRR1139189.sig loading urchin/californicus-SRR1139198.sig loading urchin/californicus-SRR1695477.sig loading urchin/chloroticus-SRR1014618.sig loading urchin/chloroticus-SRR1014619.sig loading urchin/chloroticus-SRR1014624.sig loading urchin/chloroticus-SRR1014627.sig loading urchin/chloroticus-SRR1014631.sig loading urchin/chloroticus-SRR1014633.sig loading urchin/chloroticus-SRR1205884.sig loading urchin/chloroticus-SRR1205886.sig loading urchin/chloroticus-SRR1205888.sig loading urchin/chloroticus-SRR1205890.sig loading urchin/chloroticus-SRR1205894.sig loading urchin/chloroticus-SRR1205895.sig loading urchin/clarki-SRR1695478.sig loading urchin/clathrata-SRR1139195.sig loading urchin/echinata-SRR1138707.sig loading urchin/erythrogramma-SRR1211015.sig loading urchin/erythrogramma-SRR1211262.sig loading urchin/erythrogramma-SRR1211268.sig loading urchin/erythrogramma-SRR1211283.sig loading urchin/erythrogramma-SRR1211414.sig loading urchin/erythrogramma-SRR1211420.sig loading urchin/erythrogramma-SRR1211439.sig loading urchin/erythrogramma-SRR1211445.sig loading urchin/erythrogramma-SRR1211507.sig loading urchin/erythrogramma-SRR1212571.sig loading urchin/erythrogramma-SRR1212573.sig loading urchin/erythrogramma-SRR1212574.sig loading urchin/erythrogramma-SRR1212919.sig loading urchin/erythrogramma-SRR1212920.sig loading urchin/excentricus-SRR2844623.sig loading urchin/eximia-SRR3217906.sig loading urchin/filiformis-SRR1523743.sig loading urchin/filiformis-SRR1533125.sig loading urchin/filiformis-SRR789489.sig loading urchin/filiformis-SRR794568.sig loading urchin/filiformis-SRR794587.sig loading urchin/forbesi-SRR1138708.sig loading urchin/glacialis-SRR1139196.sig loading urchin/granularis-SRR1139199.sig loading urchin/helianthoides-SRR1708165.sig loading urchin/helianthoides-SRR1708168.sig loading urchin/helianthoides-SRR1708189.sig loading urchin/helianthoides-SRR1708266.sig loading urchin/helianthoides-SRR1708282.sig loading urchin/helianthoides-SRR1708289.sig loading urchin/helianthoides-SRR1708296.sig loading urchin/heros-SRR3217908.sig loading urchin/intermedius-SRR1061354.sig loading urchin/intermedius-SRR1061355.sig loading urchin/intermedius-SRR1061356.sig loading urchin/japonica-SRR1138706.sig loading urchin/japonicus-ERR1193930.sig loading urchin/japonicus-ERR1193931.sig loading urchin/japonicus-ERR1193932.sig loading urchin/japonicus-SRR1139215.sig loading urchin/japonicus-SRR1185973.sig loading urchin/japonicus-SRR1216681.sig loading urchin/japonicus-SRR2089767.sig loading urchin/japonicus-SRR2128002.sig loading urchin/japonicus-SRR2442873.sig loading urchin/japonicus-SRR2442874.sig loading urchin/japonicus-SRR2442875.sig loading urchin/japonicus-SRR414926.sig loading urchin/japonicus-SRR414927.sig loading urchin/japonicus-SRR414928.sig loading urchin/japonicus-SRR414929.sig loading urchin/japonicus-SRR414930.sig loading urchin/laudata-SRR3217920.sig loading urchin/leucospilota-DRR023762.sig loading urchin/leucospilota-DRR023763.sig loading urchin/lividus-SRR1664663.sig loading urchin/lividus-SRR1735496.sig loading urchin/lividus-SRR1735497.sig loading urchin/lividus-SRR1735498.sig loading urchin/lividus-SRR1735499.sig loading urchin/lividus-SRR1735500.sig loading urchin/lividus-SRR1735501.sig loading urchin/longicauda-SRR1325046.sig loading urchin/longicauda-SRR1325047.sig loading urchin/longicauda-SRR1325048.sig loading urchin/longicauda-SRR1325049.sig loading urchin/longicauda-SRR1325050.sig loading urchin/longicauda-SRR1325051.sig loading urchin/longicauda-SRR1325052.sig loading urchin/loveni-SRR3217902.sig loading urchin/lymani-SRR3217922.sig loading urchin/miniata-SRR1138705.sig loading urchin/miniata-SRR2454338.sig loading urchin/miniata-SRR573675.sig loading urchin/miniata-SRR573705.sig loading urchin/miniata-SRR573706.sig loading urchin/miniata-SRR573707.sig loading urchin/miniata-SRR573708.sig loading urchin/miniata-SRR573709.sig loading urchin/miniata-SRR573710.sig loading urchin/ochraceus-SRR1139197.sig loading urchin/parma-SRR1139193.sig loading urchin/parvimensis-SRR496203.sig loading urchin/parvimensis-SRR496204.sig loading urchin/pectinifera-DRR023760.sig loading urchin/pectinifera-DRR023761.sig loading urchin/pectinifera-SRR1139200.sig loading urchin/pectinifera-SRR1141045.sig loading urchin/planci-SRR1197243.sig loading urchin/purpuratus-SRR1012313.sig loading urchin/purpuratus-SRR1012339.sig loading urchin/purpuratus-SRR1012340.sig loading urchin/purpuratus-SRR1012342.sig loading urchin/purpuratus-SRR1012401.sig loading urchin/purpuratus-SRR1012403.sig loading urchin/purpuratus-SRR1041572.sig loading urchin/purpuratus-SRR1041798.sig loading urchin/purpuratus-SRR1041901.sig loading urchin/purpuratus-SRR1042009.sig loading urchin/purpuratus-SRR1042830.sig loading urchin/purpuratus-SRR1042834.sig loading urchin/purpuratus-SRR1042838.sig loading urchin/purpuratus-SRR1042899.sig loading urchin/purpuratus-SRR1043060.sig loading urchin/purpuratus-SRR1043069.sig loading urchin/purpuratus-SRR1139792.sig loading urchin/purpuratus-SRR1765910.sig loading urchin/purpuratus-SRR1765938.sig loading urchin/purpuratus-SRR1765978.sig loading urchin/purpuratus-SRR1765979.sig loading urchin/purpuratus-SRR1765980.sig loading urchin/purpuratus-SRR1765981.sig loading urchin/purpuratus-SRR1765982.sig loading urchin/purpuratus-SRR1765983.sig loading urchin/purpuratus-SRR1765984.sig loading urchin/purpuratus-SRR1765986.sig loading urchin/purpuratus-SRR1765988.sig loading urchin/purpuratus-SRR1765991.sig loading urchin/purpuratus-SRR3017856.sig loading urchin/purpuratus-SRR3017857.sig loading urchin/purpuratus-SRR531843.sig loading urchin/purpuratus-SRR531853.sig loading urchin/purpuratus-SRR531860.sig loading urchin/purpuratus-SRR531949.sig loading urchin/purpuratus-SRR531950.sig loading urchin/purpuratus-SRR531951.sig loading urchin/purpuratus-SRR531952.sig loading urchin/purpuratus-SRR531953.sig loading urchin/purpuratus-SRR531954.sig loading urchin/purpuratus-SRR531955.sig loading urchin/purpuratus-SRR531956.sig loading urchin/purpuratus-SRR531957.sig loading urchin/purpuratus-SRR531958.sig loading urchin/purpuratus-SRR531964.sig loading urchin/purpuratus-SRR531996.sig loading urchin/purpuratus-SRR532046.sig loading urchin/purpuratus-SRR532055.sig loading urchin/purpuratus-SRR532074.sig loading urchin/purpuratus-SRR532121.sig loading urchin/purpuratus-SRR532143.sig loading urchin/purpuratus-SRR532151.sig loading urchin/purpuratus-SRR533746.sig loading urchin/resiliens-SRR3217919.sig loading urchin/rotundus-DRR023764.sig loading urchin/rubens-SRR1139190.sig loading urchin/rubens-SRR1685733.sig loading urchin/rubens-SRR3087891.sig loading urchin/schayeri-SRR3217901.sig loading urchin/serratissima-SRR3097584.sig loading urchin/sp.-SRR1139191.sig loading urchin/sp.-SRR1139194.sig loading urchin/sp.-SRR1695483.sig loading urchin/spinulosus-SRR1139455.sig loading urchin/tenuis-SRR3217898.sig loading urchin/tribuloides-SRR1138704.sig loading urchin/variegatus-SRR1139214.sig loading urchin/variegatus-SRR1660831.sig loading urchin/variegatus-SRR1660833.sig loading urchin/variegatus-SRR1661075.sig loading urchin/variegatus-SRR1661077.sig loading urchin/variegatus-SRR1661079.sig loading urchin/variegatus-SRR1661081.sig loading urchin/variegatus-SRR1661090.sig loading urchin/variegatus-SRR1661111.sig loading urchin/variegatus-SRR1661112.sig loading urchin/variegatus-SRR1661113.sig loading urchin/variegatus-SRR1661363.sig loading urchin/variegatus-SRR1661395.sig loading urchin/variegatus-SRR1661397.sig loading urchin/variegatus-SRR1661399.sig loading urchin/variegatus-SRR1661401.sig loading urchin/variegatus-SRR1661406.sig loading urchin/variegatus-SRR1661409.sig loading urchin/wendtii-SRR2845427.sig loading urchin/wilsoni-SRR3217896.sig min similarity in matrix: 0.0 saving labels to: urchin.npy.labels.txt saving distance matrix to: urchin.npy success
%%capture
D_filename = 'urchin.npy'
D, labeltext = load_matrix_and_labels(D_filename)
fig = plot_composite_matrix(D, labeltext, show_labels=False, show_indices=False)
pylab.savefig('sourmash-urchin.png')
fig
Conclusion: Patiria asterina and Patiria pectinifera group together, agreeing with recent reclassifications (ht Dan Rokhsar).
!sourmash compare urchin/miniata*.sig urchin/pectinifera*.sig -o patiria.npy > /dev/null \
&& echo success || echo fail
# running sourmash subcommand: compare loading urchin/miniata-SRR1138705.sig loading urchin/miniata-SRR2454338.sig loading urchin/miniata-SRR573675.sig loading urchin/miniata-SRR573705.sig loading urchin/miniata-SRR573706.sig loading urchin/miniata-SRR573707.sig loading urchin/miniata-SRR573708.sig loading urchin/miniata-SRR573709.sig loading urchin/miniata-SRR573710.sig loading urchin/pectinifera-DRR023760.sig loading urchin/pectinifera-DRR023761.sig loading urchin/pectinifera-SRR1139200.sig loading urchin/pectinifera-SRR1141045.sig min similarity in matrix: 0.0719999969006 saving labels to: patiria.npy.labels.txt saving distance matrix to: patiria.npy success
D_filename = 'patiria.npy'
D, labeltext = load_matrix_and_labels(D_filename)
fig = plot_composite_matrix(D, labeltext)
pylab.savefig('sourmash-patiria.png')