#!/usr/bin/env python # coding: utf-8 # ### Intro to downstream ipyrad data visualization # All of the modules below are included in the ipyrad conda installation. # In[36]: ## import libraries import ipyrad as ip ## ipyrad import numpy as np ## array operations import h5py ## access hdf5 database file import toyplot ## my fav new plotting library import toyplot.html ## toypot sublib for saving html plots ## print versions for posterity print 'ipyrad', ip.__version__ print 'numpy', np.__version__ print 'h5py', h5py.__version__ print 'toyplot', toyplot.__version__ # ### Import the ipyrad Assembly class object # I assembled the data set under three minimum cluster depth settings (6, 10, 20), # and import their Assembly objects as data1, data2, and data3. These are ipyrad Assembly class objects which have many features and functions available to them. To see these type the object name (e.g., data1) followed by a period (.) and then press tab to see list of all the available options. # # In[37]: ## import Assembly objects data = ip.load_json("/home/deren/Downloads/pedicularis/pedictrim5.json") # ### Open database files # # We also open the database file for each data set. This is the file with the suffix ".hdf5" that should have a file name like: "[project_dir]/[name]_consens/[name].hdf5". The assembly class objects save the database file path under the attribute ".database", which can be used to access it more easily. Below we open a view to the hdf5 database file for each of the three assemblies. # In[38]: ## load the hdf5 database io5 = h5py.File(data.database, 'r') # ### What's in the database file? # The hdf5 data base is compressed and sometimes quite large. If you moved your JSON file from a remote machine (e.g., HPC cluster) to a local machine you will have to update the `data.database` path to the location of the database file on your local machine. # In[39]: print 'location of my database file:\n ', data.database print 'keys in the hdf5 database\n ', io5.keys() # ### Access arrays # The hdf5 data base contains the following five arrays with the following dimensions. # In[40]: ## This doesn't actually load them into memory, they can be very large. ## It just makes a reference for calling keys more easily #hcatg = io5["catgs"] ## depth information (not edge filtered) #hseqs = io5["seqs"] ## sequence data (not edge filtered) hsnps = io5["snps"] ## snp locations (edge filtered) hfilt = io5["filters"] ## locus filters hedge = io5["edges"] ## edge filters ## arrays shapes and dtypes #print hcatg #print hseqs print hsnps print hfilt print hedge # ### A function to filter the SNPs array # In[41]: def filter_snps(data): ## get h5 database io5 = h5py.File(data.database, 'r') hsnps = io5["snps"] ## snp locations hfilt = io5["filters"] ## locus filters hedge = io5["edges"] ## edge filters ## read in a local copy of the full snps and edge arrays snps = hsnps[:] edge = hedge[:] ## print status print "prefilter {}\nshape {} = (nloci, maxlen, [var,pis])"\ .format(data.name, snps.shape) print "total vars = {}".format(snps[:,:,0].sum()) ## apply edge filters to all loci in the snps array for loc in xrange(snps.shape[0]): a, b = edge[loc, :2] mask = np.invert([i in range(a, b) for i in np.arange(snps.shape[1])]) snps[loc, mask, :] = 0 ## get locus filter by summing across all filters locfilter = hfilt[:].sum(axis=1).astype(np.bool) ## apply filter to snps array fsnps = snps[~locfilter, ...] ## print new shape and sum print "postfilter {}\nshape {} = (nloci, maxlen, [var,pis])"\ .format(data.name, fsnps.shape) print "total vars = {}".format(fsnps[:,:,0].sum()) ## clean up big objects del snps del edge ## return what we want return fsnps # In[42]: def filter_snps(data): ## get h5 database io5 = h5py.File(data.database, 'r') hsnps = io5["snps"] ## snp locations hfilt = io5["filters"] ## locus filters #hedge = io5["edges"] ## edge filters ## read in a local copy of the full snps and edge arrays snps = hsnps[:] #edge = hedge[:] ## get locus filter by summing across all filters locfilter = hfilt[:].sum(axis=1).astype(np.bool) ## apply filter to snps array fsnps = snps[~locfilter, ...] ## clean up big objects del snps ## return what we want return fsnps # In[43]: fsnps.sum(axis=0) # In[44]: ## apply filter to each data set fsnps = filter_snps(data) # In[45]: a = np.arange(0, 40)#.reshape(10,4) b = np.arange(10, 50)#.reshape(10, 4) #np.concatenate((a,b), axis=1) np.array([a,b]).max(axis=0) # ### The snps array # In[46]: ## the last dimension has two columns (var, pis) ## how many snps per locus varlocs = fsnps[:, :, 0].sum(axis=1) pislocs = fsnps[:, :, 1].sum(axis=1) print varlocs[:5] ## how many snps per site (across loci) persite = fsnps[:, :, :].sum(axis=0) print persite[10:15] # ### Get a color map # In[47]: colormap = toyplot.color.Palette() colormap # In[48]: ## deconstruct array into bins vbars, vbins = np.histogram(varlocs, bins=range(0, varlocs.max()+2)) pbars, pbins = np.histogram(pislocs, bins=range(0, varlocs.max()+2)) ## setup canvas and axes canvas = toyplot.Canvas(width=350, height=300) axes = canvas.cartesian(xlabel="n variable (or pis) sites", ylabel="n nloci w/ n var (or pis) sites", gutter=50) ## set up x axis axes.x.domain.max = 16 axes.x.spine.show = False axes.x.ticks.labels.style = {"baseline-shift":"10px"} axes.x.ticks.locator = toyplot.locator.Explicit( range(0, 16, 2), map(str, range(0, 16, 2))) ## set up y axis axes.y.ticks.show=True axes.y.label.style = {"baseline-shift":"35px"} axes.y.ticks.labels.style = {"baseline-shift":"5px"} axes.y.ticks.below = 0 axes.y.ticks.above = 5 axes.y.domain.min = 0 axes.y.domain.max = 10000 axes.y.ticks.locator = toyplot.locator.Explicit( range(0, 11000, 2500), map(str, range(0, 11000, 2500))) ## add bars axes.bars(vbars, color=colormap[1], opacity=0.5) axes.bars(pbars, color=colormap[0], opacity=0.5) ## or as a filled/smoothed plot #x = np.arange(0, len(pbars)) #fill = axes.fill(x, vbars, color=colormap[0], opacity=0.5) #fill = axes.fill(x, pbars, color=colormap[1], opacity=0.5) # ### Variation along the length of reads # This data includes 75bp reads sequenced on an Illumina GAIIx. We know that the error rate increases along the length of reads, and that the error rate was a bit higher in this older type of data than it is in more recent sequencing technology. # In[49]: ## the snps array is longer than the actual seq length (it's a bit padded) ## and so we want to lop the extra on the end off. Let's get the max values w/ data. maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max() ## all variables (including autapomorphies) distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0) print(distvar) ## synapomorphies (just pis) distpis = fsnps[:, :maxend+1, 1].sum(axis=0) print(distpis) ## how long is the longest seq print 'maxlen = ', maxend # ### Plot distribution of variation along length of RAD loci # In[50]: def SNP_position_plot(distvar, distpis): ## set color theme colormap = toyplot.color.Palette() ## make a canvas canvas = toyplot.Canvas(width=800, height=300) ## make axes axes = canvas.cartesian(xlabel="Position along RAD loci", ylabel="N variables sites", gutter=65) ## x-axis axes.x.ticks.show = True axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"} axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"} axes.x.ticks.below = 5 axes.x.ticks.above = 0 axes.x.domain.max = maxend axes.x.ticks.locator = toyplot.locator.Explicit( range(0, maxend, 5), map(str, range(0, maxend, 5))) ## y-axis axes.y.ticks.show=True axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"} axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"} axes.y.ticks.below = 0 axes.y.ticks.above = 5 ## add fill plots x = np.arange(0, maxend+1) f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites") f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites") ## add a horizontal dashed line at the median Nsnps per site axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"}) axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"}) return canvas, axes # In[51]: canvas, axes = SNP_position_plot(distvar, distpis) ## save fig toyplot.html.render(canvas, 'snp_positions.html') ## show fig canvas #axes # ### Possible reasons for the uptick in SNPs towards the end of reads: # + Sequencing error rate increases along the length of reads # + Poor muscle alignment at the end of reads # # I think the more likely explanation is that poor alignment towards the end of reads is causing the excess SNPs for two reasons. First, if it was sequencing errors than we would expect an excess of autapomorphies at the end of reads equally or more so than we observe synapomorphies, but that is not the case. And second, increasing the minimum depth does fix the problem. # ### Apply a more stringent edge filter # Positive values of the edge filter # In[ ]: ## make a new branch of cyatho-d6-min4 data5 = data1.branch('cyatho-d6-min4-trim') ## edge filter order is (R1left, R1right, R2left, R2right) ## we set the R1right to -10, which trims 10 bases from the right side data5.set_params('edge_filter', ("4, -10, 4, 4")) ## run step7 to fill new data base w/ filters data5.run('7', force=True) # In[169]: data5 = ip.load_json("~/Downloads/pedicularis/cyatho-d12-min4") # In[170]: ## filter the snps fsnps = filter_snps(data5) ## trim non-data maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max() ## all variables (including autapomorphies) distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0) print(distvar) ## synapomorphies (just pis) distpis = fsnps[:, :maxend+1, 1].sum(axis=0) print(distpis) ## how long is the longest seq print 'maxlen = ', maxend # In[171]: SNP_position_plot(distvar, distpis) # ### Why the right side but not the left? # I found early on that leaving the cut site attached to the left side of reads improved assemblies by acting as an achor, which then allowed gap openings to arise in the early parts of the read but not to be treated differently, i.e., as terminal gap openings. For some reason it didn't occur to me to similarly anchor the right side of reads. Let's see what happens if I had an invariant anchor to the right side of reads. # # Another point to note, though I don't show the results here, this increase in variation along the length of reads is not observed in simulated data, suggesting it is inherent to real data. # # In[15]: simdata = ip.load_json("~/Documents/ipyrad/tests/cli/cli.json") ## filter the snps fsnps = filter_snps(simdata) ## trim non-data maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max() ## all variables (including autapomorphies) distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0) ## synapomorphies (just pis) distpis = fsnps[:, :maxend+1, 1].sum(axis=0) SNP_position_plot(distvar, distpis) # ### Results when I add a right-side anchor # I now add a five base anchor just before muscle alignment, and then strip it off again after aligning. I have an added check to make sure that the anchor is definitely not left on the read in case muscle did try to stick an opening into the anchor. # In[172]: ## import the new version of ipyrad w/ this update. import ipyrad as ip print ip.__version__ # In[ ]: