#!/usr/bin/env python # coding: utf-8 #

ipyrad-analysis toolkit: Dimensionality reduction

# # The `pca` tool can be used to implement a number of dimensionality reduction methods on SNP data (PCA, t-SNE, UMAP) and to filter and/or impute missing data in genotype matrices to reduce the effects of missing data. # ### Load libraries # In[2]: # conda install ipyrad -c conda-forge -c bioconda # conda install ipcoal -c conda-forge # conda install scikit-learn -c conda-forge # In[3]: import ipyrad.analysis as ipa import toyplot # In[4]: print(ipa.__version__) print(toyplot.__version__) # ### The input data # In[5]: # the simulated SNP database file SNPS = "/tmp/oaks.snps.hdf5" # In[7]: # download example hdf5 dataset (158Mb, takes ~2-3 minutes) URL = "https://www.dropbox.com/s/x6a4i47xqum27fo/virentes_ref.snps.hdf5?raw=1" ipa.download(url=URL, path=SNPS); # ### Make an IMAP dictionary (map popnames to list of samplenames) # In[111]: IMAP = { "virg": ["LALC2", "TXWV2", "FLBA140", "FLSF33", "SCCU3"], "mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"], "gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"], "bran": ["BJSL25", "BJSB3", "BJVL19"], "fusi": ["MXED8", "MXGT4", "TXMD3", "TXGR3"], "sagr": ["CUCA4", "CUSV6", "CUVN10"], "oleo": ["MXSA3017", "BZBB1", "HNDA09", "CRL0030", "CRL0001"], } MINMAP = { "virg": 3, "mini": 3, "gemi": 3, "bran": 2, "fusi": 2, "sagr": 2, "oleo": 3, } # ### Initiate tool with filtering options # In[112]: tool = ipa.pca(data=SNPS, minmaf=0.05, imap=IMAP, minmap=MINMAP, impute_method="sample") # ### Run PCA # Unlinked SNPs are automatically sampled from each locus. By setting `nreplicates=N` the subsampling procedure is repeated N times to show variation over the subsampled SNPs. The imap dictionary is used in the `.draw()` function to color points, and can be overriden to color points differently from the IMAP used in the tool above. # In[42]: tool.run(nreplicates=10) tool.draw(imap=IMAP); # In[43]: # a convenience function for plotting across three axes tool.draw_panels(0, 1, 2, imap=IMAP); # ### Run TSNE # t-SNE is a manifold learning algorithm that can sometimes better project data into a 2-dimensional plane. The distances between points in this space are harder to interpret. # In[44]: tool.run_tsne(perplexity=5, seed=333) tool.draw(imap=IMAP); # ### Run UMAP # UMAP is similar to t-SNE but the distances between clusters are more representative of the differences betwen groups. This requires another package that if it is not yet installed it will ask you to install. # In[52]: tool.run_umap(n_neighbors=13, seed=333) tool.draw(imap=IMAP); # ### Missing data with imputation # Missing data has large effects on dimensionality reduction methods, and it is best to (1) minimize the amount of missing data in your input data set by using filtering, and (2) impute missing data values. In the examples above data is imputed using the 'sample' method, which probabilistically samples alleles for based on the allele frequency in the group that a taxon is assigned to in IMAP. It is good to compare this to a case where imputation is performed without IMAP assignments, to assess the impact of the *a priori* assignments. Although this comparison is useful, assigning taxa to groups with IMAP dictionaries for imputation is expected to yield more accurate imputation. # In[61]: # allow very little missing data import itertools tool = ipa.pca( data=SNPS, imap={'samples': list(itertools.chain(*[i for i in IMAP.values()]))}, minmaf=0.05, mincov=0.9, impute_method="sample", quiet=True, ) tool.run(nreplicates=10, seed=123) tool.draw(imap=IMAP); # ### Statistics # In[65]: # variance explained by each PC axes in the first replicate run tool.variances[0].round(2) # In[74]: # PC loadings in the first replicate tool.pcs(0) # ### Styling plots (see toyplot documentation) # The `.draw()` function returns a canvas and axes object from toyplot which can be further modified and styled. # In[115]: # get plot objects, several styling options to draw canvas, axes = tool.draw(imap=IMAP, size=8, width=400); # various axes styling options shown for x axis axes.x.ticks.show = True axes.x.spine.style['stroke-width'] = 1.5 axes.x.ticks.labels.style['font-size'] = '13px' axes.x.label.style['font-size'] = "15px" axes.x.label.offset = "22px"