#!/usr/bin/env python # coding: utf-8 # # ipyrad-analysis toolkit: STRUCTURE # # Structure v.2.3.4 is a standard tool for examining population genetic structure based on allele frequencies within and among populations. Although many new implementations of the structure algorithm have been developed in recent years offering improvements to speed, the classic tool offers a number of useful options that keep it relevant to this day. # ### Required software # In[1]: # conda install ipyrad -c bioconda # conda install -c bioconda -c ipyrad structure clumpp # conda install toyplot -c eaton-lab # In[2]: import ipyrad.analysis as ipa import toyplot # ### Required input data files # Your input data should be a `.snps.hdf` database file produced by ipyrad. If you do not have this you can generate it from any VCF file following the [vcf2hdf5 tool tutorial](https://ipyrad.readthedocs.io/en/latest/API-analysis/cookbook-vcf2hdf5.html). The database file contains the genotype calls information as well as linkage information that is used for subsampling unlinked SNPs and bootstrap resampling. # In[3]: # the path to your .snps.hdf5 database file data = "/home/deren/Downloads/ref_pop2.snps.hdf5" # # ### Note: missing data in STRUCTURE analyses: # # Structure infers the values of missing data while it runs the MCMC chain. No imputation is required, but it will perform more accurately if there is less missing data and when base calls are more accurate. I recommend not imputing data and simply filtering fairly stringently. # # #### Approximate run times # This example data set should probably be run for a longer burnin and number of reps if it were to be used in a publication. For reference, this data set takes about 2.5 hours to run 12 jobs on a 4 core laptop for a data set with 27 samples and ~125K SNPs. If your data set has more samples or SNPs then it will take longer. If you have 2X as many cores then it will run 2X faster. # #### Input data file and population assignments # If you are using the "sample" input method then population assignments (imap dictionary) are used for for filtering, color coding plots, and for imputation. If you are using the "kmeans" imputing method then population assignments are only used for filtering and color coding plots. # In[4]: # group individuals into populations imap = { "virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"], "mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"], "gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"], "sagr": ["CUVN10", "CUCA4", "CUSV6"], "oleo": ["CRL0030", "HNDA09", "BZBB1", "MXSA3017"], "fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"], "bran": ["BJSL25", "BJSB3", "BJVL19"], } # require that 50% of samples have data in each group minmap = {i: 0.5 for i in imap} # #### Enter data file and params # The `struct` analysis object takes input data as the *.snps.hdf5* file produced by ipyrad. All other parameters are optional. The **imap** dictionary groups individuals into populations and **minmap** can be used to filter SNPs to only include those that have data for at least some proportion of samples in every group. The **mincov** option works similarly, it filters SNPs that are shared across less than some proportion of all samples (in contrast to minmap this does not use imap groupings). # # When you init the object it will load the data and apply filtering. The printed output tells you how many SNPs were removed by each filter and the remaining amount of missing data after filtering. These remaining missing values are the ones that will be filled with imputation. # In[6]: # init analysis object with input data and (optional) parameter options struct = ipa.structure( name="test", data=data, imap=imap, minmap=minmap, mincov=0.9, ) # #### Run STRUCTURE and plot results. # The `burnin` and `numreps` parameters determine the length of the run. For analyses with many samples and with larger values of K you should use much larger values than these. # In[7]: struct.mainparams.burnin = 5000 struct.mainparams.numreps = 10000 # In[8]: struct.run(nreps=3, kpop=[2, 3, 4, 5], auto=True) # ### Analyze results: Choosing K # In[83]: etable = struct.get_evanno_table([2, 3, 4, 5]) etable # In[90]: # get canvas object and set size canvas = toyplot.Canvas(width=400, height=300) # plot the mean log probability of the models in red axes = canvas.cartesian(ylabel="estLnProbMean") axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o") axes.y.spine.style = {"stroke": "darkred"} # plot delta K with its own scale bar of left side and in blue axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25) axes.plot(etable.deltaK, color="steelblue", marker="o"); axes.y.spine.style = {"stroke": "steelblue"} # set x labels axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index) axes.x.label.text = "K (N ancestral populations)" # ### Analyze results: Barplots # In[7]: k = 3 table = struct.get_clumpp_table(k) # In[8]: # sort list by columns table.sort_values(by=list(range(k)), inplace=True) # or, sort by a list of names (here taken from imap) import itertools onames = list(itertools.chain(*imap.values())) table = table.loc[onames] # In[9]: # build barplot canvas = toyplot.Canvas(width=500, height=250) axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%")) axes.bars(table) # add labels to x-axis ticklabels = [i for i in table.index.tolist()] axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels) axes.x.ticks.labels.angle = -60 axes.x.ticks.show = True axes.x.ticks.labels.offset = 10 axes.x.ticks.labels.style = {"font-size": "12px"} # # Cookbook #