#!/usr/bin/env python # coding: utf-8 # # sourmash: working with private collections of signatures # # ### Running this notebook. # # You can run this notebook interactively via mybinder; click on this button: # [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dib-lab/sourmash/master?filepath=doc%2Fsourmash-collections.ipynb) # # A rendered version of this notebook is available at [sourmash.readthedocs.io](https://sourmash.readthedocs.io) under "Tutorials and notebooks". # # You can also get this notebook from the [doc/ subdirectory of the sourmash github repository](https://github.com/dib-lab/sourmash/tree/master/doc). See [binder/environment.yaml](https://github.com/dib-lab/sourmash/blob/master/binder/environment.yml) for installation dependencies. # # ### What is this? # # This is a Jupyter Notebook using Python 3. If you are running this via [binder](https://mybinder.org), you can use Shift-ENTER to run cells, and double click on code cells to edit them. # # Contact: C. Titus Brown, ctbrown@ucdavis.edu. Please [file issues on GitHub](https://github.com/dib-lab/sourmash/issues/) if you have any questions or comments! # ## download a bunch of genomes # In[1]: get_ipython().system('mkdir -p big_genomes') get_ipython().system('curl -L https://osf.io/8uxj9/?action=download | (cd big_genomes && tar xzf -)') # ## compute signatures for each file # In[2]: get_ipython().system('cd big_genomes/ && sourmash compute -k 31 --scaled=1000 --name-from-first *.fa') # ## Compare them all # In[3]: get_ipython().system('sourmash compare big_genomes/*.sig -o compare_all.mat') get_ipython().system('sourmash plot compare_all.mat') # In[4]: from IPython.display import Image Image(filename='compare_all.mat.matrix.png') # ## make a fast(er) search database for all of them # In[5]: get_ipython().system('sourmash index -k 31 all-genomes big_genomes/*.sig') # You can now use this to search, and gather. # In[6]: get_ipython().system('sourmash search shew_os185.fa.sig all-genomes --threshold=0.001') # In[7]: # (make fake metagenome again, just in case) get_ipython().system('cat genomes/*.fa > fake-metagenome.fa') get_ipython().system('sourmash compute -k 31 --scaled=1000 fake-metagenome.fa') # In[8]: get_ipython().system('sourmash gather fake-metagenome.fa.sig all-genomes') # # build a database with taxonomic information -- # # for this, we need to provide a metadata file that contains accession => tax information. # In[9]: import pandas df = pandas.read_csv('podar-lineage.csv') df # In[10]: get_ipython().system('sourmash lca index podar-lineage.csv taxdb big_genomes/*.sig -C 3 --split-identifiers') # This database 'taxdb.lca.json' can be used for search and gather as above: # In[11]: get_ipython().system('sourmash gather fake-metagenome.fa.sig taxdb.lca.json') # ...but can also be used for taxonomic summarization: # In[12]: get_ipython().system('sourmash lca summarize --query fake-metagenome.fa.sig --db taxdb.lca.json') # ## A full list of notebooks # # [An introduction to k-mers for genome comparison and analysis](kmers-and-minhash.ipynb) # # [Some sourmash command line examples!](sourmash-examples.ipynb) # # [Working with private collections of signatures.](sourmash-collections.ipynb)