#!/usr/bin/env python
# coding: utf-8

# # sourmash: working with private collections of signatures

# ### What is this?
# 
# This is a Jupyter Notebook using Python 3. If you are running this via [binder](https://mybinder.org), you can use Shift-ENTER to run cells, and double click on code cells to edit them.

# ## download a bunch of genomes

# In[1]:


get_ipython().system('mkdir -p big_genomes')
get_ipython().system('curl -L https://osf.io/8uxj9/?action=download | (cd big_genomes && tar xzf -)')


# ## compute signatures for each file

# In[2]:


get_ipython().system('cd big_genomes/ && sourmash sketch dna -p k=31,scaled=1000 --name-from-first *.fa')


# ## Compare them all

# In[3]:


get_ipython().system('sourmash compare big_genomes/*.sig -o compare_all.mat')
get_ipython().system('sourmash plot compare_all.mat')


# In[4]:


from IPython.display import Image
Image(filename='compare_all.mat.matrix.png')


# ## make a fast(er) search database for all of them

# In[5]:


get_ipython().system('sourmash index -k 31 all-genomes big_genomes/*.sig')


# You can now use this to search, and gather.

# In[6]:


get_ipython().system('sourmash search shew_os185.fa.sig all-genomes --threshold=0.001')


# In[7]:


# (make fake metagenome again, just in case)
get_ipython().system('cat genomes/*.fa > fake-metagenome.fa')
get_ipython().system('rm -f fake-metagenome.fa.sig')
get_ipython().system('sourmash sketch dna -p k=31,scaled=1000 fake-metagenome.fa')


# In[8]:


get_ipython().system('sourmash gather fake-metagenome.fa.sig all-genomes')


# ## build a database with taxonomic information --
# 
# for this, we need to provide a metadata file that contains accession => tax information.

# In[9]:


import pandas
df = pandas.read_csv('podar-lineage.csv')
df


# In[10]:


get_ipython().system('sourmash lca index podar-lineage.csv taxdb big_genomes/*.sig -C 3 --split-identifiers')


# This database 'taxdb.lca.json' can be used for search and gather as above:

# In[11]:


get_ipython().system('sourmash gather fake-metagenome.fa.sig taxdb.lca.json')


# ...but can also be used for taxonomic summarization:

# In[12]:


get_ipython().system('sourmash lca summarize --query fake-metagenome.fa.sig --db taxdb.lca.json')