#!/usr/bin/env python # coding: utf-8 # # Using the `LCA_Database` API # `LCA_Database` objects combine a fast in-memory storage of signatures # indexed by their hash values, with taxonomic lineage storage. They are # limited to storing scaled DNA signatures with a single ksize; the scaled # and ksize values are specified at creation. # ### Running this notebook. # # You can run this notebook interactively via mybinder; click on this button: # [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dib-lab/sourmash/latest?filepath=doc%2Fusing-LCA-database-API.ipynb) # # A rendered version of this notebook is available at [sourmash.readthedocs.io](https://sourmash.readthedocs.io) under "Tutorials and notebooks". # # You can also get this notebook from the [doc/ subdirectory of the sourmash github repository](https://github.com/dib-lab/sourmash/tree/latest/doc). See [binder/environment.yaml](https://github.com/dib-lab/sourmash/blob/latest/binder/environment.yml) for installation dependencies. # # ### What is this? # # This is a Jupyter Notebook using Python 3. If you are running this via [binder](https://mybinder.org), you can use Shift-ENTER to run cells, and double click on code cells to edit them. # # Contact: C. Titus Brown, ctbrown@ucdavis.edu. Please [file issues on GitHub](https://github.com/dib-lab/sourmash/issues/) if you have any questions or comments! # ## Creating an `LCA_Database` object # # Create an `LCA_Database` like so: # In[1]: import sourmash db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) # Create signatures for some genomes, load them, and add them: # In[2]: get_ipython().system('sourmash compute --name-from-first -k 31 --scaled=1000 genomes/*') # In[3]: sig1 = sourmash.load_one_signature('akkermansia.fa.sig', ksize=31) sig2 = sourmash.load_one_signature('shew_os185.fa.sig', ksize=31) sig3 = sourmash.load_one_signature('shew_os223.fa.sig', ksize=31) # In[4]: db.insert(sig1, ident='akkermansia') db.insert(sig2, ident='shew_os185') db.insert(sig3, ident='shew_os223') # ## Run `search` and `gather` via the `Index` API # In[5]: from pprint import pprint pprint(db.search(sig1, threshold=0.1)) # In[6]: pprint(db.search(sig2, threshold=0.1)) # In[7]: pprint(db.gather(sig3)) # ## Retrieve all signatures with `signatures()` # In[8]: for i in db.signatures(): print(i) # ## Access identifiers and names # The list of (unique) identifiers in the database can be accessed via the attribute `ident_to_idx`, which maps to integer identifiers; identifiers can also retrieve full names, which are taken from `sig.name()` upon insertion. # In[9]: pprint(db.ident_to_idx.keys()) # In[10]: pprint(db.ident_to_name) # ## Access hash values directly # The attribute `hashval_to_idx` contains a mapping from individual hash values to sets of `idx` indices. # # See the method `_find_signatures()` for an example of how this is used in `search` and `gather`. # In[11]: print('{} hash values total in this database'.format(len(db.hashval_to_idx))) # In[12]: all_idx = set() for idx_set in db.hashval_to_idx.values(): all_idx.update(idx_set) print('belonging to signatures with idx {}'.format(all_idx)) # In[13]: first_three_hashvals = list(db.hashval_to_idx)[:3] # In[14]: for hashval in first_three_hashvals: print('hashval {} belongs to idxs {}'.format(hashval, db.hashval_to_idx[hashval])) # In[15]: query_idx = 2 hashval_set = set() for hashval, idx_set in db.hashval_to_idx.items(): if query_idx in idx_set: hashval_set.add(hashval) print('{} hashvals belong to query idx {}'.format(len(hashval_set), query_idx)) ident = db.idx_to_ident[query_idx] print('query idx {} matches to ident {}'.format(query_idx, ident)) name = db.ident_to_name[ident] print('query idx {} matches to name {}'.format(query_idx, name)) # ## Lineage storage and retrieval # In[16]: from sourmash.lca import LineagePair # In[17]: superkingdom = LineagePair('superkingdom', 'Bacteria') phylum = LineagePair('phylum', 'Verrucomicrobia') klass = LineagePair('class', 'Verrucomicrobiae') lineage = (superkingdom, phylum, klass) # In[18]: db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) db.insert(sig1, lineage=lineage) # In[19]: # by default, the identifier is the signature name -- ident = sig1.name() idx = db.ident_to_idx[ident] print("ident '{}' has idx {}".format(ident, idx)) lid = db.idx_to_lid[idx] print("lid for idx {} is {}".format(idx, lid)) lineage = db.lid_to_lineage[lid] display = sourmash.lca.display_lineage(lineage) print("lineage for lid {} is {}".format(lid, display)) # ## Lineage manipulation # ### Default taxonomy ranks for lineages # # While sourmash lineage functions can work with any taxonomy ranks and any taxonomy names, both the NCBI and GTDB taxonomies use superkingdom/phylum/etc, so there is a hard coded list availalbe via `sourmash.lca.taxlist()`. # In[20]: print(list(sourmash.lca.taxlist())) # Given a taxonomy as a list, you can then construct a lineage like so: # In[21]: linstr1 = ["Bacteria", "Verrucomicrobia", "Verrucomicrobiae", "Verrucomicrobiales", "Akkermansiaceae", "Akkermansia", "Akkermansia muciniphila", "Akkermansia muciniphila ATCC BAA-835"] lineage1 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr1) ] pprint(lineage1) # In[22]: # display lineages as strings with 'sourmash.lca.display_lineage()' sourmash.lca.display_lineage(lineage1) # ## sourmash lowest-common-ancestor functions # The LCA functionality available in sourmash is built around some simple lineage manipulation functions -- `build_tree` and `find_lca`. # # First, let's define some more lineages -- # In[23]: linstr2 = ["Bacteria", "Proteobacteria", "Gammaproteobacteria", "Alteromonadales", "Shewanellaceae", "Shewanella", "Shewanella baltica", "Shewanella baltica OS185"] lineage2 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr2) ] linstr3 = ["Bacteria", "Proteobacteria", "Gammaproteobacteria", "Alteromonadales", "Shewanellaceae", "Shewanella", "Shewanella baltica", "Shewanella baltica OS223"] lineage3 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr3) ] print('lineage2 is', sourmash.lca.display_lineage(lineage2)) print('lineage3 is', sourmash.lca.display_lineage(lineage3)) # Now, build a tree structure that collapses these lineages where it can, and run some LCA analyses. Lineages 1 and 2 collapse to superkingdom: # In[24]: tree = sourmash.lca.build_tree([lineage1, lineage2]) sourmash.lca.find_lca(tree) # while lineages 2 and 3 collapse to species: # In[25]: tree = sourmash.lca.build_tree([lineage2, lineage3]) sourmash.lca.find_lca(tree) # ## Convenience functions let you make use of LCA_Database stored lineages # First, let's create a database from 3 signatures, and this time we'll store lineages in there: # In[26]: db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) db.insert(sig1, lineage=lineage1) db.insert(sig2, lineage=lineage2) db.insert(sig3, lineage=lineage3) # Now, for any collection of hashes, you can retrieve all the lineage assignments into a dictionary: # `{ hashval: set of lineages }` # In[27]: assignments = sourmash.lca.gather_assignments(sig2.minhash.get_mins(), [db]) print('num hashvals:', len(assignments)) # For example, this particular hashvalue belongs to two different lineages: # In[28]: assignments[196037984804395] # sourmash also includes functions to summarize assignments by counting the number of # times a particular lineage occurs; this is # used by `sourmash lca classify` and `sourmash lca summarize`. # In[29]: counter = sourmash.lca.count_lca_for_assignments(assignments) # count_lca_for_assignments returns a collections.Counter object for lineage, count in counter.most_common(): print('{} hashes have LCA: {}'.format(count, sourmash.lca.display_lineage(lineage))) # ## Other pointers # # [Sourmash: a practical guide](https://sourmash.readthedocs.io/en/latest/using-sourmash-a-guide.html) # # [Classifying signatures taxonomically](https://sourmash.readthedocs.io/en/latest/classifying-signatures.html) # # [Pre-built search databases](https://sourmash.readthedocs.io/en/latest/databases.html) # # ## A full list of notebooks # # [An introduction to k-mers for genome comparison and analysis](kmers-and-minhash.ipynb) # # [Some sourmash command line examples!](sourmash-examples.ipynb) # # [Working with private collections of signatures.](sourmash-collections.ipynb) # # [Using the LCA_Database API.](using-LCA-database-API.ipynb)