This tutorial shows how to find proteins for a specific organism, how to calculate protein-protein interactions, and visualize the results.
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring_index
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol
spark = SparkSession.builder.appName("Problem-1").getOrCreate()
For our first task, we need to run a taxonomy query using SIFTS data. See examples and SIFTS demo
To figure out how to query for taxonomy, the command below lists the first 10 entries for the SIFTS taxonomy table. As you can see, we can use the science_name field to query for a specific organism.
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show()
+-----+-----+------+--------------------+----------------+ |pdbid|chain|tax_id| scientific_name|structureChainId| +-----+-----+------+--------------------+----------------+ | 101M| A| 9755| PHYCD| 101M.A| | 101M| A| 9755| Physeter catodon| 101M.A| | 101M| A| 9755|Physeter catodon ...| 101M.A| | 101M| A| 9755|Physeter catodon ...| 101M.A| | 101M| A| 9755|Physeter macrocep...| 101M.A| | 101M| A| 9755| Sperm whale| 101M.A| | 101M| A| 9755| sperm whale| 101M.A| | 102L| A| 10665| BPT4| 102L.A| | 102L| A| 10665| Bacteriophage T4| 102L.A| | 102L| A| 10665|Enterobacteria ph...| 102L.A| +-----+-----+------+--------------------+----------------+
taxonomy_query = ... your code here ...
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show(10)
+-----+-----+------+---------------+----------------+ |pdbid|chain|tax_id|scientific_name|structureChainId| +-----+-----+------+---------------+----------------+ | 12E8| H| 10090| Mus musculus| 12E8.H| | 12E8| L| 10090| Mus musculus| 12E8.L| | 12E8| M| 10090| Mus musculus| 12E8.M| | 12E8| P| 10090| Mus musculus| 12E8.P| | 15C8| H| 10090| Mus musculus| 15C8.H| | 15C8| L| 10090| Mus musculus| 15C8.L| | 1914| A| 10090| Mus musculus| 1914.A| | 1A0Q| H| 10090| Mus musculus| 1A0Q.H| | 1A0Q| L| 10090| Mus musculus| 1A0Q.L| | 1A14| H| 10090| Mus musculus| 1A14.H| +-----+-----+------+---------------+----------------+ only showing top 10 rows
path = "../resources/mmtf_full_sample/"
pdb = mmtfReader.read_sequence_file(path, fraction=0.1)
### TODO-2: Take the taxonomy query from above and use it to filter the pdb structures
pdb = ... your code here ... .cache()
Find protein-protein interactions with a 6 A distance cutoff
distance_cutoff = 6.0
interactionFilter = InteractionFilter(distance_cutoff, minInteractions=10)
interactions = InteractionFingerprinter.get_polymer_interactions(pdb, interactionFilter).cache()
interactions = interactions.withColumn("structureId", substring_index(interactions.structureChainId, '.', 1)).cache()
interactions.toPandas().head(10)
structureChainId | queryChainId | targetChainId | groupNumbers | sequenceIndices | sequence | structureId | |
---|---|---|---|---|---|---|---|
0 | 4M48.A | H | A | [337, 338, 498, 501, 502, 503, 504, 505, 506, ... | [70, 274, 275, 435, 438, 439, 440, 441, 442, 4... | MNSISDERETWSGKVDFLLSVIGFAVDLANVWRFPYLCYKNGGGAF... | 4M48 |
1 | 4M48.H | A | H | [100, 101, 102, 103, 31, 33, 50, 52, 53, 54, 5... | [49, 51, 68, 70, 71, 72, 73, 74, 75, 77, 117, ... | MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF... | 4M48 |
2 | 4M48.L | H | L | [1, 100, 101, 115, 117, 118, 119, 120, 121, 12... | [22, 53, 54, 56, 58, 60, 63, 64, 65, 66, 67, 6... | MDFQVQIFSFLLISASVAMSRGENVLTQSPAIMSTSPGEKVTMTCR... | 4M48 |
3 | 4M48.H | L | H | [100, 101, 102, 103, 104, 105, 106, 107, 108, ... | [53, 55, 57, 60, 61, 62, 63, 64, 65, 68, 77, 7... | MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF... | 4M48 |
4 | 4NN5.A | C | A | [126, 127, 129, 130, 131, 132, 133, 134, 136, ... | [11, 14, 15, 16, 19, 20, 23, 28, 30, 31, 32, 3... | YNFSNCNFTSITKIYCNIIFHDLTGDLKGAKFEQIEDCESKPACLL... | 4NN5 |
5 | 4NN5.C | A | C | [106, 107, 108, 109, 110, 112, 113, 143, 144, ... | [16, 41, 42, 68, 69, 70, 71, 73, 74, 86, 87, 8... | AAAVTSRGDVTVVCHDLETVEVTWGSGPDHHGANLSLEFRYGTGAL... | 4NN5 |
6 | 2QDQ.A | B | A | [2496, 2497, 2498, 2500, 2501, 2502, 2504, 250... | [4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 19, 20... | GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL... | 2QDQ |
7 | 2QDQ.B | A | B | [2497, 2498, 2500, 2501, 2504, 2505, 2507, 250... | [5, 6, 8, 9, 12, 13, 15, 16, 17, 19, 20, 22, 2... | GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL... | 2QDQ |
8 | 4P3A.C | D | C | [698, 701, 702, 704, 705, 706, 708, 709, 710, ... | [21, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 3... | GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV... | 4P3A |
9 | 4P3A.D | C | D | [698, 701, 702, 704, 705, 706, 708, 709, 710, ... | [21, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 3... | GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV... | 4P3A |
structure_ids = interactions.select("structureId").rdd.flatMap(lambda x: x).collect()
query_chain_ids = interactions.select("queryChainID").rdd.flatMap(lambda x: x).collect()
target_chain_ids = interactions.select("targetChainID").rdd.flatMap(lambda x: x).collect()
target_groups = interactions.select("groupNumbers").rdd.flatMap(lambda x: x).collect()
Disable scrollbar for the visualization below
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}
def view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, target_groups, distance=4.5):
def view3d(i=0):
print(f"PDB: {structure_ids[i]}, query: {query_chain_ids[i]}, target: {target_chain_ids[i]}")
target = {'chain': target_chain_ids[i], 'resi': target_groups[i]}
viewer = py3Dmol.view(query='pdb:' + structure_ids[i], width=600, height=600)
viewer.setStyle({})
viewer.setStyle({'chain': query_chain_ids[i]}, {'line': {'colorscheme': 'orangeCarbon'}})
viewer.setStyle({'chain' : query_chain_ids[i], 'within':{'distance' : distance, 'sel':{'chain': target_chain_ids[i]}}}, {'sphere': {'colorscheme': 'orangeCarbon'}});
viewer.setStyle({'chain': target_chain_ids[i]}, {'line': {'colorscheme': 'lightblueCarbon'}})
viewer.setStyle(target, {'stick': {'colorscheme': 'lightblueCarbon'}})
viewer.zoomTo(target)
return viewer.show()
s_widget = IntSlider(min=0, max=len(structure_ids)-1, description='Structure', continuous_update=False)
return interact(view3d, i=s_widget)
view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, \
target_groups, distance=distance_cutoff);
interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=47), Output()),…
spark.stop()