#!/usr/bin/env python # coding: utf-8 # # 1-Metadata # This tutorial shows how to use Spark datasets to retrieve metadata about PDB structures. mmtfPyspark provides a number of moduls to fetch data from [external resources](https://github.com/sbl-sdsc/mmtf-pyspark/tree/master/mmtfPyspark/datasets). # # In this tutorial shows how to download and analyze PDB metadata from the [SIFTS project](https://www.ebi.ac.uk/pdbe/docs/sifts/methodology.html) as Spark Datasets. # # In[1]: from pyspark.sql import SparkSession from pyspark.sql.functions import substring_index from mmtfPyspark.datasets import pdbjMineDataset import matplotlib.pyplot as plt # #### Configure Spark # In[2]: spark = SparkSession.builder.appName("1-Metadata").getOrCreate() # ## Download up to date EC classification data # The SIFTS project maintains up-to-date mappings of protein chains in the PDB to Enzyme Classifications [EC](http://www.sbcs.qmul.ac.uk/iubmb/enzyme/). We use the [pdbjMinedDataset class](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/mmtfPyspark/datasets/pdbjMineDataset.py) to retrieve these mappings. An extensive [demo](https://nbviewer.jupyter.org/github/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb) shows how to query SIFTS data with pdbjMineDataset. # #### Query EC data # In[3]: query = "SELECT * FROM sifts.pdb_chain_enzyme" enzymes = pdbjMineDataset.get_dataset(query).cache() # In[4]: enzymes.show() # #### For better formatting, we can convert the dataset to pandas # In[5]: enzymes.toPandas().head(20) # ## Remove redundcancy # Here we select a single protein chain for each unique UniProt accession number # In[6]: enzymes = enzymes.dropDuplicates(["accession"]) # ## Add a columns for enzyme type and subtype # We use the [withColumn](http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumn) method to add a new column and the [substring_index](http://spark.apache.org/docs/2.3.0/api/python/pyspark.sql.html#pyspark.sql.functions.substring_index) method to extract the first two levels from the EC number hierarchy. # In[7]: enzymes = enzymes.withColumn("enzymeType", substring_index(enzymes.ec_number, '.', 1)) enzymes = enzymes.withColumn("enzymeSubtype", substring_index(enzymes.ec_number, '.', 2)) # In[8]: enzymes.toPandas().head(20) # ## Count the occurance of the enzyme types # In[9]: counts = enzymes.groupBy("enzymeType")\ .count()\ .sort("count", ascending=False)\ .toPandas() counts # ## Use pandas to plot the occurances with Matplotlib # In[10]: counts.plot(x='enzymeType', y='count', kind='bar'); # In[11]: spark.stop()