Here we combine what you've learned about ColumnarStructure with skills from previous tutorials. We use ColumnarStructure to calculate the average number of water molecules per amino acid residue (waterRatio) and the average b-factor (temperature factor). We capture the information in a dataset and then plot the waterRatio against the resolution to see if there is a trend in the data.
from pyspark.sql import Row, SparkSession
from mmtfPyspark.filters import ContainsLProteinChain, ExperimentalMethods
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.utils import ColumnarStructure
import matplotlib.pyplot as plt
import numpy as np
spark = SparkSession.builder.appName("Solution-2").getOrCreate()
path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path).cache()
pdb = ... your code here ...
pdb = ... your code here ...
pdb = pdb.filter(lambda t: t[1].num_models ==1)
def calcProperties(s):
# s[0] pdb id
# s[1] mmtf structure record
arrays = ColumnarStructure(s[1], firstModelOnly=True)
# we column arrays
atom_names = arrays.get_atom_names()
entity_types = arrays.get_entity_types()
# TODO-3: get array of b-factors
b_factors = ... your code here ...
# calculate number of protein residues using boolean indexing
pro_idx = (entity_types == 'PRO') & (atom_names == 'CA')
num_pro = int(np.sum(pro_idx))
# TODO-4: calculate number of water residues using boolean indexing
wat_idx = ... your code here ...
num_wat = int(np.sum(wat_idx))
# calculate average B-factor for protein atoms
pro_atom_idx = (entity_types == 'PRO')
pro_b_factors = b_factors[pro_atom_idx]
ave_b = float(np.average(pro_b_factors))
return Row(s[0], s[1].resolution, ave_b, num_pro, num_wat, num_wat/num_pro)
rows = pdb.map(lambda s: calcProperties(s))
col_names = ["pdbId", "resolution", "ave_b", "numPro", "numWat", "waterRatio"]
summary = pythonRDDToDataset.get_dataset(rows, col_names).cache()
summary.printSchema()
root |-- pdbId: string (nullable = false) |-- resolution: float (nullable = false) |-- ave_b: float (nullable = false) |-- numPro: integer (nullable = false) |-- numWat: integer (nullable = false) |-- waterRatio: float (nullable = false)
summary.show()
+-----+----------+---------+------+------+----------+ |pdbId|resolution| ave_b|numPro|numWat|waterRatio| +-----+----------+---------+------+------+----------+ | 1LBU| 1.8| 15.45025| 214| 229| 1.0700935| | 1LC0| 1.2|17.563633| 290| 407| 1.4034482| | 1LC5| 1.46| 16.42075| 355| 271| 0.7633803| | 1LFP| 1.72| 35.73947| 243| 350| 1.4403292| | 1LFW| 1.8|13.556223| 468| 570| 1.2179487| | 1LGH| 2.4| 37.60027| 396| 64|0.16161616| | 1LH0| 2.0|31.355368| 419| 278| 0.6634845| | 1LJ8| 1.7| 18.55931| 492| 437|0.88821137| | 1LKI| 2.0|22.673143| 172| 50|0.29069766| | 1LMI| 1.5| 19.07365| 131| 172| 1.3129771| | 1LML| 1.86|23.073887| 465| 212| 0.455914| | 1LO7| 1.5| 18.83036| 140| 173| 1.2357143| | 1LQ9| 1.3| 13.54245| 224| 258| 1.1517857| | 1LQV| 1.6|26.535683| 411| 429| 1.0437956| | 1LR0| 1.914|26.655485| 126| 122|0.96825397| | 1LRI| 1.45|21.217518| 98| 99| 1.0102041| | 1LRZ| 2.1|27.266705| 400| 318| 0.795| | 1LS1| 1.1|21.858797| 316| 316| 1.0| | 1LTS| 1.95| 32.31854| 741| 293| 0.3954116| | 1LU0| 1.03|15.596555| 61| 76| 1.2459016| +-----+----------+---------+------+------+----------+ only showing top 20 rows
data = summary.toPandas()
data.plot(x='resolution', y='waterRatio', kind='scatter');
spark.stop()