This tutorial shows a basic template to create a dataset computationally.
from pyspark.sql import Row, SparkSession
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.io import mmtfReader
spark = SparkSession.builder.appName("4-CreateDatasets").getOrCreate()
Reading a random fraction of the input file is a good strategy to test some new functionality.
path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path, fraction=0.1)
def calcProperties(s):
# s[0] pdb id
# s[1] mmtf structure record
return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)
Here we use a lambda expression to calculate properties.
rows = pdb.map(lambda s: calcProperties(s))
col_names = ["pdbId", "models", "chains", "groups", "atoms", "bonds"]
summary = pythonRDDToDataset.get_dataset(rows, col_names)
# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long
summary.columns
['pdbId', 'models', 'chains', 'groups', 'atoms', 'bonds']
summary.printSchema()
root |-- pdbId: string (nullable = false) |-- models: integer (nullable = false) |-- chains: integer (nullable = false) |-- groups: integer (nullable = false) |-- atoms: integer (nullable = false) |-- bonds: integer (nullable = false)
summary.show()
+-----+------+------+------+-----+-----+ |pdbId|models|chains|groups|atoms|bonds| +-----+------+------+------+-----+-----+ | 1LBU| 1| 3| 443| 1793| 1602| | 1LC0| 1| 5| 700| 2731| 2358| | 1LC5| 1| 4| 628| 3056| 2848| | 1LFP| 1| 2| 593| 2275| 1958| | 1LFW| 1| 5| 1041| 4238| 3750| | 1LGH| 1| 68| 512| 5436| 5526| | 1LH0| 1| 8| 701| 3596| 3375| | 1LJ8| 1| 3| 930| 4310| 3965| | 1LKI| 1| 2| 222| 1386| 1364| | 1LMI| 1| 2| 303| 1139| 989| | 1LML| 1| 3| 678| 3738| 3616| | 1LO7| 1| 5| 316| 1375| 1229| | 1LQ9| 1| 5| 483| 2006| 1794| | 1LQV| 1| 30| 862| 4048| 3695| | 1LR0| 1| 5| 251| 1100| 992| | 1LR5| 1| 16| 1379| 6071| 5531| | 1LRI| 1| 4| 199| 861| 777| | 1LRZ| 1| 2| 718| 3631| 3399| | 1LS1| 1| 6| 577| 5396| 5073| | 1LTS| 1| 14| 1034| 6271| 6091| +-----+------+------+------+-----+-----+ only showing top 20 rows
summary.describe(col_names[1:]).toPandas()
summary | models | chains | groups | atoms | bonds | |
---|---|---|---|---|---|---|
0 | count | 9756 | 9756 | 9756 | 9756 | 9756 |
1 | mean | 1.0003075030750308 | 8.567343173431734 | 699.720377203772 | 3510.4634071340715 | 3252.99651496515 |
2 | stddev | 0.01753396788544404 | 7.177280313219018 | 437.73900408050133 | 2140.150369170067 | 2015.1217534374905 |
3 | min | 1 | 1 | 21 | 154 | 144 |
4 | max | 2 | 91 | 3026 | 9995 | 10077 |
spark.stop()