#!/usr/bin/env python # coding: utf-8 # # 4-CreateDatasets # This tutorial shows a basic template to create a dataset computationally. # In[12]: from pyspark.sql import Row, SparkSession from mmtfPyspark.ml import pythonRDDToDataset from mmtfPyspark.io import mmtfReader # #### Configure Spark Session and Spark Context # In[13]: spark = SparkSession.builder.appName("4-CreateDatasets").getOrCreate() # ## Read a 10% fraction of the sample file # Reading a random fraction of the input file is a good strategy to test some new functionality. # In[14]: path = "../resources/mmtf_full_sample" pdb = mmtfReader.read_sequence_file(path, fraction=0.1) # # Creating a dataset in 3 simple steps # ## Step 1: calculate properties for a structure and add it to a Row object # In[15]: def calcProperties(s): # s[0] pdb id # s[1] mmtf structure record return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds) # ## Step 2: map structures to rows # Here we use a lambda expression to calculate properties. # In[16]: rows = pdb.map(lambda s: calcProperties(s)) # ## Step 3: convert RDD of Rows to a dataset # In[17]: col_names = ["pdbId", "models", "chains", "groups", "atoms", "bonds"] summary = pythonRDDToDataset.get_dataset(rows, col_names) # summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long # ## Done: Show some details about this dataset # In[18]: summary.columns # In[19]: summary.printSchema() # In[20]: summary.show() # #### Print statistics for the numerical columns # In[21]: summary.describe(col_names[1:]).toPandas() # In[22]: spark.stop() # In[ ]: