#!/usr/bin/env python
# coding: utf-8

# # 4-CreateDatasets
# This tutorial shows a basic template to create a dataset computationally.

# In[12]:


from pyspark.sql import Row, SparkSession
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.io import mmtfReader


# #### Configure Spark Session and Spark Context

# In[13]:


spark = SparkSession.builder.appName("4-CreateDatasets").getOrCreate()


# ## Read a 10% fraction of the sample file
# Reading a random fraction of the input file is a good strategy to test some new functionality.

# In[14]:


path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path, fraction=0.1)


# # Creating a dataset in 3 simple steps

# ## Step 1: calculate properties for a structure and add it to a Row object

# In[15]:


def calcProperties(s):
    # s[0] pdb id
    # s[1] mmtf structure record
    return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)


# ## Step 2: map structures to rows
# Here we use a lambda expression to calculate properties.

# In[16]:


rows = pdb.map(lambda s: calcProperties(s))


# ## Step 3: convert RDD of Rows to a dataset

# In[17]:


col_names = ["pdbId", "models", "chains", "groups", "atoms", "bonds"]
summary = pythonRDDToDataset.get_dataset(rows, col_names) 
# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long


# ## Done: Show some details about this dataset

# In[18]:


summary.columns


# In[19]:


summary.printSchema()


# In[20]:


summary.show()


# #### Print statistics for the numerical columns

# In[21]:


summary.describe(col_names[1:]).toPandas()


# In[22]:


spark.stop()


# In[ ]: