#!/usr/bin/env python # coding: utf-8 # # Problem 1 # Apply your skills to classify protein foldType with Decision Tree Classifier # # ## Imports # In[1]: from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer from pyspark.sql import SparkSession from pyspark.sql.functions import * import mltoolkit # ## Configure Spark Session # In[2]: spark = SparkSession.builder.appName("Problem-1").getOrCreate() # ## TODO-1: Read in data from parquet file # In[3]: parquetFile = './input_features/' data = # Your Code Here # # ## TODO-2: Select alpha, beta, alpha+beta foldtypes # In[ ]: data = # Your Code Here # print(f"Total number of data: {data.count()}") # ## TODO-3: Downsample data # In[ ]: label = 'foldType' data = # Your Code Here # print(f"Dataset size (balanced) : {data.count()}") data.groupby(label).count().show() # ## TODO-4: Decision Tree Classifier with PySpark # In[ ]: from pyspark.ml.classification import DecisionTreeClassifier dtc = # Your Code Here: Make Decision Tree Classifier Class # mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc# matrics = # Your Code Here: fit data# for k,v in matrics.items(): print(f"{k}\t{v}") # ## BONUS: Decision Tree Classifier with sklearn # In[ ]: from sklearn.tree import DecisionTreeClassifier df = # Your Code Here: convert data to Pandas Dataframe # dtc = # Your Code Here: Make Decision Tree Classifier Class # mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc# matrics = # Your Code Here: fit data# for k,v in matrics.items(): print(f"{k}\t{v}") # In[ ]: spark.stop()