from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit
spark = SparkSession.builder.appName("Problem-1").getOrCreate()
parquetFile = './input_features/'
data = # Your Code Here #
File "<ipython-input-3-cf53e0a2b799>", line 2 data = # Your Code Here # ^ SyntaxError: invalid syntax
data = # Your Code Here #
print(f"Total number of data: {data.count()}")
label = 'foldType'
data = # Your Code Here #
print(f"Dataset size (balanced) : {data.count()}")
data.groupby(label).count().show()
from pyspark.ml.classification import DecisionTreeClassifier
dtc = # Your Code Here: Make Decision Tree Classifier Class #
mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc#
matrics = # Your Code Here: fit data#
for k,v in matrics.items(): print(f"{k}\t{v}")
from sklearn.tree import DecisionTreeClassifier
df = # Your Code Here: convert data to Pandas Dataframe #
dtc = # Your Code Here: Make Decision Tree Classifier Class #
mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc#
matrics = # Your Code Here: fit data#
for k,v in matrics.items(): print(f"{k}\t{v}")
spark.stop()