#!/usr/bin/env python # coding: utf-8 # # Decision Tree를 활용한 Mushroom 데이터 분류 # ### 1) Mushroom Data Set 로드 및 scikit을 활용하기 위한 데이터 분리 # In[2]: import urllib2 from scipy import stats from pandas import Series, DataFrame import pandas as pd import numpy as np path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data' raw_csv = urllib2.urlopen(path) col_names = range(23) df = pd.read_csv(raw_csv, names = col_names) # In[3]: df.head() # - categorical 데이터를 ordered 데이터로 변경 # In[4]: df[0] = df[0].map({'p': 1, 'e': 0}) df.head() # In[5]: map_dic = {} num_columns = df.shape[1] for i in range(num_columns): unique_array = df[i].unique() map_dic_sub = {} for j in range(len(unique_array)): map_dic_sub[unique_array[j]] = j df[i] = df[i].map(map_dic_sub) df.head() # In[6]: attributes = df.iloc[:, 1:23] attributes.head() # In[7]: mushroom_data = attributes.values mushroom_data # In[8]: target_series = df.iloc[:, 0] target_series.head() # In[9]: mushroom_target = target_series.values mushroom_target # ### 2) scikit의 DecisionTreeClassifier를 활용한 결정 트리 분류 # In[10]: from sklearn import tree clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(mushroom_data, mushroom_target) # In[11]: with open("mushroom.dot", 'w') as f2: tree.export_graphviz(clf, out_file=f2) # # - classifier (clf2) 객체를 활용한 새로운 데이터에 대한 분류 추론 # In[12]: mushroom_data[-1] # In[13]: mushroom_data[-1].reshape(1,-1) # In[14]: clf.predict(mushroom_data[-1].reshape(1,-1)) # In[15]: clf.predict(mushroom_data[-2].reshape(1,-1)) # ### 3) Spark을 활용한 Mushroom 데이터 분류 # In[81]: import findspark findspark.init() from pyspark import SparkContext, SparkFiles, SQLContext if not 'sc' in locals(): sc = SparkContext() sqlCtx = SQLContext(sc) sdf = sqlCtx.createDataFrame(df) sdf.show() #sdf.printSchema() print "Raw data size is %s" % sdf.count() # In[82]: from pyspark.mllib.tree import DecisionTree, LabeledPoint result = sdf.rdd.map(lambda row: LabeledPoint(row[0], row[1:23])) (trainingData, testData) = result.randomSplit([0.7, 0.3]) featuresTrainingData = trainingData.map(lambda x: x.features) labelTrainingData = trainingData.map(lambda x: x.label) print featuresTrainingData.take(10) print labelTrainingData.take(10) print testData.count() # In[83]: model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=6, maxBins=200) # In[84]: predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions) #print labelsAndPredictions.take(100) errorCount = labelsAndPredictions.filter(lambda (v, p): v != p).count() print "errorCount = %s" % errorCount testErr = errorCount / float(testData.count()) print 'Test Error = %s' % testErr print 'Learned classification tree model:' print model.toDebugString() # ### 4) Spark의 Random Forest 라이브러리를 이용 # In[85]: from pyspark.mllib.tree import RandomForest # In[87]: model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=100, impurity='gini', maxDepth=6, maxBins=200) # In[88]: predictions = model2.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions) #print labelsAndPredictions.take(100) errorCount = labelsAndPredictions.filter(lambda (v, p): v != p).count() print "errorCount = %s" % errorCount testErr = errorCount / float(testData.count()) print 'Test Error = %s' % testErr print 'Learned classification tree model2:' print model2.toDebugString() # In[ ]: