# Custom libraries
from datascienceutils import clusteringModels as cm
from datascienceutils import analyze
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
irisDf = pd.read_csv('~/DataScientist/data/Iris.csv')
analyze.dimension_analyze(irisDf, cluster=True, n_clusters=3)
analyze.dimension_analyze(irisDf, pca_plot=True)
irisDf.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
target = irisDf.Class
irisDf.drop('Class', 1, inplace=True)
cm.cluster_analyze(irisDf)
cm.silhouette_analyze(irisDf, cluster_type='KMeans')
For clusters = 2 The average silhouette_score is : 0.363119994814
For clusters = 4 The average silhouette_score is : 0.753523569925
For clusters = 6 The average silhouette_score is : 0.622085988116
cm.silhouette_analyze(irisDf, cluster_type='dbscan')
For clusters = 2 The average silhouette_score is : 0.485842354601
For clusters = 4 The average silhouette_score is : 0.607665314687
For clusters = 6 The average silhouette_score is : 0.607665314687
cm.silhouette_analyze(irisDf, cluster_type='spectral')
cm.silhouette_analyze(irisDf, cluster_type='birch')
For clusters = 2 The average silhouette_score is : 0.501699257107
For clusters = 4 The average silhouette_score is : 0.57351529012
For clusters = 6 The average silhouette_score is : 0.659228815789
cm.som_analyze(irisDf, (10,10), algo_type='som')
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-7-afbf7e8103c0> in <module>() ----> 1 cm.som_analyze(irisDf, (10,10), algo_type='som') ~/playspace/data-science-utils/datascienceutils/clusteringModels.py in som_analyze(dataframe, mapsize, algo_type) 163 164 def som_analyze(dataframe, mapsize, algo_type='som'): --> 165 import sompy 166 som_factory = sompy.SOMFactory() 167 data = dataframe.as_matrix() ~/playspace/data-science-utils/src/sompy/sompy/__init__.py in <module>() 28 29 ---> 30 from .sompy import SOMFactory 31 from .visualization import * ~/playspace/data-science-utils/src/sompy/sompy/sompy.py in <module>() 30 31 #lbugnon ---> 32 import sompy,ipdb 33 # 34 ModuleNotFoundError: No module named 'ipdb'