In [1]:

# Custom libraries
from datascienceutils import clusteringModels as cm
from datascienceutils import analyze

# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()

/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

BokehJS successfully loaded.

In [2]:

irisDf = pd.read_csv('~/DataScientist/data/Iris.csv')

In [ ]:

analyze.dimension_analyze(irisDf, cluster=True, n_clusters=3)

In [ ]:

analyze.dimension_analyze(irisDf, pca_plot=True)

In [ ]:

irisDf.head()

Out[ ]:

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

In [4]:

target = irisDf.Class
irisDf.drop('Class', 1, inplace=True)

In [ ]:

cm.cluster_analyze(irisDf)

In [5]:

cm.silhouette_analyze(irisDf, cluster_type='KMeans')

For clusters = 2 The average silhouette_score is : 0.363119994814

For clusters = 4 The average silhouette_score is : 0.753523569925

For clusters = 6 The average silhouette_score is : 0.622085988116

In [4]:

cm.silhouette_analyze(irisDf, cluster_type='dbscan')

For clusters = 2 The average silhouette_score is : 0.485842354601

For clusters = 4 The average silhouette_score is : 0.607665314687

For clusters = 6 The average silhouette_score is : 0.607665314687

In [ ]:

cm.silhouette_analyze(irisDf, cluster_type='spectral')

In [6]:

cm.silhouette_analyze(irisDf, cluster_type='birch')

For clusters = 2 The average silhouette_score is : 0.501699257107

For clusters = 4 The average silhouette_score is : 0.57351529012

For clusters = 6 The average silhouette_score is : 0.659228815789

In [7]:

cm.som_analyze(irisDf, (10,10), algo_type='som')

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-7-afbf7e8103c0> in <module>()
----> 1 cm.som_analyze(irisDf, (10,10), algo_type='som')

~/playspace/data-science-utils/datascienceutils/clusteringModels.py in som_analyze(dataframe, mapsize, algo_type)
    163 
    164 def som_analyze(dataframe, mapsize, algo_type='som'):
--> 165     import sompy
    166     som_factory = sompy.SOMFactory()
    167     data = dataframe.as_matrix()

~/playspace/data-science-utils/src/sompy/sompy/__init__.py in <module>()
     28 
     29 
---> 30 from .sompy import SOMFactory
     31 from .visualization import *

~/playspace/data-science-utils/src/sompy/sompy/sompy.py in <module>()
     30 
     31 #lbugnon
---> 32 import sompy,ipdb
     33 #
     34 

ModuleNotFoundError: No module named 'ipdb'