In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
from sklearn.cluster import KMeans
import warnings
import os
warnings.filterwarnings("ignore")
py.offline.init_notebook_mode(connected = True)
#print(os.listdir(""))

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-4-6d6b15785353> in <module>()
3 import matplotlib.pyplot as plt
4 import seaborn as sns
----> 5 import plotly as py
6 import plotly.graph_objs as go
7 from sklearn.cluster import KMeans

ModuleNotFoundError: No module named 'plotly'

## Resumen¶

El informe corresponde a una segmentaciÃ³n de clientes basado en una lista de correos a manera de clasificarlos y analizar su comportamiento

In [6]:
df = pd.read_csv(r'Mall_Customers.csv')

Out[6]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100) 0 1 Male 19 15 39 1 2 Male 21 15 81 2 3 Female 20 16 6 3 4 Female 23 16 77 4 5 Female 31 17 40 In [7]: df.shape  Out[7]: (200, 5) In [8]: df.describe()  Out[8]: CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
In [9]:
df.dtypes

Out[9]:
CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$) int64 Spending Score (1-100) int64 dtype: object In [10]: df.isnull().sum()  Out[10]: CustomerID 0 Gender 0 Age 0 Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64
In [11]:
plt.style.use('fivethirtyeight')

In [12]:
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(1 , 3 , n) plt.subplots_adjust(hspace =0.5 , wspace = 0.5) sns.distplot(df[x] , bins = 20) plt.title('Distplot of {}'.format(x)) plt.show()  C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use arr[tuple(seq)] instead of arr[seq]. In the future this will be interpreted as an array index, arr[np.array(seq)], which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "  In [13]: plt.figure(1 , figsize = (15 , 5)) sns.countplot(y = 'Gender' , data = df) plt.show()  In [26]: plt.figure(1 , figsize = (15 , 7)) n = 0 for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(3 , 3 , n) plt.subplots_adjust(hspace = 0.5 , wspace = 0.5) sns.regplot(x = x , y = y , data = df) plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y ) plt.show()  C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use arr[tuple(seq)] instead of arr[seq]. In the future this will be interpreted as an array index, arr[np.array(seq)], which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval  In [15]: plt.figure(1 , figsize = (15 , 6)) for gender in ['Male' , 'Female']: plt.scatter(x = 'Age' , y = 'Annual Income (k$)' , data = df[df['Gender'] == gender] ,
s = 200 , alpha = 0.5 , label = gender)
plt.xlabel('Age'), plt.ylabel('Annual Income (k$)') plt.title('Age vs Annual Income w.r.t Gender') plt.legend() plt.show()  In [16]: plt.figure(1 , figsize = (15 , 6)) for gender in ['Male' , 'Female']: plt.scatter(x = 'Annual Income (k$)',y = 'Spending Score (1-100)' ,
data = df[df['Gender'] == gender] ,s = 200 , alpha = 0.5 , label = gender)
plt.xlabel('Annual Income (k$)'), plt.ylabel('Spending Score (1-100)') plt.title('Annual Income vs Spending Score w.r.t Gender') plt.legend() plt.show()  In [17]: plt.figure(1 , figsize = (15 , 7)) n = 0 for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.violinplot(x = cols , y = 'Gender' , data = df , palette = 'vlag')
sns.swarmplot(x = cols , y = 'Gender' , data = df)
plt.ylabel('Gender' if n == 1 else '')
plt.title('Boxplots & Swarmplots' if n == 2 else '')
plt.show()

C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use arr[tuple(seq)] instead of arr[seq]. In the future this will be interpreted as an array index, arr[np.array(seq)], which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

In [ ]: