Description¶

This is an example tutorial to use my module bhishan for the plotly extension for pandas.

Imports¶

In [1]:

import numpy as np
import pandas as pd
import seaborn as sns
import bhishan
from bhishan import bp
import matplotlib.pyplot as plt

%load_ext autoreload
%load_ext watermark

%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv

Bhishan Poudel 2020-09-28 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
numpy    1.18.4
pandas   1.1.0
autopep8 1.5.2
seaborn  0.11.0
bhishan  0.3.1
json     2.0.9

Using plotly api in module bhishan¶

In [2]:

# print(sns.get_dataset_names())

In [3]:

df = sns.load_dataset('titanic')
df.head()

Out[3]:

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

Correlation¶

In [4]:

df.bp.plot_corr(xrot=0)

In [5]:

df.bp.plot_corr(target='survived')

In [6]:

target = 'survived'
df_few_cols = df[['age','fare']]
df_few_cols.merge(df[target],left_index=True,right_index=True)\
           .bp.plot_corr()

In [7]:

df.bp.plot_corr_style()

Out[7]:

Hover to magify
	survived	pclass	age	sibsp	parch	fare	adult_male	alone
survived	1.00	-0.34	-0.08	-0.04	0.08	0.26	-0.56	-0.20
pclass	-0.34	1.00	-0.37	0.08	0.02	-0.55	0.09	0.14
age	-0.08	-0.37	1.00	-0.31	-0.19	0.10	0.28	0.20
sibsp	-0.04	0.08	-0.31	1.00	0.41	0.16	-0.25	-0.58
parch	0.08	0.02	-0.19	0.41	1.00	0.22	-0.35	-0.58
fare	0.26	-0.55	0.10	0.16	0.22	1.00	-0.18	-0.27
adult_male	-0.56	0.09	0.28	-0.25	-0.35	-0.18	1.00	0.40
alone	-0.20	0.14	0.20	-0.58	-0.58	-0.27	0.40	1.00

In [8]:

# df.bp.plot_corr_sns()

In [9]:

df.corr().style.apply(lambda x: ["background: salmon" if  (abs(v) > 0.5 and v!=1) else "" for v in x], axis = 1)

Out[9]:

	survived	pclass	age	sibsp	parch	fare	adult_male	alone
survived	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307	-0.557080	-0.203367
pclass	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500	0.094035	0.135207
age	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067	0.280328	0.198270
sibsp	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651	-0.253586	-0.584471
parch	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225	-0.349943	-0.583398
fare	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000	-0.182024	-0.271832
adult_male	-0.557080	0.094035	0.280328	-0.253586	-0.349943	-0.182024	1.000000	0.404744
alone	-0.203367	0.135207	0.198270	-0.584471	-0.583398	-0.271832	0.404744	1.000000

In [10]:

df.bp.corr_high(thr=0.5,disp=True)

cols_high_corr = ['sibsp', 'pclass', 'adult_male', 'survived', 'fare', 'alone', 'parch']
cols_high_corr1 = ['fare', 'survived', 'parch', 'alone']
cols_high_corr2 = ['pclass', 'adult_male', 'alone', 'sibsp']
cols_high_corr_drop = ['fare', 'parch', 'survived']

	feature1	feature2	corr
0	fare	pclass	-0.549500
1	survived	adult_male	-0.557080
2	parch	alone	-0.583398
3	alone	sibsp	-0.584471

In [11]:

cols_high_corr_drop = ['fare', 'parch', 'survived']

In [12]:

df1 = df.drop(cols_high_corr_drop,axis=1)
df1.corr()

Out[12]:

	pclass	age	sibsp	adult_male	alone
pclass	1.000000	-0.369226	0.083081	0.094035	0.135207
age	-0.369226	1.000000	-0.308247	0.280328	0.198270
sibsp	0.083081	-0.308247	1.000000	-0.253586	-0.584471
adult_male	0.094035	0.280328	-0.253586	1.000000	0.404744
alone	0.135207	0.198270	-0.584471	0.404744	1.000000

Partial Correlation¶

In [13]:

df.bp.partial_corr(thr=0.3,disp=True)

Out[13]:

	survived	pclass	age	sibsp	parch	fare
survived	1.000000	0.115371	0.111542	-0.028674	0.082447	0.301210
pclass	0.115371	1.000000	0.571910	0.306844	0.173285	-0.315652
age	0.111542	0.571910	1.000000	-0.192800	-0.095105	0.392652
sibsp	-0.028674	0.306844	-0.192800	1.000000	0.324787	0.211403
parch	0.082447	0.173285	-0.095105	0.324787	1.000000	0.199136
fare	0.301210	-0.315652	0.392652	0.211403	0.199136	1.000000

In [14]:

df.bp.partial_corr(['fare','age'])

Out[14]:

	fare	age
fare	1.000000	0.096067
age	0.096067	1.000000

Outliers¶

In [15]:

ser_outliers = df.bp.outliers_tukey('age')
ser_outliers

Out[15]:

	age
33	66.0
54	65.0
96	71.0
116	70.5
280	65.0
456	65.0
493	71.0
630	80.0
672	70.0
745	70.0
851	74.0

In [16]:

col = 'age'
df1 = df.dropna(subset=[col]).reset_index(drop=True)
idx_outliers, val_outliers = df1.bp.outliers_kde(col)
df1.loc[idx_outliers,[col]]

Out[16]:

	age
498	80.0
679	74.0
74	71.0
393	71.0
91	70.5
531	70.0
592	70.0
25	66.0
366	65.0
225	65.0
40	65.0
351	64.0
433	64.0
385	63.0
221	63.0

Kernel Density¶

In [17]:

df.bp.compare_kde_binn('age','survived')

In [ ]:

Table of Contents