Description

This is an example tutorial to use my module bhishan for the plotly extension for pandas.

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import bhishan
from bhishan import bp
import matplotlib.pyplot as plt

%load_ext autoreload
%load_ext watermark

%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv
Bhishan Poudel 2020-09-28 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
numpy    1.18.4
pandas   1.1.0
autopep8 1.5.2
seaborn  0.11.0
bhishan  0.3.1
json     2.0.9

Using plotly api in module bhishan

In [2]:
# print(sns.get_dataset_names())
In [3]:
df = sns.load_dataset('titanic')
df.head()
Out[3]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True

Correlation

In [4]:
df.bp.plot_corr(xrot=0)
In [5]:
df.bp.plot_corr(target='survived')
In [6]:
target = 'survived'
df_few_cols = df[['age','fare']]
df_few_cols.merge(df[target],left_index=True,right_index=True)\
           .bp.plot_corr()
In [7]:
df.bp.plot_corr_style()
Out[7]:
Hover to magify
survived pclass age sibsp parch fare adult_male alone
survived 1.00 -0.34 -0.08 -0.04 0.08 0.26 -0.56 -0.20
pclass -0.34 1.00 -0.37 0.08 0.02 -0.55 0.09 0.14
age -0.08 -0.37 1.00 -0.31 -0.19 0.10 0.28 0.20
sibsp -0.04 0.08 -0.31 1.00 0.41 0.16 -0.25 -0.58
parch 0.08 0.02 -0.19 0.41 1.00 0.22 -0.35 -0.58
fare 0.26 -0.55 0.10 0.16 0.22 1.00 -0.18 -0.27
adult_male -0.56 0.09 0.28 -0.25 -0.35 -0.18 1.00 0.40
alone -0.20 0.14 0.20 -0.58 -0.58 -0.27 0.40 1.00
In [8]:
# df.bp.plot_corr_sns()
In [9]:
df.corr().style.apply(lambda x: ["background: salmon" if  (abs(v) > 0.5 and v!=1) else "" for v in x], axis = 1)
Out[9]:
survived pclass age sibsp parch fare adult_male alone
survived 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 -0.557080 -0.203367
pclass -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 0.094035 0.135207
age -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 0.280328 0.198270
sibsp -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 -0.253586 -0.584471
parch 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 -0.349943 -0.583398
fare 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 -0.182024 -0.271832
adult_male -0.557080 0.094035 0.280328 -0.253586 -0.349943 -0.182024 1.000000 0.404744
alone -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 0.404744 1.000000
In [10]:
df.bp.corr_high(thr=0.5,disp=True)
cols_high_corr = ['sibsp', 'pclass', 'adult_male', 'survived', 'fare', 'alone', 'parch']
cols_high_corr1 = ['fare', 'survived', 'parch', 'alone']
cols_high_corr2 = ['pclass', 'adult_male', 'alone', 'sibsp']
cols_high_corr_drop = ['fare', 'parch', 'survived']
feature1 feature2 corr
0 fare pclass -0.549500
1 survived adult_male -0.557080
2 parch alone -0.583398
3 alone sibsp -0.584471
In [11]:
cols_high_corr_drop = ['fare', 'parch', 'survived']
In [12]:
df1 = df.drop(cols_high_corr_drop,axis=1)
df1.corr()
Out[12]:
pclass age sibsp adult_male alone
pclass 1.000000 -0.369226 0.083081 0.094035 0.135207
age -0.369226 1.000000 -0.308247 0.280328 0.198270
sibsp 0.083081 -0.308247 1.000000 -0.253586 -0.584471
adult_male 0.094035 0.280328 -0.253586 1.000000 0.404744
alone 0.135207 0.198270 -0.584471 0.404744 1.000000

Partial Correlation

In [13]:
df.bp.partial_corr(thr=0.3,disp=True)
Out[13]:
survived pclass age sibsp parch fare
survived 1.000000 0.115371 0.111542 -0.028674 0.082447 0.301210
pclass 0.115371 1.000000 0.571910 0.306844 0.173285 -0.315652
age 0.111542 0.571910 1.000000 -0.192800 -0.095105 0.392652
sibsp -0.028674 0.306844 -0.192800 1.000000 0.324787 0.211403
parch 0.082447 0.173285 -0.095105 0.324787 1.000000 0.199136
fare 0.301210 -0.315652 0.392652 0.211403 0.199136 1.000000
In [14]:
df.bp.partial_corr(['fare','age'])
Out[14]:
fare age
fare 1.000000 0.096067
age 0.096067 1.000000

Outliers

In [15]:
ser_outliers = df.bp.outliers_tukey('age')
ser_outliers
Out[15]:
age
33 66.0
54 65.0
96 71.0
116 70.5
280 65.0
456 65.0
493 71.0
630 80.0
672 70.0
745 70.0
851 74.0
In [16]:
col = 'age'
df1 = df.dropna(subset=[col]).reset_index(drop=True)
idx_outliers, val_outliers = df1.bp.outliers_kde(col)
df1.loc[idx_outliers,[col]]
Out[16]:
age
498 80.0
679 74.0
74 71.0
393 71.0
91 70.5
531 70.0
592 70.0
25 66.0
366 65.0
225 65.0
40 65.0
351 64.0
433 64.0
385 63.0
221 63.0

Kernel Density

In [17]:
df.bp.compare_kde_binn('age','survived')
In [ ]: