Description

This is an example tutorial to use my module bhishan for the plotly extension for pandas.

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext autoreload
%load_ext watermark

%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv
Bhishan Poudel 2020-09-28 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
pandas   1.1.0
bhishan  0.3.1
autopep8 1.5.2
seaborn  0.11.0
json     2.0.9
numpy    1.18.4

In [ ]:
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp

Using plotly api in module bhishan

In [2]:
# print(sns.get_dataset_names())
In [3]:
df = sns.load_dataset('titanic')
df.head()
Out[3]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [4]:
df.bp.freq(['embarked'],style=True)
Out[4]:
embarked Count Percent Cumulative Count Cumulative Percent
0 S 644 72.44% 644 72.44%
1 C 168 18.90% 812 91.34%
2 Q 77 8.66% 889 100.00%
In [5]:
df.bp.describe()
Out[5]:
Feature Type N Count Unique Missing MissingPct Zeros ZerosPct mean std min max 25% 50% 75%
11 deck category 891 203 7 688 77.22 0 0.00
3 age float64 891 714 88 177 19.87 0 0.00 29.70 14.53 0.42 80.00 20.12 28.00 38.00
7 embarked object 891 889 3 2 0.22 0 0.00
12 embark_town object 891 889 3 2 0.22 0 0.00
5 parch int64 891 891 7 0 0.00 678 76.09 0.38 0.81 0.00 6.00 0.00 0.00 0.00
4 sibsp int64 891 891 7 0 0.00 608 68.24 0.52 1.10 0.00 8.00 0.00 0.00 1.00
0 survived int64 891 891 2 0 0.00 549 61.62 0.38 0.49 0.00 1.00 0.00 0.00 1.00
10 adult_male bool 891 891 2 0 0.00 354 39.73
14 alone bool 891 891 2 0 0.00 354 39.73
6 fare float64 891 891 248 0 0.00 15 1.68 32.20 49.69 0.00 512.33 7.91 14.45 31.00
1 pclass int64 891 891 3 0 0.00 0 0.00 2.31 0.84 1.00 3.00 2.00 3.00 3.00
2 sex object 891 891 2 0 0.00 0 0.00
8 class category 891 891 3 0 0.00 0 0.00
9 who object 891 891 3 0 0.00 0 0.00
13 alive object 891 891 2 0 0.00 0 0.00
In [6]:
df.bp.get_duplicate_columns()
Out[6]:
[]
In [7]:
df.dtypes
Out[7]:
survived         int64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
embarked        object
class           object
who             object
adult_male        bool
deck            object
embark_town     object
alive           object
alone             bool
dtype: object
In [8]:
df1 = pd.DataFrame({'a': range(3),'b':range(1,4),'a_dup':range(3)})
df1
Out[8]:
a b a_dup
0 0 1 0
1 1 2 1
2 2 3 2
In [9]:
df1.bp.get_duplicate_columns()
a == a_dup
Out[9]:
['a_dup']
In [10]:
df.bp.missing()
Missing values high threshold = 80%

Number of missing values features: 4
cols_missing_high = []
cols_missing_low = ['deck', 'age', 'embarked', 'embark_town']
Out[10]:
Feature Type Count Missing Zeros Unique MissingPct ZerosPct count mean std min 25% 50% 75% max
11 deck object 891 688 0 7 77.216611 0.000000
3 age float64 891 177 0 88 19.865320 0.000000 714.000000 29.699118 14.526497 0.420000 20.125000 28.000000 38.000000 80.000000
7 embarked object 891 2 0 3 0.224467 0.000000
12 embark_town object 891 2 0 3 0.224467 0.000000
In [ ]: