import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# from beakerx import *
warnings.filterwarnings('ignore')
%matplotlib inline
color = sns.color_palette()
Dataset used here is taken from kaggle datasets: https://www.kaggle.com/sekarmg/pokemon
data = pd.read_csv('data/pokemon/pokemon.csv')
data.head()
# | Name | Type 1 | Type 2 | HP | Attack | Defense | Sp. Atk | Sp. Def | Speed | Generation | Legendary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Bulbasaur | Grass | Poison | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False |
1 | 2 | Ivysaur | Grass | Poison | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False |
2 | 3 | Venusaur | Grass | Poison | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False |
3 | 4 | Mega Venusaur | Grass | Poison | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False |
4 | 5 | Charmander | Fire | NaN | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False |
data.columns
Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'], dtype='object')
data.shape
(800, 12)
data.isnull().sum()
# 0 Name 1 Type 1 0 Type 2 386 HP 0 Attack 0 Defense 0 Sp. Atk 0 Sp. Def 0 Speed 0 Generation 0 Legendary 0 dtype: int64
data.nunique()
# 800 Name 799 Type 1 18 Type 2 18 HP 94 Attack 111 Defense 103 Sp. Atk 105 Sp. Def 92 Speed 108 Generation 6 Legendary 2 dtype: int64
lmplot()
and regplot()
¶We'll try to observe relationship between 2 continuous variables: Attack and Defense. We can also differentiate based on a categorical variable i.e. whether the pokemon is legendary or not. For this we can use lmplot()
or regplot()
# plt.figure(figsize=(14,6))
sns.set_style('whitegrid')
sns.lmplot(
x="Attack",
y="Defense",
data=data,
fit_reg=False,
hue='Legendary',
palette="Set1")
<seaborn.axisgrid.FacetGrid at 0x1a704f7780>
We can see clearly that legendary pokemons have both high defense and attack
sns.set_style('darkgrid') #changes the background of the plot
plt.figure(figsize=(14, 6))
sns.regplot(
x="Attack", y="Defense", data=data,
fit_reg=True) #fit_Reg fits a regression line
<matplotlib.axes._subplots.AxesSubplot at 0x1a3e363828>
The relationship between Attack and Defense seems to be linear but their are few outliers too
We can make faceted plots where we can segment plots based on another categorical variable: Generation
in this case
plt.figure(figsize=(20, 6))
sns.set_style('whitegrid')
sns.lmplot(
x="Attack",
y="Defense",
data=data,
fit_reg=False,
hue='Legendary',
col="Generation",
aspect=0.4,
size=10)
<seaborn.axisgrid.FacetGrid at 0x1a44f4da90>
<matplotlib.figure.Figure at 0x1a44f38550>
We can also see plot a continous variable against a categorical column. Below we're trying to see relationship between Speed
and Legendary
status
plt.figure(figsize=(14, 6))
sns.set_style('whitegrid')
sns.regplot(x="Legendary", y="Speed", data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x1a3eb0e828>
One issue with this plot is we cannot see the distribution at each value of speed as the points are overlapping. This can be fixed by an option called jitter
plt.figure(figsize=(14, 6))
sns.set_style("ticks")
sns.regplot(x="Legendary", y="Speed", data=data, x_jitter=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x1a57737cf8>
We can also fit a logistic relationship
plt.figure(figsize=(14, 6))
sns.set_style("ticks")
sns.regplot(x="Attack", y="Legendary", data=data, logistic=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1a46146080>
distplot()
and kdeplot()
¶These series of plots are good for observing distributions of variables
plt.figure(figsize=(12, 6))
ax = sns.distplot(data['Attack'], kde=False)
ax.set_title('Attack')
Text(0.5,1,'Attack')
Distribution of Attack
seems to be close to normal
kde = True
option tries to estimate the density based on gaussian kernel
plt.figure(figsize=(12, 6))
ax = sns.distplot(
data['Defense'], kde=True,
norm_hist=False) #norm_hist normalizes the count
ax.set_title('Defense')
plt.show()
Defense
seems to have thinner tails and values are more centered around the mean
plt.figure(figsize=(12, 6))
ax = sns.distplot(data['Speed'], rug=True)
ax.set_title('Speed')
plt.show()
We can also just use kdeplot()
if we are only interested in the density function
plt.figure(figsize=(12, 6))
ax = sns.kdeplot(data['HP'], shade=True, color='g')
ax.set_title('HP')
plt.show()
Other ways to visualize distributions are striplot()
and boxplot()
plt.figure(figsize=(12, 6))
sns.stripplot(
y='HP', data=data, jitter=0.1,
color='g') #jitter option to spread the points
<matplotlib.axes._subplots.AxesSubplot at 0x1a47b59080>
plt.figure(figsize=(12, 6))
sns.boxplot(y='Speed', data=data, width=.6)
<matplotlib.axes._subplots.AxesSubplot at 0x1a47f457f0>
jointplot()
¶Another way to make scatterplot is jointplot()
plt.figure(figsize=(12, 6))
sns.jointplot(x='HP', y='Speed', data=data)
<seaborn.axisgrid.JointGrid at 0x1a48b015f8>
<matplotlib.figure.Figure at 0x1a48b011d0>
There are different varieties to the scatterplot
plt.figure(figsize=(12, 6))
sns.jointplot(x='HP', y='Speed', data=data, kind='kde')
<seaborn.axisgrid.JointGrid at 0x1a49132940>
<matplotlib.figure.Figure at 0x1a49132390>
In the above plot we can see 2 prominent regions of high density
plt.figure(figsize=(12, 6))
sns.jointplot(x='HP', y='Speed', data=data, kind='hex')
<seaborn.axisgrid.JointGrid at 0x1a48f857f0>
<matplotlib.figure.Figure at 0x1a48f850f0>
pairplot()
¶To see relationships between all pairwise combination of variables, we can use pairplot
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x1a4915ea58>
We can also segment data based on another variable. In this case it is Legendary
sns.pairplot(data, hue='Legendary')
<seaborn.axisgrid.PairGrid at 0x1a4407ab00>
Using vars
option you can make pairplots for selected group of variables. Also on the diagnol you can see kde estimate of distribution
sns.pairplot(
data,
hue='Legendary',
vars=['Speed', 'HP', 'Attack', 'Defense', 'Generation'],
diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x1a4ebaf6a0>
Clearly attack and defense seems to be linearly related and legendary pokemons have stronger attributes
Following series of plots are more used for analysis of categorical columns
countplot()
¶plt.figure(figsize=(20, 6))
ax = sns.countplot(x="Type 1", data=data, color='c')
<matplotlib.axes._subplots.AxesSubplot at 0x1a48b4a898>
We have maximum number of water
pokemons and flying
pokemons are the rare
plt.figure(figsize=(20, 6))
sns.countplot(
x="Type 1", data=data, hue='Legendary',
dodge=False) #dodge = False option is used to make stacked plots
<matplotlib.axes._subplots.AxesSubplot at 0x1a66e5b390>
Psychic
and dragon
pokemons have the highest number of Legendary
pokemons
barplot()
¶Barplot are used to aggregate a continuous variable based on a categorical column
In this plot, the value is average Speed by Type of the pokemon and the black line indicates the confidence interval
sns.set_style('darkgrid')
plt.figure(figsize=(20, 6))
sns.barplot(x="Type 1", y='Speed', data=data, color='c')
<matplotlib.axes._subplots.AxesSubplot at 0x1a51020390>
Obviously Flying
pokemons have the highest mean Speed
but also high variability
sns.set_style('darkgrid')
plt.figure(figsize=(20, 6))
sns.barplot(x="Type 1", y='Speed', data=data, hue='Legendary')
<matplotlib.axes._subplots.AxesSubplot at 0x1a516e4978>
pointplot()
¶Another way to display the same information as the previous plot is the pointplot
plt.figure(figsize=(20, 6))
sns.pointplot(x="Generation", y='Speed', data=data, hue='Legendary')
<matplotlib.axes._subplots.AxesSubplot at 0x1a67689d68>
Following series of plots are generally used to visualize distibution of a continuous variable within each category of a categorical variable
striplot()
¶plt.figure(figsize=(12, 6))
sns.stripplot(x="Generation", y="Speed", data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x1a523df320>
plt.figure(figsize=(12, 6))
sns.stripplot(x="Generation", y="Speed", data=data, jitter=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x1a52914f98>
swarmplot()
¶Swarmplot goes one step further by displaying all the points anf there is no overlap at all
sns.set_style('ticks')
plt.figure(figsize=(12, 6))
sns.swarmplot(x="Generation", y="Speed", data=data, hue='Legendary')
<matplotlib.axes._subplots.AxesSubplot at 0x1a531bf4e0>
boxplot()
¶Good old boxplot to visualize quantiles
and outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x="Generation", y="Speed", data=data, hue='Legendary')
<matplotlib.axes._subplots.AxesSubplot at 0x1a52d5f4e0>
violinplot()
¶One of the more interesting plots are the violinplots. Here you can see the quantiles as well as the distribution estimate using kde. This can be an effective and attractive way to show multiple distributions of data at once, but keep in mind that the estimation procedure is influenced by the sample size, and violins for relatively small samples might look misleadingly smooth.
plt.figure(figsize=(12, 6))
sns.violinplot(x="Generation", y="Speed", data=data, hue='Legendary')
<matplotlib.axes._subplots.AxesSubplot at 0x1a53345c50>
Combining the violins
plt.figure(figsize=(12, 6))
sns.violinplot(
x="Generation", y="Speed", data=data, hue='Legendary', split=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1a534b8e48>
If you pass 'count'
as scale
, the width of the violin plot is based on the actual count in the bin
plt.figure(figsize=(12, 6))
sns.violinplot(
x="Generation",
y="Speed",
data=data,
hue='Legendary',
split=True,
scale='count')
<matplotlib.axes._subplots.AxesSubplot at 0x1a67f23518>
You can also visualize the quartiles also
plt.figure(figsize=(12, 6))
sns.violinplot(
x="Generation",
y="Speed",
data=data,
hue='Legendary',
split=True,
inner='quartile')
<matplotlib.axes._subplots.AxesSubplot at 0x1a531c0588>
plt.figure(figsize=(12, 6))
sns.violinplot(
x="Generation",
y="Speed",
data=data,
hue='Legendary',
split=True,
inner='stick') #show each datapoint
<matplotlib.axes._subplots.AxesSubplot at 0x1a55022780>
plt.figure(figsize=(12, 6))
sns.violinplot(
x="Generation",
y="Speed",
data=data,
hue='Legendary',
split=True,
inner='stick',
bw=.2)
<matplotlib.axes._subplots.AxesSubplot at 0x1a54f9a7f0>
Following series of plots are used to show multiple plots in one grid
pairgrid()
¶Pairgrid is used for pairwise plots of y_vars and x_vars
g = sns.PairGrid(
data,
x_vars=["Generation", "Legendary"],
y_vars=["Speed", "HP", "Attack"],
aspect=.75,
size=8)
g.map(sns.violinplot, palette="pastel")
Facetgrid()
¶Facetgrid is used to plot small multiples i.e separate plot for each category. Variables can be represented in cols
or rows
or hue
Only cols
g = sns.FacetGrid(data=data, col='Generation', col_wrap=3)
g.map(plt.hist, "Speed")
<seaborn.axisgrid.FacetGrid at 0x1a5d2774e0>
cols
+ hue
g = sns.FacetGrid(data=data, col='Generation', col_wrap=3, hue="Legendary")
g.map(sns.regplot, "Speed", "HP", fit_reg=False).add_legend()
<seaborn.axisgrid.FacetGrid at 0x1a68285940>
cols
+ rows
g = sns.FacetGrid(
data=data, col='Generation', row='Legendary', margin_titles=True)
g.map(sns.regplot, "Speed", "HP", fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x1a5a577a58>
g = sns.FacetGrid(
data=data, col='Generation', margin_titles=True, size=4, aspect=.8)
g.map(sns.distplot, "Speed")
<seaborn.axisgrid.FacetGrid at 0x1a5c2129e8>
g = sns.FacetGrid(
data=data,
col='Generation',
margin_titles=True,
size=4,
aspect=.8,
hue='Legendary')
g.map(sns.violinplot, "Speed")
<seaborn.axisgrid.FacetGrid at 0x1a578970b8>
dragon = data.loc[data['Type 1']=='Dragon']
Mapping speed to the size of the bubble
g = sns.PairGrid(
dragon,
vars=["Attack", "Defense"],
size=5,
)
g.map(plt.scatter, s=4 * dragon.HP, alpha=.5)
<seaborn.axisgrid.PairGrid at 0x1a719468d0>
# Libraries
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
# Data set
url = 'https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
df = pd.read_csv(url)
df = df.set_index('model')
df
# Prepare a vector of color mapped to the 'cyl' column
my_palette = dict(zip(df.cyl.unique(), ["orange","yellow","brown"]))
row_colors = df.cyl.map(my_palette)
# plot
sns.clustermap(df, metric="correlation", method="single", cmap="Blues", standard_scale=1, row_colors=row_colors)
<seaborn.matrix.ClusterGrid at 0x1a20758898>