import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# plotting function
def f(flip=1):
x = np.linspace(0, 14, 100)
for i in range(7):
plt.plot(x, np.sin(x+i*.5)*(7-i)*flip)
f()
There are two set of functions you can use
axes_style()
, set_style()
plotting_context()
, set_context()
The first function of each set returns a dict, and this can be used with the with
context.
# To use seaborn defaults
sns.set()
f()
Five preset theme:
sns.set_style('whitegrid')
f()
sns.set_style('dark')
f()
sns.set_style('white')
f()
sns.set_style('ticks')
f()
white and ticks styles benefit from this.
# This gives the default
f()
sns.despine(top=True, right=True, bottom=False, left=False, offset=0, trim=False)
f()
sns.despine()
# offset moves the axis away and trim shortens the axis
f()
sns.despine(offset=10, trim=True)
fig = plt.figure(figsize=(6, 6))
gs = fig.add_gridspec(2, 2)
with sns.axes_style("darkgrid"):
ax = fig.add_subplot(gs[0,0])
f()
with sns.axes_style("white"):
ax = fig.add_subplot(gs[0, 1])
f()
with sns.axes_style("ticks"):
ax = fig.add_subplot(gs[1, 0])
f()
with sns.axes_style("whitegrid"):
ax = fig.add_subplot(gs[1, 1])
f()
fig.tight_layout()
sns.axes_style()
{'axes.facecolor': 'white', 'axes.edgecolor': '.15', 'axes.grid': False, 'axes.axisbelow': True, 'axes.labelcolor': '.15', 'figure.facecolor': 'white', 'grid.color': '.8', 'grid.linestyle': '-', 'text.color': '.15', 'xtick.color': '.15', 'ytick.color': '.15', 'xtick.direction': 'out', 'ytick.direction': 'out', 'lines.solid_capstyle': 'round', 'patch.edgecolor': 'w', 'image.cmap': 'rocket', 'font.family': ['sans-serif'], 'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif'], 'patch.force_edgecolor': True, 'xtick.bottom': True, 'xtick.top': False, 'ytick.left': True, 'ytick.right': False, 'axes.spines.left': True, 'axes.spines.bottom': True, 'axes.spines.right': True, 'axes.spines.top': True}
sns.plotting_context()
{'font.size': 12.0, 'axes.labelsize': 12.0, 'axes.titlesize': 12.0, 'xtick.labelsize': 11.0, 'ytick.labelsize': 11.0, 'legend.fontsize': 11.0, 'axes.linewidth': 1.25, 'grid.linewidth': 1.0, 'lines.linewidth': 1.5, 'lines.markersize': 6.0, 'patch.linewidth': 1.0, 'xtick.major.width': 1.25, 'ytick.major.width': 1.25, 'xtick.minor.width': 1.0, 'ytick.minor.width': 1.0, 'xtick.major.size': 6.0, 'ytick.major.size': 6.0, 'xtick.minor.size': 4.0, 'ytick.minor.size': 4.0}
sns.set()
sns.set_context('paper')
f()
sns.set_context('notebook')
f()
sns.set_context('talk')
f()
sns.set_context('poster')
f()
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
sns.color_palette()
[(0.2980392156862745, 0.4470588235294118, 0.6901960784313725), (0.8666666666666667, 0.5176470588235295, 0.3215686274509804), (0.3333333333333333, 0.6588235294117647, 0.40784313725490196), (0.7686274509803922, 0.3058823529411765, 0.3215686274509804), (0.5058823529411764, 0.4470588235294118, 0.7019607843137254), (0.5764705882352941, 0.47058823529411764, 0.3764705882352941), (0.8549019607843137, 0.5450980392156862, 0.7647058823529411), (0.5490196078431373, 0.5490196078431373, 0.5490196078431373), (0.8, 0.7254901960784313, 0.4549019607843137), (0.39215686274509803, 0.7098039215686275, 0.803921568627451)]
current_pallete = sns.color_palette()
# Function to plot current color pallete
sns.palplot(current_pallete)
These are best when you have to distinguish discrete chunks of data that do not have an inherent orderoing.
There are six variations of the default theme:
sns.palplot(sns.color_palette('deep'))
sns.palplot(sns.color_palette('muted'))
sns.palplot(sns.color_palette('pastel'))
sns.palplot(sns.color_palette('bright'))
sns.palplot(sns.color_palette('dark'))
sns.palplot(sns.color_palette('colorblind'))
When you have an arbitrary number of categories to distinguish without emphasizing any one, the easiest approach is to draw evenly-spaced colors in a circular color space (one where the hue changes while keeping the brightness and saturation constant).
sns.palplot(sns.color_palette("hls", 8))
This kind of color mapping is appropriate when data range from relatively low or uninteresting values to relatively high or interesting values. jet
is preferred here. Also, this can introduce misleading info because the brightest colors are used for intermediate data values. This can emphasize uninteresting (and arbitrary) values while deemphasizing the extremes.
To use these features, the data should be in pandas dataframe and in tidy format (each column is a variable and each row is an observation).
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='ticks')
FacetGrid
class is useful when you want to visualize the distribution of a variable or the relationship between multiple variables separately within subsets of your dataset. It can be drawn up to three dimensions: row
, col
, hue
.
tips = sns.load_dataset('tips')
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
# This initializes the grid
g = sns.FacetGrid(tips, col='time')
g = sns.FacetGrid(tips, row='time')
g = sns.FacetGrid(tips, hue='time')
g = sns.FacetGrid(tips, col='time')
g.map(plt.hist, 'tip')
<seaborn.axisgrid.FacetGrid at 0x7f3249e79040>
g = sns.FacetGrid(tips, col='sex', hue='smoker')
g.map(plt.scatter, "total_bill", "tip", alpha=.7)
g.add_legend()
<seaborn.axisgrid.FacetGrid at 0x7f324bce2310>
So in FaceGrid we pass in the variables we want to get info on. So in this case we selected 'sex', it means it will look up the unique values in the dataframe and plot a figure for every unique value. Also, you can pass 'hue' which would specify the variable you want to plot inside the figure. In the map function you pass in standard variables that the plotting function would take.
g = sns.FacetGrid(tips, row='smoker', col='time', margin_titles=True)
g.map(sns.regplot, 'size', 'total_bill', color='.3', fit_reg=False, x_jitter=.1)
<seaborn.axisgrid.FacetGrid at 0x7f324a345760>
g = sns.FacetGrid(tips, row='smoker', col='time', margin_titles=True, height=4)
g.map(sns.regplot, 'size', 'total_bill', color='.3', fit_reg=False, x_jitter=.1)
<seaborn.axisgrid.FacetGrid at 0x7f324a30e3d0>
g = sns.FacetGrid(tips, row='day', row_order=['Thur', 'Fri', 'Sat', 'Sun'], height=1.7, aspect=4,)
g.map(sns.distplot, 'total_bill', hist=False, rug=True)
<seaborn.axisgrid.FacetGrid at 0x7f32483e1dc0>
# To vary stuff across the hue, pass a dictionary to `hue_kws`
g = sns.FacetGrid(tips, hue='sex', height=5, hue_kws={"marker": ["^", "v"]})
g.map(plt.scatter, 'total_bill', 'tip', s=100, linewidth=.5, edgecolor='white')
g.add_legend();
# If you have many levels of one variabel, you can pass in along the columns but 'wrap' them so that they span multiple rows.
# 'row' cannot be used in this case.
attend = sns.load_dataset('attention').query('subject <= 12')
g = sns.FacetGrid(attend, col='subject', col_wrap=4, height=2, ylim=(0,10))
g.map(sns.pointplot, 'solutions', 'score', order=[1, 2, 3], color='.3', ci=None)
<seaborn.axisgrid.FacetGrid at 0x7f3248379c40>
# Use FacetGrid.set()
g = sns.FacetGrid(tips, row='sex', col='smoker', margin_titles=True, height=2.5)
g.map(plt.scatter, 'total_bill', 'tip', color='#334488', edgecolor='white', lw=.5)
g.set_axis_labels('Total bill (US Dollars)', 'Tip')
g.set(xticks=[10, 40, 50], yticks=[2, 6, 10])
g.fig.subplots_adjust(wspace=.02, hspace=.02)
# Matplotlib fig, ax are made availble as `fig` and `axes`
g = sns.FacetGrid(tips, col='smoker', margin_titles=True, height=4)
g.map(plt.scatter, 'total_bill', 'tip', color='#334488', edgecolor='white', lw=.5)
for ax in g.axes.flat:
ax.plot((0,50), (0, .2*50), c=".2", ls="--")
g.set(xlim=(0.60), ylim=(0,14))
<seaborn.axisgrid.FacetGrid at 0x7f3241779460>
PairGrid
to visualize relationship between all pairs of variables, scatterplot being the most common.
iris = sns.load_dataset('iris')
g = sns.PairGrid(iris)
g.map(plt.scatter)
<seaborn.axisgrid.PairGrid at 0x7f3248379040>
# It is possible to plot a different function on the diagonal to show the univariate distribution of the variable in each column
# Note the axis ticks will not correpsond to the density or count.
g = sns.PairGrid(iris)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
<seaborn.axisgrid.PairGrid at 0x7f32410a8dc0>
# Now further we can color all the categorical variables with different colors
g = sns.PairGrid(iris, hue='species', height=3)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()
<seaborn.axisgrid.PairGrid at 0x7f32407c5400>
# If you want to focus on particular variables only
g = sns.PairGrid(iris, vars=['sepal_length', 'sepal_width'], hue='species')
g.map(plt.scatter);
# It is also possible to use diffeent functions in the upper and lower triangles
g = sns.PairGrid(iris, height=3)
g.map_diag(sns.kdeplot, lw=3, legend=False)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
<seaborn.axisgrid.PairGrid at 0x7f323f4a8d30>
# To plot relationship between given x and y
g = sns.PairGrid(tips, y_vars=['tip'], x_vars=["total_bill", "size"], height=4)
g.map(sns.regplot, color=".3")
g.set(ylim=(-1, 11), yticks=[0,5,10])
<seaborn.axisgrid.PairGrid at 0x7f323ec860d0>
# To quickly look at a dataset PairGrid with uniform dist on diag is a good
# To make the process easier we have a shortcut function
sns.pairplot(iris, hue='species', height=3)
<seaborn.axisgrid.PairGrid at 0x7f323ec640a0>
g = sns.PairGrid(iris, hue='species', height=3)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()
<seaborn.axisgrid.PairGrid at 0x7f323c8d6850>
Statistical analysis is a process of understanding how variables in a dataset relate to each other and how these relationships depend on other variables.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
sns.set(style='darkgrid')
scatterplot
function when both variables are numeric. Alos, it is the default in relplot
.
tips = sns.load_dataset('tips')
sns.relplot(x='total_bill', y='tip', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322fa42b80>
sns.scatterplot(x='total_bill', y='tip', data=tips)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322f93b4c0>
sns.relplot(x='total_bill', y='tip', hue='smoker', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322f197220>
# To use different marker style for each class
sns.relplot(x='total_bill', y='tip', hue='smoker', style='smoker', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322f545be0>
When you want to understand changes in one variables as a function of time, or a similarly continous variable. lineplot
function can be used or relplot
with kind=line
.
df = pd.DataFrame(dict(time=np.arange(500), value=np.random.randn(500).cumsum()))
g = sns.relplot(x='time', y='value', kind='line', data=df)
g = sns.relplot(x='time', y='value', kind='line', data=df)
g.fig.autofmt_xdate()
Lineplot by default assumes you are trying to draw y as a function of x and due to this reason it sorts the date by the x values before plotting.
# To diable sorting
df = pd.DataFrame(np.random.randn(500, 2).cumsum(axis=0), columns=["x", "y"])
sns.relplot(data=df, x='x', y='y', sort=False, kind='line')
<seaborn.axisgrid.FacetGrid at 0x7f322f02fe80>
For more complex datasets there will be multiple measurements for the same value of x
variable. The default behaviour is to aggregate the multiple measuerments at each x
value by plotting the mean and the 95% confidence interval around the mean.
fmri = sns.load_dataset('fmri')
sns.relplot(x='timepoint', y='signal', kind='line', data=fmri)
<seaborn.axisgrid.FacetGrid at 0x7f322eff9b20>
# The confidence intervals are computed using bootstrapping and can be time extensive
# To disable them
sns.relplot(x='timepoint', y='signal', kind='line', data=fmri, ci=None)
<seaborn.axisgrid.FacetGrid at 0x7f322f0fa520>
# Also standard deviation can also be plotted instead of confidence intervals
sns.relplot(x='timepoint', y='signal', kind='line', data=fmri, ci='sd')
<seaborn.axisgrid.FacetGrid at 0x7f322efeb2b0>
# To disable aggregation
sns.relplot(x='timepoint', y='signal', kind='line', data=fmri, estimator=None)
<seaborn.axisgrid.FacetGrid at 0x7f322eef8c10>
sns.relplot(x='timepoint', y='signal', hue='event', style='event', kind='line', data=fmri, )
<seaborn.axisgrid.FacetGrid at 0x7f322ee5f160>
sns.relplot(x='timepoint', y='signal', hue='region', style='event', kind='line', data=fmri)
<seaborn.axisgrid.FacetGrid at 0x7f322ee9edf0>
If you are working with multiple measurements i.e. you have units that were smaples mutiple times, you can also plot each sampling unit separetley without distinguishing through semantics.
sns.relplot(x='timepoint', y='signal', hue='region', units='subject', estimator=None, kind='line', data=fmri.query("event == 'stim'"))
<seaborn.axisgrid.FacetGrid at 0x7f322ed5ee20>
You can also use the FaceGrid semantics in the relplot because relplot is based on FacetGrid.
catplot()
is the higher level unified acess to all the functions.
Categorical scatterplots:
Categorical distribution plots:
Categorical estimate plots
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='ticks', color_codes=True)
# striplot() - is to adjust the positions of points on the categorical axis with a small amount of random jitter (along x-axis)
tips = sns.load_dataset('tips')
sns.catplot(x='day', y='total_bill', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322ee5aa30>
sns.catplot(x='day', y='total_bill', jitter=False, data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322edc39d0>
sns.catplot(x='day', y='total_bill', kind='swarm', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322ec503d0>
sns.catplot(x='day', y='total_bill', hue='sex', kind='swarm', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322eb97940>
sns.catplot(x='day', y='total_bill', kind='swarm', data=tips, order=['Fri', 'Sat', 'Sun', 'Thur'])
<seaborn.axisgrid.FacetGrid at 0x7f322ebd5550>
# It is always better to plot the categorical variable on the y-axis
sns.catplot(x='total_bill', y='day', kind='swarm', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322ead4c10>
# Boxplots :- whiskers extend to points that lie within 1.5 IQRs of the lower and upper quartile
sns.catplot(x='day', y='total_bill', kind='box', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322eaaa1f0>
sns.catplot(x='day', y='total_bill', hue='sex', kind='box', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322eaa6160>
# The above behavior is called dodging and is turned on by default
sns.catplot(x='day', y='total_bill', hue='sex', kind='box', data=tips, dodge=False)
<seaborn.axisgrid.FacetGrid at 0x7f322eadbca0>
# boxenpot :- similar to boxplot but shows more information anout the shape of the distribution i.e. use for large datasets
diamonds = sns.load_dataset('diamonds')
sns.catplot(x='color', y='price', kind='boxen', data=diamonds.sort_values('color'))
<seaborn.axisgrid.FacetGrid at 0x7f322ea64e20>
# voilinplot :- combine boxplot with kernel density estimation
sns.catplot(x='total_bill', y='day', hue='sex', kind='violin', data=tips)
<seaborn.axisgrid.FacetGrid at 0x7f322ed024c0>
# You can also split the violin into 2 parts when hue is 2 levels
sns.catplot(x='total_bill', y='day', hue='sex', kind='violin', data=tips, split=True)
<seaborn.axisgrid.FacetGrid at 0x7f322e31ecd0>
# SHow each indiviidual observation instead of the summary boxplot values
sns.catplot(x='total_bill', y='day', hue='sex', kind='violin', data=tips, split=True, palette='pastel', inner='stick')
<seaborn.axisgrid.FacetGrid at 0x7f322e299760>
# Combine swarmplot or striplot in the inside of the violin
g = sns.catplot(x='total_bill', y='day', kind='violin', inner=None, data=tips)
sns.swarmplot(x='total_bill', y='day', color='k', size=3, data=tips, ax=g.ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322e2967f0>
g = sns.catplot(x='total_bill', y='day', kind='violin', inner=None, data=tips)
sns.swarmplot(x='total_bill', y='day', color='k', size=7, data=tips, ax=g.ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322dee3550>
# barplot :- apply a function to all the variables (mean by default) or you can show the count of variables
titanic = sns.load_dataset('titanic')
sns.catplot(x='sex', y='survived', hue='class', kind='bar', data=titanic)
<seaborn.axisgrid.FacetGrid at 0x7f322dece580>
sns.catplot(x='deck', kind='count', data=titanic)
<seaborn.axisgrid.FacetGrid at 0x7f322df26eb0>
# pointplot : same as barplot but instead of showing the bar, only a line is drawn between center of different categoirs
sns.catplot(x='sex', y='survived', hue='class', kind='point', data=titanic)
<seaborn.axisgrid.FacetGrid at 0x7f322dda8310>
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set(color_codes=True)
# displot is the most basic. It will draw a histogram and fit a kernel sensity estimate
x = np.random.normal(size=100)
sns.distplot(x);
# add rugplot i.e. ticks for each observation in data
sns.distplot(x, bins=20, rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322c0928b0>
sns.distplot(x, hist=False, kde=True, rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322c080fa0>
How KDE is computed? First fit a normal gaussian plot at each value in the data and then sum all these graphs and then normalize to fit the data between 0 and 1.
# You can also fit a function to see how good it fits to the given data
x = np.random.gamma(6, size=200)
sns.distplot(x, kde=False, fit=stats.gamma)
<matplotlib.axes._subplots.AxesSubplot at 0x7f322bdaec70>
# jointplot : creates a multi-panel figure that shows both the bivariate relationship between two variables
# along with the univariate distribution of each on separate axes.
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])
# scatterplot is the defautl
sns.jointplot(x="x", y="y", data=df)
<seaborn.axisgrid.JointGrid at 0x7f322cfd5730>
# hexbin: bivariate analog of a histogram
x, y = np.random.multivariate_normal(mean, cov, 1000).T
with sns.axes_style("white"):
sns.jointplot(x=x, y=y, kind="hex", color="k")
# kde for bivariate distribution
sns.jointplot(x='x', y='y', data=df, kind='kde')
<seaborn.axisgrid.JointGrid at 0x7f322bb6ab50>
f, ax = plt.subplots(figsize=(6, 6))
sns.kdeplot(df.x, df.y, ax=ax)
sns.rugplot(df.x, color="g", ax=ax)
sns.rugplot(df.y, vertical=True, ax=ax);
# use n_levels to increase the continuity
f, ax = plt.subplots(figsize=(6, 6))
cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(df.x, df.y, cmap=cmap, n_levels=60, shade=True);
g = sns.jointplot(x="x", y="y", data=df, kind="kde", color="m")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$");
sns.regplot(x="total_bill", y="tip", data=tips);
sns.lmplot(x="total_bill", y="tip", data=tips);
sns.lmplot(x="size", y="tip", data=tips);
sns.lmplot(x="size", y="tip", data=tips, x_jitter=.05);
# in the above case linear plot is not a good representation of the data. So instead
# we can plot the linear relationship between the central tendency of the categories
sns.lmplot(x="size", y="tip", data=tips, x_estimator=np.mean);
anscombe = sns.load_dataset("anscombe")
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'I'"), ci=None, scatter_kws={"s": 80});
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'II'"), ci=None, scatter_kws={"s": 80});
# in the above two cases linear is not a good figure to use to represent the data
# Use order to fit a higher degree polynomial
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'II'"), order=2, ci=None, scatter_kws={"s": 80});
# The next problem is outlier problem
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'III'"), ci=None, scatter_kws={"s": 80});
# add routust=True which uses a different loss function to downweight relatively learge residuals
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'III'"), robust=True, ci=None, scatter_kws={"s": 80});
tips["big_tip"] = (tips.tip / tips.total_bill) > .15
sns.lmplot(x="total_bill", y="big_tip", data=tips,
y_jitter=.03);
# Use logistic instead of linear when y is binary
sns.lmplot(x="total_bill", y="big_tip", data=tips, logistic=True, y_jitter=.03);
# Use ci=None to turn the boostrapping off i.e. to not predict the confidence intervals
# Fit a nonparametric regression using a lowess smoother. This approach has fewest assumptions, but is very intensive
sns.lmplot(x="total_bill", y="tip", data=tips,
lowess=True);
# To check if simple regression techniques can be used, use residplot. If there is some relationship between the
# residuals then it means that simple regression cannot be used.
# Like in this case, these should be randomply distributed around y=0
sns.residplot(x="x", y="y", data=anscombe.query("dataset == 'I'"),
scatter_kws={"s": 80});
sns.residplot(x="x", y="y", data=anscombe.query("dataset == 'II'"),
scatter_kws={"s": 80});
# the difference between regplot and lmplot is that lmplot combines regplot with Facetgrid
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips,
markers=["o", "x"], palette="Set1");