seaborn
, pandas
and pylab
¶seaborn
when loaded, tries to overwrite certain function calls of pylab (or matplotlib)seaborn
moduleseaborn
works well with pandas.DataFrame
%pylab inline --no-import
import pylab as pl
import seaborn as sns
import numpy as np
from scipy import stats
import pandas as pd
from IPython.display import HTML, display
from sklearn.preprocessing import label_binarize, LabelEncoder
Populating the interactive namespace from numpy and matplotlib
sns.set(context = 'notebook', style = 'whitegrid')
x = np.linspace(0, 14, 100)
y1 = np.sin(x + .5)
y2 = np.sin(x + 4*.5) * 3
#c1, c2 = sns.color_palette("deep", n_colors = 2, )
pl.plot(x, y1)
pl.fill_between(x, y1 -.5, y1+.5, alpha = .2)
pl.plot(x, y2)
pl.fill_between(x, y2 -.5, y2+.5, alpha = .2)
<matplotlib.collections.PolyCollection at 0x10b49f510>
weather = pd.read_csv('../data/weather.csv', parse_dates=['Date'])
weather.columns
Index([u'Date', u'Location', u'MinTemp', u'MaxTemp', u'Rainfall', u'Evaporation', u'Sunshine', u'WindGustDir', u'WindGustSpeed', u'WindDir9am', u'WindDir3pm', u'WindSpeed9am', u'WindSpeed3pm', u'Humidity9am', u'Humidity3pm', u'Pressure9am', u'Pressure3pm', u'Cloud9am', u'Cloud3pm', u'Temp9am', u'Temp3pm', u'RainToday', u'RISK_MM', u'RainTomorrow'], dtype=object)
boxplot
- numeric values¶sns.boxplot(weather.loc[:, ['MinTemp', 'MaxTemp', 'Humidity9am', 'Humidity3pm']])
<matplotlib.axes.AxesSubplot at 0x10b58b790>
sns.boxplot(weather.loc[:, 'MinTemp'], groupby=weather.RainTomorrow)
pl.title('MinTemp ~ RainTomorrow')
<matplotlib.text.Text at 0x10c871550>
regplot
- regression plot for numeric values¶If you feel that intuitively it doesnt look right recall that it is a RMSE fit, instead of PCA (which pass right through the cluster of data)
sns.regplot(x = weather.Pressure9am, y = weather.Temp9am,
xlabel='Pressure9am', ylabel = 'Temp9am',)
## A more data frame friendly way
sns.regplot(x = 'Pressure3pm', y = 'Temp3pm',
data = weather, corr_func=stats.kendalltau,)
lmplot
¶lmplot
is exactly the same as regplot
lmplot
allows *CONDITIONAL* regplot
- by using the color
parameter & col
parameter (facet),lmplot
include a lot of more functions, such as (1) higher-order polynomial, (2) logistic, and (3) beyond numeric values (but must be discrete instead of strings), (4) partial regressionlmplot
works only with pd.DataFrame, instead of general x, y as in regplot
sns.lmplot(x = 'Pressure3pm', y = 'Temp3pm', color = 'RainToday', order = 2,
data = weather, )
sns.lmplot(x = 'Pressure3pm', y = 'Temp3pm', col = 'RainToday', fit_reg=False,
data = weather, )
sns.lmplot(x = 'Pressure3pm', y = 'Temp3pm', col = 'RainToday', color = 'RainTomorrow',
data = weather, )
sns.lmplot(x = 'Pressure3pm', y = 'Temp3pm', col = 'RainToday', row = 'RainTomorrow',
data = weather, )
df = weather.copy()
df['RainTodayEncoded'] = LabelEncoder().fit_transform(df.RainToday)
sns.lmplot(x = 'RainTodayEncoded', y = 'Temp3pm', color = 'RainTomorrow',
data = df, )
df = weather.copy()
df['RainTomorrowBinary'] = label_binarize(weather.RainTomorrow, classes=['Yes', 'No']).ravel()
sns.lmplot(x = 'Pressure3pm', y = 'RainTomorrowBinary',
logistic=True,
data = df, )
## partial regression of Temp9am, Temp3pm, conditioning on MaxTemp
sns.lmplot('Temp9am', 'Temp3pm', weather)
sns.lmplot('Temp9am', 'Temp3pm', weather, x_partial='MaxTemp')
corrplot
- auto filter out non-numeric values¶pl.figure(figsize=(25, 25))
sns.corrplot(weather)
<matplotlib.axes.AxesSubplot at 0x10c5e0c50>
coefplot
- multivariate fit and plot by Patsy
formula¶sns.coefplot('Temp3pm ~ RainToday + Pressure3pm + RainToday * Pressure3pm + MinTemp + MaxTemp + MinTemp * MaxTemp',
data = weather, intercept=True)
print stats.skew(weather.loc[:, ['MinTemp', 'MaxTemp']])
[-0.00379527 0.34893973]
sns.kdeplot(weather.loc[:, ['MinTemp']], label = 'MinTemp')
sns.kdeplot(weather.loc[:, ['MaxTemp']], label = 'MaxTemp')
pl.title('kde')
pl.legend(loc = 'best')
<matplotlib.legend.Legend at 0x111551c90>
## estimate lambda for Box-Cox transform
for lamb in np.linspace(-1, 1, 50):
print lamb, stats.skew(map(lambda x: (x**lamb - 1.)/lamb, np.array(weather.loc[:, ['MaxTemp']])))
-1.0 [-0.93058773] -0.959183673469 [-0.8990553] -0.918367346939 [-0.86783628] -0.877551020408 [-0.8369237] -0.836734693878 [-0.80631068] -0.795918367347 [-0.77599043] -0.755102040816 [-0.74595628] -0.714285714286 [-0.71620168] -0.673469387755 [-0.68672017] -0.632653061224 [-0.65750544] -0.591836734694 [-0.62855128] -0.551020408163 [-0.59985161] -0.510204081633 [-0.5714005] -0.469387755102 [-0.54319212] -0.428571428571 [-0.5152208] -0.387755102041 [-0.48748098] -0.34693877551 [-0.45996726] -0.30612244898 [-0.43267435] -0.265306122449 [-0.40559713] -0.224489795918 [-0.37873057] -0.183673469388 [-0.35206982] -0.142857142857 [-0.32561013] -0.102040816327 [-0.29934692] -0.0612244897959 [-0.2732757] -0.0204081632653 [-0.24739216] 0.0204081632653 [-0.22169207] 0.0612244897959 [-0.19617138] 0.102040816327 [-0.17082611] 0.142857142857 [-0.14565245] 0.183673469388 [-0.1206467] 0.224489795918 [-0.09580527] 0.265306122449 [-0.07112469] 0.30612244898 [-0.04660161] 0.34693877551 [-0.02223279] 0.387755102041 [ 0.0019849] 0.428571428571 [ 0.02605448] 0.469387755102 [ 0.04997887] 0.510204081633 [ 0.07376089] 0.551020408163 [ 0.09740326] 0.591836734694 [ 0.1209086] 0.632653061224 [ 0.14427945] 0.673469387755 [ 0.16751824] 0.714285714286 [ 0.19062732] 0.755102040816 [ 0.21360898] 0.795918367347 [ 0.23646538] 0.836734693878 [ 0.25919863] 0.877551020408 [ 0.28181078] 0.918367346939 [ 0.30430377] 0.959183673469 [ 0.32667948] 1.0 [ 0.34893973]
sns.kdeplot(map(lambda x: (x**.3 - 1.)/.3, np.array(weather.loc[:, ['MaxTemp']])), label = '$log(MaxTemp)$')
print stats.skew(map(lambda x: (x**.3 - 1.)/.3, np.array(weather.loc[:, ['MaxTemp']])))
[-0.05027016]
tsplot
for timeplot¶n_pts = 29
x = np.linspace(0, 14, n_pts)
true_data = np.sin(x)
np.random.seed(9221999)
n_subjs = 20
subj_noise = np.random.rand(20) * 2
subj_data = np.array([true_data + # real signal
np.random.randn() + # subject specific offset from real signal
np.random.randn(n_pts) * (subj_noise[s]) # sample specific error with subject specific variance
for s in range(n_subjs)])
sns.tsplot(x, subj_data, err_style="ci_bars")
<matplotlib.axes.AxesSubplot at 0x10fbaf1d0>
df = weather.copy()
df.sort('Date')
sns.tsplot(df.Date, df.Cloud3pm, interpolate=False, )
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-145-7813858ce637> in <module>() 3 print df.Date 4 print df.Cloud3pm ----> 5 sns.tsplot(df.Date, df.Cloud3pm, interpolate=False, ) /Library/Python/2.7/site-packages/seaborn/plotobjs.pyc in tsplot(x, data, err_style, ci, interpolate, estimator, n_boot, smooth, err_palette, ax, err_kws, **kwargs) 78 79 # Plot the timeseries line to get its color ---> 80 line, = ax.plot(x, central_data, **kwargs) 81 color = line.get_color() 82 line.remove() /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/axes.pyc in plot(self, *args, **kwargs) 3994 lines = [] 3995 -> 3996 for line in self._get_lines(*args, **kwargs): 3997 self.add_line(line) 3998 lines.append(line) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/axes.pyc in _grab_next_args(self, *args, **kwargs) 328 return 329 if len(remaining) <= 3: --> 330 for seg in self._plot_args(remaining, kwargs): 331 yield seg 332 return /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/axes.pyc in _plot_args(self, tup, kwargs) 306 x = np.arange(y.shape[0], dtype=float) 307 --> 308 x, y = self._xy_from_xy(x, y) 309 310 if self.command == 'plot': /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/axes.pyc in _xy_from_xy(self, x, y) 246 y = np.atleast_1d(y) 247 if x.shape[0] != y.shape[0]: --> 248 raise ValueError("x and y must have same first dimension") 249 if x.ndim > 2 or y.ndim > 2: 250 raise ValueError("x and y can be no greater than 2-D") ValueError: x and y must have same first dimension
0 2007-11-01 00:00:00 1 2007-11-02 00:00:00 2 2007-11-03 00:00:00 3 2007-11-04 00:00:00 4 2007-11-05 00:00:00 5 2007-11-06 00:00:00 6 2007-11-07 00:00:00 7 2007-11-08 00:00:00 8 2007-11-09 00:00:00 9 2007-11-10 00:00:00 10 2007-11-11 00:00:00 11 2007-11-12 00:00:00 12 2007-11-13 00:00:00 13 2007-11-14 00:00:00 14 2007-11-15 00:00:00 ... 351 2008-10-17 00:00:00 352 2008-10-18 00:00:00 353 2008-10-19 00:00:00 354 2008-10-20 00:00:00 355 2008-10-21 00:00:00 356 2008-10-22 00:00:00 357 2008-10-23 00:00:00 358 2008-10-24 00:00:00 359 2008-10-25 00:00:00 360 2008-10-26 00:00:00 361 2008-10-27 00:00:00 362 2008-10-28 00:00:00 363 2008-10-29 00:00:00 364 2008-10-30 00:00:00 365 2008-10-31 00:00:00 Name: Date, Length: 366, dtype: datetime64[ns] 0 7 1 3 2 7 3 7 4 7 5 5 6 6 7 7 8 7 9 1 10 2 11 3 12 1 13 4 14 1 ... 351 1 352 7 353 6 354 1 355 5 356 5 357 2 358 8 359 3 360 8 361 3 362 1 363 2 364 7 365 1 Name: Cloud3pm, Length: 366, dtype: int64