import pandas as pd import numpy as np # Set some Pandas options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 25) plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'ro') with mpl.rc_context(rc={'font.family': 'serif', 'font.weight': 'bold', 'font.size': 8}): fig = plt.figure(figsize=(6,3)) ax1 = fig.add_subplot(121) ax1.set_xlabel('some random numbers') ax1.set_ylabel('more random numbers') ax1.set_title("Random scatterplot") plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'r.') ax2 = fig.add_subplot(122) plt.hist(np.random.normal(size=100), bins=15) ax2.set_xlabel('sample') ax2.set_ylabel('cumulative sum') ax2.set_title("Normal distrubution") plt.tight_layout() plt.savefig("normalvars.png", dpi=150) normals = pd.Series(np.random.normal(size=10)) normals.plot() normals.cumsum().plot(grid=False) variables = pd.DataFrame({'normal': np.random.normal(size=100), 'gamma': np.random.gamma(1, size=100), 'poisson': np.random.poisson(size=100)}) variables.cumsum(0).plot() variables.cumsum(0).plot(subplots=True) variables.cumsum(0).plot(secondary_y='normal') fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4)) for i,var in enumerate(['normal','gamma','poisson']): variables[var].cumsum(0).plot(ax=axes[i], title=var) axes[0].set_ylabel('cumulative sum') titanic = pd.read_excel("data/titanic.xls", "titanic") titanic.head() titanic.groupby('pclass').survived.sum().plot(kind='bar') titanic.groupby(['sex','pclass']).survived.sum().plot(kind='barh') death_counts = pd.crosstab([titanic.pclass, titanic.sex], titanic.survived.astype(bool)) death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False) death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold']) titanic.fare.hist(grid=False) titanic.fare.hist(bins=30) sturges = lambda n: int(log2(n) + 1) square_root = lambda n: int(sqrt(n)) from scipy.stats import kurtosis doanes = lambda data: int(1 + log(len(data)) + log(1 + kurtosis(data) * (len(data) / 6.) ** 0.5)) n = len(titanic) sturges(n), square_root(n), doanes(titanic.fare.dropna()) titanic.fare.hist(bins=doanes(titanic.fare.dropna())) titanic.fare.dropna().plot(kind='kde', xlim=(0,600)) titanic.fare.hist(bins=doanes(titanic.fare.dropna()), normed=True, color='lightseagreen') titanic.fare.dropna().plot(kind='kde', xlim=(0,600), style='r--') titanic.boxplot(column='fare', by='pclass', grid=False) bp = titanic.boxplot(column='age', by='pclass', grid=False) for i in [1,2,3]: y = titanic.age[titanic.pclass==i].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plot(x, y, 'r.', alpha=0.2) titanic.groupby('pclass')['fare'].mean().plot(kind='bar', yerr=titanic.groupby('pclass')['fare'].std()) data1 = [150, 155, 175, 200, 245, 255, 395, 300, 305, 320, 375, 400, 420, 430, 440] data2 = [225, 380] fake_data = pd.DataFrame([data1, data2]).transpose() p = fake_data.mean().plot(kind='bar', yerr=fake_data.std(), grid=False) fake_data = pd.DataFrame([data1, data2]).transpose() p = fake_data.mean().plot(kind='bar', yerr=fake_data.std(), grid=False) x1, x2 = p.xaxis.get_majorticklocs() plot(np.random.normal(x1, 0.01, size=len(data1)), data1, 'ro') plot([x2]*len(data2), data2, 'ro') baseball = pd.read_csv("data/baseball.csv") baseball.head() plt.scatter(baseball.ab, baseball.h) xlim(0, 700); ylim(0, 200) plt.scatter(baseball.ab, baseball.h, s=baseball.hr*10, alpha=0.5) xlim(0, 700); ylim(0, 200) plt.scatter(baseball.ab, baseball.h, c=baseball.hr, s=40, cmap='hot') xlim(0, 700); ylim(0, 200); _ = pd.scatter_matrix(baseball.loc[:,'r':'sb'], figsize=(12,8), diagonal='kde') from pandas.tools.rplot import * titanic = titanic[titanic.age.notnull() & titanic.fare.notnull()] tp = RPlot(titanic, x='age') tp.add(TrellisGrid(['pclass', 'sex'])) tp.add(GeomDensity()) _ = tp.render(gcf()) cdystonia = pd.read_csv("data/cdystonia.csv", index_col=None) cdystonia.head() plt.figure(figsize=(12,12)) bbp = RPlot(cdystonia, x='age', y='twstrs') bbp.add(TrellisGrid(['week', 'treat'])) bbp.add(GeomScatter()) bbp.add(GeomPolyFit(degree=2)) _ = bbp.render(gcf()) cdystonia['site'] = cdystonia.site.astype(float) plt.figure(figsize=(6,6)) cp = RPlot(cdystonia, x='age', y='twstrs') cp.add(GeomPoint(colour=ScaleGradient('site', colour1=(1.0, 1.0, 0.5), colour2=(1.0, 0.0, 0.0)), size=ScaleSize('week', min_size=10.0, max_size=200.0), shape=ScaleShape('treat'))) _ = cp.render(gcf())