A Gallery of Statistical Graphs in Matplotlib (Matplotlib Defaults)¶

For hints on improving on matplotlib's default style, see this alternate notebook

In [1]:

%matplotlib inline
from urllib import urlopen

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150

Example Data¶

In [2]:

file = urlopen('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/diamonds.csv')
diamonds = pd.read_csv(file)

file = urlopen('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/titanic.csv')
titanic = pd.read_csv(file)

In [3]:

change = [23.2, 22.7, 19.7, 13.9, 13.1, 12.8, 12.7,
        12.6, 12.0, 11.5, 10.8, 10.4, 10.4, 9.8, 9.2,
        9.2, 8.8, 7.7, 6.9, 6.9, 6.4, 5.6, 5.3, 5.3, 5.2, 4.9,
        4.8, 4.6, 3.6, 3.1, 0.7, -.3, -.7, -1.2, -1.5, -1.7, 
        -1.7, -1.8, -2, -2.3, -2.4, -3.6, -3.7,
        -4.9, -6.5, -6.6, -11.6, -14.8, -17.6, -23.1]
city = ['Philadelphia', 'Tucson', 'Kansas City, MO',
        'El Paso', 'Portland, Ore.', 'New York', 'Dallas',
        'Columbus', 'Mesa', 'Austin', 'Atlanta', 'Fort Worth',
        'Miami', 'Houston', 'Chicago', 'Oakland', 'Virginia Beach',
        'Baltimore', 'Denver', 'Detroit', 'San Antonio', 'Phoenix',
        'Oklahoma City', 'Indianapolis', 'Milwaukee', 'Sacramento',
        'Washington, D.C.', 'Colorado Springs', 'Honolulu', 'Nashville',
        'Jacksonville', 'Louisville', 'Seattle', 
        'Memphis', 'Fresno', 'Boston', 'Mineappolis',
        'San Jose', 'Tulsa', 'Charlotte', 'San Diego', 'Los Angeles',
        'Long Beach', 'Cleveland', 'San Francisco', 'Albuquerque',
        'Arlington, TX', 'Omaha', 'Wichita', 'Las Vegas']

grad = pd.DataFrame({'change' : change, 'city': city})

Bar Chart¶

In [4]:

plt.figure(figsize=(3, 8))

change = grad.change[grad.change > 0]
city = grad.city[grad.change > 0]
pos = np.arange(len(change))

plt.title('1995-2005 Change in HS graduation rate')
plt.barh(pos, change)

#add the numbers to the side of each bar
for p, c, ch in zip(pos, city, change):
    plt.annotate(str(ch), xy=(ch + 1, p + .5), va='center')

#set plot limits
plt.ylim(pos.max() + 1, pos.min() - 1)
plt.xlim(0, 30)

Out[4]:

(0, 30)

In [5]:

change = grad.change[grad.change < 0].values
city = grad.city[grad.change < 0].values

pos = np.arange(len(change))
red = (0.78, 0.22, 0.18) # RGB triplet

plt.figure(figsize=(3, 6), dpi=200)
plt.barh(pos, change, color=red)
plt.yticks(pos + .5, city)

#add the numbers to the side of each bar
for p, c, ch in zip(pos, city, change):
    plt.annotate(str(ch), xy=(ch - 1, p + .5), va='center', ha='right')

plt.ylim(pos.max() + 1, pos.min()- .5)
plt.xlim(-30, 0)
plt.title('1995-2005 Change in HS graduation rate')

Out[5]:

<matplotlib.text.Text at 0x10a9636d0>

In [6]:

years = np.arange(2004, 2009)
heights = np.random.random(years.shape) * 7000 + 3000

box_colors = ['r', 'g', 'b', 'c', 'm']

plt.bar(years - .4, heights, color=box_colors)
plt.yticks([2000, 4000, 6000, 8000])

for x, y in zip(years, heights):
    plt.annotate("%i" % y, (x, y + 200), ha='center')

Dot Plots¶

Scatterplots¶

In [7]:

plt.figure(tight_layout=True, figsize=(6, 4))
plt.subplot(121)
plt.scatter(diamonds.carat, diamonds.price, color='k')
plt.ylim(0, diamonds.price.max())
plt.xlim(0, 5)
plt.xlabel("Carat")
plt.ylabel("Price")

plt.subplot(122)
plt.scatter(diamonds.carat, diamonds.price, color='k', alpha=.01)
plt.ylim(0, diamonds.price.max())
plt.xlim(0, 5)

plt.xlabel("Carat")
plt.ylabel("Price")

Out[7]:

<matplotlib.text.Text at 0x102157d10>

/Users/beaumont/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:1595: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Trend Lines¶

In [8]:

# the raw data
x = diamonds.carat[diamonds.carat < 2]
y = diamonds.price[diamonds.carat < 2]
plt.plot(x, y, 'o', mec='none', alpha=.05)

#fit and overplot a 2nd order polynomial
params = np.polyfit(x, y, 2)
xp = np.linspace(x.min(), 2, 20)
yp = np.polyval(params, xp)
plt.plot(xp, yp, 'k')

#overplot an error band
sig = np.std(y - np.polyval(params, x))
plt.fill_between(xp, yp - sig, yp + sig, 
                 color='k', alpha=0.2)

plt.xlabel("Carat")
plt.ylabel("Price")
plt.xlim(0, 2)

Out[8]:

(0, 2)

Bubble Charts¶

Pie Charts¶

In [9]:

t = titanic.groupby(['pclass']).size()
print t

plt.subplot(aspect=True)
plt.pie(t, labels=t.index.values, autopct='%i%%')
plt.title("Passenger Class on the Titanic")

pclass
1st       323
2nd       277
3rd       709
dtype: int64

Out[9]:

<matplotlib.text.Text at 0x10bd87c50>

Donut Charts¶

Stacked Bar Chart¶

In [10]:

tclass = titanic.groupby(['pclass', 'survived']).size().unstack()
print tclass

red, blue = '#B2182B', '#2166AC'

plt.subplot(121)
plt.bar([0, 1, 2], tclass[0], color=red, label='Died')
plt.bar([0, 1, 2], tclass[1], bottom=tclass[0], color=blue, label='Survived')
plt.xticks([0.5, 1.5, 2.5], ['1st Class', '2nd Class', '3rd Class'], rotation='horizontal')
plt.ylabel("Number")
plt.xlabel("")
plt.legend(loc='upper left')

#normalize each row by transposing, normalizing each column, and un-transposing
tclass = (1. * tclass.T / tclass.T.sum()).T

plt.subplot(122)
plt.bar([0, 1, 2], tclass[0], color=red, label='Died')
plt.bar([0, 1, 2], tclass[1], bottom=tclass[0], color=blue, label='Survived')
plt.xticks([0.5, 1.5, 2.5], ['1st Class', '2nd Class', '3rd Class'], rotation='horizontal')
plt.ylabel("Fraction")
plt.xlabel("")

plt.show()

survived    0    1
pclass            
1st       123  200
2nd       158  119
3rd       528  181

Small Multiples¶

Waterfall Chart¶

Stacked Area Chart¶

Histogram¶

In [11]:

plt.hist(diamonds.depth, bins=np.linspace(50, 70, 200))
plt.xlabel("Depth")
plt.xlim(55, 70)
plt.show()


plt.hist(diamonds.depth, bins=np.linspace(50, 70, 40))
plt.xlabel("Depth")
plt.xlim(55, 70)
plt.show()

Density Plots¶

In [12]:

#KernelDensity objects estimate the (log of the) density of points
#see http://scikit-learn.org/stable/modules/density.html
from sklearn.neighbors.kde import KernelDensity

age = titanic.age.dropna().values  # drop missing values, turn to normal numpy array
age = age.reshape(-1, 1)  # scikit-learn expects data matrices of shape [ndata, ndim]

kde = KernelDensity(bandwidth=2).fit(age)
x = np.linspace(age.min(), age.max(), 100).reshape(-1, 1)
density = np.exp(kde.score_samples(x))

plt.plot(x, density)
plt.plot(age, age * 0, 'ok', alpha=.03)
plt.ylim(-.001, .035)

plt.xlabel("Age")
plt.ylabel("Density")

Out[12]:

<matplotlib.text.Text at 0x10bb74610>

Box and Whisker Plots¶

In [13]:

male_age = titanic.age[titanic.sex == 'male']
female_age = titanic.age[titanic.sex == 'female']
                       
plt.boxplot([male_age, female_age])
plt.ylabel("Titanic Passanger Age")
plt.xticks([1, 2], ["Male", "Female"])
plt.ylim(0, 85)

Out[13]:

(0, 85)

Heat Maps (2D Density Plots)¶

In [14]:

from sklearn.datasets import make_blobs
from matplotlib.colors import LogNorm

X, _ = make_blobs(n_samples=20000, centers=3, random_state=42, cluster_std=2)

plt.scatter(X[:, 0], X[:, 1], 2, color='k')
plt.title("Points")
plt.xlim(-15, 15)
plt.ylim(-15, 15)
plt.gca().set_position([.125, .125, .62, .775])
plt.show()

plt.hist2d(X[:, 0], X[:, 1], bins=40, norm=LogNorm())
ax = plt.gca()
plt.title("Heatmap")
plt.colorbar()
plt.xlim(-15, 15)
plt.ylim(-15, 15)
plt.show()