import pandas as pd
import matplotlib.pyplot as plt
# display plots in the notebook
%matplotlib inline
# increase default figure and font sizes for easier viewing
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14
# read in the drinks data
drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv'
drinks = pd.read_csv(url, header=0, names=drink_cols, na_filter=False)
# sort the beer column and mentally split it into 3 groups
drinks.beer.order().values
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 5, 5, 5, 5, 5, 6, 6, 6, 6, 8, 8, 8, 9, 9, 9, 9, 12, 13, 15, 15, 16, 16, 17, 18, 19, 19, 20, 20, 21, 21, 21, 21, 22, 23, 25, 25, 25, 25, 26, 28, 31, 31, 31, 31, 32, 32, 34, 36, 36, 36, 37, 42, 42, 43, 44, 45, 47, 49, 51, 51, 52, 52, 52, 53, 56, 56, 57, 58, 60, 62, 62, 63, 64, 69, 71, 76, 76, 77, 77, 77, 78, 79, 82, 82, 85, 88, 89, 90, 92, 93, 93, 98, 99, 102, 105, 106, 109, 111, 115, 120, 122, 124, 127, 128, 130, 133, 140, 142, 143, 144, 147, 149, 149, 152, 157, 159, 162, 163, 167, 169, 171, 173, 185, 188, 192, 193, 193, 194, 194, 196, 197, 199, 203, 206, 213, 217, 219, 224, 224, 225, 230, 231, 233, 234, 236, 238, 240, 245, 245, 247, 249, 251, 261, 263, 263, 270, 279, 281, 283, 284, 285, 295, 297, 306, 313, 333, 343, 343, 346, 347, 361, 376], dtype=int64)
# compare with histogram
drinks.beer.plot(kind='hist', bins=3)
<matplotlib.axes._subplots.AxesSubplot at 0xbfb55c0>
# try more bins
drinks.beer.plot(kind='hist', bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0xc1c7e10>
# add title and labels
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')
<matplotlib.text.Text at 0xc3936d8>
# compare with density plot (smooth version of a histogram)
drinks.beer.plot(kind='density', xlim=(0, 500))
<matplotlib.axes._subplots.AxesSubplot at 0xc6bf898>
# select the beer and wine columns and sort by beer
drinks[['beer', 'wine']].sort('beer').values
array([[ 0, 0], [ 0, 74], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 0, 0], [ 1, 7], [ 1, 1], [ 1, 4], [ 1, 1], [ 2, 0], [ 3, 1], [ 5, 0], [ 5, 0], [ 5, 16], [ 5, 1], [ 5, 0], [ 6, 1], [ 6, 0], [ 6, 1], [ 6, 9], [ 8, 0], [ 8, 1], [ 8, 1], [ 9, 2], [ 9, 0], [ 9, 7], [ 9, 0], [ 12, 10], [ 13, 0], [ 15, 3], [ 15, 1], [ 16, 5], [ 16, 0], [ 17, 1], [ 18, 0], [ 19, 32], [ 19, 2], [ 20, 0], [ 20, 31], [ 21, 11], [ 21, 11], [ 21, 5], [ 21, 1], [ 22, 1], [ 23, 0], [ 25, 8], [ 25, 14], [ 25, 2], [ 25, 7], [ 26, 4], [ 28, 21], [ 31, 128], [ 31, 6], [ 31, 10], [ 31, 1], [ 32, 4], [ 32, 1], [ 34, 13], [ 36, 19], [ 36, 5], [ 36, 1], [ 37, 7], [ 42, 2], [ 42, 7], [ 43, 0], [ 44, 1], [ 45, 0], [ 47, 5], [ 49, 8], [ 51, 20], [ 51, 7], [ 52, 2], [ 52, 149], [ 52, 26], [ 53, 2], [ 56, 140], [ 56, 1], [ 57, 1], [ 58, 2], [ 60, 11], [ 62, 18], [ 62, 123], [ 63, 9], [ 64, 4], [ 69, 2], [ 71, 1], [ 76, 8], [ 76, 9], [ 77, 8], [ 77, 16], [ 77, 1], [ 78, 1], [ 79, 8], [ 82, 9], [ 82, 0], [ 85, 237], [ 88, 0], [ 89, 54], [ 90, 2], [ 92, 233], [ 93, 5], [ 93, 1], [ 98, 18], [ 99, 1], [102, 45], [105, 24], [106, 86], [109, 18], [111, 1], [115, 220], [120, 11], [122, 51], [124, 12], [127, 370], [128, 7], [130, 172], [133, 218], [140, 9], [142, 42], [143, 36], [144, 16], [147, 4], [149, 120], [149, 11], [152, 186], [157, 51], [159, 3], [162, 3], [163, 21], [167, 8], [169, 129], [171, 71], [173, 35], [185, 280], [188, 7], [192, 113], [193, 9], [193, 221], [194, 339], [194, 32], [196, 116], [197, 7], [199, 28], [203, 175], [206, 45], [213, 74], [217, 45], [219, 195], [224, 59], [224, 278], [225, 81], [230, 254], [231, 94], [233, 78], [234, 185], [236, 271], [238, 5], [240, 100], [245, 312], [245, 16], [247, 73], [249, 84], [251, 190], [261, 212], [263, 97], [263, 8], [270, 276], [279, 191], [281, 62], [283, 127], [284, 112], [285, 18], [295, 212], [297, 167], [306, 23], [313, 165], [333, 3], [343, 56], [343, 56], [346, 175], [347, 59], [361, 134], [376, 1]], dtype=int64)
# compare with scatter plot
drinks.plot(kind='scatter', x='beer', y='wine')
<matplotlib.axes._subplots.AxesSubplot at 0xdbbe390>
# add transparency
drinks.plot(kind='scatter', x='beer', y='wine', alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0xdcbabe0>
# vary point color by spirit servings
drinks.plot(kind='scatter', x='beer', y='wine', c='spirit', colormap='Blues')
<matplotlib.axes._subplots.AxesSubplot at 0xdfc2048>
# scatter matrix of three numerical columns
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000018B46AC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000018D82240>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019098710>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000190B8320>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000191DDA90>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000191EBA58>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001932E0F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000193565C0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019479400>]], dtype=object)
# increase figure size
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000196C0AC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019D2B320>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019D53160>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019EAE8D0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019FD4940>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019FE76D8>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001A11E2B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A144780>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A2A52B0>]], dtype=object)
# count the number of countries in each continent
drinks.continent.value_counts()
AF 53 EU 45 AS 44 NA 23 OC 16 SA 12 dtype: int64
# compare with bar plot
drinks.continent.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1a5ef3c8>
# calculate the mean alcohol amounts for each continent
drinks.groupby('continent').mean()
beer | spirit | wine | liters | |
---|---|---|---|---|
continent | ||||
AF | 61.471698 | 16.339623 | 16.264151 | 3.007547 |
AS | 37.045455 | 60.840909 | 9.068182 | 2.170455 |
EU | 193.777778 | 132.555556 | 142.222222 | 8.617778 |
NA | 145.434783 | 165.739130 | 24.521739 | 5.995652 |
OC | 89.687500 | 58.437500 | 35.625000 | 3.381250 |
SA | 175.083333 | 114.750000 | 62.416667 | 6.308333 |
# side-by-side bar plots
drinks.groupby('continent').mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1a5ef358>
# drop the liters column
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1a8a4080>
# stacked bar plots
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1ae85d30>
Five-number summary:
(More useful than mean and standard deviation for describing skewed distributions)
Interquartile Range (IQR) = Q3 - Q1
Outliers:
# sort the spirit column
drinks.spirit.order().values
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 9, 11, 11, 12, 13, 15, 15, 16, 16, 18, 18, 18, 18, 19, 21, 21, 22, 22, 25, 25, 27, 29, 31, 31, 34, 35, 35, 35, 35, 38, 39, 41, 41, 42, 42, 44, 46, 50, 51, 55, 56, 57, 60, 61, 63, 63, 65, 67, 68, 69, 69, 69, 71, 71, 72, 74, 75, 76, 76, 79, 81, 84, 87, 87, 88, 97, 97, 98, 98, 100, 100, 100, 100, 101, 104, 104, 112, 114, 114, 114, 117, 117, 118, 118, 122, 122, 124, 126, 128, 131, 132, 133, 133, 135, 137, 138, 145, 147, 151, 152, 154, 156, 157, 158, 160, 170, 173, 173, 176, 178, 179, 186, 189, 192, 194, 200, 202, 205, 215, 215, 216, 221, 226, 237, 244, 246, 252, 254, 258, 286, 293, 302, 315, 326, 326, 373, 438], dtype=int64)
# show "five-number summary" for spirit
drinks.spirit.describe()
count 193.000000 mean 80.994819 std 88.284312 min 0.000000 25% 4.000000 50% 56.000000 75% 128.000000 max 438.000000 Name: spirit, dtype: float64
# compare with box plot
drinks.spirit.plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0x1b301a20>
# include multiple variables
drinks.drop('liters', axis=1).plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0x1b6d47f0>
# read in the ufo data
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/ufo.csv'
ufo = pd.read_csv(url)
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo['Year'] = ufo.Time.dt.year
# count the number of ufo reports each year (and sort by year)
ufo.Year.value_counts().sort_index()
1930 2 1931 2 1933 1 1934 1 1935 1 1936 2 1937 2 1939 3 1941 2 1942 3 1943 5 1944 8 1945 9 1946 8 1947 41 1948 9 1949 19 1950 31 1951 21 1952 52 1953 36 1954 55 1955 33 1956 46 1957 78 1958 53 1959 57 1960 67 1961 50 1962 72 ... 1985 211 1986 186 1987 210 1988 232 1989 247 1990 237 1991 220 1992 245 1993 292 1994 406 1995 1344 1996 851 1997 1237 1998 1743 1999 2774 2000 2635 2001 2925 2002 2933 2003 3507 2004 3850 2005 3787 2006 3445 2007 4058 2008 4655 2009 4251 2010 4154 2011 5089 2012 7263 2013 7003 2014 5382 dtype: int64
# compare with line plot
ufo.Year.value_counts().sort_index().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1b6e74a8>
# don't use a line plot when there is no logical ordering
drinks.continent.value_counts().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1c2d0b38>
# reminder: box plot of beer servings
drinks.beer.plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0x1969d898>
# box plot of beer servings grouped by continent
drinks.boxplot(column='beer', by='continent')
<matplotlib.axes._subplots.AxesSubplot at 0x1c2dfcf8>
# box plot of all numeric columns grouped by continent
drinks.boxplot(by='continent')
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001C27B128>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001C86DC18>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001CB19668>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001CB38208>]], dtype=object)
# reminder: histogram of beer servings
drinks.beer.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x1c7be550>
# histogram of beer servings grouped by continent
drinks.hist(column='beer', by='continent')
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F838390>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FA829B0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001FC287F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001CB23D68>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001FC5C518>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FDE2278>]], dtype=object)
# share the x axes
drinks.hist(column='beer', by='continent', sharex=True)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F865F60>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000204A8320>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000202DC0B8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000202F6860>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000002070E828>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000020721518>]], dtype=object)
# share the x and y axes
drinks.hist(column='beer', by='continent', sharex=True, sharey=True)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000020DF74A8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000020CE1128>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000020D03D68>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000021392588>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000213BB080>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000214E2550>]], dtype=object)
# change the layout
drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000002103D358>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000021A4B470>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000021A71208>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000221EAB00>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022210AC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000223237B8>]], dtype=object)
# saving a plot to a file
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')
plt.savefig('beer_histogram.png')
# list available plot styles
plt.style.available
[u'dark_background', u'bmh', u'grayscale', u'ggplot', u'fivethirtyeight']
# change to a different style
plt.style.use('ggplot')